"src/targets/vscode:/vscode.git/clone" did not exist on "7604ecf5b9cd04f9df4219d6f1ef998951ed7449"
Commit 764b3a75 authored by Sugon_ldc's avatar Sugon_ldc
Browse files

add new model

parents
#!/bin/bash
# Copyright 2016 Tsinghua University (Author: Dong Wang, Xuewei Zhang). Apache 2.0.
# 2016 LeSpeech (Author: Xingyu Na)
#This script pepares the data directory for thchs30 recipe.
#It reads the corpus and get wav.scp and transcriptions.
corpus_dir=$1
data=$2
echo "**** Creating THCHS-30 data folder ****"
mkdir -p $data/{train,dev,test}
#create wav.scp, utt2spk.scp, spk2utt.scp, text
(
for x in train dev test; do
echo "cleaning $data/$x"
part=$data/$x
rm -rf $part/{wav.scp,utt2spk,spk2utt,text}
echo "preparing scps and text in $part"
# updated new "for loop" figured out the compatibility issue with Mac created by Xi Chen, in 03/06/2018
for nn in `find $corpus_dir/$x -name "*.wav" | sort -u | xargs -I {} basename {} .wav`; do
spkid=`echo $nn | awk -F"_" '{print "" $1}'`
spk_char=`echo $spkid | sed 's/\([A-Z]\).*/\1/'`
spk_num=`echo $spkid | sed 's/[A-Z]\([0-9]\)/\1/'`
spkid=$(printf '%s%.2d' "$spk_char" "$spk_num")
utt_num=`echo $nn | awk -F"_" '{print $2}'`
uttid=$(printf '%s%.2d_%.3d' "$spk_char" "$spk_num" "$utt_num")
echo $uttid $corpus_dir/$x/$nn.wav >> $part/wav.scp
echo $uttid $spkid >> $part/utt2spk
echo $uttid `sed -n 1p $corpus_dir/data/$nn.wav.trn` | sed 's/ l =//' >> $part/text
done
sort $part/wav.scp -o $part/wav.scp
sort $part/utt2spk -o $part/utt2spk
sort $part/text -o $part/text
tools/utt2spk_to_spk2utt.pl $part/utt2spk > $part/spk2utt
done
) || exit 1
tools/validate_data_dir.sh --no-feats $data/train || exit 1;
tools/validate_data_dir.sh --no-feats $data/dev || exit 1;
tools/validate_data_dir.sh --no-feats $data/test || exit 1;
#!/bin/bash
# Copyright 2014 Johns Hopkins University (author: Daniel Povey)
# Copyright 2016 Tsinghua University (author: Dong Wang)
# Apache 2.0
# Adapted from librispeech recipe local/download_and_untar.sh
remove_archive=false
if [ "$1" == --remove-archive ]; then
remove_archive=true
shift
fi
if [ $# -ne 3 ]; then
echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
echo "e.g.: $0 /nfs/public/materials/data/thchs30-openslr www.openslr.org/resources/18 data_thchs30"
echo "With --remove-archive it will remove the archive after successfully un-tarring it."
echo "<corpus-part> can be one of: data_thchs30, test-noise, resource"
fi
data=$1
url=$2
part=$3
if [ ! -d "$data" ]; then
echo "$0: no such directory $data, make it"
mkdir -p $data
fi
part_ok=false
list="data_thchs30 test-noise resource"
for x in $list; do
if [ "$part" == $x ]; then part_ok=true; fi
done
if ! $part_ok; then
echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
exit 1;
fi
if [ -z "$url" ]; then
echo "$0: empty URL base."
exit 1;
fi
if [ -f $data/$part/.complete ]; then
echo "$0: data part $part was already successfully extracted, nothing to do."
exit 0;
fi
sizes="6453425169 1971460210 24813708"
if [ -f $data/$part.tgz ]; then
size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
size_ok=false
for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
if ! $size_ok; then
echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
echo "does not equal the size of one of the archives."
rm $data/$part.tgz
else
echo "$data/$part.tgz exists and appears to be complete."
fi
fi
if [ ! -f $data/$part.tgz ]; then
if ! which wget >/dev/null; then
echo "$0: wget is not installed."
exit 1;
fi
full_url=$url/$part.tgz
echo "$0: downloading data from $full_url. This may take some time, please be patient."
cd $data
pwd
echo " wget --no-check-certificate $full_url"
if ! wget --no-check-certificate $full_url; then
echo "$0: error executing wget $full_url"
exit 1;
fi
fi
cd $data
if ! tar -xvzf $part.tgz; then
echo "$0: error un-tarring archive $data/$part.tgz"
exit 1;
fi
touch $data/$part/.complete
echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
if $remove_archive; then
echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
rm $data/$part.tgz
fi
export WENET_DIR=$PWD/../../..
export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=../../../:$PYTHONPATH
#!/bin/bash
# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan)
# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu)
. ./path.sh || exit 1;
# Use this to control how many gpu you use, It's 1-gpu training if you specify
# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
export CUDA_VISIBLE_DEVICES="0,1,2,3"
stage=0 # start from 0 if you need to start from data preparation
stop_stage=6
# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
# communication. More details can be found in
# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
# export NCCL_SOCKET_IFNAME=ens4f1
# The num of nodes or machines used for multi-machine training
# Default 1 for single machine/node
# NFS will be needed if you want run multi-machine training
num_nodes=1
# The rank of each node or machine, range from 0 to num_nodes -1
# The first node/machine sets node_rank 0, the second one sets node_rank 1
# the third one set node_rank 2, and so on. Default 0
node_rank=0
# data
dbase=/ssd/nfs06/di.wu/open_source
aidatatang_url=www.openslr.org/resources/62
aishell_url=www.openslr.org/resources/33
magicdata_url=www.openslr.org/resources/68
primewords_url=www.openslr.org/resources/47
stcmds_url=www.openslr.org/resources/38
thchs_url=www.openslr.org/resources/18
nj=16
train_set=train
dev_set=dev
has_aishell2=false # AISHELL2 train set is not publically downloadable
# With this option true, the script assumes you have it in
# $dbase
has_tal=false # TAL data need download from Baidu SkyDrive
# With this option true, the script assumes you have
# TAL/TAL_ASR and TAL/TAL_ASR_mix in $dbase
data_type=raw # raw or shard
num_utts_per_shard=1000
shards_dir= # specify if you prefer to store to somewhere else
# Optional train_config
# 1. conf/train_transformer.yaml: Standard transformer
# 2. conf/train_conformer.yaml: Standard conformer
# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer
# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer
train_config=conf/train_conformer.yaml
# English modeling unit
# Optional 1. bpe 2. char
en_modeling_unit=bpe
dict=data/dict_$en_modeling_unit/lang_char.txt
cmvn=true
dir=exp/conformer
checkpoint=
# use average_checkpoint will get better result
average_checkpoint=true
decode_checkpoint=$dir/final.pt
average_num=30
decode_modes="ctc_greedy_search ctc_prefix_beam_search"
decode_modes="$decode_modes attention attention_rescoring"
. tools/parse_options.sh || exit 1;
test_sets="aishell aidatatang magicdata thchs"
if $has_aishell2; then
test_sets="$test_sets aishell2"
fi
if $has_tal; then
test_sets="$test_sets tal_asr"
fi
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
# download all training data
local/aidatatang_download_and_untar.sh $dbase/aidatatang $aidatatang_url \
aidatatang_200zh || exit 1;
local/aishell_download_and_untar.sh $dbase/aishell $aishell_url \
data_aishell || exit 1;
local/magicdata_download_and_untar.sh $dbase/magicdata $magicdata_url \
train_set || exit 1;
local/primewords_download_and_untar.sh $dbase/primewords $primewords_url \
|| exit 1;
local/stcmds_download_and_untar.sh $dbase/stcmds $stcmds_url || exit 1;
local/thchs_download_and_untar.sh $dbase/thchs $thchs_url data_thchs30 || \
exit 1;
# download all test data
local/thchs_download_and_untar.sh $dbase/thchs $thchs_url test-noise \
|| exit 1;
local/magicdata_download_and_untar.sh $dbase/magicdata $magicdata_url \
dev_set || exit 1;
local/magicdata_download_and_untar.sh $dbase/magicdata $magicdata_url \
test_set || exit 1;
# tal data need download from Baidu SkyDrive
# AISHELL-2 database is free for academic research, not in the commerce,
# if without permission.
# You need to request the data from AISHELL company.
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# Data preparation
local/aidatatang_data_prep.sh $dbase/aidatatang/aidatatang_200zh \
data/aidatatang || exit 1;
local/aishell_data_prep.sh $dbase/aishell/data_aishell data/aishell \
|| exit 1;
local/thchs-30_data_prep.sh $dbase/thchs/data_thchs30 data/thchs || exit 1;
local/magicdata_data_prep.sh $dbase/magicdata/ data/magicdata || exit 1;
local/primewords_data_prep.sh $dbase/primewords data/primewords || exit 1;
local/stcmds_data_prep.sh $dbase/stcmds data/stcmds || exit 1;
if $has_tal; then
local/tal_data_prep.sh $dbase/TAL/TAL_ASR data/tal_asr || exit 1;
local/tal_mix_data_prep.sh $dbase/TAL/TAL_ASR_mix data/tal_mix || exit 1;
fi
if $has_aishell2; then
local/aishell2_data_prep.sh $dbase/aishell2/IOS data/aishell2/train \
|| exit 1;
local/aishell2_data_prep.sh $dbase/aishell2/IOS/dev data/aishell2/dev \
|| exit 1;
local/aishell2_data_prep.sh $dbase/aishell2/IOS/test data/aishell2/test \
|| exit 1;
fi
# Merge all data sets.
train_sets=aidatatang,aishell,magicdata,primewords,stcmds,thchs
dev_sets=aidatatang,aishell,magicdata,thchs
if $has_aishell2; then
train_sets=$train_sets,aishell2
dev_sets=$dev_sets,aishell2
fi
if $has_tal; then
train_sets=$train_sets,tal_asr,tal_mix
dev_sets=$dev_sets,tal_asr
fi
unrolled_train_sets=$(eval echo data/{$train_sets}/train)
unrolled_dev_sets=$(eval echo data/{$dev_sets}/dev)
tools/combine_data.sh data/train $unrolled_train_sets || exit 1;
tools/combine_data.sh data/dev $unrolled_dev_sets || exit 1;
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# For wav feature, just copy the data. Fbank extraction is done in training
mkdir -p data_${en_modeling_unit}
for x in ${train_set} ${dev_set}; do
cp -r data/$x data_${en_modeling_unit}
done
for x in ${test_sets}; do
cp -r data/$x/test data_${en_modeling_unit}/test_${x}
done
# Unified data format for char and bpe modelding
# Here we use ▁ for blank among english words
# Warning : it is "▁" symbol, not "_" symbol
for x in train dev; do
cp data_${en_modeling_unit}/${x}/text data_${en_modeling_unit}/${x}/text.org
paste -d " " <(cut -f 1 -d" " data_${en_modeling_unit}/${x}/text.org) \
<(cut -f 2- -d" " data_${en_modeling_unit}/${x}/text.org \
| tr 'a-z' 'A-Z' | sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' \
| sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' | tr -d " ") \
> data_${en_modeling_unit}/${x}/text
sed -i 's/\xEF\xBB\xBF//' data_${en_modeling_unit}/${x}/text
done
for x in ${test_sets}; do
cp data_${en_modeling_unit}/test_${x}/text \
data_${en_modeling_unit}/test_${x}/text.org
paste -d " " <(cut -f 1 -d" " data_${en_modeling_unit}/test_${x}/text.org) \
<(cut -f 2- -d" " data_${en_modeling_unit}/test_${x}/text.org \
| tr 'a-z' 'A-Z' | sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' \
| sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' | tr -d " ") \
> data_${en_modeling_unit}/test_${x}/text
sed -i 's/\xEF\xBB\xBF//' data_${en_modeling_unit}/test_${x}/text
done
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "Compute cmvn"
# Here we use all the training data, you can sample some data to save time
if $cmvn; then
tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \
--in_scp data/${train_set}/wav.scp \
--out_cmvn data_${en_modeling_unit}/$train_set/global_cmvn
fi
fi
# This bpe model is trained on librispeech training data set.
bpecode=conf/train_960_unigram5000.model
trans_type_ops=
enable_bpe=
if [ $en_modeling_unit = "bpe" ]; then
trans_type_ops="--trans_type cn_char_en_bpe"
enable_bpe=true
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# Make train dict
echo "Make a dictionary"
mkdir -p $(dirname $dict)
echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
echo "<unk> 1" >> ${dict} # <unk> must be 1
tools/text2token.py -s 1 -n 1 -m ${bpecode} \
data_${en_modeling_unit}/${train_set}/text ${trans_type_ops} \
| cut -f 2- -d" " | tr " " "\n" | sort | uniq | grep -a -v -e '^\s*$' \
| grep -v '·' | grep -v '“' | grep -v "”" | grep -v "\[" | grep -v "\]" \
| grep -v "…" | awk '{print $0 " " NR+1}' >> ${dict}
num_token=$(cat $dict | wc -l)
echo "<sos/eos> $num_token" >> $dict # <eos>
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "Prepare data, prepare required format"
feat_test_sets=""
for x in ${test_sets}; do
feat_test_sets=${feat_test_sets}" "test_${x}
done
for x in ${dev_set} ${train_set} ${feat_test_sets}; do
if [ $data_type == "shard" ]; then
sdir=${shards_dir:+$shards_dir/}shards_${en_modeling_unit}
mkdir -p $sdir
tools/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \
--num_threads 16 data_${en_modeling_unit}/$x/wav.scp \
data_${en_modeling_unit}/$x/text $(realpath $sdir/$x) \
data_${en_modeling_unit}/$x/data.list
else
tools/make_raw_list.py data_${en_modeling_unit}/$x/wav.scp \
data_${en_modeling_unit}/$x/text data_${en_modeling_unit}/$x/data.list
fi
done
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# Training
mkdir -p $dir
INIT_FILE=$dir/ddp_init
# You had better rm it manually before you start run.sh on first node.
# rm -f $INIT_FILE # delete old one before starting
init_method=file://$(readlink -f $INIT_FILE)
echo "$0: init method is $init_method"
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# Use "nccl" if it works, otherwise use "gloo"
dist_backend="nccl"
# The total number of processes/gpus, so that the master knows
# how many workers to wait for.
# More details about ddp can be found in
# https://pytorch.org/tutorials/intermediate/dist_tuto.html
world_size=`expr $num_gpus \* $num_nodes`
echo "total gpus is: $world_size"
cmvn_opts=
$cmvn && cp data_${en_modeling_unit}/$train_set/global_cmvn $dir
$cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
# train.py will write $train_config to $dir/train.yaml with model input
# and output dimension, train.yaml will be used for inference or model
# export later
for ((i = 0; i < $num_gpus; ++i)); do
{
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
# Rank of each gpu/process used for knowing whether it is
# the master of a worker.
rank=`expr $node_rank \* $num_gpus + $i`
python wenet/bin/train.py --gpu $gpu_id \
--config $train_config \
--data_type $data_type \
--symbol_table $dict \
--train_data data_${en_modeling_unit}/$train_set/data.list \
--cv_data data_${en_modeling_unit}/$dev_set/data.list \
${checkpoint:+--checkpoint $checkpoint} \
--model_dir $dir \
--ddp.init_method $init_method \
--ddp.world_size $world_size \
--ddp.rank $rank \
--ddp.dist_backend $dist_backend \
--num_workers 4 \
${enable_bpe:+--bpe_model $bpecode} \
$cmvn_opts \
--pin_memory
} &
done
wait
fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# Test model, please specify the model you want to test by --checkpoint
if [ ${average_checkpoint} == true ]; then
decode_checkpoint=$dir/avg_${average_num}.pt
echo "do model average and final checkpoint is $decode_checkpoint"
python wenet/bin/average_model.py \
--dst_model $decode_checkpoint \
--src_path $dir \
--num ${average_num} \
--val_best
fi
# Specify decoding_chunk_size if it's a unified dynamic chunk trained model
# -1 for full chunk
decoding_chunk_size=16
ctc_weight=0.5
idx=0
for mode in ${decode_modes}; do
{
for x in ${test_sets}; do
{
test_name=test_${mode}${decoding_chunk_size:+_chunk$decoding_chunk_size}
test_dir=$dir/$test_name/${x}
mkdir -p $test_dir
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$idx+1])
python wenet/bin/recognize.py --gpu $gpu_id \
--mode $mode \
--config $dir/train.yaml \
--data_type $data_type \
--test_data data_${en_modeling_unit}/test_${x}/data.list \
--checkpoint $decode_checkpoint \
--beam_size 10 \
--batch_size 1 \
--penalty 0.0 \
--dict $dict \
--ctc_weight $ctc_weight \
${enable_bpe:+--bpe_model $bpecode} \
--result_file $test_dir/text_${en_modeling_unit} \
${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
cat $test_dir/text_${en_modeling_unit} | sed -e "s/▁/ /g" \
> $test_dir/text
cat data_${en_modeling_unit}/test_${x}/text | sed -e "s/▁/ /g" \
> data_${en_modeling_unit}/test_${x}/text.tmp
python tools/compute-wer.py --char=1 --v=1 \
data_${en_modeling_unit}/test_${x}/text.tmp $test_dir/text \
> $test_dir/wer
rm data_${en_modeling_unit}/test_${x}/text.tmp
}
done
} &
((idx+=1))
done
wait
fi
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
# Export the best model you want
python wenet/bin/export_jit.py \
--config $dir/train.yaml \
--checkpoint $dir/avg_${average_num}.pt \
--output_file $dir/final.zip \
--output_quant_file $dir/final_quant.zip
fi
../../../tools
\ No newline at end of file
../../../wenet
\ No newline at end of file
# w2v-conformer based end-to-end model for Openasr2021 challenge
This is a example to use unsupervised pretrained w2v-conformer model to fintune [OpenASR2021](https://www.nist.gov/itl/iad/mig/openasr-challenge) constrained-plus tasks.
We pretrain conformer encoders using wav2vec 2.0 pre-training method , which we called ch-w2v-conformer. The original pre-training works take raw waveforms
as input. Unlike these works, we use MFCC features as inputs.
The ch-w2v-conformer model uses following datasets to pretrain:
ISML datasets (6 languages,70k hours): internal dataset contains 40k hours Chinese, Cantonese, Tibetan, Inner Mongolian, Inner Kazakh, Uighur.
Babel datasets (17 languages, 2k hours): Assamese, Bengali, Cantonese, Cebuano, Georgian, Haitian, Kazakh, Kurmanji, Lao, Pashto, Swahili, Tagalog, Tamil, Tok, Turkish, Vietnamese, Zulu
After pretraining, we build ASR system based on CTC-Attention structure. In very low resource task, we find that if too many initialization network structures are constructed in the upper layer of pre-training conformer encoder, the migration performance of the pre-training model will be destroyed, so we only build a single-layer transformer decoder for joint training.
pretrained model link: https://huggingface.co/emiyasstar/ch-w2v-conformer
## constrained-plus Task Performance
* Languages: Cantonese,mongolian,kazakh
* config: conf/train_conformer_large_10h.yaml
* Feature info: using mfcc feature, with dither 1.0, without cmvn
* Training info: lr 0.001, batch size 10, 4 gpus on V100, acc_grad 1, 80 epochs
* Decoding info: ctc_weight 0.5, average_num 35
dev set results trained only with 10 hours training set
## w2v-Conformer
| decoding_method | Cantonese(CER) | mongolian(WER) |
|:-------------------:|:----:|:----:|
| ctc_greedy_search | 31.46 | 53.64 |
| ctc_prefix_search | 31.47 | 53.50 |
| attention_rescoring | 31.45 | 52.96 |
## Conformer (train from scratch)
| decoding_method | Cantonese(CER) | mongolian(WER) |
|:-------------------:|----:|:----:|
| ctc_greedy_search | 61.43 | 89.38 |
| ctc_prefix_search | 61.37 | 89.53|
| attention_rescoring | 60.61 | 89.60|
# A giant configurations file for all the BABEL languages
# as well as some training configurations for training HMM-GMM systems
# for obtaining phoneme level alignments if you really want to do that
# All paths starting with /export/* are set for the JHU/CLSP grid and shoudl
# be changed appropriately for other users
# Cantonese
train_data_dir_101=/train/asr/feat/mfcchires/openasr/cantonese/openasr21_cantonese/build
train_data_list_101=/train/asr/feat/mfcchires/openasr/cantonese/openasr21_cantonese/build/ct_train_openasr21_uniq
train_data_dir_101_FLP=/export/babel/data/101-cantonese/release-current/conversational/training
train_data_list_101_FLP=./conf/lists/101-cantonese/train.FullLP.list
dev10h_data_dir_101=/train/asr/feat/mfcchires/openasr/cantonese/openasr21_cantonese/dev
dev10h_data_list_101=/train/asr/feat/mfcchires/openasr/cantonese/openasr21_cantonese/dev/ct_dev_openasr21_uniq
lexicon_file_101=/train/asr/feat/mfcchires/openasr/cantonese/openasr21_cantonese/build/reference_materials/lexicon.txt
lexiconFlags_101="--romanized --oov <unk>"
# Kazakh
train_data_dir_302=/export/babel/data/302-kazakh/IARPA-babel302b-v1.0a-build/BABEL_OP2_302/conversational/training
train_data_list_302=./conf/lists/302-kazakh/sub-train.list
train_data_dir_302_FLP=/export/babel/data/302-kazakh/IARPA-babel302b-v1.0a-build/BABEL_OP2_302/conversational/training
train_data_list_302_FLP=./conf/lists/302-kazakh/training.list
dev10h_data_dir_302=/export/babel/data/302-kazakh/IARPA-babel302b-v1.0a-build/BABEL_OP2_302/conversational/dev
dev10h_data_list_302=./conf/lists/302-kazakh/dev.list
lexicon_file_302=/export/babel/data/302-kazakh/IARPA-babel302b-v1.0a-build/BABEL_OP2_302/conversational/reference_materials/lexicon.sub-train.txt
lexiconFlags_302="--romanized --oov <unk>"
#mongolian
train_data_dir_401=/train/asr/feat/mfcchires/openasr/mongolian/openasr21_mongolian/build
train_data_list_401=/train/asr/feat/mfcchires/openasr/mongolian/openasr21_mongolian/build/mn_train_openasr21
dev10h_data_dir_401=/train/asr/feat/mfcchires/openasr/mongolian/openasr21_mongolian/dev
dev10h_data_list_401=/train/asr/feat/mfcchires/openasr/mongolian/openasr21_mongolian/dev/mn_dev_openasr21
lexicon_file_401=/train/asr/feat/mfcchires/openasr/mongolian/openasr21_mongolian/build/reference_materials/lexicon.txt
lexiconFlags_401="--romanized --oov <unk>"
oovSymbol="<unk>"
lexiconFlags="--oov <unk>"
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 512 # dimension of attention
attention_heads: 8
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 24 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.0
attention_dropout_rate: 0.0
input_layer: conv2d6 # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: True
activation_type: 'swish'
macaron_style: True
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
cnn_module_norm: 'layer_norm'
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 1024
num_blocks: 1
dropout_rate: 0.1
positional_dropout_rate: 0.0
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.7
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
raw_wav: True
# dataset related
dataset_conf:
filter_conf:
max_length: 40960
min_length: 0
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
feats_type: mfcc
mfcc_conf:
num_mel_bins: 40
frame_shift: 10
frame_length: 25
num_ceps: 40
low_freq: 20
high_freq: -400
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 10
grad_clip: 5
accum_grad: 1
max_epoch: 100
log_interval: 100
optim: adam
optim_conf:
lr: 0.0004
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 15000
#!/bin/bash
# dumps such pipe-style-wav to real audio file
nj=1
. tools/parse_options.sh || exit 1;
inscp=$1
segments=$2
outscp=$3
data=$(dirname ${inscp})
if [ $# -eq 4 ]; then
logdir=$4
else
logdir=${data}/log
fi
mkdir -p ${logdir}
sox=`which sox`
[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1;
paste -d " " <(cut -f 1 -d " " $inscp) <(cut -f 2- -d " " $inscp | tr -t " " "#") \
> $data/wav_ori.scp
tools/segment.py --segments $segments --input $data/wav_ori.scp --output $data/wav_segments.scp
sed -i 's/ /,/g' $data/wav_segments.scp
sed -i 's/#/ /g' $data/wav_segments.scp
rm -f $logdir/wav_*.slice
rm -f $logdir/*.log
split --additional-suffix .slice -d -n l/$nj $data/wav_segments.scp $logdir/wav_
for slice in `ls $logdir/wav_*.slice`; do
{
name=`basename -s .slice $slice`
mkdir -p ${data}/wavs/${name}
cat ${slice} | awk -F ',' -v sox=$sox -v data=`pwd`/$data/wavs/$name \
-v logdir=$logdir -v name=$name '{
during=$4-$3
cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during;
system(cmd)
printf("%s %s/%s.wav\n", $1, data, $1);
}' | \
sort > ${data}/wavs_${name}.scp || exit 1;
} &
done
wait
cat ${data}/wavs_*.scp > $outscp
rm ${data}/wavs_*.scp
rm -f $data/{segments,wav_segments.scp,reco2file_and_channel,reco2dur}
tools/fix_data_dir.sh $data
#!/bin/bash
# This script replaces the command readlink -f (which is not portable).
# It turns a pathname into an absolute pathname, including following soft links.
target_file=$1
cd $(dirname $target_file)
target_file=$(basename $target_file)
# Iterate down a (possible) chain of symlinks
while [ -L "$target_file" ]; do
target_file=$(readlink $target_file)
cd $(dirname $target_file)
target_file=$(basename $target_file)
done
# Compute the canonicalized name by finding the physical path
# for the directory we're in and appending the target file.
phys_dir=$(pwd -P)
result=$phys_dir/$target_file
echo $result
#!/bin/bash
# Copyright 2012 Johns Hopkins University (Author: Yenda Trmal)
# Apache 2.0.
#Begin configuration
ignore_missing_txt=false #If the reference transcript txt is missing, \
#shall we ignore it or treat it as a fatal error?
#End configuration
echo "$0 $@" # Print the command line for logging
help_message="$0: create subset of the input directory (specified as the first directory).
The subset is specified by the second parameter.
The directory in which the subset should be created is the third parameter
Example:
$0 <source-corpus-dir> <subset-descriptor-list-file> <target-corpus-subset-dir>"
[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;
if [[ "$#" -ne "3" ]] ; then
echo -e "FATAL: wrong number of script parameters!\n\n"
printf "$help_message\n\n"
exit 1;
fi
input_data_dir=$1
input_data_list=$2
output_data_dir=$3
if [[ ! -d "$input_data_dir" ]] ; then
echo "FATAL: input data directory does not exist";
exit 1;
fi
if [[ ! -f "$input_data_list" ]] ; then
echo "FATAL: input data list file does not exist!";
exit 1;
fi
mkdir -p $output_data_dir/transcription
mkdir -p $output_data_dir/audio
abs_src_dir=`local/make_absolute.sh $input_data_dir`
abs_tgt_dir=`local/make_absolute.sh $output_data_dir`
echo "Making subset..."
for file_basename in `cat $input_data_list`; do
echo $file_basename
if [[ -e $abs_src_dir/audio/$file_basename.sph ]] ; then
ln -sf $abs_src_dir/audio/$file_basename.sph $abs_tgt_dir/audio || exit 1
else
if [[ -e $abs_src_dir/audio/$file_basename.wav ]] ; then
ln -sf $abs_src_dir/audio/$file_basename.wav $abs_tgt_dir/audio || exit 1
else
echo "File $abs_src_dir/audio/$file_basename.sph|wav does not exist!"
exit 1
fi
fi
if [[ -e $abs_src_dir/transcription/$file_basename.txt ]] ; then
ln -sf $abs_src_dir/transcription/$file_basename.txt $abs_tgt_dir/transcription || exit 1
else
echo "File $abs_src_dir/transcription/$file_basename.txt does not exist!"
if ! $ignore_missing_txt ; then
exit 1;
fi
fi
done
#!/usr/bin/env perl
use Getopt::Long;
########################################################################
#
# Script to prepare the Babel acoustic training data for Kaldi.
#
# - Place transcripts in a file named "text"
# Each line contains: utteranceID word1 word2 ...
#
# - Place the utterance-to-speaker map in a file named "utt2spk"
# Each line contains: utteranceID speakerID
# speakerID MUST BE be a prefix of the utteranceID
# Kaldi code does not require it, but some training scripts do.
#
# - Place the utterance-to-segment map in a file named "segments"
# Each line contains: utteranceID recordingID startTime endTime
#
# - Place the recordingID-to-waveformFile map in "wav.scp"
# Each line contains: recordingIB Input_pipe_for_reading_waveform|
#
# - Place the speaker-utterance map in a file named "spk2utt"
# Each line contains: speakerID utteranceID_1 utteranceID_2 ...
# This is the inverse of the utt2spk mapping
#
# Note 1: the utteranceIDs in the first 3 files must match exactly, and
# the recordingIDSs in the last 2 files must match exactly.
#
# Note 2: Babel data formats and file-naming conventions are assumed.
#
# - The transcriptions and waveforms are in subdirectories named
# audio/<filename>.sph
# transcription/<filename>.txt
# There is 1 pair of files per recording, with extensions as above
#
# - The audio is in NIST sphere format, so shp2pipe may be used, e.g.
# BABEL_BP_101_11694_20111204_205320_inLine \
# /export/babel/sanjeev/kaldi-trunk/tools/sph2pipe_v2.5/sph2pipe \
# -f wav -p -c 1 \
# BABEL_BP_101_11694_20111204_205320_inLine.sph|
#
# - The filename contains speaker information, e.g.
# BABEL_BP_101_37210_20111102_170037_O1_scripted.sph -> 37210_A
# BABEL_BP_101_37210_20111102_172955_inLine.sph -> 37210_A
# BABEL_BP_101_37210_20111102_172955_outLine.sph -> 37210_B
# Specifically, the inLine speaker is the same as scripted
#
# - The transcription file has time marks in square brackets, e.g.
# [0.0]
# <no-speech> 喂 <no-speech>
# [7.05]
# 啊 听 听唔听到 啊 <no-speech> 你 而家 仲未 上课 系 嘛 <no-speech>
# [14.07]
#
# - If a vocabulary is provided, map all OOV tokens to an OOV symbol,
# and write out an OOV list with counts to a file named "oovCounts"
#
# If one or more word-fragment markers are provided, this script
# checks if an OOV token can be made in-vocabulary by stripping off
# the markers one by one from either end of the token.
#
# The default settings are
#
$vocabFile = ""; # No vocab file; nothing is mapped to OOV
$OOV_symbol = "<unk>"; # Default OOV symbol
$fragMarkers = ""; # No characters are word-fragment markers
#
# - Babel transcriptions contain 4 kinds of untranscribed words
#
# (()) designates unintelligible words
# <foreign> designates a word in another language
# <prompt> designates a sequence of pre-recorded words
# <overlap> designates two simultaneous foreground speakers
#
# This script maps them to OOV. They are not included in oovCounts
#
# - Babel transcriptions also contain a few non-linguistics tokens
#
# <limspack> map to a vocal noise symbol
# <breath> map to a vocal noise symbol
# <cough> map to a vocal noise symbol
# <laugh> map to a vocal noise symbol
#
# <click> map to a nonvocal noise symbol
# <ring> map to a nonvocal noise symbol
# <dtmf> map to a nonvocal noise symbol
# <int> map to a nonvocal noise symbol
#
# <no-speech> designates silence > 1 sec.
#
$vocalNoise = "<v-noise>";
$nVoclNoise = "<noise>";
$silence = "<silence>";
$icu_transform="";
#
########################################################################
GetOptions("fragmentMarkers=s" => \$fragMarkers,
"oov=s" => \$OOV_symbol,
"vocab=s" => \$vocabFile,
"icu-transform=s" => \$icu_transform
);
if ($#ARGV == 1) {
$inDir = $ARGV[0];
$outDir = $ARGV[1];
print STDERR ("$0: $inDir $outDir\n");
if($vocabFile) {
print STDERR ("\tLimiting transcriptions to words in $vocabFile\n");
print STDERR ("\tMapping OOV tokens to \"$OOV_symbol\"\n");
print STDERR ("\tif they remain OOV even after removing [$fragMarkers] from either end\n") if ($fragMarkers);
}
print STDERR ("$0 ADVICE: Use full path for the Input Directory\n") unless ($inDir=~m:^/:);
} else {
print STDERR ("Usage: $0 [--options] InputDir OutputDir\n");
print STDERR ("\t--vocab <file> File containing the permitted vocabulary\n");
print STDERR ("\t--oov <symbol> Use this symbol for OOV words (default <unk>)\n");
print STDERR ("\t--fragmentMarkers <chars> Remove these from ends of words to minimize OOVs (default none)\n");
exit(1);
}
########################################################################
# Read and save the vocabulary and map anything not in the vocab <unk>
########################################################################
if ($vocabFile) {
open (VOCAB, $vocabFile)
|| die "Unable to open vocabulary file $vocabFile";
$numWords = 0;
while (<VOCAB>) {
next unless (m:^([^\s]+):);
$numWords++ unless (exists $inVocab{$1}); # Don't count word repetitions
$inVocab{$1} = 1; # commonly found in lexicons
}
close(VOCAB);
print STDERR ("Read $numWords unique words from $vocabFile\n");
}
########################################################################
# First read segmentation information from all the transcription files
########################################################################
$TranscriptionDir = "$inDir/transcription";
if (-d $TranscriptionDir) {
@TranscriptionFiles = `ls ${TranscriptionDir}/*.txt`;
if ($#TranscriptionFiles >= 0) {
printf STDERR ("$0: Found %d .txt files in $TranscriptionDir\n", ($#TranscriptionFiles +1));
$numFiles = $numUtterances = $numWords = $numOOV = $numSilence = 0;
while ($filename = shift @TranscriptionFiles) {
$fileID = $filename; # To capture the base file name
$fileID =~ s:.+/::; # remove path prefix
$fileID =~ s:\.txt\s*$::; # remove file extension
# For each transcription file, extract and save segmentation data
$numUtterancesThisFile = 0;
$prevTimeMark = -1.0;
$text = "";
if ( $icu_transform ) {
$inputspec="uconv -f utf8 -t utf8 -x \"$icu_transform\" $filename |";
} else {
$inputspec=$filename;
}
open (TRANSCRIPT, $inputspec) || die "Unable to open $filename";
while ($line=<TRANSCRIPT>) {
chomp $line;
if ($line =~ m:^\[([0-9]+\.*[0-9]*)\]$:) {
$thisTimeMark = $1;
if ($thisTimeMark < $prevTimeMark) {
print STDERR ("$0 ERROR: Found segment with negative duration in $filename\n");
print STDERR ("\tStart time = $prevTimeMark, End time = $thisTimeMark\n");
print STDERR ("\tThis could be a sign of something seriously wrong!\n");
print STDERR ("\tFix the file by hand or remove it from the directory, and retry.\n");
exit(1);
}
if ($prevTimeMark<0) {
# Record the first timemark and continue
$prevTimeMark = $thisTimeMark;
next;
}
##################################################
# Create an utteranceID using fileID & start time
# - Assume Babel file naming conventions
# - Remove prefix: program_phase_language
# - inLine = scripted = spkr A, outLine = B
# - Move A/B so that utteranceIDs sort by spkr
# - Assume utterance start time < 10000 sec.
##################################################
$utteranceID = $fileID;
$utteranceID =~ s:[^_]+_[^_]+_[^_]+_::;
$utteranceID =~ s:([^_]+)_(.+)_(inLine|scripted):${1}_A_${2}:;
$utteranceID =~ s:([^_]+)_(.+)_outLine:${1}_B_${2}:;
$utteranceID .= sprintf ("_%06i", (100*$prevTimeMark));
##################################################
# Then save segmentation, transcription, spkeaerID
##################################################
if (exists $transcription{$utteranceID}) {
# utteranceIDs should be unique, but this one is not!
# Either time marks in the transcription file are bad,
# or something went wrong in generating the utteranceID
print STDERR ("$0 WARNING: Skipping duplicate utterance $utteranceID\n");
}
elsif ($text eq "") {
# Could be due to text filtering done below
# Output information to STDOUT to enable > /dev/null
print STDOUT ("$0: Skipping empty transcription $utteranceID\n");
} else {
$transcription{$utteranceID} = $text;
$startTime{$utteranceID} = $prevTimeMark;
$endTime{$utteranceID} = $thisTimeMark;
if ($utteranceID =~ m:([^_]+_[AB]).*:) {
$speakerID{$utteranceID} = $1;
} else {
# default: one speaker per audio file
$speakerID{$utteranceID} = $fileID;
}
$baseFileID{$utteranceID} = $fileID;
$numUtterancesThisFile++;
$numUtterances++;
$text = "";
}
$prevTimeMark = $thisTimeMark;
} else {
@tokens = split(/\s+/, $line);
$text = "";
while ($w = shift(@tokens)) {
# First, some Babel-specific transcription filtering
if (($w eq "<sta>")||($w eq "<male-to-female>")||($w eq "<female-to-male>")||($w eq "~")) {
next;
} elsif (($w eq "<lipsmack>")||($w eq "<breath>")||($w eq "<cough>")||($w eq "<laugh>")) {
$text .= " $vocalNoise";
$numWords++;
} elsif (($w eq "<click>")||($w eq "<ring>")||($w eq "<dtmf>")||($w eq "<int>")){
$text .= " $nVoclNoise";
$numWords++;
} elsif (($w eq "(())")||($w eq "<foreign>")||($w eq "<overlap>")||($w eq "<prompt>")) {
$text .= " $OOV_symbol";
$oovCount{$w}++;
$numOOV++;
$numWords++;
} elsif ($w eq "<no-speech>") {
$text .= " $silence";
$numSilence++;
} else {
# This is a just regular spoken word
if ($vocabFile && (! $inVocab{$w}) && $fragMarkers) {
# $w is a potential OOV token
# Remove fragMarkers to see if $w becomes in-vocabulary
while ($w =~ m:^(\S+[$fragMarkers]|[$fragMarkers]\S+)$:) {
if ($w =~ m:^(\S+)[$fragMarkers]$:) {
$w = $1;
last if ($inVocab{$w});
} elsif ($w =~m:^[$fragMarkers](\S+)$:) {
$w = $1;
last if ($inVocab{$w});
} else {
die "Logically, the program should never reach here!";
}
}
}
# If still an OOV, replace $w by $OOV_symbol
if ($vocabFile && (! $inVocab{$w})) {
# $w is definitely an OOV token
if (exists $oovCount{$w}) {
$oovCount{$w}++;
} else {
$oovCount{$w} = 1;
}
$w = $OOV_symbol;
$numOOV++;
}
$text .= " $w";
$numWords++;
}
}
$text =~ s:^\s+::; # Remove leading white space, if any
# Transcriptions must contain real words to be useful in training
$text =~ s:^(($OOV_symbol|$vocalNoise|$nVoclNoise|$silence)[ ]{0,1})+$::;
}
}
close(TRANSCRIPTION);
if ($numUtterancesThisFile>0) {
$lastTimeMarkInFile{$fileID} = $prevTimeMark;
$numUtterancesInFile{$fileID} = $numUtterancesThisFile;
$numUtterancesThisFile = 0;
}
$numFiles++;
}
print STDERR ("$0: Recorded $numUtterances non-empty utterances from $numFiles files\n");
} else {
print STDERR ("$0 ERROR: No .txt files found $TranscriptionDir\n");
exit(1);
}
} else {
print STDERR ("$0 ERROR: No directory named $TranscriptionDir\n");
exit(1);
}
########################################################################
# Then verify existence of corresponding audio files and their durations
########################################################################
$AudioDir = "$inDir/audio";
if (-d $AudioDir) {
@AudioFiles = `ls ${AudioDir}/*.sph`;
if ($#AudioFiles >= 0) {
printf STDERR ("$0: Found %d .sph files in $AudioDir\n", ($#AudioFiles +1));
$numFiles = 0;
while ($filename = shift @AudioFiles) {
$fileID = $filename;
$fileID =~ s:.+/::; # remove path prefix
$fileID =~ s:\.sph\s*::; # remove file extension
if (exists $numUtterancesInFile{$fileID}) {
# Some portion of this file has training transcriptions
@Info = `head $filename`;
$SampleCount = -1;
$SampleRate = 8000; #default
while ($#Info>=0) {
$line = shift @Info;
$SampleCount = $1 if ($line =~ m:sample_count -i (\d+):);
$SampleRate = $1 if ($line =~ m:sample_rate -i (\d+):);
}
if ($SampleCount<0) {
# Unable to extract a valid duration from the sphere header
print STDERR ("Unable to extract duration: skipping file $filename");
} else {
$waveformName{$fileID} = $filename; chomp $waveformName{$fileID};
$duration{$fileID} = $SampleCount/$SampleRate;
$numFiles++;
}
} else {
# Could be due to text filtering resulting in an empty transcription
# Output information to STDOUT to enable > /dev/null
print STDOUT ("$0: No transcriptions for audio file ${fileID}.sph\n");
}
}
print STDERR ("$0: Recorded durations from headers of $numFiles .sph files\n");
} else {
print STDERR ("$0 NOTICE: No .sph files in $AudioDir\n");
}
@AudioFiles = `ls ${AudioDir}/*.wav`;
if ($#AudioFiles >= 0) {
$soxi=`which soxi` or die "Could not find soxi binary -- do you have sox installed?\n";
chomp $soxi;
printf STDERR ("$0: Found %d .wav files in $AudioDir\n", ($#AudioFiles +1));
print STDERR "Soxi found: $soxi\n";
$numFiles = 0;
while ($filename = shift @AudioFiles) {
$fileID = $filename;
$fileID =~ s:.+/::; # remove path prefix
$fileID =~ s:\.wav\s*::; # remove file extension
if (exists $numUtterancesInFile{$fileID}) {
# Some portion of this file has training transcriptions
$duration = `$soxi -D $filename`;
if ($duration <=0) {
# Unable to extract a valid duration from the sphere header
print STDERR ("Unable to extract duration: skipping file $filename");
} else {
if (exists $waveformName{$fileID} ) {
print STDERR ("$0 ERROR: duplicate fileID \"$fileID\" for files \"$filename\" and \"" . $waveformName{$fileID} ."\"\n");
exit(1);
}
$waveformName{$fileID} = $filename; chomp $waveformName{$fileID};
$duration{$fileID} = $duration;
$numFiles++;
}
} else {
# Could be due to text filtering resulting in an empty transcription
# Output information to STDOUT to enable > /dev/null
print STDOUT ("$0: No transcriptions for audio file ${fileID}.sph\n");
}
}
print STDERR ("$0: Recorded durations from headers of $numFiles .sph files\n");
} else {
print STDERR ("$0 NOTICE: No .wav files in $AudioDir\n");
}
if ( $#waveformName == 0 ) {
print STDERR ("$0 ERROR: No audio files found!");
}
} else {
print STDERR ("$0 ERROR: No directory named $AudioDir\n");
exit(1);
}
########################################################################
# Now all the needed information is available. Write out the 4 files.
########################################################################
unless (-d $outDir) {
print STDERR ("$0: Creating output directory $outDir\n");
die "Failed to create output directory" if (`mkdir -p $outDir`); # i.e. if the exit status is not zero.
}
print STDERR ("$0: Writing 5 output files to $outDir\n");
$textFileName = "$outDir/text";
open (TEXT, "> $textFileName") || die "$0 ERROR: Unable to write text file $textFileName\n";
$utt2spkFileName = "$outDir/utt2spk";
open (UTT2SPK, "> $utt2spkFileName") || die "$0 ERROR: Unable to write utt2spk file $utt2spkFileName\n";
$segmentsFileName = "$outDir/segments";
open (SEGMENTS, "> $segmentsFileName") || die "$0 ERROR: Unable to write segments file $segmentsFileName\n";
$scpFileName = "$outDir/wav.scp";
open (SCP, "| sort -u > $scpFileName") || die "$0 ERROR: Unable to write wav.scp file $scpFileName\n";
my $binary=$ENV{SPH2PIPE}
$SPHBINARY ="$binary -f wav -p -c 1";
my $SOXBINARY =`which sox` or die "Could not find the sph2pipe command"; chomp $SOXBINARY;
$SOXFLAGS ="-r 8000 -c 1 -b 16 -t wav - downsample";
$spk2uttFileName = "$outDir/spk2utt";
open (SPK2UTT, "> $spk2uttFileName") || die "$0 ERROR: Unable to write spk2utt file $spk2uttFileName\n";
$oovFileName = "$outDir/oovCounts";
open (OOV, "| sort -nrk2 > $oovFileName") || die "$0 ERROR: Unable to write oov file $oovFileName\n";
$numUtterances = $numSpeakers = $numWaveforms = 0;
$totalSpeech = $totalSpeechSq = 0.0;
foreach $utteranceID (sort keys %transcription) {
$fileID = $baseFileID{$utteranceID};
if (exists $waveformName{$fileID}) {
# There are matching transcriptions and audio
$numUtterances++;
$totalSpeech += ($endTime{$utteranceID} - $startTime{$utteranceID});
$totalSpeechSq += (($endTime{$utteranceID} - $startTime{$utteranceID})
*($endTime{$utteranceID} - $startTime{$utteranceID}));
print TEXT ("$utteranceID $transcription{$utteranceID}\n");
print UTT2SPK ("$utteranceID $speakerID{$utteranceID}\n");
print SEGMENTS ("$utteranceID $fileID $startTime{$utteranceID} $endTime{$utteranceID}\n");
if (exists $uttList{$speakerID{$utteranceID}}) {
$uttList{$speakerID{$utteranceID}} .= " $utteranceID";
} else {
$numSpeakers++;
$uttList{$speakerID{$utteranceID}} = "$utteranceID";
}
next if (exists $scpEntry{$fileID});
$numWaveforms++;
if ($waveformName{$fileID} =~ /.*\.sph/ ) {
$scpEntry{$fileID} = "$SPHBINARY $waveformName{$fileID} |";
} else {
$scpEntry{$fileID} = "$SOXBINARY $waveformName{$fileID} $SOXFLAGS |";
}
} else {
print STDERR ("$0 WARNING: No audio file for transcription $utteranceID\n");
}
}
foreach $fileID (sort keys %scpEntry) {
print SCP ("$fileID $scpEntry{$fileID}\n");
}
foreach $speakerID (sort keys %uttList) {
print SPK2UTT ("$speakerID $uttList{$speakerID}\n");
}
foreach $w (sort keys %oovCount) {
print OOV ("$w\t$oovCount{$w}\n");
}
exit(1) unless (close(TEXT) && close(UTT2SPK) && close(SEGMENTS) && close(SCP) && close(SPK2UTT) && close(OOV));
print STDERR ("$0: Summary\n");
print STDERR ("\tWrote $numUtterances lines each to text, utt2spk and segments\n");
print STDERR ("\tWrote $numWaveforms lines to wav.scp\n");
print STDERR ("\tWrote $numSpeakers lines to spk2utt\n");
print STDERR ("\tHmmm ... $numSpeakers distinct speakers in this corpus? Unusual!\n")
if (($numSpeakers<($numUtterances/500.0)) || ($numSpeakers>($numUtterances/2.0)));
print STDERR ("\tTotal # words = $numWords (including $numOOV OOVs) + $numSilence $silence\n")
if ($vocabFile);
printf STDERR ("\tAmount of speech = %.2f hours (including some due to $silence)\n", $totalSpeech/3600.0);
if ($numUtterances>0) {
printf STDERR ("\tAverage utterance length = %.2f sec +/- %.2f sec, and %.2f words\n",
$totalSpeech /= $numUtterances,
sqrt(($totalSpeechSq/$numUtterances)-($totalSpeech*$totalSpeech)),
$numWords/$numUtterances);
}
exit(0);
########################################################################
# Done!
########################################################################
#!/bin/bash
# Copyright 2018 Johns Hopkins University (Matthew Wiesner)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
# This is not necessarily the top-level run.sh as it is in other directories. see README.txt first.
. ./conf/lang.conf
. ./path.sh
. ./cmd.sh
sph2pipe_version="v2.5"
if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then
echo "Download sph2pipe_${sph2pipe_version} ......"
wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \
wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \
tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools
cd tools/sph2pipe_${sph2pipe_version}/ && \
gcc -o sph2pipe *.c -lm
cd -
fi
sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe
[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1;
export SPH2PIPE=$sph2pipe
sox=`which sox`
[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1;
FLP=false
. ./utils/parse_options.sh
if [ $# -ne 1 ]; then
echo >&2 "Usage: ./local/prepare_data.sh [opts] <lang_id>"
echo >&2 " --FLP : Use FLP training data (instead of LLP ~10h)"
exit 1
fi
l=$1
l_suffix=${l}
if $FLP; then
l_suffix=${l_suffix}_FLP
fi
#Preparing train directories
if [ ! -f data/raw_train_data/.done ]; then
echo ---------------------------------------------------------------------
echo "Subsetting the TRAIN set"
echo ---------------------------------------------------------------------
train_data_dir=train_data_dir_${l_suffix}
train_data_list=train_data_list_${l_suffix}
local/make_corpus_subset.sh "${!train_data_dir}" "${!train_data_list}" ./data/raw_train_data
train_data_dir=`utils/make_absolute.sh ./data/raw_train_data`
touch data/raw_train_data/.done
fi
#exit 0
#Preparing dev10 directories
if [ ! -f data/raw_dev10h_data/.done ]; then
echo ---------------------------------------------------------------------
echo "Subsetting the Dev set"
echo ---------------------------------------------------------------------
dev10h_data_dir=dev10h_data_dir_${l}
dev10h_data_list=dev10h_data_list_${l}
local/make_corpus_subset.sh "${!dev10h_data_dir}" "${!dev10h_data_list}" ./data/raw_dev10h_data
dev10h_data_dir=`utils/make_absolute.sh ./data/raw_dev10h_data`
touch data/raw_dev10h_data/.done
fi
dev10h_data_dir=`utils/make_absolute.sh ./data/raw_dev10h_data`
train_data_dir=`utils/make_absolute.sh ./data/raw_train_data`
lexicon_file=lexicon_file_${l_suffix}
if [[ ! -f data/train/wav.scp || data/train/wav.scp -ot "$train_data_dir" ]]; then
echo ---------------------------------------------------------------------
echo "Preparing acoustic training lists in data/train on" `date`
echo ---------------------------------------------------------------------
mkdir -p data/train.tmp
local/prepare_acoustic_training_data.pl \
--fragmentMarkers \-\*\~ \
$train_data_dir data/train.tmp > data/train.tmp/skipped_utts.log
fi
if [[ ! -f data/dev10h.pem/wav.scp || data/dev10h.pem/wav.scp -ot "$dev10h_data_dir" ]]; then
echo ---------------------------------------------------------------------
echo "Preparing acoustic training lists in data/train on" `date`
echo ---------------------------------------------------------------------
mkdir -p data/dev10h.pem
local/prepare_acoustic_training_data.pl \
--fragmentMarkers \-\*\~ \
$dev10h_data_dir data/dev10h.pem > data/dev10h.pem/skipped_utts.log
fi
###########################################################################
# Prepend language ID to all utterances to disambiguate between speakers
# of different languages sharing the same speaker id.
#
# The individual lang directories can be used for alignments, while a
# combined directory will be used for training. This probably has minimal
# impact on performance as only words repeated across languages will pose
# problems and even amongst these, the main concern is the <hes> marker.
###########################################################################
num_utts=$(cat data/train.tmp/segments | wc -l)
dev_utts=$((num_utts / 10))
./utils/subset_data_dir.sh data/train.tmp ${dev_utts} data/train_dev
awk '{print $1}' data/train_dev/utt2spk > data/train_dev.list
awk '{print $1}' data/train.tmp/utt2spk | grep -vf data/train_dev.list > data/train.list
./utils/subset_data_dir.sh --utt-list data/train.list data/train.tmp data/train
echo "Prepend ${l} to data dir"
./utils/copy_data_dir.sh --spk-prefix "${l}_" --utt-prefix "${l}_" \
data/train data/train_${l}
./utils/copy_data_dir.sh --spk-prefix "${l}_" --utt-prefix "${l}_" \
data/train_dev data/dev_${l}
./utils/copy_data_dir.sh --spk-prefix "${l}_" --utt-prefix "${l}_" \
data/dev10h.pem data/eval_${l}
#!/bin/bash
# Copyright 2018 Johns Hopkins University (Matthew Wiesner)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
. ./path.sh
. ./cmd.sh
. ./conf/lang.conf
#langs="101 102 103 104 105 106 202 203 204 205 206 207 301 302 303 304 305 306 401 402 403"
langs="101"
recog="101"
FLP=false
garbage_utterance_tags="<silence> <v-noise> <noise> <unk> <hes>"
. ./tools/parse_options.sh
set -e
set -o pipefail
all_langs=""
for l in `cat <(echo ${langs}) <(echo ${recog}) | tr " " "\n" | sort -u`; do
all_langs="${l} ${all_langs}"
done
all_langs=${all_langs%% }
# Save top-level directory
cwd=$(local/make_absolute.sh `pwd`)
echo "Stage 0: Setup Language Specific Directories"
echo "cwd"
echo $cwd
echo " --------------------------------------------"
echo "Languagues: ${all_langs}"
# Basic directory prep
for l in ${all_langs}; do
[ -d data/${l} ] || mkdir -p data/${l}
cd data/${l}
ln -sf ${cwd}/local .
for f in ${cwd}/{tools,conf}; do
link=`make_absolute.sh $f`
ln -sf $link .
done
cp ${cwd}/cmd.sh .
cp ${cwd}/path.sh .
sed -i 's/\.\.\/\.\.\/\.\./\.\.\/\.\.\/\.\.\/\.\.\/\.\./g' path.sh
cd ${cwd}
done
# Prepare language specific data
for l in ${all_langs}; do
(
cd data/${l}
./local/prepare_data.sh --FLP ${FLP} ${l}
cd ${cwd}
) &
done
wait
# Combine all language specific training directories and generate a single
# lang directory by combining all language specific dictionaries
train_dirs=""
dev_dirs=""
eval_dirs=""
for l in ${langs}; do
train_dirs="data/${l}/data/train_${l} ${train_dirs}"
done
for l in ${recog}; do
dev_dirs="data/${l}/data/dev_${l} ${dev_dirs}"
done
./tools/combine_data.sh data/train ${train_dirs}
./tools/combine_data.sh data/dev ${dev_dirs}
for l in ${recog}; do
ln -s ${cwd}/data/${l}/data/eval_${l} ${cwd}/data/eval_${l}
done
# Delete utterances with garbage meta tags
for tag in $garbage_utterance_tags; do
sed -i "s/${tag}//g" data/train/text
sed -i "s/${tag}//g" data/dev/text
sed -i "s/${tag}//g" data/eval_${l}/text
done
sed -i "/_.*[0-9][ ]*$/d" data/train/text
sed -i "/_.*[0-9][ ]*$/d" data/dev/text
sed -i "/_.*[0-9][ ]*$/d" data/eval_${l}/text
sed -i 's/[ ][ ]*/ /g' data/train/text
sed -i 's/[ ][ ]*/ /g' data/dev/text
sed -i 's/[ ][ ]*/ /g' data/eval_${l}/text
./tools/fix_data_dir.sh data/train
./tools/fix_data_dir.sh data/dev
./tools/fix_data_dir.sh data/eval_${l}
export WENET_DIR=$PWD/../../..
export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=../../../:$PYTHONPATH
#!/bin/bash
# Copyright 2021 Tencent Inc. (Author: Kai Tang).
# Apach 2.0
. ./path.sh || exit 1;
# Use this to control how many gpu you use, It's 1-gpu training if you specify
# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
export CUDA_VISIBLE_DEVICES="0,1,2,3"
stage=0 # start from 0 if you need to start from data preparation
stop_stage=5
# data
data=data
data_url=www.openslr.org/resources/33
nj=4
#langid: 101 Cantonese , 302 Kazakh , 401 mongolian
langs="101"
recog="101"
token_type=char
# bpemode (unigram or bpe)
nbpe=4500
bpemode=unigram
# data_type can be `raw` or `shard`. Typically, raw is used for small dataset,
# `shard` is used for large dataset which is over 1k hours, and `shard` is
# faster on reading data and training.
data_type=raw
num_utts_per_shard=1000
if [ "${token_type}" = bpe ]; then
dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
bpemodel=data/lang_char/${train_set}_${bpemode}${nbpe}
elif [ "${token_type}" = char ]; then
dict=data/lang_char/lang_char.txt
bpe_model=
else
echo "Error: not supported token_type"
exit 0
fi
train_set=train_sp
train_dev=dev
recog_set=eval_$recog
# pretrained w2v-conformer encoder
enc_init=pretrain/conformer.pt
#reinit last pretrained encoder layer: https://arxiv.org/pdf/2107.04734.pdf
enc_init_mods='encoder.encoders.0,encoder.encoders.1,encoder.encoders.2,encoder.encoders.3,encoder.encoders.4,encoder.encoders.5,encoder.encoders.6,encoder.encoders.7,encoder.encoders.8,encoder.encoders.9,encoder.encoders.10,encoder.encoders.11,encoder.encoders.12,encoder.encoders.13,encoder.encoders.14,encoder.encoders.15,encoder.encoders.16,encoder.encoders.17,encoder.encoders.18,encoder.encoders.19,encoder.encoders.20,encoder.encoders.21,encoder.encoders.22,encoder.embed'
train_config=conf/train_conformer_large_10h.yaml
checkpoint=
cmvn=false
dir=exp/${langs}_finetune_10h
# use average_checkpoint will get better result
average_checkpoint=true
decode_checkpoint=$dir/final.pt
average_num=35
. utils/parse_options.sh || exit 1;
#Babel style data preparation
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "stage 0: Setting up individual languages"
./local/setup_languages.sh --langs "${langs}" --recog "${recog}"
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# Data preparation
for x in ${train_set} ${train_dev} ${recog_set}; do
# Remove the space in text
if [ "${token_type}" = char ]; then
cp data/${x}/text data/${x}/text.org
paste -d " " <(cut -f 1 -d" " data/${x}/text.org) <(cut -f 2- -d" " data/${x}/text.org | tr -d " ") \
> data/${x}/text
rm data/${x}/text.org
fi
done
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# Make train dict
echo "Make a dictionary"
mkdir -p $(dirname $dict)
echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
echo "<unk> 1" >> ${dict} # <unk> must be 1
if [ "${token_type}" = bpe ]; then
# we borrowed these code and scripts which are related bpe from ESPnet.
cut -f 2- -d" " data/${train_set}/text | sort > data/lang_char/input.txt
tools/spm_train --input=data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
tools/spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
elif [ "${token_type}" = char ]; then
tools/text2token.py -s 1 -n 1 data/${train_set}/text | cut -f 2- -d" " | tr " " "\n" \
| sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0 " " NR+1}' >> ${dict}
fi
num_token=$(cat $dict | wc -l)
echo "<sos/eos> $num_token" >> $dict # <eos>
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "stage 1: format scp "
#dumps such pipe-style-wav to real audio file
for x in ${train_set} ${train_dev} ${recog_set}; do
cp data/${x}/wav.scp data/${x}/wav.scp.org
bash local/dump_wav.sh --nj 26 data/$x/wav.scp.org data/$x/segments data/$x/wav.scp
rm data/$x/wav.scp.org
done
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "Prepare data, prepare required format"
# For wav feature, just copy the data. mfcc/fbank extraction is done in training
for x in ${train_set} ${train_dev} ${recog_set}; do
if [ $data_type == "shard" ]; then
tools/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \
--num_threads 16 data/$x/wav.scp data/$x/text \
$(realpath data/$x/shards) data/$x/data.list
else
tools/make_raw_list.py data/$x/wav.scp data/$x/text \
data/$x/data.list
fi
done
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# Training
mkdir -p $dir
INIT_FILE=$dir/ddp_init
rm -f $INIT_FILE # delete old one before starting
init_method=file://$(readlink -f $INIT_FILE)
echo "$0: init method is $init_method"
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# Use "nccl" if it works, otherwise use "gloo"
dist_backend="nccl"
cmvn_opts=
$cmvn && cmvn_opts="--cmvn data/${train_set}/global_cmvn"
# train.py will write $train_config to $dir/train.yaml with model input
# and output dimension, train.yaml will be used for inference or model
# export later
for ((i = 0; i < $num_gpus; ++i)); do
{
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
python wenet/bin/train.py --gpu $gpu_id \
--config $train_config \
--data_type $data_type \
--symbol_table $dict \
${bpemodel:+--bpe_model ${bpemodel}.model} \
--train_data data/$train_set/data.list \
--cv_data data/$train_dev/data.list \
${checkpoint:+--checkpoint $checkpoint} \
${enc_init:+--enc_init $enc_init} \
--enc_init_mods $enc_init_mods \
--model_dir $dir \
--ddp.init_method $init_method \
--ddp.world_size $num_gpus \
--ddp.rank $i \
--ddp.dist_backend $dist_backend \
--num_workers 6 \
$cmvn_opts
} &
done
wait
fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# Test model, please specify the model you want to test by --checkpoint
cmvn_opts=
$cmvn && cmvn_opts="--cmvn data/${train_set}/global_cmvn"
# TODO, Add model average here
mkdir -p $dir/test
if [ ${average_checkpoint} == true ]; then
decode_checkpoint=$dir/avg_${average_num}.pt
echo "do model average and final checkpoint is $decode_checkpoint"
python wenet/bin/average_model.py \
--dst_model $decode_checkpoint \
--src_path $dir \
--num ${average_num} \
--val_best
fi
# Specify decoding_chunk_size if it's a unified dynamic chunk trained model
# -1 for full chunk
decoding_chunk_size=
ctc_weight=0.5
for mode in ctc_greedy_search ctc_prefix_beam_search attention attention_rescoring; do
for rtask in ${recog_set}; do
{
test_dir=$dir/test_${rtask}_${mode}
mkdir -p $test_dir
python wenet/bin/recognize.py --gpu 0 \
--mode $mode \
--config $dir/train.yaml \
--data_type $data_type \
--test_data data/$rtask/data.list \
--checkpoint $decode_checkpoint \
--beam_size 5 \
--batch_size 1 \
--penalty 0.0 \
--dict $dict \
${bpemodel:+--bpe_model ${bpemodel}.model} \
--ctc_weight $ctc_weight \
--result_file $test_dir/text_ori \
$cmvn_opts \
${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
if [ "${token_type}" = bpe ]; then
tools/spm_decode --model=${bpemodel}.model --input_format=piece < $test_dir/text_ori | sed -e "s/▁/ /g" > $test_dir/text
python tools/compute-wer.py --char=0 --v=1 \
data/$rtask/text $test_dir/text > $test_dir/wer
elif [ "${token_type}" = char ]; then
python tools/compute-wer.py --char=1 --v=1 \
data/$rtask/text $test_dir/text_ori > $test_dir/wer
fi
} &
done
done
wait
fi
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
# Export the best model you want
python wenet/bin/export_jit.py \
--config $dir/train.yaml \
--checkpoint $dir/avg_${average_num}.pt \
--output_file $dir/final.zip
fi
../../../tools
\ No newline at end of file
../../../wenet/
\ No newline at end of file
# Performance Record
## Conformer Result
* Feature info: dither + specaug + speed perturb
* Training info: lr 0.001, warmup_steps 25000, batch size 16, 1 gpu, acc_grad 4, 240 epochs
* Decoding info: average_num 10
| decoding mode | eval2000 (wer) |
|:----------------------:|:----------------:|
| ctc_greedy_search | 32.39% |
| ctc_prefix_beam_search | 32.39% |
| attention | 31.28% |
| attention_rescoring | 31.36% |
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment