Commit 764b3a75 authored by Sugon_ldc's avatar Sugon_ldc
Browse files

add new model

parents
#!/usr/bin/env bash
#
# Copyright 2014 Nickolay V. Shmyrev
# 2014 Brno University of Technology (Author: Karel Vesely)
# 2016 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0
# To be run from one directory above this script.
. ./path.sh
export LC_ALL=C
sph2pipe=sph2pipe
data_type=$1
# Prepare: test, train,
for set in dev test train; do
dir=data/$set.orig
mkdir -p $dir
# Merge transcripts into a single 'stm' file, do some mappings:
# - <F0_M> -> <o,f0,male> : map dev stm labels to be coherent with train + test,
# - <F0_F> -> <o,f0,female> : --||--
# - (2) -> null : remove pronunciation variants in transcripts, keep in dictionary
# - <sil> -> null : remove marked <sil>, it is modelled implicitly (in kaldi)
# - (...) -> null : remove utterance names from end-lines of train
# - it 's -> it's : merge words that contain apostrophe (if compound in dictionary, local/join_suffix.py)
{ # Add STM header, so sclite can prepare the '.lur' file
echo ';;
;; LABEL "o" "Overall" "Overall results"
;; LABEL "f0" "f0" "Wideband channel"
;; LABEL "f2" "f2" "Telephone channel"
;; LABEL "male" "Male" "Male Talkers"
;; LABEL "female" "Female" "Female Talkers"
;;'
# Process the STMs
cat db/TEDLIUM_release-3/${data_type}/$set/stm/*.stm | sort -k1,1 -k2,2 -k4,4n | \
sed -e 's:<F0_M>:<o,f0,male>:' \
-e 's:<F0_F>:<o,f0,female>:' \
-e 's:([0-9])::g' \
-e 's:<sil>::g' \
-e 's:([^ ]*)$::' | \
awk '{ $2 = "A"; print $0; }'
} | local/join_suffix.py > data/$set.orig/stm
# Prepare 'text' file
# - {NOISE} -> [NOISE] : map the tags to match symbols in dictionary
cat $dir/stm | grep -v -e 'ignore_time_segment_in_scoring' -e ';;' | \
awk '{ printf ("%s-%07d-%07d", $1, $4*100, $5*100);
for (i=7;i<=NF;i++) { printf(" %s", $i); }
printf("\n");
}' | tr '{}' '[]' | sort -k1,1 > $dir/text || exit 1
# Prepare 'segments', 'utt2spk', 'spk2utt'
cat $dir/text | cut -d" " -f 1 | awk -F"-" '{printf("%s %s %07.2f %07.2f\n", $0, $1, $2/100.0, $3/100.0)}' > $dir/segments
cat $dir/segments | awk '{print $1, $2}' > $dir/utt2spk
cat $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt
# Prepare 'wav.scp', 'reco2file_and_channel'
cat $dir/spk2utt | awk -v data_type=$data_type -v set=$set -v pwd=$PWD '{ printf("%s %s/db/TEDLIUM_release-3/%s/%s/sph/%s.sph\n", $1, pwd, data_type, set, $1); }' > $dir/wav.scp
cat $dir/wav.scp | awk '{ print $1, $1, "A"; }' > $dir/reco2file_and_channel
# Create empty 'glm' file
echo ';; empty.glm
[FAKE] => %HESITATION / [ ] __ [ ] ;; hesitation token
' > data/$set.orig/glm
# The training set seems to not have enough silence padding in the segmentations,
# especially at the beginning of segments. Extend the times.
if [ $set == "train" ]; then
mv data/$set.orig/segments data/$set.orig/segments.temp
utils/data/extend_segment_times.py --start-padding=0.15 \
--end-padding=0.1 <data/$set.orig/segments.temp >data/$set.orig/segments || exit 1
rm data/$set.orig/segments.temp
fi
# Check that data dirs are okay!
utils/validate_data_dir.sh --no-feats $dir || exit 1
done
export WENET_DIR=$PWD/../../..
export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=../../../:$PYTHONPATH
#!/bin/bash
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
. ./path.sh || exit 1;
# Use this to control how many gpu you use, It's 1-gpu training if you specify
# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
# communication. More details can be found in
# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
# export NCCL_SOCKET_IFNAME=ens4f1
export NCCL_DEBUG=INFO
stage=0 # start from 0 if you need to start from data preparation
stop_stage=5
# The num of nodes or machines used for multi-machine training
# Default 1 for single machine/node
# NFS will be needed if you want run multi-machine training
num_nodes=1
# The rank of each node or machine, range from 0 to num_nodes -1
# The first node/machine sets node_rank 0, the second one sets node_rank 1
# the third one set node_rank 2, and so on. Default 0
node_rank=0
nj=16
feat_dir=raw_wav
data_type=raw # raw or shard
num_utts_per_shard=1000
data_cat=legacy
train_set=train
train_config=conf/train_conformer.yaml
cmvn=true
dir=exp/conformer
checkpoint=
# bpemode (unigram or bpe)
nbpe=500
bpemode=unigram
# use average_checkpoint will get better result
average_checkpoint=true
decode_checkpoint=$dir/final.pt
average_num=10
decode_modes="ctc_greedy_search ctc_prefix_beam_search attention attention_rescoring"
. tools/parse_options.sh || exit 1;
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
local/download_data.sh # make soft link by yourself if you already have the dataset
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# Data preparation
local/prepare_data.sh $data_cat
for dset in dev test train; do
utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 \
data/${dset}.orig data/${dset}
done
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# For wav feature, just copy the data. Fbank extraction is done in training
mkdir -p $feat_dir
for x in ${train_set} dev test; do
cp -r data/$x $feat_dir
done
tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \
--in_scp data/${train_set}/wav.scp \
--out_cmvn $feat_dir/$train_set/global_cmvn
fi
dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
bpemodel=data/lang_char/${train_set}_${bpemode}${nbpe}
echo "dictionary: ${dict}"
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
### Task dependent. You have to check non-linguistic symbols used in the corpus.
echo "stage 2: Dictionary and Json Data Preparation"
mkdir -p data/lang_char/
echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
echo "<unk> 1" >> ${dict} # <unk> must be 1
# we borrowed these code and scripts which are related bpe from ESPnet.
cut -f 2- -d" " data/${train_set}/text > data/lang_char/input.txt
tools/spm_train --input=data/lang_char/input.txt --vocab_size=${nbpe} \
--model_type=${bpemode} --model_prefix=${bpemodel} \
--input_sentence_size=100000000
tools/spm_encode --model=${bpemodel}.model \
--output_format=piece < data/lang_char/input.txt | \
tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
num_token=$(cat $dict | wc -l)
echo "<sos/eos> $num_token" >> $dict # <eos>
wc -l ${dict}
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Prepare data, prepare required format"
if [ ! -f $feat_dir/$train_set/segments ]; then
echo "$0: No such file segments" && exit 1;
else
for x in dev test ${train_set}; do
tools/make_raw_list.py --segments $feat_dir/$x/segments \
$feat_dir/$x/wav.scp $feat_dir/$x/text $feat_dir/$x/data.list
done
fi
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# Training
mkdir -p $dir
INIT_FILE=$dir/ddp_init
# You had better rm it manually before you start run.sh on first node.
# rm -f $INIT_FILE # delete old one before starting
init_method=file://$(readlink -f $INIT_FILE)
echo "$0: init method is $init_method"
# The number of gpus runing on each node/machine
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# Use "nccl" if it works, otherwise use "gloo"
dist_backend="nccl"
# The total number of processes/gpus, so that the master knows
# how many workers to wait for.
# More details about ddp can be found in
# https://pytorch.org/tutorials/intermediate/dist_tuto.html
world_size=`expr $num_gpus \* $num_nodes`
echo "total gpus is: $world_size"
cmvn_opts=
$cmvn && cp ${feat_dir}/${train_set}/global_cmvn $dir
$cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
# train.py will write $train_config to $dir/train.yaml with model input
# and output dimension, train.yaml will be used for inference or model
# export later
for ((i = 0; i < $num_gpus; ++i)); do
{
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
# Rank of each gpu/process used for knowing whether it is
# the master of a worker.
rank=`expr $node_rank \* $num_gpus + $i`
python wenet/bin/train.py --gpu $gpu_id \
--config $train_config \
--data_type $data_type \
--symbol_table $dict \
--bpe_model $bpemodel.model \
--train_data $feat_dir/$train_set/data.list \
--cv_data $feat_dir/dev/data.list \
${checkpoint:+--checkpoint $checkpoint} \
--model_dir $dir \
--ddp.init_method $init_method \
--ddp.world_size $world_size \
--ddp.rank $rank \
--ddp.dist_backend $dist_backend \
--num_workers 8 \
$cmvn_opts \
--pin_memory
} &
done
wait
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# Test model, please specify the model you want to test by --checkpoint
if [ ${average_checkpoint} == true ]; then
decode_checkpoint=$dir/avg_${average_num}.pt
echo "do model average and final checkpoint is $decode_checkpoint"
python wenet/bin/average_model.py \
--dst_model $decode_checkpoint \
--src_path $dir \
--num ${average_num} \
--val_best
fi
# Specify decoding_chunk_size if it's a unified dynamic chunk trained model
# -1 for full chunk
decoding_chunk_size=
ctc_weight=0.5
reverse_weight=0.0
for mode in ${decode_modes}; do
{
test_dir=$dir/test_${mode}
mkdir -p $test_dir
python wenet/bin/recognize.py --gpu 0 \
--mode $mode \
--config $dir/train.yaml \
--data_type $data_type \
--test_data $feat_dir/test/data.list \
--checkpoint $decode_checkpoint \
--beam_size 10 \
--batch_size 1 \
--penalty 0.0 \
--dict $dict \
--bpe_model $bpemodel.model \
--ctc_weight $ctc_weight \
--reverse_weight $reverse_weight \
--result_file $test_dir/text \
${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
python tools/compute-wer.py --char=1 --v=1 \
$feat_dir/test/text $test_dir/text > $test_dir/wer
} &
done
wait
fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# Export the best model you want
python wenet/bin/export_jit.py \
--config $dir/train.yaml \
--checkpoint $dir/avg_${average_num}.pt \
--output_file $dir/final.zip
fi
../../../tools/
\ No newline at end of file
../../../wenet/
\ No newline at end of file
# Performance Record
## Conformer Result
* Feature info: dither + specaug + speed perturb
* Training info: lr 0.002, warmup_steps 5000 batch size 16, 1 gpu, acc_grad 4, 120 epochs
* Decoding info: average_num 20
* trans_type: phn
| decoding mode | test (wer) |
| :--------------------: | :---------: |
| ctc_greedy_search | 16.70% |
| ctc_prefix_beam_search | 16.60% |
| attention | 22.37% |
| attention_rescoring | 16.60% |
## transformer Result
* Feature info: dither + specaug + speed perturb
* Training info: lr 0.002, warmup_steps 5000 batch size 16, 1 gpu, acc_grad 4, 120 epochs
* Decoding info: average_num 20
* trans_type: phn
| decoding mode | test (wer) |
| :--------------------: | :---------: |
| ctc_greedy_search | 17.78% |
| ctc_prefix_beam_search | 17.46% |
| attention | 21.77% |
| attention_rescoring | 17.06% |
\ No newline at end of file
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
dataset_conf:
split_with_space: true
filter_conf:
max_length: 40960
min_length: 0
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 16
grad_clip: 5
accum_grad: 4 #
max_epoch: 120
log_interval: 10
optim: adam
optim_conf:
lr: 0.002
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 5000 # 20000
# network architecture
# encoder related
encoder: transformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.2
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder architecture type
normalize_before: true
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.2
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
dataset_conf:
split_with_space: true
filter_conf:
max_length: 40960
min_length: 0
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 16
grad_clip: 5
accum_grad: 4
max_epoch: 120
log_interval: 10
optim: adam
optim_conf:
lr: 0.002
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 5000
faks0
fdac1
fjem0
mgwt0
mjar0
mmdb1
mmdm2
mpdf0
fcmh0
fkms0
mbdg0
mbwm0
mcsh0
fadg0
fdms0
fedw0
mgjf0
mglb0
mrtk0
mtaa0
mtdt0
mthc0
mwjg0
fnmr0
frew0
fsem0
mbns0
mmjr0
mdls0
mdlf0
mdvc0
mers0
fmah0
fdrw0
mrcs0
mrjm4
fcal1
mmwh0
fjsj0
majc0
mjsw0
mreb0
fgjd0
fjmg0
mroa0
mteb0
mjfc0
mrjr0
fmml0
mrws1
aa aa aa
ae ae ae
ah ah ah
ao ao aa
aw aw aw
ax ax ah
ax-h ax ah
axr er er
ay ay ay
b b b
bcl vcl sil
ch ch ch
d d d
dcl vcl sil
dh dh dh
dx dx dx
eh eh eh
el el l
em m m
en en n
eng ng ng
epi epi sil
er er er
ey ey ey
f f f
g g g
gcl vcl sil
h# sil sil
hh hh hh
hv hh hh
ih ih ih
ix ix ih
iy iy iy
jh jh jh
k k k
kcl cl sil
l l l
m m m
n n n
ng ng ng
nx n n
ow ow ow
oy oy oy
p p p
pau sil sil
pcl cl sil
q
r r r
s s s
sh sh sh
t t t
tcl cl sil
th th th
uh uh uh
uw uw uw
ux uw uw
v v v
w w w
y y y
z z z
zh zh sh
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
import os
def sph2pipe_wav(in_wav, tmp_out_wav, out_wav):
with open(in_wav, 'r', encoding='utf-8') as in_f:
with open(tmp_out_wav, 'w', encoding='utf-8') as tmp_out_f:
with open(out_wav, 'w', encoding='utf-8') as out_f:
for line in in_f:
_tmp = line.strip().split(' ')
wav_out_path = _tmp[4]
wav_out_path = wav_out_path.split('/')
wav_out_path[-4] = wav_out_path[-4] + '_pipe'
if not os.path.exists('/'.join(wav_out_path[:-1])):
os.makedirs('/'.join(wav_out_path[:-1]))
wav_out_path = '/'.join(wav_out_path)
tmp_out_f.write(' '.join(_tmp[1:5]) + ' ' + wav_out_path +
'\n')
out_f.write(_tmp[0] + ' ' + wav_out_path + '\n')
if __name__ == '__main__':
if len(sys.argv) != 4:
print('wrong input parameter')
raise NotImplementedError(len(sys.argv))
in_wav = sys.argv[1]
tmp_out_wav = sys.argv[2]
out_wav = sys.argv[3]
sph2pipe_wav(in_wav, tmp_out_wav, out_wav)
mdab0
mwbt0
felc0
mtas1
mwew0
fpas0
mjmp0
mlnt0
fpkt0
mlll0
mtls0
fjlm0
mbpm0
mklt0
fnlp0
mcmj0
mjdh0
fmgd0
mgrt0
mnjm0
fdhc0
mjln0
mpam0
fmld0
#!/usr/bin/env bash
# Copyright 2013 (Authors: Bagher BabaAli, Daniel Povey, Arnab Ghoshal)
# 2014 Brno University of Technology (Author: Karel Vesely)
# 2019 IIIT-Bangalore (Shreekantha Nadig)
# Apache 2.0.
create_glm_stm=false
if [ $# -le 0 ]; then
echo "Argument should be the Timit directory, see ../run.sh for example."
exit 1;
fi
dir=`pwd`/data/local/data
lmdir=`pwd`/data/local/nist_lm
mkdir -p $dir $lmdir
local=`pwd`/local
utils=`pwd`/utils
conf=`pwd`/conf
if [ $2 ]; then
if [[ $2 = "char" || $2 = "phn" ]]; then
trans_type=$2
else
echo "Transcript type must be one of [phn, char]"
echo $2
fi
else
trans_type=phn
fi
. ./path.sh
sph2pipe_version="v2.5"
if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then
echo "Download sph2pipe_${sph2pipe_version} ......"
wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \
wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \
tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools
cd tools/sph2pipe_${sph2pipe_version}/ && \
gcc -o sph2pipe *.c -lm
cd -
fi
sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe
if ! command -v "${sph2pipe}" &> /dev/null; then
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
exit 1;
fi
[ -f $local/test_spk.list ] || error_exit "$PROG: Eval-set speaker list not found.";
[ -f $local/dev_spk.list ] || error_exit "$PROG: dev-set speaker list not found.";
# First check if the train & test directories exist (these can either be upper-
# or lower-cased
if [ ! -d $1/TRAIN -o ! -d $1/TEST ] && [ ! -d $1/train -o ! -d $1/test ]; then
echo "timit_data_prep.sh: Spot check of command line argument failed"
echo "Command line argument must be absolute pathname to TIMIT directory"
echo "with name like /export/corpora5/LDC/LDC93S1/timit/TIMIT"
exit 1;
fi
# Now check what case the directory structure is
uppercased=false
train_dir=train
test_dir=test
if [ -d $1/TRAIN ]; then
uppercased=true
train_dir=TRAIN
test_dir=TEST
fi
tmpdir=$(mktemp -d /tmp/kaldi.XXXX);
trap 'rm -rf "$tmpdir"' EXIT
# Get the list of speakers. The list of speakers in the 24-speaker core test
# set and the 50-speaker development set must be supplied to the script. All
# speakers in the 'train' directory are used for training.
if $uppercased; then
tr '[:lower:]' '[:upper:]' < $local/dev_spk.list > $tmpdir/dev_spk
tr '[:lower:]' '[:upper:]' < $local/test_spk.list > $tmpdir/test_spk
ls -d "$1"/TRAIN/DR*/* | sed -e "s:^.*/::" > $tmpdir/train_spk
else
tr '[:upper:]' '[:lower:]' < $local/dev_spk.list > $tmpdir/dev_spk
tr '[:upper:]' '[:lower:]' < $local/test_spk.list > $tmpdir/test_spk
ls -d "$1"/train/dr*/* | sed -e "s:^.*/::" > $tmpdir/train_spk
fi
cd $dir
for x in train dev test; do
# First, find the list of audio files (use only si & sx utterances).
# Note: train & test sets are under different directories, but doing find on
# both and grepping for the speakers will work correctly.
find $1/{$train_dir,$test_dir} -not \( -iname 'SA*' \) -iname '*.WAV' \
| grep -f $tmpdir/${x}_spk > ${x}_sph.flist
sed -e 's:.*/\(.*\)/\(.*\).WAV$:\1_\2:i' ${x}_sph.flist \
> $tmpdir/${x}_sph.uttids
paste $tmpdir/${x}_sph.uttids ${x}_sph.flist \
| sort -k1,1 > ${x}_sph.scp
cat ${x}_sph.scp | awk '{print $1}' > ${x}.uttids
# Now, Convert the transcripts into our format (no normalization yet)
# Get the transcripts: each line of the output contains an utterance
# ID followed by the transcript.
if [ $trans_type = "phn" ]
then
echo "phone transcript!"
find $1/{$train_dir,$test_dir} -not \( -iname 'SA*' \) -iname '*.PHN' \
| grep -f $tmpdir/${x}_spk > $tmpdir/${x}_phn.flist
sed -e 's:.*/\(.*\)/\(.*\).PHN$:\1_\2:i' $tmpdir/${x}_phn.flist \
> $tmpdir/${x}_phn.uttids
while read line; do
[ -f $line ] || error_exit "Cannot find transcription file '$line'";
cut -f3 -d' ' "$line" | tr '\n' ' ' | perl -ape 's: *$:\n:;'
done < $tmpdir/${x}_phn.flist > $tmpdir/${x}_phn.trans
paste $tmpdir/${x}_phn.uttids $tmpdir/${x}_phn.trans \
| sort -k1,1 > ${x}.trans
elif [ $trans_type = "char" ]
then
echo "char transcript!"
find $1/{$train_dir,$test_dir} -not \( -iname 'SA*' \) -iname '*.WRD' \
| grep -f $tmpdir/${x}_spk > $tmpdir/${x}_wrd.flist
sed -e 's:.*/\(.*\)/\(.*\).WRD$:\1_\2:i' $tmpdir/${x}_wrd.flist \
> $tmpdir/${x}_wrd.uttids
while read line; do
[ -f $line ] || error_exit "Cannot find transcription file '$line'";
cut -f3 -d' ' "$line" | tr '\n' ' ' | perl -ape 's: *$:\n:;' | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z A-Z]//g'
done < $tmpdir/${x}_wrd.flist > $tmpdir/${x}_wrd.trans
paste $tmpdir/${x}_wrd.uttids $tmpdir/${x}_wrd.trans \
| sort -k1,1 > ${x}.trans
else
echo "WRONG!"
echo $trans_type
exit 0;
fi
# Do normalization steps.
cat ${x}.trans | $local/timit_norm_trans.pl -i - -m $local/phones.60-48-39.map -to 39 | sort > $x.text || exit 1;
# cat ${x}.trans | sort > $x.text || exit 1;
# Create wav.scp
awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp
# Make the utt2spk and spk2utt files.
cut -f1 -d'_' $x.uttids | paste -d' ' $x.uttids - > $x.utt2spk
cat $x.utt2spk | $local/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
# Prepare gender mapping
cat $x.spk2utt | awk '{print $1}' | perl -ane 'chop; m:^.:; $g = lc($&); print "$_ $g\n";' > $x.spk2gender
if "${create_glm_stm}"; then
# Prepare STM file for sclite:
wav-to-duration --read-entire-file=true scp:${x}_wav.scp ark,t:${x}_dur.ark || exit 1
awk -v dur=${x}_dur.ark \
'BEGIN{
while(getline < dur) { durH[$1]=$2; }
print ";; LABEL \"O\" \"Overall\" \"Overall\"";
print ";; LABEL \"F\" \"Female\" \"Female speakers\"";
print ";; LABEL \"M\" \"Male\" \"Male speakers\"";
}
{ wav=$1; spk=wav; sub(/_.*/,"",spk); $1=""; ref=$0;
gender=(substr(spk,0,1) == "f" ? "F" : "M");
printf("%s 1 %s 0.0 %f <O,%s> %s\n", wav, spk, durH[wav], gender, ref);
}
' ${x}.text >${x}.stm || exit 1
# Create dummy GLM file for sclite:
echo ';; empty.glm
[FAKE] => %HESITATION / [ ] __ [ ] ;; hesitation token
' > ${x}.glm
fi
done
echo "Data preparation succeeded"
\ No newline at end of file
#!/usr/bin/env bash
# Copyright 2013 (Author: Daniel Povey)
# Apache 2.0
# This script takes data prepared in a corpus-dependent way
# in data/local/, and converts it into the "canonical" form,
# in various subdirectories of data/, e.g. data/lang, data/train, etc.
. ./path.sh || exit 1;
echo "Preparing train, dev and test data"
srcdir=data/local/data
for x in train dev test; do
mkdir -p data/$x
# cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
local/sph2pipe_process.py $srcdir/${x}_wav.scp data/${x}/tmp_wav.scp data/${x}/wav.scp || exit 1;
while read line
do
echo $line
$line
done < data/${x}/tmp_wav.scp
rm data/${x}/tmp_wav.scp
cp $srcdir/$x.text data/$x/text || exit 1;
cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1;
cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1;
tools/filter_scp.pl data/$x/spk2utt $srcdir/$x.spk2gender > data/$x/spk2gender || exit 1;
[ -e $srcdir/${x}.stm ] && cp $srcdir/${x}.stm data/$x/stm
[ -e $srcdir/${x}.glm ] && cp $srcdir/${x}.glm data/$x/glm
# tools/validate_data_dir.sh --no-feats data/$x || exit 1
done
\ No newline at end of file
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
# Copyright 2012 Arnab Ghoshal
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script normalizes the TIMIT phonetic transcripts that have been
# extracted in a format where each line contains an utterance ID followed by
# the transcript, e.g.:
# fcke0_si1111 h# hh ah dx ux w iy dcl d ix f ay n ih q h#
my $usage = "Usage: timit_norm_trans.pl -i transcript -m phone_map -from [60|48] -to [48|39] > normalized\n
Normalizes phonetic transcriptions for TIMIT, by mapping the phones to a
smaller set defined by the -m option. This script assumes that the mapping is
done in the \"standard\" fashion, i.e. to 48 or 39 phones. The input is
assumed to have 60 phones (+1 for glottal stop, which is deleted), but that can
be changed using the -from option. The input format is assumed to be utterance
ID followed by transcript on the same line.\n";
use strict;
use Getopt::Long;
die "$usage" unless(@ARGV >= 1);
my ($in_trans, $phone_map, $num_phones_out);
my $num_phones_in = 60;
GetOptions ("i=s" => \$in_trans, # Input transcription
"m=s" => \$phone_map, # File containing phone mappings
"from=i" => \$num_phones_in, # Input #phones: must be 60 or 48
"to=i" => \$num_phones_out ); # Output #phones: must be 48 or 39
die $usage unless(defined($in_trans) && defined($phone_map) &&
defined($num_phones_out));
if ($num_phones_in != 60 && $num_phones_in != 48) {
die "Can only used 60 or 48 for -from (used $num_phones_in)."
}
if ($num_phones_out != 48 && $num_phones_out != 39) {
die "Can only used 48 or 39 for -to (used $num_phones_out)."
}
unless ($num_phones_out < $num_phones_in) {
die "Argument to -from ($num_phones_in) must be greater than that to -to ($num_phones_out)."
}
open(M, "<$phone_map") or die "Cannot open mappings file '$phone_map': $!";
my (%phonemap, %seen_phones);
my $num_seen_phones = 0;
while (<M>) {
chomp;
next if ($_ =~ /^q\s*.*$/); # Ignore glottal stops.
m:^(\S+)\s+(\S+)\s+(\S+)$: or die "Bad line: $_";
my $mapped_from = ($num_phones_in == 60)? $1 : $2;
my $mapped_to = ($num_phones_out == 48)? $2 : $3;
if (!defined($seen_phones{$mapped_to})) {
$seen_phones{$mapped_to} = 1;
$num_seen_phones += 1;
}
$phonemap{$mapped_from} = $mapped_to;
}
if ($num_seen_phones != $num_phones_out) {
die "Trying to map to $num_phones_out phones, but seen only $num_seen_phones";
}
open(T, "<$in_trans") or die "Cannot open transcription file '$in_trans': $!";
while (<T>) {
chomp;
$_ =~ m:^(\S+)\s+(.+): or die "Bad line: $_";
my $utt_id = $1;
my $trans = $2;
$trans =~ s/q//g; # Remove glottal stops.
$trans =~ s/^\s*//; $trans =~ s/\s*$//; # Normalize spaces
print $utt_id;
for my $phone (split(/\s+/, $trans)) {
if(exists $phonemap{$phone}) { print " $phonemap{$phone}"; }
if(not exists $phonemap{$phone}) { print " $phone"; }
}
print "\n";
}
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# converts an utt2spk file to a spk2utt file.
# Takes input from the stdin or from a file argument;
# output goes to the standard out.
if ( @ARGV > 1 ) {
die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt";
}
while(<>){
@A = split(" ", $_);
@A == 2 || die "Invalid line in utt2spk file: $_";
($u,$s) = @A;
if(!$seen_spk{$s}) {
$seen_spk{$s} = 1;
push @spklist, $s;
}
push (@{$spk_hash{$s}}, "$u");
}
foreach $s (@spklist) {
$l = join(' ',@{$spk_hash{$s}});
print "$s $l\n";
}
#!/usr/bin/env bash
cmd="$@"
no_feats=false
no_wav=false
no_text=false
no_spk_sort=false
for x in `seq 4`; do
if [ "$1" == "--no-feats" ]; then
no_feats=true
shift;
fi
if [ "$1" == "--no-text" ]; then
no_text=true
shift;
fi
if [ "$1" == "--no-wav" ]; then
no_wav=true
shift;
fi
if [ "$1" == "--no-spk-sort" ]; then
no_spk_sort=true
shift;
fi
done
if [ $# -ne 1 ]; then
echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] <data-dir>"
echo "The --no-xxx options mean that the script does not require "
echo "xxx.scp to be present, but it will check it if it is present."
echo "--no-spk-sort means that the script does not require the utt2spk to be "
echo "sorted by the speaker-id in addition to being sorted by utterance-id."
echo "By default, utt2spk is expected to be sorted by both, which can be "
echo "achieved by making the speaker-id prefixes of the utterance-ids"
echo "e.g.: $0 data/train"
exit 1;
fi
data=$1
if [ ! -d $data ]; then
echo "$0: no such directory $data"
exit 1;
fi
if [ -f $data/images.scp ]; then
cmd=${cmd/--no-wav/} # remove --no-wav if supplied
image/validate_data_dir.sh $cmd
exit $?
fi
for f in spk2utt utt2spk; do
if [ ! -f $data/$f ]; then
echo "$0: no such file $f"
exit 1;
fi
if [ ! -s $data/$f ]; then
echo "$0: empty file $f"
exit 1;
fi
done
! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \
echo "$0: $data/utt2spk has wrong format." && exit;
ns=$(wc -l < $data/spk2utt)
if [ "$ns" == 1 ]; then
echo "$0: WARNING: you have only one speaker. This probably a bad idea."
echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html"
echo " for more information."
fi
tmpdir=$(mktemp -d /tmp/kaldi.XXXX);
trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM
export LC_ALL=C
function check_sorted_and_uniq {
! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1;
! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \
echo "$0: file $1 is not in sorted order or has duplicates" && exit 1;
}
function partial_diff {
diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6)
n1=`cat $1 | wc -l`
n2=`cat $2 | wc -l`
echo "[Lengths are $1=$n1 versus $2=$n2]"
}
check_sorted_and_uniq $data/utt2spk
if ! $no_spk_sort; then
! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \
echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \
echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1;
fi
check_sorted_and_uniq $data/spk2utt
! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \
<(utils/spk2utt_to_utt2spk.pl $data/spk2utt) && \
echo "$0: spk2utt and utt2spk do not seem to match" && exit 1;
cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts
if [ ! -f $data/text ] && ! $no_text; then
echo "$0: no such file $data/text (if this is by design, specify --no-text)"
exit 1;
fi
num_utts=`cat $tmpdir/utts | wc -l`
if [ -f $data/text ]; then
utils/validate_text.pl $data/text || exit 1;
check_sorted_and_uniq $data/text
text_len=`cat $data/text | wc -l`
illegal_sym_list="<s> </s> #0"
for x in $illegal_sym_list; do
if grep -w "$x" $data/text > /dev/null; then
echo "$0: Error: in $data, text contains illegal symbol $x"
exit 1;
fi
done
awk '{print $1}' < $data/text > $tmpdir/utts.txt
if ! cmp -s $tmpdir/utts{,.txt}; then
echo "$0: Error: in $data, utterance lists extracted from utt2spk and text"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/utts{,.txt}
exit 1;
fi
fi
if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then
echo "$0: in directory $data, segments file exists but no wav.scp"
exit 1;
fi
if [ ! -f $data/wav.scp ] && ! $no_wav; then
echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)"
exit 1;
fi
if [ -f $data/wav.scp ]; then
check_sorted_and_uniq $data/wav.scp
if grep -E -q '^\S+\s+~' $data/wav.scp; then
# note: it's not a good idea to have any kind of tilde in wav.scp, even if
# part of a command, as it would cause compatibility problems if run by
# other users, but this used to be not checked for so we let it slide unless
# it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which
# would definitely cause problems as the fopen system call does not do
# tilde expansion.
echo "$0: Please do not use tilde (~) in your wav.scp."
exit 1;
fi
if [ -f $data/segments ]; then
check_sorted_and_uniq $data/segments
# We have a segments file -> interpret wav file as "recording-ids" not utterance-ids.
! cat $data/segments | \
awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \
echo "$0: badly formatted segments file" && exit 1;
segments_len=`cat $data/segments | wc -l`
if [ -f $data/text ]; then
! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \
echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \
echo "$0: Lengths are $segments_len vs $num_utts" && \
exit 1
fi
cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings
awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav
if ! cmp -s $tmpdir/recordings{,.wav}; then
echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/recordings{,.wav}
exit 1;
fi
if [ -f $data/reco2file_and_channel ]; then
# this file is needed only for ctm scoring; it's indexed by recording-id.
check_sorted_and_uniq $data/reco2file_and_channel
! cat $data/reco2file_and_channel | \
awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) {
if ( NF == 3 && $3 == "1" ) {
warning_issued = 1;
} else {
print "Bad line ", $0; exit 1;
}
}
}
END {
if (warning_issued == 1) {
print "The channel should be marked as A or B, not 1! You should change it ASAP! "
}
}' && echo "$0: badly formatted reco2file_and_channel file" && exit 1;
cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc
if ! cmp -s $tmpdir/recordings{,.r2fc}; then
echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/recordings{,.r2fc}
exit 1;
fi
fi
else
# No segments file -> assume wav.scp indexed by utterance.
cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav
if ! cmp -s $tmpdir/utts{,.wav}; then
echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/utts{,.wav}
exit 1;
fi
if [ -f $data/reco2file_and_channel ]; then
# this file is needed only for ctm scoring; it's indexed by recording-id.
check_sorted_and_uniq $data/reco2file_and_channel
! cat $data/reco2file_and_channel | \
awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) {
if ( NF == 3 && $3 == "1" ) {
warning_issued = 1;
} else {
print "Bad line ", $0; exit 1;
}
}
}
END {
if (warning_issued == 1) {
print "The channel should be marked as A or B, not 1! You should change it ASAP! "
}
}' && echo "$0: badly formatted reco2file_and_channel file" && exit 1;
cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc
if ! cmp -s $tmpdir/utts{,.r2fc}; then
echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/utts{,.r2fc}
exit 1;
fi
fi
fi
fi
if [ ! -f $data/feats.scp ] && ! $no_feats; then
echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)"
exit 1;
fi
if [ -f $data/feats.scp ]; then
check_sorted_and_uniq $data/feats.scp
cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats
if ! cmp -s $tmpdir/utts{,.feats}; then
echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/utts{,.feats}
exit 1;
fi
fi
if [ -f $data/cmvn.scp ]; then
check_sorted_and_uniq $data/cmvn.scp
cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn
cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
if ! cmp -s $tmpdir/speakers{,.cmvn}; then
echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/speakers{,.cmvn}
exit 1;
fi
fi
if [ -f $data/spk2gender ]; then
check_sorted_and_uniq $data/spk2gender
! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \
echo "$0: Mal-formed spk2gender file" && exit 1;
cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender
cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
if ! cmp -s $tmpdir/speakers{,.spk2gender}; then
echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/speakers{,.spk2gender}
exit 1;
fi
fi
if [ -f $data/spk2warp ]; then
check_sorted_and_uniq $data/spk2warp
! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \
echo "$0: Mal-formed spk2warp file" && exit 1;
cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp
cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
if ! cmp -s $tmpdir/speakers{,.spk2warp}; then
echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/speakers{,.spk2warp}
exit 1;
fi
fi
if [ -f $data/utt2warp ]; then
check_sorted_and_uniq $data/utt2warp
! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \
echo "$0: Mal-formed utt2warp file" && exit 1;
cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp
cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts
if ! cmp -s $tmpdir/utts{,.utt2warp}; then
echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/utts{,.utt2warp}
exit 1;
fi
fi
# check some optionally-required things
for f in vad.scp utt2lang utt2uniq; do
if [ -f $data/$f ]; then
check_sorted_and_uniq $data/$f
if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \
<( awk '{print $1}' $data/$f ); then
echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list"
exit 1;
fi
fi
done
if [ -f $data/utt2dur ]; then
check_sorted_and_uniq $data/utt2dur
cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur
if ! cmp -s $tmpdir/utts{,.utt2dur}; then
echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/utts{,.utt2dur}
exit 1;
fi
cat $data/utt2dur | \
awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1
fi
if [ -f $data/utt2num_frames ]; then
check_sorted_and_uniq $data/utt2num_frames
cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames
if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then
echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/utts{,.utt2num_frames}
exit 1
fi
awk <$data/utt2num_frames '{
if (NF != 2 || !($2 > 0) || $2 != int($2)) {
print "Bad line utt2num_frames:" NR ":" $0
exit 1 } }' || exit 1
fi
if [ -f $data/reco2dur ]; then
check_sorted_and_uniq $data/reco2dur
cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur
if [ -f $tmpdir/recordings ]; then
if ! cmp -s $tmpdir/recordings{,.reco2dur}; then
echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/recordings{,.reco2dur}
exit 1;
fi
else
if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then
echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/{utts,recordings.reco2dur}
exit 1;
fi
fi
cat $data/reco2dur | \
awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1
fi
echo "$0: Successfully validated data-directory $data"
export WENET_DIR=$PWD/../../..
export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=../../../:$PYTHONPATH
#!/bin/bash
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
. ./path.sh || exit 1;
# Use this to control how many gpu you use, It's 1-gpu training if you specify
# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
export CUDA_VISIBLE_DEVICES="0"
# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
# communication. More details can be found in
# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
# export NCCL_SOCKET_IFNAME=ens4f1
export NCCL_DEBUG=INFO
stage=0 # start from 0 if you need to start from data preparation
stop_stage=4
# The num of nodes or machines used for multi-machine training
# Default 1 for single machine/node
# NFS will be needed if you want run multi-machine training
num_nodes=1
# The rank of each node or machine, range from 0 to num_nodes -1
# The first node/machine sets node_rank 0, the second one sets node_rank 1
# the third one set node_rank 2, and so on. Default 0
node_rank=0
# data
timit_data=/home/Liangcd/data/timit
# path to save preproecssed data
# export data=data
nj=16
# data_type can be `raw` or `shard`. Typically, raw is used for small dataset,
# `shard` is used for large dataset which is over 1k hours, and `shard` is
# faster on reading data and training.
data_type=raw
num_utts_per_shard=1000
train_set=train
# Optional train_config
# 1. conf/train_transformer.yaml: Standard transformer
# 2. conf/train_conformer.yaml: Standard conformer
# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer
# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer
# 5. conf/train_conformer_no_pos.yaml: Conformer without relative positional encoding
# 6. conf/train_u2++_conformer.yaml: U2++ conformer
# 7. conf/train_u2++_transformer.yaml: U2++ transformer
train_config=conf/train_transformer.yaml
cmvn=true
dir=exp/transformer_phn_5k_acc4_bs16
checkpoint=
# use average_checkpoint will get better result
average_checkpoint=true
decode_checkpoint=$dir/final.pt
average_num=20
decode_modes="ctc_greedy_search ctc_prefix_beam_search attention attention_rescoring"
# choose in [phn]
trans_type=phn
dict=data/dict/${trans_type}_units.txt
. tools/parse_options.sh || exit 1;
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "stage 0: Data preparation"
echo "preparing data for TIMIT for ${trans_type} level transcripts"
local/timit_data_prep.sh ${timit_data} ${trans_type} || exit 1;
local/timit_format_data.sh
echo "Finish stage 0"
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: compute global cmvn"
# compute cmvn
tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \
--in_scp data/${train_set}/wav.scp \
--out_cmvn data/${train_set}/global_cmvn
echo "Finish stage 1"
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: make train dict"
# Make train dict
echo "Make a dictionary"
mkdir -p $(dirname $dict)
echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
echo "<unk> 1" >> ${dict} # <unk> must be 1
tools/text2token.py -s 1 -n 1 --space sil --trans_type ${trans_type} data/${train_set}/text \
| cut -f 2- -d" " | tr " " "\n" | sort | uniq | grep -v -e '^\s*$' | \
awk '{print $0 " " NR+1}' >> ${dict}
wc -l ${dict}
num_token=$(cat $dict | wc -l)
echo "<sos/eos> $num_token" >> $dict # <eos>
echo "Finish stage 2"
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "stage 3: Prepare data, prepare required format"
for x in dev test ${train_set}; do
if [ $data_type == "shard" ]; then
tools/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \
--num_threads 16 data/$x/wav.scp data/$x/text \
$(realpath data/$x/shards) data/$x/data.list
else
tools/make_raw_list.py data/$x/wav.scp data/$x/text \
data/$x/data.list
fi
done
echo "Finish stage 3"
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
mkdir -p $dir
# You have to rm `INIT_FILE` manually when you resume or restart a
# multi-machine training.
INIT_FILE=$dir/ddp_init
init_method=file://$(readlink -f $INIT_FILE)
echo "$0: init method is $init_method"
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# Use "nccl" if it works, otherwise use "gloo"
dist_backend="gloo"
world_size=`expr $num_gpus \* $num_nodes`
echo "total gpus is: $world_size"
cmvn_opts=
$cmvn && cp data/${train_set}/global_cmvn $dir
$cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
# train.py rewrite $train_config to $dir/train.yaml with model input
# and output dimension, and $dir/train.yaml will be used for inference
# and export.
for ((i = 0; i < $num_gpus; ++i)); do
{
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
# Rank of each gpu/process used for knowing whether it is
# the master of a worker.
rank=`expr $node_rank \* $num_gpus + $i`
python wenet/bin/train.py --gpu $gpu_id \
--config $train_config \
--data_type $data_type \
--symbol_table $dict \
--train_data data/$train_set/data.list \
--cv_data data/dev/data.list \
${checkpoint:+--checkpoint $checkpoint} \
--model_dir $dir \
--ddp.init_method $init_method \
--ddp.world_size $world_size \
--ddp.rank $rank \
--ddp.dist_backend $dist_backend \
--num_workers 1 \
$cmvn_opts \
--pin_memory
} &
done
wait
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# Test model, please specify the model you want to test by --checkpoint
if [ ${average_checkpoint} == true ]; then
decode_checkpoint=$dir/avg_${average_num}.pt
echo "do model average and final checkpoint is $decode_checkpoint"
python wenet/bin/average_model.py \
--dst_model $decode_checkpoint \
--src_path $dir \
--num ${average_num} \
--val_best
fi
# Please specify decoding_chunk_size for unified streaming and
# non-streaming model. The default value is -1, which is full chunk
# for non-streaming inference.
decoding_chunk_size=
ctc_weight=0.5
reverse_weight=0.0
for mode in ${decode_modes}; do
{
test_dir=$dir/test_${mode}
mkdir -p $test_dir
python wenet/bin/recognize.py --gpu 0 \
--mode $mode \
--config $dir/train.yaml \
--data_type $data_type \
--test_data data/test/data.list \
--checkpoint $decode_checkpoint \
--beam_size 10 \
--batch_size 1 \
--penalty 0.0 \
--dict $dict \
--ctc_weight $ctc_weight \
--reverse_weight $reverse_weight \
--result_file $test_dir/text \
${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} \
--connect_symbol
python tools/compute-wer.py --char=1 --v=1 \
data/test/text $test_dir/text > $test_dir/wer
} &
done
wait
fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# compute wer
for mode in ${decode_modes}; do
for test_set in test; do
test_dir=$dir/test_${mode}
sed 's:▁: :g' $test_dir/text > $test_dir/text.norm
python tools/compute-wer.py --char=1 --v=1 \
data/$test_set/text $test_dir/text.norm > $test_dir/wer
done
done
fi
../../../tools/
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment