Commit 764b3a75 authored by Sugon_ldc's avatar Sugon_ldc
Browse files

add new model

parents
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 31
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
dataset_conf:
filter_conf:
max_length: 2000
min_length: 10
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 3
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
# batch_size: 32
batch_size: 16
grad_clip: 5
accum_grad: 4
max_epoch: 240
log_interval: 100
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
A ey
B b iy
C s iy
D d iy
E iy
F eh f
G jh iy
H ey ch
I ay
J jh ey
K k ey
L eh l
M eh m
N eh n
O ow
P p iy
Q k y uw
R aa r
S eh s
T t iy
U y uw
V v iy
W d ah b ax l y uw
X eh k s
Y w ay
Z z iy
1d0
< file: $SWB/data/dictionary/sw-ms98-dict.text
8645a8646
> uh-hum ah m hh ah m
9006c9007
< April ey p r ih l
---
> April ey p r ax l
9144d9144
< B ay zh aa n iy z
9261c9261
< Battle b ae t el
---
> Battle b ae t ax l
10014a10015
> Chevy sh eh v iy
10211a10213
> Colorado k ao l ax r aa d ow
10212a10215
> Colorado' k ao l ax r aa d ow z
10370c10373
< Creek k r ih k
---
> Creek k r iy k
10889a10893
> Eleven ax l eh v ih n
10951c10955
< Erie ih r iy
---
> Erie iy r iy
11183c11187
< Forever f ax r eh v er
---
> Forever f er eh v er
11231a11236
> Friday f r ay d iy
11744a11750
> History hh ih s t r iy
12004a12011,12012
> Israel ih z r ih l
> Israel's ih z r ih l z
12573a12582
> Lincoln l ih ng k ih n
12574a12584
> Lincolns l ih ng k ih n z
13268c13278
< NAACP eh ey ey s iy p iy
---
> NAACP eh n ey ey s iy p iy
13286c13296
< NIT eh ay t iy
---
> NIT eh n ay t iy
13292c13302
< NTSC eh t iy eh s s iy
---
> NTSC eh n t iy eh s s iy
14058a14069
> Quarter k ow r t er
14059a14071
> Quarterback k ow r t er b ae k
14060a14073
> Quarters k ow r t er z
14569a14583
> Science s ay n s
15087a15102
> Sunday s ah n d iy
15088a15104
> Sunday's s ah n d iy z
15089a15106
> Sundays s ah n d iy z
15290,15291c15307,15308
< Texan t eh k sh ih n
< Texan's t eh k sh ih n s
---
> Texan t eh k s ih n
> Texan's t eh k s ih n s
15335a15353
> Thousands th aw z ih n z
15739c15757
< Waco w ae k ow
---
> Waco w ey k ow
15841a15860
> Weekends w iy k eh n z
16782a16802
> acceptable eh k s eh p ax b ax l
16833a16854
> accounting ax k aw n ih ng
16948a16970
> address ax d r eh s
17281a17304
> already aa r d iy
17315a17339
> am m
17709a17734
> asked ae s t
17847a17873
> attorney ih t er n iy
17919a17946
> autopilot ao t ow p ay l ih t
17960a17988
> awfully ao f l iy
18221a18250
> basketball b ae s k ax b ao l
18222a18252
> basketball's b ae s k ax b ao l z
18302a18333
> become b ah k ah m
18303a18335
> becomes b iy k ah m z
18344a18377
> began b ax g en n
18817c18850
< bottle b aa t el
---
> bottle b aa t ax l
19332,19333c19365,19367
< camera's k ae m ax r ax z
< cameras k ae m ax r ax z
---
> camera k ae m r ax
> camera's k ae m r ax z
> cameras k ae m r ax z
19411a19446
> capital k ae p ax l
19505a19541
> carrying k ae r ih ng
20316a20353,20354
> combination k aa m ih n ey sh ih n
> combinations k aa m ih n ey sh ih n z
20831a20870
> contracts k aa n t r ae k s
21010a21050
> costs k ao s
21062a21103
> county k aw n iy
21371a21413
> cultural k ao l ch ax r ax l
21372a21415
> culturally k ao l ch ax r ax l iy
21373a21417
> culture k ao l ch er
21375a21420
> cultures k ao l ch er z
21543a21589
> data d ey t ax
22097a22144
> differently d ih f ax r ih n t l iy
22972a23020
> effects ax f eh k t s
23016a23065
> election ax l eh k sh ih n
23018a23068
> elections ax l eh k sh ih n z
23052a23103
> eleven ax l eh v ih n
23242a23294
> enjoyable ae n jh oy ax b ax l
23248a23301
> enjoys ae n jh oy z
23293a23347
> entire ih n t ay r
23295a23350,23351
> entirely ih n t ay r l iy
> entirety ih n t ay r t iy
23745a23802
> extra eh k s t er
23818a23876
> facts f ae k s
24508c24566
< forever f ax r eh v er
---
> forever f er eh v er
24514c24572
< forget f ow r g eh t
---
> forget f er r g eh t
24521a24580
> forgot f er r g aa t
24522a24582
> forgotten f er r g aa t ax n
24563a24624
> forward f ow er d
24680a24742
> frightening f r ay t n ih ng
24742a24805
> full-time f ax l t ay m
24862a24926
> garage g r aa jh
25218a25283
> grandmother g r ae m ah dh er
25790a25856
> heavily hh eh v ax l iy
25949a26016
> history hh ih s t r iy
26038a26106
> honestly aa n ax s t l iy
26039a26108
> honesty aa n ax s t iy
26099a26169
> horror hh ow r
26155a26226
> houses hh aw z ih z
26184c26255
< huh-uh hh ah hh ah
---
> huh-uh ah hh ah
26189c26260
< hum-um hh m hh m
---
> hum-um ah m hh ah m
26236a26308
> hunting hh ah n ih ng
26307a26380,26381
> ideal ay d iy l
> idealist ay d iy l ih s t
26369a26444
> imagine m ae jh ih n
26628a26704
> individuals ih n d ih v ih jh ax l z
26968a27045
> interest ih n t r ih s t
27184a27262
> it'd ih d
27702a27781
> lead l iy d
28378a28458
> mandatory m ae n d ih t ow r iy
28885a28966
> minute m ih n ih t
29167a29249
> mountains m aw t n z
29317a29400
> mysteries m ih s t r iy z
29318a29402
> mystery m ih s t r iy
29470a29555
> nervous n er v ih s
29578,29580c29663,29665
< nobody n ow b aa d iy
< nobody'll n ow b aa d iy l
< nobody's n ow b aa d iy z
---
> nobody n ow b ah d iy
> nobody'll n ow b ah d iy l
> nobody's n ow b ah d iy z
29712a29798
> nuclear n uw k l iy r
29938a30025
> onto aa n t ax
30051a30139
> originally ax r ih jh ax l iy
30507a30596
> particularly p er t ih k y ax l iy
30755a30845
> perfectly p er f ih k l iy
30820a30911
> personally p er s n ax l iy
30915a31007
> physically f ih z ih k l iy
30986a31079
> pilot p ay l ih t
30987a31081
> pilot's p ay l ih t s
31227a31322
> police p l iy s
31513a31609
> prefer p er f er
31553a31650
> prepare p r ax p ey r
31578a31676
> prescription p er s k r ih p sh ih n
31579a31678
> prescriptions p er s k r ih p sh ih n z
31770a31870
> products p r aa d ax k s
31821a31922
> projects p r aa jh eh k s
31908a32010
> protect p er t eh k t
31909a32012
> protected p er t eh k t ih d
31911a32015
> protection p er t eh k sh ih n
31914a32019
> protection p er t eh k t ih v
32149a32255
> quarter k ow r t er
32414a32521
> read r iy d
32785a32893
> rehabilitation r iy ax b ih l ih t ey sh ih n
33150a33259
> resource r ih s ow r s
33151a33261
> resources r iy s ow r s ih z
33539c33649
< roots r uh t s
---
> roots r uw t s
33929a34040
> science s ay n s
34315a34427
> seventy s eh v ih n iy
34319,34320c34431,34432
< severe s ax v iy r
< severely s ax v iy r l iy
---
> severe s ih v iy r
> severely s ih v iy r l iy
35060a35173
> software s ao f w ey r
35083a35197
> solid s ao l ih d
35084a35199
> solidly s ao l ih d l iy
35750a35866
> stood s t ih d
35854a35971
> strictly s t r ih k l iy
35889c36006
< stronger s t r ao ng er
---
> stronger s t r ao ng g er
36192a36310,36311
> supposed s p ow z
> supposed s p ow s
36510a36630
> tastes t ey s
36856a36977
> thoroughly th er r l iy
36866a36988
> thousands th aw z ih n z
37081c37203
< toots t uh t s
---
> toots t uw t s
37157a37280
> toward t w ow r d
37158a37282
> towards t w ow r d z
37564a37689
> twenties t w eh n iy z
37565a37691
> twentieth t w eh n iy ih th
37637a37764
> unacceptable ah n ae k s eh p ax b ax l
37728a37856
> understand ah n d er s t ae n
37860a37989
> unless ih n l eh s
38040a38170
> use y uw z
38049a38180
> uses y uw z ih z
38125a38257
> various v ah r iy ih s
38202a38335
> versus v er s ih z
38381c38514
< wacko w ae k ow
---
> wacko w ey k ow
38455c38588
< wanna w aa n ax
---
> wanna w ah n ax
38675c38808
< whatnot w ah t n aa t
---
> whatnot w aa t n aa t
38676a38810
> whatsoever w aa t s ow eh v er
38890c39024
< wok w aa k
---
> wok w ao k
38910a39045
> wondering w ah n d r ih ng
#!/usr/bin/env bash
# Hub-5 Eval 2000 data preparation
# Author: Arnab Ghoshal (Jan 2013)
# To be run from one directory above this script.
# The input is two directory names (possibly the same) containing the
# 2000 Hub5 english evaluation test set and transcripts, which are
# respectively: LDC2002S09 LDC2002T43
# e.g. see
# http://www.ldc.upenn.edu/Catalog/catalogEntry.jsp?catalogId=LDC2002S09
# http://www.ldc.upenn.edu/Catalog/CatalogEntry.jsp?catalogId=LDC2002T43
#
# Example usage:
# local/eval2000_data_prep_edin.sh /exports/work/inf_hcrc_cstr_general/corpora/hub5/2000 /exports/work/inf_hcrc_cstr_general/corpora/hub5/2000/transcr
# The first directory ($sdir) contains the speech data, and the directory
# $sdir/english/ must exist.
# The second directory ($tdir) contains the transcripts, and the directory
# $tdir/reference must exist; in particular we need the file
# $tdir/reference/hub5e00.english.000405.stm
if [ $# -ne 2 ]; then
echo "Usage: "`basename $0`" <speech-dir> <transcription-dir>"
echo "See comments in the script for more details"
exit 1
fi
sdir=$1
tdir=$2
[ ! -d $sdir/english ] \
&& echo Expecting directory $sdir/english to be present && exit 1;
[ -d $tdir/2000_hub5_eng_eval_tr ] \
&& tdir=$tdir/2000_hub5_eng_eval_tr
[ ! -d $tdir/reference ] \
&& echo Expecting directory $tdir/reference to be present && exit 1;
. ./path.sh
dir=data/local/eval2000
mkdir -p $dir
find -L $sdir/english -iname '*.sph' | sort > $dir/sph.flist
sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \
> $dir/sph.scp
# Get segments file...
# segments file format is: utt-id side-id start-time end-time, e.g.:
# sw02001-A_000098-001156 sw02001-A 0.98 11.56
pem=$sdir/english/hub5e_00.pem
[ ! -f $pem ] && echo "$0: No such file $pem" && exit 1;
# pem file has lines like:
# en_4156 A unknown_speaker 301.85 302.48
# we ignore the warnings below for now, although they seem to indicate some problems
# with the data.
grep -v ';;' $pem \
| awk '{
spk=$1"-"$2;
utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
print utt,spk,$4,$5;}' \
| sort -u | local/extend_segments.pl 0.1 > $dir/segments
# stm file has lines like:
# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F> HE IS A POLICE OFFICER
# TODO(arnab): We should really be lowercasing this since the Edinburgh
# recipe uses lowercase. This is not used in the actual scoring.
grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \
| awk '{
spk=$1"-"$2;
utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
printf utt; for(n=7;n<=NF;n++) printf(" %s", $n); print ""; }' \
| sort > $dir/text.all
# We'll use the stm file for sclite scoring. There seem to be various errors
# in the stm file that upset hubscr.pl, and we fix them here.
sed -e 's:((:(:' -e 's:<B_ASIDE>::g' -e 's:<E_ASIDE>::g' \
$tdir/reference/hub5e00.english.000405.stm > $dir/stm
cp $tdir/reference/en20000405_hub5.glm $dir/glm
# next line uses command substitution
# Just checking that the segments are the same in pem vs. stm.
! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) && \
echo "Segments from pem file and stm file do not match." && exit 1;
grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text
# side A - channel 1, side B - channel 2
bash tools/sph2wav.sh --nj 16 $dir/sph.scp $dir/segments $dir/wav.scp
# create an utt2spk file that assumes each conversation side is
# a separate speaker.
awk '{print $1,$2;}' $dir/segments > $dir/utt2spk
tools/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
# cp $dir/segments $dir/segments.tmp
# awk '{x=$3-0.05; if (x<0.0) x=0.0; y=$4+0.05; print $1, $2, x, y; }' \
# $dir/segments.tmp > $dir/segments
awk '{print $1}' $dir/wav_ori.scp \
| perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_";
print "$1-$2 $1 $2\n"; ' \
> $dir/reco2file_and_channel || exit 1;
dest=data/eval2000
mkdir -p $dest
for x in wav.scp text utt2spk spk2utt; do
cp $dir/$x $dest/$x
done
echo Data preparation and formatting completed for Eval 2000
echo "(but not MFCC extraction)"
tools/fix_data_dir.sh $dest
if [ $(wc -l < $dest/wav.scp) -ne 80 ]; then
echo "$0: error: expected 80 lines in wav.scp, got $(wc -l < $dest/wav.scp)"
exit 1;
fi
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
if (@ARGV != 1 || !($ARGV[0] =~ m/^-?\d+\.?\d*$/ && $ARGV[0] >= 0)) {
print STDERR "Usage: extend_segments.pl time-in-seconds <segments >segments.extended \n" .
"e.g. extend_segments.pl 0.25 <segments.1 >segments.2\n" .
"This command modifies a segments file, with lines like\n" .
" <utterance-id> <recording-id> <start-time> <end-time>\n" .
"by extending the beginning and end of each segment by a certain\n" .
"length of time. This script makes sure the output segments do not\n" .
"overlap as a result of this time-extension, and that there are no\n" .
"negative times in the output.\n";
exit 1;
}
$extend = $ARGV[0];
@all_lines = ();
while (<STDIN>) {
chop;
@A = split(" ", $_);
if (@A != 4) {
die "invalid line in segments file: $_";
}
$line = @all_lines; # current number of lines.
($utt_id, $reco_id, $start_time, $end_time) = @A;
push @all_lines, [ $utt_id, $reco_id, $start_time, $end_time ]; # anonymous array.
if (! defined $lines_for_reco{$reco_id}) {
$lines_for_reco{$reco_id} = [ ]; # push new anonymous array.
}
push @{$lines_for_reco{$reco_id}}, $line;
}
foreach $reco_id (keys %lines_for_reco) {
$ref = $lines_for_reco{$reco_id};
@line_numbers = sort { ${$all_lines[$a]}[2] <=> ${$all_lines[$b]}[2] } @$ref;
{
# handle start of earliest segment as a special case.
$l0 = $line_numbers[0];
$tstart = ${$all_lines[$l0]}[2] - $extend;
if ($tstart < 0.0) { $tstart = 0.0; }
${$all_lines[$l0]}[2] = $tstart;
}
{
# handle end of latest segment as a special case.
$lN = $line_numbers[$#line_numbers];
$tend = ${$all_lines[$lN]}[3] + $extend;
${$all_lines[$lN]}[3] = $tend;
}
for ($i = 0; $i < $#line_numbers; $i++) {
$ln = $line_numbers[$i];
$ln1 = $line_numbers[$i+1];
$tend = ${$all_lines[$ln]}[3]; # end of earlier segment.
$tstart = ${$all_lines[$ln1]}[2]; # start of later segment.
if ($tend > $tstart) {
$utt1 = ${$all_lines[$ln]}[0];
$utt2 = ${$all_lines[$ln1]}[0];
print STDERR "Warning: for utterances $utt1 and $utt2, segments " .
"already overlap; leaving these times unchanged.\n";
} else {
$my_extend = $extend;
$max_extend = 0.5 * ($tstart - $tend);
if ($my_extend > $max_extend) { $my_extend = $max_extend; }
$tend += $my_extend;
$tstart -= $my_extend;
${$all_lines[$ln]}[3] = $tend;
${$all_lines[$ln1]}[2] = $tstart;
}
}
}
# leave the numbering of the lines unchanged.
for ($l = 0; $l < @all_lines; $l++) {
$ref = $all_lines[$l];
($utt_id, $reco_id, $start_time, $end_time) = @$ref;
printf("%s %s %.2f %.2f\n", $utt_id, $reco_id, $start_time, $end_time);
}
__END__
# testing below.
# ( echo a1 A 0 1; echo a2 A 3 4; echo b1 B 0 1; echo b2 B 2 3 ) | local/extend_segments.pl 1.0
a1 A 0.00 2.00
a2 A 2.00 5.00
b1 B 0.00 1.50
b2 B 1.50 4.00
# ( echo a1 A 0 2; echo a2 A 1 3 ) | local/extend_segments.pl 1.0
Warning: for utterances a1 and a2, segments already overlap; leaving these times unchanged.
a1 A 0.00 2.00
a2 A 1.00 4.00
# ( echo a1 A 0 2; echo a2 A 5 6; echo a3 A 3 4 ) | local/extend_segments.pl 1.0
a1 A 0.00 2.50
a2 A 4.50 7.00
a3 A 2.50 4.50
#!/usr/bin/env python3
# Copyright 2015 Minhua Wu
# Apache 2.0
# convert acronyms in swbd dict to fisher convention
# IBM to i._b._m.
# BBC to b._b._c.
# BBCs to b._b._c.s
# BBC's to b._b._c.'s
import argparse
import re
__author__ = "Minhua Wu"
parser = argparse.ArgumentParser(description="format acronyms to a._b._c.")
parser.add_argument("-i", "--input", help="Input lexicon", required=True)
parser.add_argument("-o", "--output", help="Output lexicon", required=True)
parser.add_argument(
"-L", "--Letter", help="Input single letter pronunciation", required=True
)
parser.add_argument("-M", "--Map", help="Output acronyms mapping", required=True)
args = parser.parse_args()
fin_lex = open(args.input, "r")
fin_Letter = open(args.Letter, "r")
fout_lex = open(args.output, "w")
fout_map = open(args.Map, "w")
# Initialise single letter dictionary
dict_letter = {}
for single_letter_lex in fin_Letter:
items = single_letter_lex.split()
dict_letter[items[0]] = single_letter_lex[len(items[0]) + 1 :].strip()
fin_Letter.close()
# print dict_letter
for lex in fin_lex:
items = lex.split()
word = items[0]
lexicon = lex[len(items[0]) + 1 :].strip()
# find acronyms from words with only letters and '
pre_match = re.match(r"^[A-Za-z]+$|^[A-Za-z]+\'s$|^[A-Za-z]+s$", word)
if pre_match:
# find if words in the form of xxx's is acronym
if word[-2:] == "'s" and (lexicon[-1] == "s" or lexicon[-1] == "z"):
actual_word = word[:-2]
actual_lexicon = lexicon[:-2]
acronym_lexicon = ""
for w in actual_word:
acronym_lexicon = acronym_lexicon + dict_letter[w.upper()] + " "
if acronym_lexicon.strip() == actual_lexicon:
acronym_mapped = ""
acronym_mapped_back = ""
for w in actual_word[:-1]:
acronym_mapped = acronym_mapped + w.lower() + "._"
acronym_mapped_back = acronym_mapped_back + w.lower() + " "
acronym_mapped = acronym_mapped + actual_word[-1].lower() + ".'s"
acronym_mapped_back = (
acronym_mapped_back + actual_word[-1].lower() + "'s"
)
fout_map.write(
word + "\t" + acronym_mapped + "\t" + acronym_mapped_back + "\n"
)
fout_lex.write(acronym_mapped + " " + lexicon + "\n")
else:
fout_lex.write(lex)
# find if words in the form of xxxs is acronym
elif word[-1] == "s" and (lexicon[-1] == "s" or lexicon[-1] == "z"):
actual_word = word[:-1]
actual_lexicon = lexicon[:-2]
acronym_lexicon = ""
for w in actual_word:
acronym_lexicon = acronym_lexicon + dict_letter[w.upper()] + " "
if acronym_lexicon.strip() == actual_lexicon:
acronym_mapped = ""
acronym_mapped_back = ""
for w in actual_word[:-1]:
acronym_mapped = acronym_mapped + w.lower() + "._"
acronym_mapped_back = acronym_mapped_back + w.lower() + " "
acronym_mapped = acronym_mapped + actual_word[-1].lower() + ".s"
acronym_mapped_back = (
acronym_mapped_back + actual_word[-1].lower() + "'s"
)
fout_map.write(
word + "\t" + acronym_mapped + "\t" + acronym_mapped_back + "\n"
)
fout_lex.write(acronym_mapped + " " + lexicon + "\n")
else:
fout_lex.write(lex)
# find if words in the form of xxx (not ended with 's or s) is acronym
elif word.find("'") == -1 and word[-1] != "s":
acronym_lexicon = ""
for w in word:
acronym_lexicon = acronym_lexicon + dict_letter[w.upper()] + " "
if acronym_lexicon.strip() == lexicon:
acronym_mapped = ""
acronym_mapped_back = ""
for w in word[:-1]:
acronym_mapped = acronym_mapped + w.lower() + "._"
acronym_mapped_back = acronym_mapped_back + w.lower() + " "
acronym_mapped = acronym_mapped + word[-1].lower() + "."
acronym_mapped_back = acronym_mapped_back + word[-1].lower()
fout_map.write(
word + "\t" + acronym_mapped + "\t" + acronym_mapped_back + "\n"
)
fout_lex.write(acronym_mapped + " " + lexicon + "\n")
else:
fout_lex.write(lex)
else:
fout_lex.write(lex)
else:
fout_lex.write(lex)
#!/usr/bin/env python3
# Copyright 2015 Minhua Wu
# Apache 2.0
# convert acronyms in swbd transcript to fisher convention
# according to first two columns in the input acronyms mapping
import argparse
import re
__author__ = "Minhua Wu"
parser = argparse.ArgumentParser(description="format acronyms to a._b._c.")
parser.add_argument("-i", "--input", help="Input transcripts", required=True)
parser.add_argument("-o", "--output", help="Output transcripts", required=True)
parser.add_argument("-M", "--Map", help="Input acronyms mapping", required=True)
args = parser.parse_args()
fin_map = open(args.Map, "r")
dict_acronym = {}
dict_acronym_noi = {} # Mapping of acronyms without I, i
for pair in fin_map:
items = pair.split("\t")
dict_acronym[items[0]] = items[1]
dict_acronym_noi[items[0]] = items[1]
fin_map.close()
del dict_acronym_noi["I"]
del dict_acronym_noi["i"]
fin_trans = open(args.input, "r")
fout_trans = open(args.output, "w")
for line in fin_trans:
items = line.split()
L = len(items)
# First pass mapping to map I as part of acronym
for i in range(L):
if items[i] == "I":
x = 0
while i - 1 - x >= 0 and re.match(r"^[A-Z]$", items[i - 1 - x]):
x += 1
y = 0
while i + 1 + y < L and re.match(r"^[A-Z]$", items[i + 1 + y]):
y += 1
if x + y > 0:
for bias in range(-x, y + 1):
items[i + bias] = dict_acronym[items[i + bias]]
# Second pass mapping (not mapping 'i' and 'I')
for i in range(len(items)):
if items[i] in dict_acronym_noi.keys():
items[i] = dict_acronym_noi[items[i]]
sentence = " ".join(items[1:])
fout_trans.write(items[0] + " " + sentence.lower() + "\n")
fin_trans.close()
fout_trans.close()
#!/usr/bin/env bash
# Switchboard-1 training data preparation customized for Edinburgh
# Author: Arnab Ghoshal (Jan 2013)
# To be run from one directory above this script.
## The input is some directory containing the switchboard-1 release 2
## corpus (LDC97S62). Note: we don't make many assumptions about how
## you unpacked this. We are just doing a "find" command to locate
## the .sph files.
. ./path.sh
#check existing directories
if [ $# != 1 ]; then
echo "Usage: swbd1_data_download.sh /path/to/SWBD"
exit 1;
fi
SWBD_DIR=$1
dir=data/local/train
mkdir -p $dir
# Audio data directory check
if [ ! -d $SWBD_DIR ]; then
echo "Error: run.sh requires a directory argument"
exit 1;
fi
# Trans directory check
if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then
(
cd $dir;
if [ ! -d swb_ms98_transcriptions ]; then
echo " *** Downloading trascriptions and dictionary ***"
wget http://www.openslr.org/resources/5/switchboard_word_alignments.tar.gz ||
wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz
tar -xf switchboard_word_alignments.tar.gz
fi
)
else
echo "Directory with transcriptions exists, skipping downloading"
[ -f $dir/swb_ms98_transcriptions ] \
|| ln -sf $SWBD_DIR/transcriptions/swb_ms98_transcriptions $dir/
fi
#!/usr/bin/env bash
# Switchboard-1 training data preparation customized for Edinburgh
# Author: Arnab Ghoshal (Jan 2013)
# To be run from one directory above this script.
## The input is some directory containing the switchboard-1 release 2
## corpus (LDC97S62). Note: we don't make many assumptions about how
## you unpacked this. We are just doing a "find" command to locate
## the .sph files.
## The second input is optional, which should point to a directory containing
## Switchboard transcriptions/documentations (specifically, the conv.tab file).
## If specified, the script will try to use the actual speaker PINs provided
## with the corpus instead of the conversation side ID (Kaldi default). We
## will be using "find" to locate this file so we don't make any assumptions
## on the directory structure. (Peng Qi, Aug 2014)
. ./path.sh
#check existing directories
if [ $# != 1 -a $# != 2 ]; then
echo "Usage: swbd1_data_prep.sh /path/to/SWBD [/path/to/SWBD_DOC]"
exit 1;
fi
SWBD_DIR=$1
dir=data/local/train
mkdir -p $dir
# Audio data directory check
if [ ! -d $SWBD_DIR ]; then
echo "Error: run.sh requires a directory argument"
exit 1;
fi
# Option A: SWBD dictionary file check
[ ! -f $dir/swb_ms98_transcriptions/sw-ms98-dict.text ] && \
echo "SWBD dictionary file does not exist" && exit 1;
# find sph audio files
find -L $SWBD_DIR -iname '*.sph' | sort > $dir/sph.flist
n=`cat $dir/sph.flist | wc -l`
[ $n -ne 2435 ] && [ $n -ne 2438 ] && \
echo Warning: expected 2435 or 2438 data data files, found $n
# (1a) Transcriptions preparation
# make basic transcription file (add segments info)
awk '{
name=substr($1,1,6); gsub("^sw","sw0",name); side=substr($1,7,1);
stime=$2; etime=$3;
printf("%s-%s_%06.0f-%06.0f",
name, side, int(100*stime+0.5), int(100*etime+0.5));
for(i=4;i<=NF;i++) printf(" %s", $i); printf "\n"
}' $dir/swb_ms98_transcriptions/*/*/*-trans.text > $dir/transcripts1.txt
# test if trans. file is sorted
export LC_ALL=C;
sort -c $dir/transcripts1.txt || exit 1; # check it's sorted.
# Remove SILENCE, <B_ASIDE> and <E_ASIDE>.
# Note: we have [NOISE], [VOCALIZED-NOISE], [LAUGHTER], [SILENCE].
# removing [SILENCE], and the <B_ASIDE> and <E_ASIDE> markers that mark
# speech to somone; we will give phones to the other three (NSN, SPN, LAU).
# There will also be a silence phone, SIL.
# **NOTE: modified the pattern matches to make them case insensitive
cat $dir/transcripts1.txt \
| perl -ane 's:\s\[SILENCE\](\s|$):$1:gi;
s/<B_ASIDE>//gi;
s/<E_ASIDE>//gi;
print;' \
| awk '{if(NF > 1) { print; } } ' > $dir/transcripts2.txt
# **NOTE: swbd1_map_words.pl has been modified to make the pattern matches
# case insensitive
local/swbd1_map_words.pl -f 2- $dir/transcripts2.txt > $dir/text
# format acronyms in text
python3 local/map_acronyms_transcripts.py -i $dir/text -o $dir/text_map \
-M data/local/dict_nosp/acronyms.map
mv $dir/text_map $dir/text
# (1c) Make segment files from transcript
#segments file format is: utt-id side-id start-time end-time, e.g.:
#sw02001-A_000098-001156 sw02001-A 0.98 11.56
awk '{
segment=$1;
split(segment,S,"[_-]");
side=S[2]; audioname=S[1]; startf=S[3]; endf=S[4];
print segment " " audioname "-" side " " startf/100 " " endf/100
}' < $dir/text > $dir/segments
sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \
> $dir/sph.scp
# side A - channel 1, side B - channel 2
bash tools/sph2wav.sh --nj 16 $dir/sph.scp $dir/segments $dir/wav.scp
# this file reco2file_and_channel maps recording-id (e.g. sw02001-A)
# to the file name sw02001 and the A, e.g.
# sw02001-A sw02001 A
# In this case it's trivial, but in other corpora the information might
# be less obvious. Later it will be needed for ctm scoring.
awk '{print $1}' $dir/wav_ori.scp \
| perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_";
print "$1-$2 $1 $2\n"; ' \
> $dir/reco2file_and_channel || exit 1;
awk '{spk=substr($1,1,9); print $1 " " spk}' $dir/segments > $dir/utt2spk \
|| exit 1;
sort -k 2 $dir/utt2spk | tools/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1;
# We assume each conversation side is a separate speaker. This is a very
# reasonable assumption for Switchboard. The actual speaker info file is at:
# http://www.ldc.upenn.edu/Catalog/desc/addenda/swb-multi-annot.summary
# Copy stuff into its final locations [this has been moved from the format_data
# script]
mkdir -p data/train
for f in spk2utt utt2spk wav.scp text; do
cp data/local/train/$f data/train/$f || exit 1;
done
if [ $# == 2 ]; then # fix speaker IDs
find $2 -name conv.tab > $dir/conv.tab
local/swbd1_fix_speakerid.pl `cat $dir/conv.tab` data/train
tools/utt2spk_to_spk2utt.pl data/train/utt2spk.new > data/train/spk2utt.new
# patch files
for f in spk2utt utt2spk text segments; do
cp data/train/$f data/train/$f.old || exit 1;
cp data/train/$f.new data/train/$f || exit 1;
done
rm $dir/conv.tab
fi
echo Switchboard-1 data preparation succeeded.
utils/fix_data_dir.sh data/train
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
# Author: Peng Qi (pengqi@cs.stanford.edu)
# This script maps Switchboard speaker IDs to the true physical speakers
# and fixes the utterances IDs accordingly. Expected to be run one level of
# directory above.
sub trim {
(my $s = $_[0]) =~ s/^\s+|\s+$//g;
return $s;
}
if ($#ARGV != 1) {
print "Usage: swbd1_fix_speakerid.pl <swbd-conv-tab-file> <data-dir>\n";
print "E.g.: swbd1_fix_speakerid.pl /datasets/SWBD1Transcripts/tables/conv.tab data/train\n";
}
$tab_file = $ARGV[0];
$dir = $ARGV[1];
%conv_to_spk = ();
open(my $conv_tab, '<', $tab_file) or die "Could not open '$tab_file' $!\n";
while (my $line = <$conv_tab>) {
chomp $line;
my @fields = split "," , $line;
#$fields[0] = trim($fields[0]);
$fields[2] = trim($fields[2]);
$fields[3] = trim($fields[3]);
$conv_to_spk{'sw0' . $fields[0] . '-A'} = $fields[2];
$conv_to_spk{'sw0' . $fields[0] . '-B'} = $fields[3];
}
close($conv_tab);
# fix utt2spk
%missingconv = ();
open(my $utt2spk, '<', $dir . '/utt2spk') or die "Could not open '$dir/utt2spk' $!\n";
open(my $utt2spk_new, '>', $dir . '/utt2spk.new');
while (my $line = <$utt2spk>) {
chomp $line;
my @fields = split " " , $line;
my $convid = substr $fields[0], 0, 9;
if (exists $conv_to_spk{ $convid }) {
my $spkid = $conv_to_spk{ $convid };
$spkid = "sw" . $spkid;
my $newuttid = $spkid . '-' . (substr $fields[0], 2);
print $utt2spk_new "$newuttid $spkid\n";
} else {
my $convid = substr $convid, 3, 4;
$missingconv{$convid} = 1;
print $utt2spk_new $fields[0]." ".$fields[1]."\n";
}
}
close($utt2spk);
close($utt2spk_new);
foreach my $conv (keys %missingconv) {
print "Warning: Conversation ID '$conv' not found in conv.tab, retaining old speaker IDs\n"
}
# fix segments and text
foreach my $file ('segments','text') {
open(my $oldfile, '<', "$dir/$file") or die "Could not open '$dir/$file' $!\n";
open(my $newfile, '>', "$dir/$file.new");
while (my $line = <$oldfile>) {
chomp $line;
my $convid = substr $line, 0, 9;
if (exists $conv_to_spk{$convid}) {
my $spkid = $conv_to_spk{$convid};
print $newfile "sw$spkid-" . (substr $line, 2) . "\n";
} else {
print $newfile "$line\n";
}
}
}
#!/usr/bin/env perl
# Modified from swbd_map_words.pl in Kaldi s5 recipe to make pattern
# matches case-insensitive --Arnab (Jan 2013)
if ($ARGV[0] eq "-f") {
shift @ARGV;
$field_spec = shift @ARGV;
if ($field_spec =~ m/^\d+$/) {
$field_begin = $field_spec - 1; $field_end = $field_spec - 1;
}
if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesy (properly, 1-10)
if ($1 ne "") {
$field_begin = $1 - 1; # Change to zero-based indexing.
}
if ($2 ne "") {
$field_end = $2 - 1; # Change to zero-based indexing.
}
}
if (!defined $field_begin && !defined $field_end) {
die "Bad argument to -f option: $field_spec";
}
}
while (<>) {
@A = split(" ", $_);
for ($n = 0; $n < @A; $n++) {
$a = $A[$n];
if ( (!defined $field_begin || $n >= $field_begin)
&& (!defined $field_end || $n <= $field_end)) {
# e.g. [LAUGHTER-STORY] -> STORY;
$a =~ s:(|\-)^\[LAUGHTER-(.+)\](|\-)$:$1$2$3:i;
# $1 and $3 relate to preserving trailing "-"
$a =~ s:^\[(.+)/.+\](|\-)$:$1$2:; # e.g. [IT'N/ISN'T] -> IT'N ... note,
# 1st part may include partial-word stuff, which we process further below,
# e.g. [LEM[GUINI]-/LINGUINI]
# the (|\_) at the end is to accept and preserve trailing -'s.
$a =~ s:^(|\-)\[[^][]+\](.+)$:-$2:; # e.g. -[AN]Y , note \047 is quote;
# let the leading - be optional on input, as sometimes omitted.
$a =~ s:^(.+)\[[^][]+\](|\-)$:$1-:; # e.g. AB[SOLUTE]- -> AB-;
# let the trailing - be optional on input, as sometimes omitted.
$a =~ s:([^][]+)\[.+\]$:$1:; # e.g. EX[SPECIALLY]-/ESPECIALLY] -> EX-
# which is a mistake in the input.
$a =~ s:^\{(.+)\}$:$1:; # e.g. {YUPPIEDOM} -> YUPPIEDOM
$a =~ s:[A-Z]\[([^][])+\][A-Z]:$1-$3:i; # e.g. AMMU[N]IT- -> AMMU-IT-
$a =~ s:_\d$::; # e.g. THEM_1 -> THEM
}
$A[$n] = $a;
}
print join(" ", @A) . "\n";
}
#!/usr/bin/env bash
# Formatting the Mississippi State dictionary for use in Edinburgh. Differs
# from the one in Kaldi s5 recipe in that it uses lower-case --Arnab (Jan 2013)
# To be run from one directory above this script.
. ./path.sh
#check existing directories
[ $# != 0 ] && echo "Usage: local/swbd1_data_prep.sh" && exit 1;
srcdir=data/local/train # This is where we downloaded some stuff..
dir=data/local/dict_nosp
mkdir -p $dir
srcdict=$srcdir/swb_ms98_transcriptions/sw-ms98-dict.text
# assume swbd_p1_data_prep.sh was done already.
[ ! -f "$srcdict" ] && echo "$0: No such file $srcdict" && exit 1;
cp $srcdict $dir/lexicon0.txt || exit 1;
patch <local/dict.patch $dir/lexicon0.txt || exit 1;
#(2a) Dictionary preparation:
# Pre-processing (remove comments)
grep -v '^#' $dir/lexicon0.txt | awk 'NF>0' | sort > $dir/lexicon1.txt || exit 1;
cat $dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \
grep -v sil > $dir/nonsilence_phones.txt || exit 1;
( echo sil; echo spn; echo nsn; echo lau ) > $dir/silence_phones.txt
echo sil > $dir/optional_silence.txt
# No "extra questions" in the input to this setup, as we don't
# have stress or tone.
echo -n >$dir/extra_questions.txt
cp local/MSU_single_letter.txt $dir/
# Add to the lexicon the silences, noises etc.
# Add single letter lexicon
# The original swbd lexicon does not have precise single letter lexicion
# e.g. it does not have entry of W
( echo '!sil sil'; echo '[vocalized-noise] spn'; echo '[noise] nsn'; \
echo '[laughter] lau'; echo '<unk> spn' ) \
| cat - $dir/lexicon1.txt $dir/MSU_single_letter.txt > $dir/lexicon2.txt || exit 1;
# Map the words in the lexicon. That is-- for each word in the lexicon, we map it
# to a new written form. The transformations we do are:
# remove laughter markings, e.g.
# [LAUGHTER-STORY] -> STORY
# Remove partial-words, e.g.
# -[40]1K W AH N K EY
# becomes -1K
# and
# -[AN]Y IY
# becomes
# -Y
# -[A]B[OUT]- B
# becomes
# -B-
# Also, curly braces, which appear to be used for "nonstandard"
# words or non-words, are removed, e.g.
# {WOLMANIZED} W OW L M AX N AY Z D
# -> WOLMANIZED
# Also, mispronounced words, e.g.
# [YEAM/YEAH] Y AE M
# are changed to just e.g. YEAM, i.e. the orthography
# of the mispronounced version.
# Note-- this is only really to be used in training. The main practical
# reason is to avoid having tons of disambiguation symbols, which
# we otherwise would get because there are many partial words with
# the same phone sequences (most problematic: S).
# Also, map
# THEM_1 EH M -> THEM
# so that multiple pronunciations just have alternate entries
# in the lexicon.
local/swbd1_map_words.pl -f 1 $dir/lexicon2.txt | sort -u \
> $dir/lexicon3.txt || exit 1;
python3 local/format_acronyms_dict.py -i $dir/lexicon3.txt -o $dir/lexicon4.txt \
-L $dir/MSU_single_letter.txt -M $dir/acronyms_raw.map
cat $dir/acronyms_raw.map | sort -u > $dir/acronyms.map
( echo 'i ay' )| cat - $dir/lexicon4.txt | tr '[A-Z]' '[a-z]' | sort -u > $dir/lexicon5.txt
pushd $dir >&/dev/null
ln -sf lexicon5.txt lexicon.txt # This is the final lexicon.
popd >&/dev/null
rm $dir/lexiconp.txt 2>/dev/null
echo Prepared input dictionary and phone-sets for Switchboard phase 1.
export WENET_DIR=$PWD/../../..
export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=../../../:$PYTHONPATH
#!/bin/bash
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
. ./path.sh || exit 1;
# Use this to control how many gpu you use, It's 1-gpu training if you specify
# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
export CUDA_VISIBLE_DEVICES="0,1"
# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
# communication. More details can be found in
# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
# export NCCL_SOCKET_IFNAME=ens4f1
export NCCL_DEBUG=INFO
stage=0 # start from 0 if you need to start from data preparation
stop_stage=5
# The num of nodes or machines used for multi-machine training
# Default 1 for single machine/node
# NFS will be needed if you want run multi-machine training
num_nodes=1
# The rank of each node or machine, range from 0 to num_nodes -1
# The first node/machine sets node_rank 0, the second one sets node_rank 1
# the third one set node_rank 2, and so on. Default 0
node_rank=0
nj=16
feat_dir=raw_wav
data_type=shard # raw or shard
num_utts_per_shard=1000
prefetch=100
# bpemode (unigram or bpe)
nbpe=2000
bpemode=bpe
# data directory
swbd1_dir=/home/backup_nfs2/hlyu/swbd/LDC97S62
eval2000_dir="/home/backup_nfs2/hlyu/swbd/LDC2002S09/hub5e_00 /home/backup_nfs2/hlyu/swbd/LDC2002T43"
train_set=train_nodup
train_config=conf/train_conformer.yaml
cmvn=true
dir=exp/conformer
checkpoint=
# use average_checkpoint will get better result
average_checkpoint=true
decode_checkpoint=$dir/final.pt
average_num=10
decode_modes="ctc_greedy_search ctc_prefix_beam_search attention attention_rescoring"
. tools/parse_options.sh || exit 1;
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# Data preparation
local/swbd1_data_download.sh ${swbd1_dir}
local/swbd1_prepare_dict.sh
local/swbd1_data_prep.sh ${swbd1_dir}
local/eval2000_data_prep.sh ${eval2000_dir}
# process the train set by
# 1) convert lower to upper
# 2) remove ._._ -1 symbols from text
# 3) subset training set and dev set
# 4) remove duplicated utterances
cp data/train/text data/train/text.org
paste -d" " <(cut -f 1 -d" " data/train/text.org) \
<(cut -f 2- -d" " data/train/text.org | tr "[:lower:]" "[:upper:]") > data/train/text
sed -i 's/\._/ /g; s/\.//g; s/THEM_1/THEM/g' data/train/text
tools/subset_data_dir.sh --first data/train 4000 data/train_dev # 5hr 6min
n=$(($(wc -l < data/train/text) - 4000))
tools/subset_data_dir.sh --last data/train ${n} data/train_nodev
tools/data/remove_dup_utts.sh 300 data/train_nodev data/train_nodup
# process eval2000 set by
# 1) remove tags (%AH) (%HESITATION) (%UH)
# 2) remove <B_ASIDE> <E_ASIDE>
# 3) remove "(" or ")"
# 4) remove file with empty text
cp data/eval2000/text data/eval2000/text.org
paste -d "" \
<(cut -f 1 -d" " data/eval2000/text.org) \
<(awk '{$1=""; print toupper($0)}' data/eval2000/text.org \
| perl -pe 's| \(\%.*\)||g' | perl -pe 's| \<.*\>||g' \
| sed -e "s/(//g" -e "s/)//g") \
| sed -e 's/\s\+/ /g' > data/eval2000/text.org2
awk -F ' ' '{if(length($2) != 0) print $0}' data/eval2000/text.org2 > data/eval2000/text
tools/fix_data_dir.sh data/eval2000
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# For wav feature, just copy the data. Fbank extraction is done in training
mkdir -p ${feat_dir}
for x in ${train_set} train_dev eval2000; do
cp -r data/${x} ${feat_dir}
done
tools/compute_cmvn_stats.py --num_workers 16 --train_config ${train_config} \
--in_scp data/${train_set}/wav.scp \
--out_cmvn ${feat_dir}/${train_set}/global_cmvn
fi
dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
bpemodel=data/lang_char/${train_set}_${bpemode}${nbpe}
echo "dictionary: ${dict}"
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
### Task dependent. You have to check non-linguistic symbols used in the corpus.
echo "stage 2: Dictionary and Json Data Preparation"
mkdir -p data/lang_char/
echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
echo "<unk> 1" >> ${dict} # <unk> must be 1
# we borrowed these code and scripts which are related bpe from ESPnet.
cut -f 2- -d" " data/${train_set}/text > data/lang_char/input.txt
tools/spm_train --input=data/lang_char/input.txt \
--vocab_size=${nbpe} \
--character_coverage=1.0 \
--model_type=${bpemode} \
--model_prefix=${bpemodel} \
--input_sentence_size=100000000 \
--user_defined_symbols="[LAUGHTER],[NOISE],[VOCALIZED-NOISE]"
tools/spm_encode --model=${bpemodel}.model \
--output_format=piece < data/lang_char/input.txt | \
tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
num_token=$(cat ${dict} | wc -l)
echo "<sos/eos> ${num_token}" >> ${dict} # <eos>
wc -l ${dict}
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Prepare data, prepare required format"
for x in ${train_set} train_dev eval2000; do
if [ ${data_type} == "shard" ]; then
tools/make_shard_list.py --num_utts_per_shard ${num_utts_per_shard} \
--num_threads ${nj} ${feat_dir}/${x}/wav.scp ${feat_dir}/${x}/text \
$(realpath ${feat_dir}/${x}/shards) ${feat_dir}/${x}/data.list
else
tools/make_raw_list.py ${feat_dir}/${x}/wav.scp ${feat_dir}/${x}/text \
${feat_dir}/${x}/data.list
fi
done
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# Training
mkdir -p ${dir}
INIT_FILE=${dir}/ddp_init
# You had better rm it manually before you start run.sh on first node.
# rm -f $INIT_FILE # delete old one before starting
init_method=file://$(readlink -f ${INIT_FILE})
echo "$0: init method is $init_method"
# The number of gpus runing on each node/machine
num_gpus=$(echo ${CUDA_VISIBLE_DEVICES} | awk -F "," '{print NF}')
# Use "nccl" if it works, otherwise use "gloo"
dist_backend="nccl"
# The total number of processes/gpus, so that the master knows
# how many workers to wait for.
# More details about ddp can be found in
# https://pytorch.org/tutorials/intermediate/dist_tuto.html
world_size=`expr ${num_gpus} \* ${num_nodes}`
echo "total gpus is: ${world_size}"
cmvn_opts=
${cmvn} && cp ${feat_dir}/${train_set}/global_cmvn ${dir}
${cmvn} && cmvn_opts="--cmvn ${dir}/global_cmvn"
# train.py will write $train_config to $dir/train.yaml with model input
# and output dimension, train.yaml will be used for inference or model
# export later
for ((i = 0; i < ${num_gpus}; ++i)); do
{
gpu_id=$(echo ${CUDA_VISIBLE_DEVICES} | cut -d',' -f$[$i+1])
# Rank of each gpu/process used for knowing whether it is
# the master of a worker.
rank=`expr ${node_rank} \* ${num_gpus} + ${i}`
python wenet/bin/train.py --gpu ${gpu_id} \
--config ${train_config} \
--data_type ${data_type} \
--symbol_table ${dict} \
--prefetch ${prefetch} \
--bpe_model ${bpemodel}.model \
--train_data ${feat_dir}/${train_set}/data.list \
--cv_data ${feat_dir}/train_dev/data.list \
${checkpoint:+--checkpoint $checkpoint} \
--model_dir ${dir} \
--ddp.init_method ${init_method} \
--ddp.world_size ${world_size} \
--ddp.rank ${rank} \
--ddp.dist_backend ${dist_backend} \
--num_workers 4 \
${cmvn_opts} \
--pin_memory
} &
done
wait
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# Test model, please specify the model you want to test by --checkpoint
if [ ${average_checkpoint} == true ]; then
decode_checkpoint=${dir}/avg_${average_num}.pt
echo "do model average and final checkpoint is ${decode_checkpoint}"
python wenet/bin/average_model.py \
--dst_model ${decode_checkpoint} \
--src_path $dir \
--num ${average_num} \
--val_best
fi
# Specify decoding_chunk_size if it's a unified dynamic chunk trained model
# -1 for full chunk
decoding_chunk_size=
ctc_weight=0.5
reverse_weight=0.0
for mode in ${decode_modes}; do
{
test_dir=${dir}/test_${mode}
mkdir -p ${test_dir}
python wenet/bin/recognize.py --gpu 0 \
--mode $mode \
--config $dir/train.yaml \
--data_type $data_type \
--test_data $feat_dir/eval2000/data.list \
--checkpoint $decode_checkpoint \
--beam_size 10 \
--batch_size 1 \
--penalty 0.0 \
--dict $dict \
--bpe_model $bpemodel.model \
--ctc_weight $ctc_weight \
--reverse_weight $reverse_weight \
--result_file $test_dir/text \
${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
sed -i.bak -r 's/<blank> //g' ${test_dir}/text
mv ${test_dir}/text ${test_dir}/text.bak2
tools/spm_decode --model=${bpemodel}.model --input_format=piece \
< ${test_dir}/text.bak2 | sed -e "s/▁/ /g" > ${test_dir}/text
python tools/compute-wer.py --char=1 --v=1 \
$feat_dir/eval2000/text $test_dir/text > $test_dir/wer
}
done
wait
fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# Export the best model you want
python wenet/bin/export_jit.py \
--config $dir/train.yaml \
--checkpoint $dir/avg_${average_num}.pt \
--output_file $dir/final.zip
fi
../../../tools/
\ No newline at end of file
../../../wenet/
\ No newline at end of file
# Performance Record
## Conformer Result
* Feature info: using fbank feature, dither, cmvn, without speed perturb (not supported segments yet)
* Training info: lr 0.001, batch size 20, 8 gpu, acc_grad 1, 240 epochs, dither 0.1
* Decoding info: ctc_weight 0.5, average_num 10
| decoding mode | Dev WER | Test WER |
|---------------------|---------|----------|
| attention rescoring | 9.54% | 8.66% |
\ No newline at end of file
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 31
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
dataset_conf:
filter_conf:
max_length: 2000
min_length: 10
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 3
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 20
grad_clip: 5
accum_grad: 1
max_epoch: 240
log_interval: 100
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
#!/usr/bin/env bash
# Copyright 2014 Nickolay V. Shmyrev
# 2014 Brno University of Technology (Author: Karel Vesely)
# 2016 John Hopkins University (author: Daniel Povey)
# Apache 2.0
mkdir -p db
cd db ### Note: the rest of this script is executed from the directory 'db'.
# TED-LIUM database:
if [[ $(hostname -f) == *.clsp.jhu.edu ]] ; then
if [ ! -e TEDLIUM_release-3 ]; then
ln -sf /export/corpora5/TEDLIUM_release-3
fi
echo "$0: linking the TEDLIUM data from /export/corpora5/TEDLIUM_release-3"
else
if [ ! -e TEDLIUM_release-3 ]; then
echo "$0: downloading TEDLIUM_release2 data (it won't re-download if it was already downloaded.)"
# the following command won't re-get it if it's already there
# because of the --continue switch.
wget --continue http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz || exit 1
tar xf "TEDLIUM_release-3.tgz"
else
echo "$0: not downloading or un-tarring TEDLIUM_release2 because it already exists."
fi
fi
num_sph=$(find -L TEDLIUM_release-3/legacy -name '*.sph' | wc -l)
# We mainly use TED-LIUM 3 "legacy" distribution, on which the dev and test datasets are the same as in TED-LIUM 2 (and TED-LIUM1).
# It contains 2351 sph files for training and 19 sph files for dev/test (total 2370).
# Because the "legacy" contains symbolic links to "data", we use `find -L`.
if [ "$num_sph" != 2370 ]; then
echo "$0: expected to find 2370 .sph files in the directory db/TEDLIUM_release3/legacy, found $num_sph"
exit 1
fi
exit 0
#!/usr/bin/env python3
#
# Copyright 2014 Nickolay V. Shmyrev
# 2016 Johns Hopkins University (author: Daniel Povey)
# Apache 2.0
import sys
# This script joins together pairs of split-up words like "you 're" -> "you're".
# The TEDLIUM transcripts are normalized in a way that's not traditional for
# speech recognition.
prev_line = ""
for line in sys.stdin:
if line == prev_line:
continue
items = line.split()
new_items = []
i = 0
while i < len(items):
if i < len(items) - 1 and items[i + 1][0] == "'":
new_items.append(items[i] + items[i + 1])
i = i + 1
else:
new_items.append(items[i])
i = i + 1
print(" ".join(new_items))
prev_line = line
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment