Commit 764b3a75 authored by Sugon_ldc's avatar Sugon_ldc
Browse files

add new model

parents
#!/usr/bin/env bash
# Copyright 2013 Johns Hopkins University (author: Daniel Povey)
# Apache 2.0
# This script operates on a directory, such as in data/train/,
# that contains some subset of the following files:
# feats.scp
# wav.scp
# vad.scp
# spk2utt
# utt2spk
# text
#
# It copies to another directory, possibly adding a specified prefix or a suffix
# to the utterance and/or speaker names. Note, the recording-ids stay the same.
#
# begin configuration section
spk_prefix=
utt_prefix=
spk_suffix=
utt_suffix=
validate_opts= # should rarely be needed.
# end configuration section
. local/parse_options.sh
if [ $# != 2 ]; then
echo "Usage: "
echo " $0 [options] <srcdir> <destdir>"
echo "e.g.:"
echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1"
echo "Options"
echo " --spk-prefix=<prefix> # Prefix for speaker ids, default empty"
echo " --utt-prefix=<prefix> # Prefix for utterance ids, default empty"
echo " --spk-suffix=<suffix> # Suffix for speaker ids, default empty"
echo " --utt-suffix=<suffix> # Suffix for utterance ids, default empty"
exit 1;
fi
export LC_ALL=C
srcdir=$1
destdir=$2
if [ ! -f $srcdir/utt2spk ]; then
echo "copy_data_dir.sh: no such file $srcdir/utt2spk"
exit 1;
fi
if [ "$destdir" == "$srcdir" ]; then
echo "$0: this script requires <srcdir> and <destdir> to be different."
exit 1
fi
set -e;
mkdir -p $destdir
cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map
cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map
if [ ! -f $srcdir/utt2uniq ]; then
if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then
cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq
fi
else
cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq
fi
cat $srcdir/utt2spk | local/apply_map.pl -f 1 $destdir/utt_map | \
local/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk
local/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt
if [ -f $srcdir/feats.scp ]; then
local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp
fi
if [ -f $srcdir/vad.scp ]; then
local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp
fi
if [ -f $srcdir/segments ]; then
local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments
cp $srcdir/wav.scp $destdir
else # no segments->wav indexed by utt.
if [ -f $srcdir/wav.scp ]; then
local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp
fi
fi
if [ -f $srcdir/reco2file_and_channel ]; then
cp $srcdir/reco2file_and_channel $destdir/
fi
if [ -f $srcdir/text ]; then
local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text
fi
if [ -f $srcdir/text.tc ]; then
local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text.tc >$destdir/text.tc
fi
if [ -f $srcdir/text.lc ]; then
local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text.lc >$destdir/text.lc
fi
if [ -f $srcdir/text.lc.rm ]; then
local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text.lc.rm >$destdir/text.lc.rm
fi
if [ -f $srcdir/utt2dur ]; then
local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur
fi
if [ -f $srcdir/reco2dur ]; then
if [ -f $srcdir/segments ]; then
cp $srcdir/reco2dur $destdir/reco2dur
else
local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur
fi
fi
if [ -f $srcdir/spk2gender ]; then
local/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender
fi
if [ -f $srcdir/cmvn.scp ]; then
local/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp
fi
for f in stm glm ctm; do
if [ -f $srcdir/$f ]; then
cp $srcdir/$f $destdir
fi
done
rm $destdir/spk_map $destdir/utt_map
echo "$0: copied data from $srcdir to $destdir"
for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel stm glm ctm; do
if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then
echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to"
echo " ... $destdir/.backup/$f"
mkdir -p $destdir/.backup
mv $destdir/$f $destdir/.backup/
fi
done
[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats"
[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text"
local/validate_data_dir.sh $validate_opts $destdir
#!/bin/bash
if [ $# -ne 3 ]; then
echo "Usage: $0 <data-base> <url-base> <corpus-part>"
echo "e.g.: $0 /home/data/aishell4 https://www.openslr.org/resources/111 train_L"
echo "<corpus-part> can be one of: train_L, train_M, train_S, test."
fi
data=$1
url=$2
part=$3
if [ ! -d "$data" ]; then
echo "$0: no such directory $data"
exit 1;
fi
part_ok=false
list="train_L train_M train_S test"
for x in $list; do
if [ "$part" == $x ]; then part_ok=true; fi
done
if ! $part_ok; then
echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
exit 1;
fi
if [ -z "$url" ]; then
echo "$0: empty URL base."
exit 1;
fi
if [ -f $data/$part/.complete ]; then
echo "$0: data part $part was already successfully extracted, nothing to do."
exit 0;
fi
if [ -f $data/$part.tar.gz ]; then
echo "$0: removing existing file $data/$part.tar.gz"
rm $data/$part.tar.gz
fi
if [ ! -f $data/$part.tar.gz ]; then
if ! which wget >/dev/null; then
echo "$0: wget is not installed."
exit 1;
fi
full_url=$url/$part.tar.gz
echo "$0: downloading data from $full_url. This may take some time, please be patient."
cd $data
if ! wget --no-check-certificate $full_url; then
echo "$0: error executing wget $full_url"
exit 1;
fi
fi
cd $data
if ! tar -xvzf $part.tar.gz; then
echo "$0: error un-tarring archive $data/$part.tgz"
exit 1;
fi
touch $data/$part/.complete
echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
#!/usr/bin/env perl
# Copyright 2010-2012 Microsoft Corporation
# Johns Hopkins University (author: Daniel Povey)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script takes a list of utterance-ids or any file whose first field
# of each line is an utterance-id, and filters an scp
# file (or any file whose "n-th" field is an utterance id), printing
# out only those lines whose "n-th" field is in id_list. The index of
# the "n-th" field is 1, by default, but can be changed by using
# the -f <n> switch
$exclude = 0;
$field = 1;
$shifted = 0;
do {
$shifted=0;
if ($ARGV[0] eq "--exclude") {
$exclude = 1;
shift @ARGV;
$shifted=1;
}
if ($ARGV[0] eq "-f") {
$field = $ARGV[1];
shift @ARGV; shift @ARGV;
$shifted=1
}
} while ($shifted);
if(@ARGV < 1 || @ARGV > 2) {
die "Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp \n" .
"Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" .
"Note: only the first field of each line in id_list matters. With --exclude, prints\n" .
"only the lines that were *not* in id_list.\n" .
"Caution: previously, the -f option was interpreted as a zero-based field index.\n" .
"If your older scripts (written before Oct 2014) stopped working and you used the\n" .
"-f option, add 1 to the argument.\n" .
"See also: utils/filter_scp.pl .\n";
}
$idlist = shift @ARGV;
open(F, "<$idlist") || die "Could not open id-list file $idlist";
while(<F>) {
@A = split;
@A>=1 || die "Invalid id-list file line $_";
$seen{$A[0]} = 1;
}
if ($field == 1) { # Treat this as special case, since it is common.
while(<>) {
$_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
# $1 is what we filter on.
if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) {
print $_;
}
}
} else {
while(<>) {
@A = split;
@A > 0 || die "Invalid scp file line $_";
@A >= $field || die "Invalid scp file line $_";
if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) {
print $_;
}
}
}
# tests:
# the following should print "foo 1"
# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo)
# the following should print "bar 2".
# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2)
#!/bin/bash
. ./path.sh || exit 1;
if [ $# != 1 ]; then
echo "Usage: $0 <audio-path>"
echo " $0 /home/data/aishell4"
exit 1;
fi
aishell4_source_dir=$1
train_dir=data/local/aishell4_train
test_dir=data/local/aishell4_test
mkdir -p $train_dir
mkdir -p $test_dir
# data directory check
if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then
echo "Error: $0 requires two directory arguments"
exit 1;
fi
for room_name in "train_L" "train_M" "train_S" "test"; do
if [ -f ${aishell4_source_dir}/$room_name/wav_list.txt ];then
rm ${aishell4_source_dir}/$room_name/wav_list.txt
fi
FILES="$PWD/${aishell4_source_dir}/$room_name/wav/*"
for f in $FILES; do
echo "$f" >> ${aishell4_source_dir}/$room_name/wav_list.txt
done
if [ -f ${aishell4_source_dir}/$room_name/TextGrid_list.txt ];then
rm ${aishell4_source_dir}/$room_name/TextGrid_list.txt
fi
FILES="$PWD/${aishell4_source_dir}/$room_name/TextGrid/*.TextGrid"
for f in $FILES; do
echo "$f" >> ${aishell4_source_dir}/$room_name/TextGrid_list.txt
done
done
mkdir -p ${aishell4_source_dir}/full_train
for r in train_L train_M train_S ; do
cat ${aishell4_source_dir}/$r/TextGrid_list.txt >> ${aishell4_source_dir}/full_train/textgrid.flist
cat ${aishell4_source_dir}/$r/wav_list.txt >> ${aishell4_source_dir}/full_train/wav.flist
done
wav_list_aishell4=${aishell4_source_dir}/full_train/wav.flist
text_grid_aishell4=${aishell4_source_dir}/full_train/textgrid.flist
# process train set
sed -e 's/\.wav//' $train_dir/wav.flist | awk -F '/' '{print $NF}' > $train_dir/utt.list
paste -d' ' $train_dir/utt.list $train_dir/wav.flist | sort -u > $train_dir/wav.scp
python local/aishell4_process_textgrid.py --path $train_dir
cat $train_dir/text_all | local/text_normalize.pl | local/text_format.pl | sort -u > $train_dir/text
local/filter_scp.pl -f 1 $train_dir/text $train_dir/utt2spk_all | sort -u > $train_dir/utt2spk
local/utt2spk_to_spk2utt.pl $train_dir/utt2spk > $train_dir/spk2utt
local/filter_scp.pl -f 1 $train_dir/text $train_dir/segments_all | sort -u > $train_dir/segments
# process test set
sed -e 's/\.wav//' $test_dir/wav.flist | awk -F '/' '{print $NF}' > $test_dir/utt.list
paste -d' ' $test_dir/utt.list $test_dir/wav.flist |sort -u > $test_dir/wav.scp
python local/aishell4_process_textgrid.py --path $test_dir
cat $test_dir/text_all | local/text_normalize.pl | local/text_format.pl | sort -u > $test_dir/text
local/filter_scp.pl -f 1 $test_dir/text $test_dir/utt2spk_all | sort -u > $test_dir/utt2spk
local/utt2spk_to_spk2utt.pl $test_dir/utt2spk > $test_dir/spk2utt
local/filter_scp.pl -f 1 $test_dir/text $test_dir/segments_all | sort -u > $test_dir/segments
local/copy_data_dir.sh --utt-prefix Aishell4- --spk-prefix Aishell4- \
$train_dir data/aishell4_train
local/copy_data_dir.sh --utt-prefix Aishell4- --spk-prefix Aishell4- \
$test_dir data/aishell4_test
echo "$0: AISHELL4 data preparation succeeded"
exit 0;
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
while(<>){
@A = split(" ", $_);
@A > 1 || die "Invalid line in spk2utt file: $_";
$s = shift @A;
foreach $u ( @A ) {
print "$u $s\n";
}
}
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
# Copyright Chao Weng
# normalizations for hkust trascript
# see the docs/trans-guidelines.pdf for details
while (<STDIN>) {
@A = split(" ", $_);
if (@A == 1) {
next;
}
print $_
}
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
# Copyright Chao Weng
# normalizations for hkust trascript
# see the docs/trans-guidelines.pdf for details
while (<STDIN>) {
@A = split(" ", $_);
print "$A[0] ";
for ($n = 1; $n < @A; $n++) {
$tmp = $A[$n];
if ($tmp =~ /<sil>/) {$tmp =~ s:<sil>::g;}
if ($tmp =~ /<%>/) {$tmp =~ s:<%>::g;}
if ($tmp =~ /<->/) {$tmp =~ s:<->::g;}
if ($tmp =~ /<\$>/) {$tmp =~ s:<\$>::g;}
if ($tmp =~ /<#>/) {$tmp =~ s:<#>::g;}
if ($tmp =~ /<_>/) {$tmp =~ s:<_>::g;}
if ($tmp =~ /<space>/) {$tmp =~ s:<space>::g;}
if ($tmp =~ /`/) {$tmp =~ s:`::g;}
if ($tmp =~ /&/) {$tmp =~ s:&::g;}
if ($tmp =~ /,/) {$tmp =~ s:,::g;}
if ($tmp =~ /[a-zA-Z]/) {$tmp=uc($tmp);}
if ($tmp =~ /A/) {$tmp =~ s:A:A:g;}
if ($tmp =~ /a/) {$tmp =~ s:a:A:g;}
if ($tmp =~ /b/) {$tmp =~ s:b:B:g;}
if ($tmp =~ /c/) {$tmp =~ s:c:C:g;}
if ($tmp =~ /k/) {$tmp =~ s:k:K:g;}
if ($tmp =~ /t/) {$tmp =~ s:t:T:g;}
if ($tmp =~ /,/) {$tmp =~ s:,::g;}
if ($tmp =~ /丶/) {$tmp =~ s:丶::g;}
if ($tmp =~ /。/) {$tmp =~ s:。::g;}
if ($tmp =~ /、/) {$tmp =~ s:、::g;}
if ($tmp =~ /?/) {$tmp =~ s:?::g;}
if ($tmp =~ /·/) {$tmp =~ s:·::g;}
if ($tmp =~ /\*/) {$tmp =~ s:\*::g;}
if ($tmp =~ /!/) {$tmp =~ s:!::g;}
if ($tmp =~ /\$/) {$tmp =~ s:\$::g;}
if ($tmp =~ /\+/) {$tmp =~ s:\+::g;}
if ($tmp =~ /-/) {$tmp =~ s:-::g;}
if ($tmp =~ /\\/) {$tmp =~ s:\\::g;}
if ($tmp =~ /\?/) {$tmp =~ s:\?::g;}
if ($tmp =~ /¥/) {$tmp =~ s:¥::g;}
if ($tmp =~ /%/) {$tmp =~ s:%::g;}
if ($tmp =~ /\./) {$tmp =~ s:\.::g;}
if ($tmp =~ /</) {$tmp =~ s:<::g;}
if ($tmp =~ /&/) {$tmp =~ s:&::g;}
print "$tmp ";
}
print "\n";
}
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# converts an utt2spk file to a spk2utt file.
# Takes input from the stdin or from a file argument;
# output goes to the standard out.
if ( @ARGV > 1 ) {
die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt";
}
while(<>){
@A = split(" ", $_);
@A == 2 || die "Invalid line in utt2spk file: $_";
($u,$s) = @A;
if(!$seen_spk{$s}) {
$seen_spk{$s} = 1;
push @spklist, $s;
}
push (@{$spk_hash{$s}}, "$u");
}
foreach $s (@spklist) {
$l = join(' ',@{$spk_hash{$s}});
print "$s $l\n";
}
#!/usr/bin/env bash
cmd="$@"
no_feats=false
no_wav=false
no_text=false
no_spk_sort=false
for x in `seq 4`; do
if [ "$1" == "--no-feats" ]; then
no_feats=true
shift;
fi
if [ "$1" == "--no-text" ]; then
no_text=true
shift;
fi
if [ "$1" == "--no-wav" ]; then
no_wav=true
shift;
fi
if [ "$1" == "--no-spk-sort" ]; then
no_spk_sort=true
shift;
fi
done
if [ $# -ne 1 ]; then
echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] <data-dir>"
echo "The --no-xxx options mean that the script does not require "
echo "xxx.scp to be present, but it will check it if it is present."
echo "--no-spk-sort means that the script does not require the utt2spk to be "
echo "sorted by the speaker-id in addition to being sorted by utterance-id."
echo "By default, utt2spk is expected to be sorted by both, which can be "
echo "achieved by making the speaker-id prefixes of the utterance-ids"
echo "e.g.: $0 data/train"
exit 1;
fi
data=$1
if [ ! -d $data ]; then
echo "$0: no such directory $data"
exit 1;
fi
if [ -f $data/images.scp ]; then
cmd=${cmd/--no-wav/} # remove --no-wav if supplied
image/validate_data_dir.sh $cmd
exit $?
fi
for f in spk2utt utt2spk; do
if [ ! -f $data/$f ]; then
echo "$0: no such file $f"
exit 1;
fi
if [ ! -s $data/$f ]; then
echo "$0: empty file $f"
exit 1;
fi
done
! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \
echo "$0: $data/utt2spk has wrong format." && exit;
ns=$(wc -l < $data/spk2utt)
if [ "$ns" == 1 ]; then
echo "$0: WARNING: you have only one speaker. This probably a bad idea."
echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html"
echo " for more information."
fi
tmpdir=$(mktemp -d /tmp/kaldi.XXXX);
trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM
export LC_ALL=C
function check_sorted_and_uniq {
! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1;
! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \
echo "$0: file $1 is not in sorted order or has duplicates" && exit 1;
}
function partial_diff {
diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6)
n1=`cat $1 | wc -l`
n2=`cat $2 | wc -l`
echo "[Lengths are $1=$n1 versus $2=$n2]"
}
check_sorted_and_uniq $data/utt2spk
if ! $no_spk_sort; then
! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \
echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \
echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1;
fi
check_sorted_and_uniq $data/spk2utt
! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \
<(local/spk2utt_to_utt2spk.pl $data/spk2utt) && \
echo "$0: spk2utt and utt2spk do not seem to match" && exit 1;
cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts
if [ ! -f $data/text ] && ! $no_text; then
echo "$0: no such file $data/text (if this is by design, specify --no-text)"
exit 1;
fi
num_utts=`cat $tmpdir/utts | wc -l`
if [ -f $data/text ]; then
local/validate_text.pl $data/text || exit 1;
check_sorted_and_uniq $data/text
text_len=`cat $data/text | wc -l`
illegal_sym_list="<s> </s> #0"
for x in $illegal_sym_list; do
if grep -w "$x" $data/text > /dev/null; then
echo "$0: Error: in $data, text contains illegal symbol $x"
exit 1;
fi
done
awk '{print $1}' < $data/text > $tmpdir/utts.txt
if ! cmp -s $tmpdir/utts{,.txt}; then
echo "$0: Error: in $data, utterance lists extracted from utt2spk and text"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/utts{,.txt}
exit 1;
fi
fi
if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then
echo "$0: in directory $data, segments file exists but no wav.scp"
exit 1;
fi
if [ ! -f $data/wav.scp ] && ! $no_wav; then
echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)"
exit 1;
fi
if [ -f $data/wav.scp ]; then
check_sorted_and_uniq $data/wav.scp
if grep -E -q '^\S+\s+~' $data/wav.scp; then
# note: it's not a good idea to have any kind of tilde in wav.scp, even if
# part of a command, as it would cause compatibility problems if run by
# other users, but this used to be not checked for so we let it slide unless
# it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which
# would definitely cause problems as the fopen system call does not do
# tilde expansion.
echo "$0: Please do not use tilde (~) in your wav.scp."
exit 1;
fi
if [ -f $data/segments ]; then
check_sorted_and_uniq $data/segments
# We have a segments file -> interpret wav file as "recording-ids" not utterance-ids.
! cat $data/segments | \
awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \
echo "$0: badly formatted segments file" && exit 1;
segments_len=`cat $data/segments | wc -l`
if [ -f $data/text ]; then
! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \
echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \
echo "$0: Lengths are $segments_len vs $num_utts" && \
exit 1
fi
cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings
awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav
if ! cmp -s $tmpdir/recordings{,.wav}; then
echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/recordings{,.wav}
exit 1;
fi
if [ -f $data/reco2file_and_channel ]; then
# this file is needed only for ctm scoring; it's indexed by recording-id.
check_sorted_and_uniq $data/reco2file_and_channel
! cat $data/reco2file_and_channel | \
awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) {
if ( NF == 3 && $3 == "1" ) {
warning_issued = 1;
} else {
print "Bad line ", $0; exit 1;
}
}
}
END {
if (warning_issued == 1) {
print "The channel should be marked as A or B, not 1! You should change it ASAP! "
}
}' && echo "$0: badly formatted reco2file_and_channel file" && exit 1;
cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc
if ! cmp -s $tmpdir/recordings{,.r2fc}; then
echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/recordings{,.r2fc}
exit 1;
fi
fi
else
# No segments file -> assume wav.scp indexed by utterance.
cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav
if ! cmp -s $tmpdir/utts{,.wav}; then
echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/utts{,.wav}
exit 1;
fi
if [ -f $data/reco2file_and_channel ]; then
# this file is needed only for ctm scoring; it's indexed by recording-id.
check_sorted_and_uniq $data/reco2file_and_channel
! cat $data/reco2file_and_channel | \
awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) {
if ( NF == 3 && $3 == "1" ) {
warning_issued = 1;
} else {
print "Bad line ", $0; exit 1;
}
}
}
END {
if (warning_issued == 1) {
print "The channel should be marked as A or B, not 1! You should change it ASAP! "
}
}' && echo "$0: badly formatted reco2file_and_channel file" && exit 1;
cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc
if ! cmp -s $tmpdir/utts{,.r2fc}; then
echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/utts{,.r2fc}
exit 1;
fi
fi
fi
fi
if [ ! -f $data/feats.scp ] && ! $no_feats; then
echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)"
exit 1;
fi
if [ -f $data/feats.scp ]; then
check_sorted_and_uniq $data/feats.scp
cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats
if ! cmp -s $tmpdir/utts{,.feats}; then
echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/utts{,.feats}
exit 1;
fi
fi
if [ -f $data/cmvn.scp ]; then
check_sorted_and_uniq $data/cmvn.scp
cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn
cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
if ! cmp -s $tmpdir/speakers{,.cmvn}; then
echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/speakers{,.cmvn}
exit 1;
fi
fi
if [ -f $data/spk2gender ]; then
check_sorted_and_uniq $data/spk2gender
! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \
echo "$0: Mal-formed spk2gender file" && exit 1;
cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender
cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
if ! cmp -s $tmpdir/speakers{,.spk2gender}; then
echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/speakers{,.spk2gender}
exit 1;
fi
fi
if [ -f $data/spk2warp ]; then
check_sorted_and_uniq $data/spk2warp
! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \
echo "$0: Mal-formed spk2warp file" && exit 1;
cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp
cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
if ! cmp -s $tmpdir/speakers{,.spk2warp}; then
echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/speakers{,.spk2warp}
exit 1;
fi
fi
if [ -f $data/utt2warp ]; then
check_sorted_and_uniq $data/utt2warp
! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \
echo "$0: Mal-formed utt2warp file" && exit 1;
cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp
cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts
if ! cmp -s $tmpdir/utts{,.utt2warp}; then
echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/utts{,.utt2warp}
exit 1;
fi
fi
# check some optionally-required things
for f in vad.scp utt2lang utt2uniq; do
if [ -f $data/$f ]; then
check_sorted_and_uniq $data/$f
if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \
<( awk '{print $1}' $data/$f ); then
echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list"
exit 1;
fi
fi
done
if [ -f $data/utt2dur ]; then
check_sorted_and_uniq $data/utt2dur
cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur
if ! cmp -s $tmpdir/utts{,.utt2dur}; then
echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/utts{,.utt2dur}
exit 1;
fi
cat $data/utt2dur | \
awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1
fi
if [ -f $data/utt2num_frames ]; then
check_sorted_and_uniq $data/utt2num_frames
cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames
if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then
echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/utts{,.utt2num_frames}
exit 1
fi
awk <$data/utt2num_frames '{
if (NF != 2 || !($2 > 0) || $2 != int($2)) {
print "Bad line utt2num_frames:" NR ":" $0
exit 1 } }' || exit 1
fi
if [ -f $data/reco2dur ]; then
check_sorted_and_uniq $data/reco2dur
cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur
if [ -f $tmpdir/recordings ]; then
if ! cmp -s $tmpdir/recordings{,.reco2dur}; then
echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/recordings{,.reco2dur}
exit 1;
fi
else
if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then
echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/{utts,recordings.reco2dur}
exit 1;
fi
fi
cat $data/reco2dur | \
awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1
fi
echo "$0: Successfully validated data-directory $data"
#!/usr/bin/env perl
#
#===============================================================================
# Copyright 2017 Johns Hopkins University (author: Yenda Trmal <jtrmal@gmail.com>)
# Johns Hopkins University (author: Daniel Povey)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
#===============================================================================
# validation script for data/<dataset>/text
# to be called (preferably) from utils/validate_data_dir.sh
use strict;
use warnings;
use utf8;
use Fcntl qw< SEEK_SET >;
# this function reads the opened file (supplied as a first
# parameter) into an array of lines. For each
# line, it tests whether it's a valid utf-8 compatible
# line. If all lines are valid utf-8, it returns the lines
# decoded as utf-8, otherwise it assumes the file's encoding
# is one of those 1-byte encodings, such as ISO-8859-x
# or Windows CP-X.
# Please recall we do not really care about
# the actually encoding, we just need to
# make sure the length of the (decoded) string
# is correct (to make the output formatting looking right).
sub get_utf8_or_bytestream {
use Encode qw(decode encode);
my $is_utf_compatible = 1;
my @unicode_lines;
my @raw_lines;
my $raw_text;
my $lineno = 0;
my $file = shift;
while (<$file>) {
$raw_text = $_;
last unless $raw_text;
if ($is_utf_compatible) {
my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
$is_utf_compatible = $is_utf_compatible && defined($decoded_text);
push @unicode_lines, $decoded_text;
} else {
#print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n";
;
}
push @raw_lines, $raw_text;
$lineno += 1;
}
if (!$is_utf_compatible) {
return (0, @raw_lines);
} else {
return (1, @unicode_lines);
}
}
# check if the given unicode string contain unicode whitespaces
# other than the usual four: TAB, LF, CR and SPACE
sub validate_utf8_whitespaces {
my $unicode_lines = shift;
use feature 'unicode_strings';
for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) {
my $current_line = $unicode_lines->[$i];
if ((substr $current_line, -1) ne "\n"){
print STDERR "$0: The current line (nr. $i) has invalid newline\n";
return 1;
}
my @A = split(" ", $current_line);
my $utt_id = $A[0];
# we replace TAB, LF, CR, and SPACE
# this is to simplify the test
if ($current_line =~ /\x{000d}/) {
print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n";
return 1;
}
$current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g;
if ($current_line =~/\s/) {
print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n";
return 1;
}
}
return 0;
}
# checks if the text in the file (supplied as the argument) is utf-8 compatible
# if yes, checks if it contains only allowed whitespaces. If no, then does not
# do anything. The function seeks to the original position in the file after
# reading the text.
sub check_allowed_whitespace {
my $file = shift;
my $filename = shift;
my $pos = tell($file);
(my $is_utf, my @lines) = get_utf8_or_bytestream($file);
seek($file, $pos, SEEK_SET);
if ($is_utf) {
my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines);
if ($has_invalid_whitespaces) {
print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n";
return 0;
}
}
return 1;
}
if(@ARGV != 1) {
die "Usage: validate_text.pl <text-file>\n" .
"e.g.: validate_text.pl data/train/text\n";
}
my $text = shift @ARGV;
if (-z "$text") {
print STDERR "$0: ERROR: file '$text' is empty or does not exist\n";
exit 1;
}
if(!open(FILE, "<$text")) {
print STDERR "$0: ERROR: failed to open $text\n";
exit 1;
}
check_allowed_whitespace(\*FILE, $text) or exit 1;
close(FILE);
export WENET_DIR=$PWD/../../..
export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=../../../:$PYTHONPATH
#!/bin/bash
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
. ./path.sh || exit 1;
# Use this to control how many gpu you use, It's 1-gpu training if you specify
# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
# communication. More details can be found in
# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
# export NCCL_SOCKET_IFNAME=ens4f1
export NCCL_DEBUG=INFO
stage=0 # start from 0 if you need to start from data preparation
stop_stage=6
# The num of nodes or machines used for multi-machine training
# Default 1 for single machine/node
# NFS will be needed if you want run multi-machine training
num_nodes=1
# The rank of each node or machine, range from 0 to num_nodes -1
# The first node/machine sets node_rank 0, the second one sets node_rank 1
# the third one set node_rank 2, and so on. Default 0
node_rank=0
num_utts_per_shard=1000
data_url=https://www.openslr.org/resources/111
data_source=/home/work_nfs5_ssd/yhliang/data/aishell4
# modify this to your AISHELL-4 data path
nj=16
dict=data/dict/lang_char.txt
train_set=aishell4_train
dev_set=aishell4_test
test_sets=aishell4_test
train_config=conf/train_conformer.yaml
cmvn=true
dir=exp/conformer
checkpoint=
# use average_checkpoint will get better result
average_checkpoint=true
decode_checkpoint=$dir/final.pt
average_num=30
decode_modes="attention_rescoring"
. tools/parse_options.sh || exit 1;
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
local/download_and_untar.sh ${data_source} ${data_url} train_L
local/download_and_untar.sh ${data_source} ${data_url} train_M
local/download_and_untar.sh ${data_source} ${data_url} train_S
local/download_and_untar.sh ${data_source} ${data_url} test
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# Data preparation
local/prepare_data.sh ${data_source} || exit 1;
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# remove the space between the text labels for Mandarin dataset
for x in ${train_set} ${test_sets}; do
cp data/${x}/text data/${x}/text.org
paste -d " " <(cut -d " " -f 1 data/${x}/text.org) <(cut -d " " -f 2 data/${x}/text.org \
| tr 'a-z' 'A-Z' | sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' | tr -d " ") > data/${x}/text
rm data/${x}/text.org
done
tools/compute_cmvn_stats.py --num_workers 32 --train_config $train_config \
--in_scp data/${train_set}/wav.scp \
--out_cmvn data/$train_set/global_cmvn
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# Make train dict
echo "Make a dictionary"
mkdir -p $(dirname $dict)
echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
echo "<unk> 1" >> ${dict} # <unk> must be 1
tools/text2token.py -s 1 -n 1 data/${train_set}/text | cut -f 2- -d" " | tr " " "\n" \
| sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0 " " NR+1}' >> ${dict}
num_token=$(cat $dict | wc -l)
echo "<sos/eos> $num_token" >> $dict # <eos>
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# Prepare wenet required data
echo "Prepare data, prepare required format"
for x in $train_set ${test_sets}; do
tools/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \
--num_threads 32 --segments data/$x/segments \
data/$x/wav.scp data/$x/text $(realpath data/$x/shards) data/$x/data.list
done
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# Training
mkdir -p $dir
INIT_FILE=$dir/ddp_init
# You had better rm it manually before you start run.sh on first node.
# rm -f $INIT_FILE # delete old one before starting
init_method=file://$(readlink -f $INIT_FILE)
echo "$0: init method is $init_method"
# The number of gpus runing on each node/machine
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# Use "nccl" if it works, otherwise use "gloo"
dist_backend="gloo"
# The total number of processes/gpus, so that the master knows
# how many workers to wait for.
# More details about ddp can be found in
# https://pytorch.org/tutorials/intermediate/dist_tuto.html
world_size=`expr $num_gpus \* $num_nodes`
echo "total gpus is: $world_size"
cmvn_opts=
$cmvn && cp data/${train_set}/global_cmvn $dir
$cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
# train.py will write $train_config to $dir/train.yaml with model input
# and output dimension, train.yaml will be used for inference or model
# export later
for ((i = 0; i < $num_gpus; ++i)); do
{
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
# Rank of each gpu/process used for knowing whether it is
# the master of a worker.
rank=`expr $node_rank \* $num_gpus + $i`
python wenet/bin/train.py --gpu $gpu_id \
--config $train_config \
--data_type shard \
--symbol_table $dict \
--train_data data/$train_set/data.list \
--cv_data data/${dev_set}/data.list \
${checkpoint:+--checkpoint $checkpoint} \
--model_dir $dir \
--ddp.init_method $init_method \
--ddp.world_size $world_size \
--ddp.rank $rank \
--ddp.dist_backend $dist_backend \
--num_workers 1 \
$cmvn_opts
}
done
wait
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# Test model, please specify the model you want to test by --checkpoint
if [ ${average_checkpoint} == true ]; then
decode_checkpoint=$dir/avg_${average_num}.pt
echo "do model average and final checkpoint is $decode_checkpoint"
python wenet/bin/average_model.py \
--dst_model $decode_checkpoint \
--src_path $dir \
--num ${average_num}
fi
# Specify decoding_chunk_size if it's a unified dynamic chunk trained model
# -1 for full chunk
decoding_chunk_size=
ctc_weight=0.5
for mode in ${decode_modes}; do
{
for test_set in ${test_sets}; do
{
test_dir=$dir/test_${mode}
mkdir -p $test_dir
python wenet/bin/recognize.py --gpu $(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f1) \
--mode $mode \
--config $dir/train.yaml \
--data_type shard \
--test_data data/${test_set}/data.list \
--checkpoint $decode_checkpoint \
--beam_size 10 \
--batch_size 1 \
--penalty 0.0 \
--dict $dict \
--ctc_weight $ctc_weight \
--result_file $test_dir/text \
${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
python tools/compute-wer.py --char=1 --v=1 \
data/${test_set}/text $test_dir/text > $test_dir/wer
} &
done
}
done
wait
fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# Export the best model you want
python wenet/bin/export_jit.py \
--config $dir/train.yaml \
--checkpoint $dir/avg_${average_num}.pt \
--output_file $dir/final.zip \
--output_quant_file $dir/final_quant.zip
fi
../../../tools
\ No newline at end of file
../../../wenet
\ No newline at end of file
# Performance Record
## Conformer Result
* Feature info: dither + specaug + speed perturb
* Training info: lr 0.0005, batch size 8, 1 gpu, acc_grad 4, 80 epochs
* Decoding info: average_num 10
| decoding mode | dt05_real_1ch | dt05_simu_1ch | et05_real_1ch | et05_simu_1ch |
|:----------------------:|:-------------:|:-------------:|:-------------:|:-------------:|
| ctc_prefix_beam_search | 19.06% | 21.17% | 28.39% | 29.16% |
| attention_rescoring | 17.92% | 20.22% | 27.40% | 28.25% |
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 512 # dimension of attention
attention_heads: 8
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
dataset_conf:
split_with_space: true
filter_conf:
max_length: 40960
min_length: 0
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 40
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 8
grad_clip: 10
accum_grad: 4
max_epoch: 80
log_interval: 200
optim: adam
optim_conf:
lr: 0.0005
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 20000
#!/usr/bin/env bash
# wujian@2020
set -eu
echo "$0: Formating chime4 data dir..."
track=isolated_1ch_track
data_dir=data/chime4
mkdir -p $data_dir/{train,dev}
cat $data_dir/tr05_{simu,real}_noisy/wav.scp $data_dir/tr05_orig_clean/wav.scp \
$data_dir/train_si200_wsj1_clean/wav.scp | sort -k1 > $data_dir/train/wav.scp
cat $data_dir/tr05_{simu,real}_noisy/text $data_dir/tr05_orig_clean/text \
$data_dir/train_si200_wsj1_clean/text | sort -k1 > $data_dir/train/text
cat $data_dir/dt05_{real,simu}_${track}/wav.scp | sort -k1 > $data_dir/dev/wav.scp
cat $data_dir/dt05_{real,simu}_${track}/text | sort -k1 > $data_dir/dev/text
echo "$0: Format $data_dir done"
#!/usr/bin/env bash
# wujian@2020
set -eu
[ $# -ne 2 ] && echo "Script format error: $0 <data-dir> <dump-dir>" && exit 0
data_dir=$1
dump_dir=$2
mkdir -p $dump_dir
num_utts=$(cat $data_dir/wav.scp | wc -l)
echo "Orginal utterances (.wav + .wv1): $num_utts"
# cat $data_dir/wav.scp | grep "sph2pipe" | \
# awk -v dir=$dump_dir '{printf("%s -f wav %s %s/%s.wav\n", $2, $5, dir, $1)}' | bash
cat $data_dir/wav.scp | grep -v "sph2pipe" > $data_dir/raw_wav.scp
find $dump_dir -name "*.wav" | awk -F '/' '{printf("%s %s\n", $NF, $0)}' | \
sed 's:\.wav::' > $data_dir/sph_wav.scp
cat $data_dir/{raw_wav,sph_wav}.scp | sort -k1 > $data_dir/wav.scp
num_utts=$(cat $data_dir/wav.scp | wc -l)
echo "Wave utterances (.wav): $num_utts"
echo "$0: Generate wav => $dump_dir done"
#!/usr/bin/env bash
# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
# Modified from Kaldi's chime4 recipe
set -eu
dataset=chime4
. ./tools/parse_options.sh || exit 1;
if [ $# -ne 1 ]; then
printf "\nUSAGE: %s <original WSJ0 corpus-directory>\n\n" `basename $0`
echo "The argument should be a the top-level WSJ corpus directory."
echo "It is assumed that there will be a 'wsj0' and a 'wsj1' subdirectory"
echo "within the top-level corpus directory."
exit 1;
fi
wsj0=$1
srcdir=$PWD/data/chime4/local
dstdir=$PWD/data/$dataset
local=$PWD/local
utils=$PWD/utils
sph2pipe=sph2pipe
if [ ! `which sph2pipe` ]; then
echo "Could not find sph2pipe, install it first..."
mkdir -p exp && cd exp && wget https://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz
tar -zxf sph2pipe_v2.5.tar.gz && cd sph2pipe_v2.5
gcc -o sph2pipe *.c -lm && cd .. && rm -rf sph2pipe_v2.5.tar.gz
sph2pipe=$PWD/sph2pipe_v2.5/sph2pipe
cd ..
fi
mkdir -p $srcdir && cd $srcdir
# This version for SI-84
cat $wsj0/wsj0/doc/indices/train/tr_s_wv1.ndx \
| $local/cstr_ndx2flist.pl $wsj0 | sort -u > tr05.flist
# Now for the test sets.
# $wsj0/wsj1/doc/indices/readme.doc
# describes all the different test sets.
# Note: each test-set seems to come in multiple versions depending
# on different vocabulary sizes, verbalized vs. non-verbalized
# pronunciations, etc. We use the largest vocab and non-verbalized
# pronunciations.
# The most normal one seems to be the "baseline 60k test set", which
# is h1_p0.
# Nov'92 (330 utts, 5k vocab)
cat $wsj0/wsj0/doc/indices/test/nvp/si_et_05.ndx | \
$local/cstr_ndx2flist.pl $wsj0 | sort > et05.flist
# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt.
# Sometimes this gets copied from the CD's with upcasing, don't know
# why (could be older versions of the disks).
find $wsj0/wsj0/si_dt_05 -print | grep -i ".wv1" | sort > dt05.flist
# Finding the transcript files:
find -L $wsj0 -iname '*.dot' > dot_files.flist
# Convert the transcripts into our format (no normalization yet)
# adding suffix to utt_id
# 0 for clean condition
for x in tr05 et05 dt05; do
$local/flist2scp.pl $x.flist | sort > ${x}_sph_tmp.scp
cat ${x}_sph_tmp.scp | awk '{print $1}' \
| $local/find_transcripts.pl dot_files.flist > ${x}_tmp.trans1
cat ${x}_sph_tmp.scp | awk '{printf("%s %s\n", $1, $2);}' > ${x}_sph.scp
cat ${x}_tmp.trans1 | awk '{printf("%s ", $1); for(i=2;i<=NF;i++) printf("%s ", $i); printf("\n");}' > ${x}.trans1
done
# Do some basic normalization steps. At this point we don't remove OOVs--
# that will be done inside the training scripts, as we'd like to make the
# data-preparation stage independent of the specific lexicon used.
noiseword="<NOISE>";
for x in tr05 et05 dt05; do
cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
| sort > $x.txt || exit 1;
done
# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
for x in tr05 et05 dt05; do
awk -v cmd=$sph2pipe '{printf("%s %s -f wav %s |\n", $1, cmd, $2);}' ${x}_sph.scp > ${x}_wav.scp
done
if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then
rm -f wsj0-train-spkrinfo.txt
wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt \
|| ( echo "Getting wsj0-train-spkrinfo.txt from backup location" && \
wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt );
fi
if [ ! -f wsj0-train-spkrinfo.txt ]; then
echo "Could not get the spkrinfo.txt file from LDC website (moved)?"
echo "This is possibly omitted from the training disks; couldn't find it."
echo "Everything else may have worked; we just may be missing gender info"
echo "which is only needed for VTLN-related diagnostics anyway."
exit 1
fi
# Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the
# LDC put it on the web. Perhaps it was accidentally omitted from the
# disks.
cat $wsj0/wsj0/doc/spkrinfo.txt \
./wsj0-train-spkrinfo.txt | \
perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \
awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender
# return back
cd -
for x in et05 dt05 tr05; do
mkdir -p $dstdir/${x}_orig_clean
cp $srcdir/$x.txt $dstdir/${x}_orig_clean/text || exit 1
cp $srcdir/${x}_wav.scp $dstdir/${x}_orig_clean/wav.scp || exit 1
done
echo "Data preparation succeeded"
#!/usr/bin/env bash
# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
set -eu
if [ $# -ne 1 ]; then
echo "Arguments should be WSJ1 directory"
exit 1;
fi
wsj1=$1
dir=$PWD/data/chime4/local
odir=$PWD/data/chime4
mkdir -p $dir
local=$PWD/local
sph2pipe=sph2pipe
if [ ! `which sph2pipe` ]; then
echo "Could not find sph2pipe, install it first..."
mkdir -p exp && cd exp && wget https://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz
tar -zxf sph2pipe_v2.5.tar.gz && cd sph2pipe_v2.5
gcc -o sph2pipe *.c -lm && cd .. && rm -rf sph2pipe_v2.5.tar.gz
sph2pipe=$PWD/sph2pipe_v2.5/sph2pipe
cd ..
fi
cd $dir
# This version for SI-200
cat $wsj1/13-34.1/wsj1/doc/indices/si_tr_s.ndx | \
$local/ndx2flist.pl $wsj1/??-{?,??}.? | sort > train_si200.flist
nl=`cat train_si200.flist | wc -l`
[ "$nl" -eq 30278 ] || echo "Warning: expected 30278 lines in train_si200.flist, got $nl"
# Dev-set for Nov'93 (503 utts)
cat $wsj1/13-34.1/wsj1/doc/indices/h1_p0.ndx | \
$local/ndx2flist.pl $wsj1/??-{?,??}.? | sort > test_dev93.flist
# Finding the transcript files:
for x in $wsj1/??-{?,??}.?; do find -L $x -iname '*.dot'; done > dot_files.flist
# Convert the transcripts into our format (no normalization yet)
for x in train_si200 test_dev93; do
$local/flist2scp.pl $x.flist | sort > ${x}_sph.scp
cat ${x}_sph.scp | awk '{print $1}' | $local/find_transcripts.pl dot_files.flist > $x.trans1
done
# Do some basic normalization steps. At this point we don't remove OOVs--
# that will be done inside the training scripts, as we'd like to make the
# data-preparation stage independent of the specific lexicon used.
noiseword="<NOISE>";
for x in train_si200 test_dev93; do
cat $x.trans1 | $local/normalize_transcript.pl $noiseword | sort > $x.txt || exit 1;
done
# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
for x in train_si200 test_dev93; do
awk -v cmd=$sph2pipe '{printf("%s %s -f wav %s |\n", $1, cmd, $2);}' ${x}_sph.scp > ${x}_wav.scp
done
# return back
cd -
for x in train_si200 test_dev93; do
mkdir -p $odir/${x}_wsj1_clean
cp $dir/$x.txt $odir/${x}_wsj1_clean/text || exit 1
cp $dir/${x}_wav.scp $odir/${x}_wsj1_clean/wav.scp || exit 1
done
echo "Data preparation WSJ1 succeeded"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment