Commit 764b3a75 authored by Sugon_ldc's avatar Sugon_ldc
Browse files

add new model

parents
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This is modified from the script in standard Kaldi recipe to account
# for the way the WSJ data is structured on the Edinburgh systems.
# - Arnab Ghoshal, 12/1/12
# This program takes as its standard input an .ndx file from the WSJ corpus that looks
# like this:
#;; File: tr_s_wv1.ndx, updated 04/26/94
#;;
#;; Index for WSJ0 SI-short Sennheiser training data
#;; Data is read WSJ sentences, Sennheiser mic.
#;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts
#;; per speaker TI) = 7236 utts
#;;
#11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1
#11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1
#11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1
# and as command-line argument it takes the names of the WSJ disk locations, e.g.:
# /group/corpora/public/wsjcam0/data on DICE machines.
# It outputs a list of absolute pathnames.
$wsj_dir = $ARGV[0];
while(<STDIN>){
if(m/^;/){ next; } # Comment. Ignore it.
else {
m/^([0-9_]+):\s*(\S+)$/ || die "Could not parse line $_";
$filename = $2; # as a subdirectory of the distributed disk.
if ($filename !~ m/\.wv1$/) { $filename .= ".wv1"; }
$filename = "$wsj_dir/$filename";
if (-e $filename) {
print "$filename\n";
} else {
print STDERR "File $filename found in the index but not on disk\n";
}
}
}
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This program takes on its standard input a list of utterance
# id's, one for each line. (e.g. 4k0c030a is a an utterance id).
# It takes as
# Extracts from the dot files the transcripts for a given
# dataset (represented by a file list).
#
@ARGV == 1 || die "find_transcripts.pl dot_files_flist < utterance_ids > transcripts";
$dot_flist = shift @ARGV;
open(L, "<$dot_flist") || die "Opening file list of dot files: $dot_flist\n";
while(<L>){
chop;
m:\S+/(\w{6})00.dot: || die "Bad line in dot file list: $_";
$spk = $1;
$spk2dot{$spk} = $_;
}
while(<STDIN>){
chop;
$uttid_orig = $_;
$uttid = substr $uttid_orig, 0, 8;
$uttid =~ m:(\w{6})\w\w: || die "Bad utterance id $_";
$spk = $1;
if($spk ne $curspk) {
%utt2trans = { }; # Don't keep all the transcripts in memory...
$curspk = $spk;
$dotfile = $spk2dot{$spk};
defined $dotfile || die "No dot file for speaker $spk\n";
open(F, "<$dotfile") || die "Error opening dot file $dotfile\n";
while(<F>) {
$_ =~ m:(.+)\((\w{8})\)\s*$: || die "Bad line $_ in dot file $dotfile (line $.)\n";
$trans = $1;
$utt = $2;
$utt2trans{$utt} = $trans;
}
}
if(!defined $utt2trans{$uttid}) {
print STDERR "No transcript for utterance $uttid (current dot file is $dotfile)\n";
} else {
print "$uttid_orig $utt2trans{$uttid}\n";
}
}
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This program takes on its standard input a list of utterance
# id's, one for each line. (e.g. 4k0c030a is a an utterance id).
# It takes as
# Extracts from the dot files the transcripts for a given
# dataset (represented by a file list).
#
@ARGV == 1 || die "find_transcripts.pl dot_files_flist < utterance_ids > transcripts";
$dot_flist = shift @ARGV;
open(L, "<$dot_flist") || die "Opening file list of dot files: $dot_flist\n";
while(<L>){
chop;
m:\S+/(\w{6})00.dot: || die "Bad line in dot file list: $_";
$spk = $1;
$spk2dot{$spk} = $_;
}
while(<STDIN>){
chop;
$uttid = $_;
$uttid =~ m:(\w{6})\w\w: || die "Bad utterance id $_";
$spk = $1;
if($spk ne $curspk) {
%utt2trans = { }; # Don't keep all the transcripts in memory...
$curspk = $spk;
$dotfile = $spk2dot{$spk};
defined $dotfile || die "No dot file for speaker $spk\n";
open(F, "<$dotfile") || die "Error opening dot file $dotfile\n";
while(<F>) {
$_ =~ m:(.+)\((\w{8})\)\s*$: || die "Bad line $_ in dot file $dotfile (line $.)\n";
$trans = $1;
$utt = $2;
$utt2trans{$utt} = $trans;
}
}
if(!defined $utt2trans{$uttid}) {
print STDERR "No transcript for utterance $uttid (current dot file is $dotfile)\n";
} else {
print "$uttid $utt2trans{$uttid}\n";
}
}
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# takes in a file list with lines like
# /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1
# and outputs an scp in kaldi format with lines like
# 4k0c030a /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1
# (the first thing is the utterance-id, which is the same as the basename of the file.
while(<>){
m:^\S+/(\w+)\.[wW][vV]1$: || die "Bad line $_";
$id = $1;
$id =~ tr/A-Z/a-z/; # Necessary because of weirdness on disk 13-16.1 (uppercase filenames)
print "$id $_";
}
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This program takes as its standard input an .ndx file from the WSJ corpus that looks
# like this:
#;; File: tr_s_wv1.ndx, updated 04/26/94
#;;
#;; Index for WSJ0 SI-short Sennheiser training data
#;; Data is read WSJ sentences, Sennheiser mic.
#;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts
#;; per speaker TI) = 7236 utts
#;;
#11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1
#11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1
#11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1
#and as command-line arguments it takes the names of the WSJ disk locations, e.g.:
#/mnt/matylda2/data/WSJ0/11-1.1 /mnt/matylda2/data/WSJ0/11-10.1 ... etc.
# It outputs a list of absolute pathnames (it does this by replacing e.g. 11_1_1 with
# /mnt/matylda2/data/WSJ0/11-1.1.
# It also does a slight fix because one of the WSJ disks (WSJ1/13-16.1) was distributed with
# uppercase rather than lower case filenames.
foreach $fn (@ARGV) {
$fn =~ m:.+/([0-9\.\-]+)/?$: || die "Bad command-line argument $fn\n";
$disk_id=$1;
$disk_id =~ tr/-\./__/; # replace - and . with - so 11-10.1 becomes 11_10_1
$fn =~ s:/$::; # Remove final slash, just in case it is present.
$disk2fn{$disk_id} = $fn;
}
while(<STDIN>){
if(m/^;/){ next; } # Comment. Ignore it.
else {
m/^([0-9_]+):\s*(\S+)$/ || die "Could not parse line $_";
$disk=$1;
if(!defined $disk2fn{$disk}) {
die "Disk id $disk not found";
}
$filename = $2; # as a subdirectory of the distributed disk.
if($disk eq "13_16_1" && `hostname` =~ m/fit.vutbr.cz/) {
# The disk 13-16.1 has been uppercased for some reason, on the
# BUT system. This is a fix specifically for that case.
$filename =~ tr/a-z/A-Z/; # This disk contains all uppercase filenames. Why?
}
print "$disk2fn{$disk}/$filename\n";
}
}
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This takes data from the standard input that's unnormalized transcripts in the format
# 4k2c0308 Of course there isn\'t any guarantee the company will keep its hot hand [misc_noise]
# 4k2c030a [loud_breath] And new hardware such as the set of personal computers I\. B\. M\. introduced last week can lead to unexpected changes in the software business [door_slam]
# and outputs normalized transcripts.
# c.f. /mnt/matylda2/data/WSJ0/11-10.1/wsj0/transcrp/doc/dot_spec.doc
@ARGV == 1 || die "usage: normalize_transcript.pl noise_word < transcript > transcript2";
$noise_word = shift @ARGV;
while(<STDIN>) {
$_ =~ m:^(\S+) (.+): || die "bad line $_";
$utt = $1;
$trans = $2;
print "$utt";
foreach $w (split (" ",$trans)) {
$w =~ tr:a-z:A-Z:; # Upcase everything to match the CMU dictionary. .
$w =~ s:\\::g; # Remove backslashes. We don't need the quoting.
$w =~ s:^\%PERCENT$:PERCENT:; # Normalization for Nov'93 test transcripts.
$w =~ s:^\.POINT$:POINT:; # Normalization for Nov'93 test transcripts.
if($w =~ m:^\[\<\w+\]$: || # E.g. [<door_slam], this means a door slammed in the preceding word. Delete.
$w =~ m:^\[\w+\>\]$: || # E.g. [door_slam>], this means a door slammed in the next word. Delete.
$w =~ m:\[\w+/\]$: || # E.g. [phone_ring/], which indicates the start of this phenomenon.
$w =~ m:\[\/\w+]$: || # E.g. [/phone_ring], which indicates the end of this phenomenon.
$w eq "~" || # This is used to indicate truncation of an utterance. Not a word.
$w eq ".") { # "." is used to indicate a pause. Silence is optional anyway so not much
# point including this in the transcript.
next; # we won't print this word.
} elsif($w =~ m:\[\w+\]:) { # Other noises, e.g. [loud_breath].
print " $noise_word";
} elsif($w =~ m:^\<([\w\']+)\>$:) {
# e.g. replace <and> with and. (the <> means verbal deletion of a word).. but it's pronounced.
print " $1";
} elsif($w eq "--DASH") {
print " -DASH"; # This is a common issue; the CMU dictionary has it as -DASH.
# } elsif($w =~ m:(.+)\-DASH$:) { # E.g. INCORPORATED-DASH... seems the DASH gets combined with previous word
# print " $1 -DASH";
} else {
print " $w";
}
}
print "\n";
}
#!/usr/bin/env bash
set -e
# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
# This is modified from the script in standard Kaldi recipe to account
# for the way the WSJ data is structured on the Edinburgh systems.
# - Arnab Ghoshal, 29/05/12
# Modified from the script for CHiME2 baseline
# Shinji Watanabe 02/13/2015
# Config:
eval_flag=true # make it true when the evaluation data are released
. tools/parse_options.sh || exit 1;
if [ $# -ne 2 ]; then
printf "\nUSAGE: %s <enhancement-name> <enhanced-speech-directory>\n\n" `basename $0`
echo "The argument should be a the directory that only contains enhanced speech data."
exit 1;
fi
echo "$0 $@" # Print the command line for logging
enhan=$1
audio_dir=$2
dir=$PWD/data/chime4/local
mkdir -p $dir
local=$PWD/local
utils=$PWD/utils
odir=$PWD/data/chime4
if $eval_flag; then
list_set="tr05_real_$enhan dt05_real_$enhan et05_real_$enhan"
else
list_set="tr05_real_$enhan dt05_real_$enhan"
fi
cd $dir
find $audio_dir/ -name '*.wav' | grep 'tr05_bus_real\|tr05_caf_real\|tr05_ped_real\|tr05_str_real' | sort -u > tr05_real_$enhan.flist
find $audio_dir/ -name '*.wav' | grep 'dt05_bus_real\|dt05_caf_real\|dt05_ped_real\|dt05_str_real' | sort -u > dt05_real_$enhan.flist
if $eval_flag; then
find $audio_dir/ -name '*.wav' | grep 'et05_bus_real\|et05_caf_real\|et05_ped_real\|et05_str_real' | sort -u > et05_real_$enhan.flist
fi
# make a scp file from file list
for x in $list_set; do
cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_REAL/' > ${x}_wav.ids
paste -d" " ${x}_wav.ids $x.flist | sort -k 1 > ${x}_wav.scp
done
#make a transcription from dot
cat tr05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_REAL"}'> tr05_real_$enhan.ids
cat tr05_real.dot | sed -e 's/(.*)//' > tr05_real_$enhan.txt
paste -d" " tr05_real_$enhan.ids tr05_real_$enhan.txt | sort -k 1 > tr05_real_$enhan.trans1
cat dt05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_REAL"}'> dt05_real_$enhan.ids
cat dt05_real.dot | sed -e 's/(.*)//' > dt05_real_$enhan.txt
paste -d" " dt05_real_$enhan.ids dt05_real_$enhan.txt | sort -k 1 > dt05_real_$enhan.trans1
if $eval_flag; then
cat et05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_REAL"}'> et05_real_$enhan.ids
cat et05_real.dot | sed -e 's/(.*)//' > et05_real_$enhan.txt
paste -d" " et05_real_$enhan.ids et05_real_$enhan.txt | sort -k 1 > et05_real_$enhan.trans1
fi
# Do some basic normalization steps. At this point we don't remove OOVs--
# that will be done inside the training scripts, as we'd like to make the
# data-preparation stage independent of the specific lexicon used.
noiseword="<NOISE>";
for x in $list_set;do
cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
| sort > $x.txt || exit 1;
done
# copying data to data/...
for x in $list_set; do
mkdir -p $odir/$x
cp ${x}_wav.scp $odir/$x/wav.scp || exit 1;
cp ${x}.txt $odir/$x/text || exit 1;
done
echo "Data preparation succeeded"
#!/usr/bin/env bash
set -eu
# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
# This is modified from the script in standard Kaldi recipe to account
# for the way the WSJ data is structured on the Edinburgh systems.
# - Arnab Ghoshal, 29/05/12
# Modified from the script for CHiME2 baseline
# Shinji Watanabe 02/13/2015
# Modified to use data of six channels
# Szu-Jui Chen 09/29/2017
# Config:
eval_flag=true # make it true when the evaluation data are released
. tools/parse_options.sh || exit 1
if [ $# -ne 1 ]; then
printf "\nUSAGE: %s <corpus-directory>\n\n" `basename $0`
echo "The argument should be a the top-level Chime4 directory."
echo "It is assumed that there will be a 'data' subdirectory"
echo "within the top-level corpus directory."
exit 1;
fi
echo "$0 $@" # Print the command line for logging
audio_dir=$1/data/audio/16kHz/isolated/
trans_dir=$1/data/transcriptions
echo "extract all channels (CH[1-6].wav) for noisy data"
dir=$PWD/data/chime4/local
mkdir -p $dir
local=$PWD/local
if $eval_flag; then
list_set="tr05_real_noisy dt05_real_noisy et05_real_noisy"
else
list_set="tr05_real_noisy dt05_real_noisy"
fi
cd $dir
find $audio_dir -name '*CH[1-6].wav' | grep 'tr05_bus_real\|tr05_caf_real\|tr05_ped_real\|tr05_str_real' | sort -u > tr05_real_noisy.flist
find $audio_dir -name '*CH[1-6].wav' | grep 'dt05_bus_real\|dt05_caf_real\|dt05_ped_real\|dt05_str_real' | sort -u > dt05_real_noisy.flist
if $eval_flag; then
find $audio_dir -name '*CH[1-6].wav' | grep 'et05_bus_real\|et05_caf_real\|et05_ped_real\|et05_str_real' | sort -u > et05_real_noisy.flist
fi
# make a dot format from json annotation files
cp $trans_dir/tr05_real.dot_all tr05_real.dot
cp $trans_dir/dt05_real.dot_all dt05_real.dot
if $eval_flag; then
cp $trans_dir/et05_real.dot_all et05_real.dot
fi
# make a scp temporary file from file list
for x in $list_set; do
cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_REAL/' > ${x}_wav.id.temp
cat ${x}_wav.id.temp | awk -F'_' '{print $3}' | awk -F'.' '{print $2}' > $x.ch
cat ${x}_wav.id.temp | awk -F'_' '{print $1}' > $x.part1
cat ${x}_wav.id.temp | sed -e 's/^..._//' > $x.part2
paste -d"_" $x.part1 $x.ch $x.part2 > ${x}_wav.ids
paste -d" " ${x}_wav.ids $x.flist | sort -t_ -k1,1 -k3 > ${x}_wav.scp.temp
done
#make a transcription from dot
cat tr05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH1_REAL"}'> tr05_real_noisy.ids
cat tr05_real.dot | sed -e 's/(.*)//' > tr05_real_noisy.txt
paste -d" " tr05_real_noisy.ids tr05_real_noisy.txt | \
awk '{print}{sub(/CH1/, "CH2",$0);print}{sub(/CH2/, "CH3",$0);print}{sub(/CH3/, "CH4",$0);print}{sub(/CH4/, "CH5",$0);print}{sub(/CH5/, "CH6",$0);print}' | \
sort -k 1 > tr05_real_noisy.trans1
cat dt05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH1_REAL"}'> dt05_real_noisy.ids
cat dt05_real.dot | sed -e 's/(.*)//' > dt05_real_noisy.txt
paste -d" " dt05_real_noisy.ids dt05_real_noisy.txt | \
awk '{print}{sub(/CH1/, "CH2",$0);print}{sub(/CH2/, "CH3",$0);print}{sub(/CH3/, "CH4",$0);print}{sub(/CH4/, "CH5",$0);print}{sub(/CH5/, "CH6",$0);print}' | \
sort -k 1 > dt05_real_noisy.trans1
if $eval_flag; then
cat et05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH1_REAL"}'> et05_real_noisy.ids
cat et05_real.dot | sed -e 's/(.*)//' > et05_real_noisy.txt
paste -d" " et05_real_noisy.ids et05_real_noisy.txt | \
awk '{print}{sub(/CH1/, "CH2",$0);print}{sub(/CH2/, "CH3",$0);print}{sub(/CH3/, "CH4",$0);print}{sub(/CH4/, "CH5",$0);print}{sub(/CH5/, "CH6",$0);print}' | \
sort -k 1 > et05_real_noisy.trans1
fi
# Do some basic normalization steps. At this point we don't remove OOVs--
# that will be done inside the training scripts, as we'd like to make the
# data-preparation stage independent of the specific lexicon used.
noiseword="<NOISE>";
for x in $list_set;do
cat ${x}_wav.scp.temp | awk '{print $1}' > $x.txt.part1
cat $x.trans1 | awk '{$1=""; print $0}' | sed 's/^[ \t]*//g' > $x.txt.part2
paste -d" " $x.txt.part1 $x.txt.part2 > $x.trans1
cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
| sort > $x.txt || exit 1;
done
# copying data to data/...
for x in $list_set; do
sort ${x}_wav.scp.temp > ${x}_wav.scp
mkdir -p ../../chime4/$x
cp ${x}_wav.scp ../../chime4/$x/wav.scp || exit 1;
cp ${x}.txt ../../chime4/$x/text || exit 1;
done
# clean up temp files
rm *.temp
rm *.part{1,2}
echo "Data preparation succeeded"
#!/usr/bin/env bash
set -eu
# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
# This is modified from the script in standard Kaldi recipe to account
# for the way the WSJ data is structured on the Edinburgh systems.
# - Arnab Ghoshal, 29/05/12
# Modified from the script for CHiME2 baseline
# Shinji Watanabe 02/13/2015
# Config:
eval_flag=true # make it true when the evaluation data are released
. tools/parse_options.sh || exit 1;
if [ $# -ne 2 ]; then
printf "\nUSAGE: %s <enhancement-name> <enhanced-speech-directory>\n\n" `basename $0`
echo "The argument should be a the directory that only contains enhanced speech data."
exit 1;
fi
echo "$0 $@" # Print the command line for logging
enhan=$1
audio_dir=$2
dir=$PWD/data/chime4/local
mkdir -p $dir
local=$PWD/local
utils=$PWD/utils
odir=$PWD/data/chime4
if $eval_flag; then
list_set="tr05_simu_$enhan dt05_simu_$enhan et05_simu_$enhan"
else
list_set="tr05_simu_$enhan dt05_simu_$enhan"
fi
cd $dir
find $audio_dir/ -name '*.wav' | grep 'tr05_bus_simu\|tr05_caf_simu\|tr05_ped_simu\|tr05_str_simu' | sort -u > tr05_simu_$enhan.flist
find $audio_dir/ -name '*.wav' | grep 'dt05_bus_simu\|dt05_caf_simu\|dt05_ped_simu\|dt05_str_simu' | sort -u > dt05_simu_$enhan.flist
if $eval_flag; then
find $audio_dir/ -name '*.wav' | grep 'et05_bus_simu\|et05_caf_simu\|et05_ped_simu\|et05_str_simu' | sort -u > et05_simu_$enhan.flist
fi
# make a scp file from file list
for x in $list_set; do
cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_SIMU/' > ${x}_wav.ids
paste -d" " ${x}_wav.ids $x.flist | sort -k 1 > ${x}_wav.scp
done
# make a transcription from dot
# simulation training data extract dot file from original WSJ0 data
# since it is generated from these data
if [ ! -e dot_files.flist ]; then
echo "Could not find $dir/dot_files.flist files, first run local/clean_wsj0_data_prep.sh";
exit 1;
fi
cat tr05_simu_${enhan}_wav.scp | awk -F'[_]' '{print $2}' | tr '[A-Z]' '[a-z]' \
| $local/find_noisy_transcripts.pl dot_files.flist | cut -f 2- -d" " > tr05_simu_$enhan.txt
cat tr05_simu_${enhan}_wav.scp | cut -f 1 -d" " > tr05_simu_$enhan.ids
paste -d" " tr05_simu_$enhan.ids tr05_simu_$enhan.txt | sort -k 1 > tr05_simu_$enhan.trans1
# dt05 and et05 simulation data are generated from the CHiME4 booth recording
# and we use CHiME4 dot files
cat dt05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_SIMU"}'> dt05_simu_$enhan.ids
cat dt05_simu.dot | sed -e 's/(.*)//' > dt05_simu_$enhan.txt
paste -d" " dt05_simu_$enhan.ids dt05_simu_$enhan.txt | sort -k 1 > dt05_simu_$enhan.trans1
if $eval_flag; then
cat et05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_SIMU"}'> et05_simu_$enhan.ids
cat et05_simu.dot | sed -e 's/(.*)//' > et05_simu_$enhan.txt
paste -d" " et05_simu_$enhan.ids et05_simu_$enhan.txt | sort -k 1 > et05_simu_$enhan.trans1
fi
# Do some basic normalization steps. At this point we don't remove OOVs--
# that will be done inside the training scripts, as we'd like to make the
# data-preparation stage independent of the specific lexicon used.
noiseword="<NOISE>";
for x in $list_set;do
cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
| sort > $x.txt || exit 1;
done
# copying data to data/...
for x in $list_set; do
mkdir -p $odir/$x
cp ${x}_wav.scp $odir/$x/wav.scp || exit 1;
cp ${x}.txt $odir/$x/text || exit 1;
done
echo "Data preparation succeeded"
#!/usr/bin/env bash
set -eu
# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
# This is modified from the script in standard Kaldi recipe to account
# for the way the WSJ data is structured on the Edinburgh systems.
# - Arnab Ghoshal, 29/05/12
# Modified from the script for CHiME2 baseline
# Shinji Watanabe 02/13/2015
# Modified to use data of six channels
# Szu-Jui Chen 09/29/2017
# Config:
eval_flag=true # make it true when the evaluation data are released
. tools/parse_options.sh || exit 1;
if [ $# -ne 1 ]; then
printf "\nUSAGE: %s <corpus-directory>\n\n" `basename $0`
echo "The argument should be a the top-level Chime4 directory."
echo "It is assumed that there will be a 'data' subdirectory"
echo "within the top-level corpus directory."
exit 1;
fi
echo "$0 $@" # Print the command line for logging
audio_dir=$1/data/audio/16kHz/isolated/
trans_dir=$1/data/transcriptions
echo "extract all channels (CH[1-6].wav) for noisy data"
dir=$PWD/data/chime4/local
mkdir -p $dir
local=$PWD/local
utils=$PWD/utils
if $eval_flag; then
list_set="tr05_simu_noisy dt05_simu_noisy et05_simu_noisy"
else
list_set="tr05_simu_noisy dt05_simu_noisy"
fi
cd $dir
find $audio_dir -name '*CH[1-6].wav' | grep 'tr05_bus_simu\|tr05_caf_simu\|tr05_ped_simu\|tr05_str_simu' | sort -u > tr05_simu_noisy.flist
find $audio_dir -name '*CH[1-6].wav' | grep 'dt05_bus_simu\|dt05_caf_simu\|dt05_ped_simu\|dt05_str_simu' | sort -u > dt05_simu_noisy.flist
if $eval_flag; then
find $audio_dir -name '*CH[1-6].wav' | grep 'et05_bus_simu\|et05_caf_simu\|et05_ped_simu\|et05_str_simu' | sort -u > et05_simu_noisy.flist
fi
# make a dot format from json annotation files
cp $trans_dir/dt05_simu.dot_all dt05_simu.dot
if $eval_flag; then
cp $trans_dir/et05_simu.dot_all et05_simu.dot
fi
# make a scp file from file list
for x in $list_set; do
cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_SIMU/' > ${x}_wav.id.temp
cat ${x}_wav.id.temp | awk -F'_' '{print $3}' | awk -F'.' '{print $2}' > $x.ch
cat ${x}_wav.id.temp | awk -F'_' '{print $1}' > $x.part1
cat ${x}_wav.id.temp | sed -e 's/^..._//' > $x.part2
paste -d"_" $x.part1 $x.ch $x.part2 > ${x}_wav.ids
paste -d" " ${x}_wav.ids $x.flist | sort -t_ -k1,1 -k3 > ${x}_wav.scp.temp
done
# make a transcription from dot
# simulation training data extract dot file from original WSJ0 data
# since it is generated from these data
if [ ! -e dot_files.flist ]; then
echo "Could not find $dir/dot_files.flist files, first run local/clean_wsj0_data_prep.sh";
exit 1;
fi
cat tr05_simu_noisy_wav.scp.temp | awk -F'[_]' '{print $3}' | tr '[A-Z]' '[a-z]' \
| $local/find_noisy_transcripts.pl dot_files.flist | cut -f 2- -d" " > tr05_simu_noisy.txt
cat tr05_simu_noisy_wav.scp.temp | cut -f 1 -d" " > tr05_simu_noisy.ids
paste -d" " tr05_simu_noisy.ids tr05_simu_noisy.txt | sort -t_ -k1,1 -k3 > tr05_simu_noisy.trans1
# dt05 and et05 simulation data are generated from the CHiME4 booth recording
# and we use CHiME4 dot files
cat dt05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH1_SIMU"}'> dt05_simu_noisy.ids
cat dt05_simu.dot | sed -e 's/(.*)//' > dt05_simu_noisy.txt
paste -d" " dt05_simu_noisy.ids dt05_simu_noisy.txt | \
awk '{print}{sub(/CH1/, "CH2",$0);print}{sub(/CH2/, "CH3",$0);print}{sub(/CH3/, "CH4",$0);print}{sub(/CH4/, "CH5",$0);print}{sub(/CH5/, "CH6",$0);print}' | \
sort -k 1 > dt05_simu_noisy.trans1
if $eval_flag; then
cat et05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH1_SIMU"}'> et05_simu_noisy.ids
cat et05_simu.dot | sed -e 's/(.*)//' > et05_simu_noisy.txt
paste -d" " et05_simu_noisy.ids et05_simu_noisy.txt | \
awk '{print}{sub(/CH1/, "CH2",$0);print}{sub(/CH2/, "CH3",$0);print}{sub(/CH3/, "CH4",$0);print}{sub(/CH4/, "CH5",$0);print}{sub(/CH5/, "CH6",$0);print}' | \
sort -k 1 > et05_simu_noisy.trans1
fi
# Do some basic normalization steps. At this point we don't remove OOVs--
# that will be done inside the training scripts, as we'd like to make the
# data-preparation stage independent of the specific lexicon used.
noiseword="<NOISE>";
for x in $list_set;do
cat ${x}_wav.scp.temp | awk '{print $1}' > $x.txt.part1
cat $x.trans1 | awk '{$1=""; print $0}' | sed 's/^[ \t]*//g' > $x.txt.part2
paste -d" " $x.txt.part1 $x.txt.part2 > $x.trans1
cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
| sort > $x.txt || exit 1;
done
# copying data to data/...
for x in $list_set; do
sort ${x}_wav.scp.temp > ${x}_wav.scp
mkdir -p ../../chime4/$x
cp ${x}_wav.scp ../../chime4/$x/wav.scp || exit 1;
cp ${x}.txt ../../chime4/$x/text || exit 1;
done
# clean up temp files
rm *.temp
rm *.part{1,2}
echo "Data preparation succeeded"
export WENET_DIR=$PWD/../../..
export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=../../../:$PYTHONPATH
#!/usr/bin/env bash
# Copyright 2020 Jian Wu
# License: Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
set -eu
stage="1-4"
space="<space>"
track="isolated_1ch_track"
wsj1_data_dir=//scratch/jwu/wsj1
chime4_data_dir=/scratch/jwu/CHiME4
dump_wav_dir=/scratch/jwu/chime4_wav
data_dir=data/chime4
dict=$data_dir/dict_char.txt
train_config=conf/train_conformer.yaml
exp_dir=exp/1a
decode_modes="ctc_prefix_beam_search attention_rescoring"
average_checkpoint=true
average_num=10
. ./path.sh
. ./tools/parse_options.sh || exit 1
beg=$(echo $stage | awk -F '-' '{print $1}')
end=$(echo $stage | awk -F '-' '{print $2}')
[ -z $end ] && end=$beg
if [ $end -ge 1 ] && [ $beg -le 1 ]; then
echo "Stage 1: preparing data ..."
./local/clean_wsj0_data_prep.sh $chime4_data_dir/CHiME3/data/WSJ0
./local/simu_noisy_chime4_data_prep.sh $chime4_data_dir
./local/real_noisy_chime4_data_prep.sh $chime4_data_dir
./local/simu_enhan_chime4_data_prep.sh $track $chime4_data_dir/data/audio/16kHz/$track
./local/real_enhan_chime4_data_prep.sh $track $chime4_data_dir/data/audio/16kHz/$track
./local/clean_wsj1_data_prep.sh $wsj1_data_dir
./local/chime4_format_dir.sh
fi
if [ $end -ge 2 ] && [ $beg -le 2 ]; then
echo -e "<NOISE>\n<*IN*>\n<*MR.*>" > $data_dir/train/non_lang.txt
for name in dev train; do
python tools/text2token.py $data_dir/$name/text -n 1 -s 1 \
-l $data_dir/train/non_lang.txt > $data_dir/$name/char
done
mkdir -p $(dirname $dict) && echo -e "<blank> 0\n<unk> 1" > ${dict}
cat $data_dir/train/char | cut -f 2- -d" " | tr " " "\n" | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
num_token=$(cat $dict | wc -l)
echo "<sos/eos> $num_token" >> $dict
echo "Make dictionary done"
fi
if [ $end -ge 3 ] && [ $beg -le 3 ]; then
./local/chime4_gen_wav.sh $data_dir/train $dump_wav_dir
tools/compute_cmvn_stats.py --num_workers 16 \
--train_config $train_config \
--in_scp $data_dir/train/wav.scp \
--out_cmvn $data_dir/train/global_cmvn
echo "Prepare data, prepare required format"
for x in train dev; do
tools/make_raw_list.py $data_dir/$x/wav.scp $data_dir/$x/char \
$data_dir/$x/data.list
done
fi
if [ $end -ge 4 ] && [ $beg -le 4 ]; then
mkdir -p $exp_dir && cp $data_dir/train/global_cmvn $exp_dir
python wenet/bin/train.py \
--gpu 0 \
--config $train_config \
--train_data $data_dir/train/data.list \
--cv_data $data_dir/dev/data.list \
--model_dir $exp_dir \
--num_workers 4 \
--symbol_table $dict \
--cmvn $exp_dir/global_cmvn \
--pin_memory > $exp_dir/train.log 2>&1
fi
suffix="isolated_1ch_track"
if [ $end -ge 5 ] && [ $beg -le 5 ]; then
if [ ${average_checkpoint} == true ]; then
decode_checkpoint=$exp_dir/avg_${average_num}.pt
echo "do model average and final checkpoint is $decode_checkpoint"
python wenet/bin/average_model.py \
--dst_model $decode_checkpoint \
--src_path $exp_dir \
--num ${average_num} \
--val_best
fi
nj=4
ctc_weight=0.5
for x in dt05_{simu,real} et05_{simu,real}; do
subdir=${x}_${suffix}
tools/make_raw_list.py $data_dir/$subdir/wav.scp $data_dir/$subdir/text \
$data_dir/$subdir/data.list
done
for mode in ${decode_modes}; do
for x in dt05_{simu,real} et05_{simu,real}; do
subdir=${x}_${suffix}
dec_dir=$exp_dir/${subdir}_${mode} && mkdir -p $dec_dir
python wenet/bin/recognize.py \
--gpu 0 \
--mode $mode \
--config $exp_dir/train.yaml \
--test_data $data_dir/$subdir/data.list \
--checkpoint $exp_dir/avg_${average_num}.pt \
--beam_size 8 \
--batch_size 1 \
--dict $dict \
--ctc_weight $ctc_weight \
--result_file $dec_dir/text &
done
wait
done
for mode in ${decode_modes}; do
for x in dt05_{simu,real} et05_{simu,real}; do
subdir=${x}_${suffix}
dec_dir=$exp_dir/${subdir}_${mode}
sed 's:<space>: :g' $dec_dir/text > $dec_dir/text.norm
python tools/compute-wer.py --char=1 --v=1 \
$data_dir/$subdir/text $dec_dir/text.norm > $dec_dir/wer
done
done
fi
../../../tools
\ No newline at end of file
../../../wenet
\ No newline at end of file
# Performance Record
# Should be installed ffmpeg , pandas !!!
## Conformer Result
* Feature info: dither + specaug + speed perturb
* Training info: lr 0.0005, warmup_steps 20000 batch size 8, 3 gpu, 30 epochs
* Decoding info: average_num 20
| decoding mode | test (wer) |
| :--------------------: | :---------: |
| ctc_greedy_search | 16.12% |
| ctc_prefix_beam_search | 16.07% |
| attention | 13.56% |
| attention_rescoring | 14.01% |
\ No newline at end of file
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 512 # dimension of attention
attention_heads: 8
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
dataset_conf:
split_with_space: true
filter_conf:
max_length: 40960
min_length: 0
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 40
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'dynamic' # static or dynamic
batch_size: 8
grad_clip: 10
accum_grad: 4
max_epoch: 30
log_interval: 200
optim: adam
optim_conf:
lr: 0.0005
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 20000
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
import os
import re
def process(src_str):
punc = '~`!#$%^&*()_+-=|\';":/.,?><~·!@#¥%……&*()——+-=“:’;、。,?》《{}'
return re.sub(r"[{0}]+".format(punc), "", src_str).upper()
if __name__ == '__main__':
src_dir = sys.argv[1]
tsv_file = src_dir + "/" + sys.argv[2] + ".tsv"
output_dir = sys.argv[3]
for file_path in os.listdir(src_dir + "/clips"):
if(os.path.exists(src_dir + "/wavs/" + file_path.split('.')[0] + ".wav")):
continue
t_str = src_dir + "/clips/" + file_path
tt_str = src_dir + "/wavs/" + file_path.split('.')[0] + ".wav"
os.system("ffmpeg -i {0} -ac 1 -ar 16000 -f wav {1}".format(t_str, tt_str))
import pandas
tsv_content = pandas.read_csv(tsv_file, sep="\t")
path_list = tsv_content["path"]
sentence = tsv_content["sentence"]
client_list = tsv_content["client_id"]
scp_file = open(output_dir + "/wav.scp", "w")
text_file = open(output_dir + "/text", "w")
utt2spk = open(output_dir + "/utt2spk", "w")
for i in range(len(path_list)):
temple_str = path_list[i].split(".")[0]
now_sentence = process(sentence[i])
wav_file = src_dir + "/wavs/" + temple_str + ".wav"
scp_file.writelines(temple_str + " " + wav_file + "\n")
text_file.writelines(temple_str + " " + now_sentence + "\n")
utt2spk.writelines(temple_str + " " + client_list[i] + "\n")
scp_file.close()
text_file.close()
utt2spk.close()
#!/usr/bin/env bash
if [ $# -le 1 ]; then
echo "Args_Error:Two parameters are required."
exit 1;
fi
download_path=$1
data_France=$2
wget -O ${download_path}/tmp.zip https://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com/cv-corpus-8.0-2022-01-19/cv-corpus-8.0-2022-01-19-fr.tar.gz
tar -xvf ${download_path}/tmp.zip -C ${data_France}
rm -rf ${download_path}/tmp.zip
\ No newline at end of file
#!/usr/bin/env bash
if [ $# -le 0 ]; then
echo "Argument should be France src directory, see ../run.sh for example."
exit 1;
fi
dir=`pwd`/data
local=`pwd`/local
src_path=$1
if [ ! -d ${dir} ]; then
mkdir ${dir}
else
rm -rf ${dir}
mkdir ${dir}
fi
for x in train dev test; do
if [ ! ${dir}/${x} ]; then
mkdir ${dir}/${x}
else
rm -rf ${dir}/${x}
mkdir ${dir}/${x}
fi
done
if [ ! -d ${src_path}/wavs ]; then
mkdir ${src_path}/wavs
fi
for x in train dev test; do
python3 ${local}/create_scp_text.py ${src_path} ${x} ${dir}/${x}
done
export WENET_DIR=$PWD/../../..
export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=../../../:$PYTHONPATH
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment