Commit 764b3a75 authored by Sugon_ldc's avatar Sugon_ldc
Browse files

add new model

parents
# Performance Record
This is a Chinese speech recognition recipe that trains on all Chinese corpora including:
| Dataset | Duration (Hours) |
|------------|------------------|
| Aidatatang | 140 |
| Aishell | 151 |
| MagicData | 712 |
| Primewords | 99 |
| ST-CMDS | 110 |
| THCHS-30 | 26 |
| TAL-ASR | 587 |
| AISHELL2 | 1000 |
## Unified Transformer Result
### Data info:
* Dataset: Aidatatang, Aishell, MagicData, Primewords, ST-CMDS, and THCHS-30.
* Feature info: using fbank feature, with cmvn, no speed perturb.
* Training info: lr 0.004, batch size 18, 3 machines, 3*8 = 24 GPUs, acc_grad 1, 220 epochs, dither 0.1
* Decoding info: ctc_weight 0.5, average_num 30
* Git hash: 013794572a55c7d0dbea23a66106ccf3e5d3b8d4
### WER
| Dataset | chunk size | attention decoder | ctc greedy search | ctc prefix beam search | attention rescoring |
|------------|------------|-------------------|-------------------|------------------------|---------------------|
| Aidatatang | full | 4.23 | 5.82 | 5.82 | 4.71 |
| | 16 | 4.59 | 6.99 | 6.99 | 5.29 |
| Aishell | full | 4.69 | 5.80 | 5.80 | 4.64 |
| | 16 | 4.97 | 6.75 | 6.75 | 5.37 |
| MagicData | full | 2.86 | 4.01 | 4.00 | 3.07 |
| | 16 | 3.10 | 5.02 | 5.02 | 3.68 |
| THCHS-30 | full | 16.68 | 15.46 | 15.46 | 14.38 |
| | 16 | 17.47 | 16.81 | 16.82 | 15.63 |
## Unified Conformer Result
### Data info:
* Dataset: Aidatatang, Aishell, MagicData, Primewords, ST-CMDS, and THCHS-30.
* Feature info: using fbank feature, with cmvn, speed perturb.
* Training info: lr 0.001, batch size 8, 1 machines, 1*8 = 8 GPUs, acc_grad 12, 60 epochs
* Decoding info: ctc_weight 0.5, average_num 10
* Git hash: 5bdf436e671ef4c696d1b039f29cc33109e072fa
### WER
| Dataset | chunk size | attention decoder | ctc greedy search | ctc prefix beam search | attention rescoring |
|------------|------------|-------------------|-------------------|------------------------|---------------------|
| Aidatatang | full | 4.12 | 4.97 | 4.97 | 4.22 |
| | 16 | 4.45 | 5.73 | 5.73 | 4.75 |
| Aishell | full | 4.49 | 5.07 | 5.05 | 4.43 |
| | 16 | 4.77 | 5.77 | 5.77 | 4.85 |
| MagicData | full | 2.55 | 3.07 | 3.05 | 2.59 |
| | 16 | 2.81 | 3.88 | 3.86 | 3.08 |
| THCHS-30 | full | 13.55 | 13.75 | 13.76 | 12.72 |
| | 16 | 13.78 | 15.10 | 15.08 | 13.90 |
## Unified Conformer Result
### Data info:
* Dataset: Aidatatang, Aishell, MagicData, Primewords, ST-CMDS, THCHS-30, TAL-ASR, and AISHELL2.
* Feature info: using fbank feature, dither=0, cmvn, speed perturb
* Training info: lr 0.001, batch size 22, 4 GPUs, acc_grad 4, 120 epochs, dither 0.1
* Decoding info: ctc_weight 0.5, average_num 10
* Git hash: 66f30c197d00c59fdeda3bc8ada801f867b73f78
### WER
| Dataset | chunk size | attention decoder | ctc greedy search | ctc prefix beam search | attention rescoring |
|------------|------------|-------------------|-------------------|------------------------|---------------------|
| Aidatatang | full | 3.22 | 4.00 | 4.01 | 3.35 |
| | 16 | 3.50 | 4.63 | 4.63 | 3.79 |
| Aishell | full | 1.23 | 2.12 | 2.13 | 1.42 |
| | 16 | 1.33 | 2.72 | 2.72 | 1.72 |
| MagicData | full | 2.38 | 3.07 | 3.05 | 2.52 |
| | 16 | 2.66 | 3.80 | 3.78 | 2.94 |
| THCHS-30 | full | 9.93 | 11.07 | 11.06 | 10.16 |
| | 16 | 10.28 | 11.85 | 11.85 | 10.81 |
| AISHELL2 | full | 5.25 | 5.81 | 5.79 | 5.22 |
| | 16 | 5.48 | 6.48 | 6.50 | 5.61 |
| TAL-ASR | full | 9.54 | 10.35 | 10.28 | 9.66 |
| | 16 | 10.04 | 11.43 | 11.39 | 10.55 |
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
dataset_conf:
filter_conf:
max_length: 40960
min_length: 0
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 16
grad_clip: 5
accum_grad: 4
max_epoch: 240
log_interval: 100
optim: adam
optim_conf:
lr: 0.002
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
causal: true
use_dynamic_chunk: true
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
use_dynamic_left_chunk: false
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
dataset_conf:
filter_conf:
max_length: 40960
min_length: 0
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 1.0
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 16
grad_clip: 5
accum_grad: 1
max_epoch: 180
log_interval: 100
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
# network architecture
# encoder related
encoder: transformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder architecture type
normalize_before: true
use_dynamic_chunk: true
use_dynamic_left_chunk: false
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
dataset_conf:
filter_conf:
max_length: 40960
min_length: 0
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 16
grad_clip: 5
accum_grad: 1
max_epoch: 220
log_interval: 100
optim: adam
optim_conf:
lr: 0.004
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
#!/bin/bash
# Copyright 2017 Xingyu Na
# Apache 2.0
. ./path.sh || exit 1;
if [ $# != 2 ]; then
echo "Usage: $0 <corpus-path> <data-path>"
echo " $0 /export/a05/xna/data/data_aidatatang_200zh data/aidatatang"
exit 1;
fi
aidatatang_audio_dir=$1/corpus
aidatatang_text=$1/transcript/aidatatang_200_zh_transcript.txt
data=$2
train_dir=$data/local/train
dev_dir=$data/local/dev
test_dir=$data/local/test
tmp_dir=$data/local/tmp
mkdir -p $train_dir
mkdir -p $dev_dir
mkdir -p $test_dir
mkdir -p $tmp_dir
# data directory check
if [ ! -d $aidatatang_audio_dir ] || [ ! -f $aidatatang_text ]; then
echo "Error: $0 requires two directory arguments"
exit 1;
fi
echo "**** Creating aidatatang data folder ****"
# find wav audio file for train, dev and test resp.
find $aidatatang_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
n=`cat $tmp_dir/wav.flist | wc -l`
[ $n -ne 237265 ] && \
echo Warning: expected 237265 data files, found $n
grep -i "corpus/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
grep -i "corpus/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
grep -i "corpus/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
rm -r $tmp_dir
# Transcriptions preparation
for dir in $train_dir $dev_dir $test_dir; do
echo Preparing $dir transcriptions
sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all
paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
tools/filter_scp.pl -f 1 $dir/utt.list $aidatatang_text | sed 's/A/A/g' > $dir/transcripts.txt
awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
tools/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u | awk '{print $1" T0055"$2}' > $dir/utt2spk
tools/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
sort -u $dir/transcripts.txt > $dir/text
tools/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
done
mkdir -p $data/train $data/dev $data/test
for f in spk2utt utt2spk wav.scp text; do
cp $train_dir/$f $data/train/$f || exit 1;
cp $dev_dir/$f $data/dev/$f || exit 1;
cp $test_dir/$f $data/test/$f || exit 1;
done
# utils/data/validate_data_dir.sh --no-feats $data/train || exit 1;
# utils/data/validate_data_dir.sh --no-feats $data/dev || exit 1;
# utils/data/validate_data_dir.sh --no-feats $data/test || exit 1;
echo "$0: aidatatang_200zh data preparation succeeded"
exit 0;
#!/bin/bash
# Copyright 2014 Johns Hopkins University (author: Daniel Povey)
# 2017 Xingyu Na
# Apache 2.0
remove_archive=false
if [ "$1" == --remove-archive ]; then
remove_archive=true
shift
fi
if [ $# -ne 3 ]; then
echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/62 aidatatang_200zh"
echo "With --remove-archive it will remove the archive after successfully un-tarring it."
echo "<corpus-part> can be one of: aidatatang_200zh."
fi
data=$1
url=$2
part=$3
if [ ! -d "$data" ]; then
echo "$0: no such directory $data, make it"
mkdir -p $data
fi
part_ok=false
list="aidatatang_200zh"
for x in $list; do
if [ "$part" == $x ]; then part_ok=true; fi
done
if ! $part_ok; then
echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
exit 1;
fi
if [ -z "$url" ]; then
echo "$0: empty URL base."
exit 1;
fi
if [ -f $data/$part/.complete ]; then
echo "$0: data part $part was already successfully extracted, nothing to do."
exit 0;
fi
# sizes of the archive files in bytes.
sizes="18756983399"
if [ -f $data/$part.tgz ]; then
size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
size_ok=false
for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
if ! $size_ok; then
echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
echo "does not equal the size of one of the archives."
rm $data/$part.gz
else
echo "$data/$part.tgz exists and appears to be complete."
fi
fi
if [ ! -f $data/$part.tgz ]; then
if ! which wget >/dev/null; then
echo "$0: wget is not installed."
exit 1;
fi
full_url=$url/$part.tgz
echo "$0: downloading data from $full_url. This may take some time, please be patient."
cd $data
if ! wget --no-check-certificate $full_url; then
echo "$0: error executing wget $full_url"
exit 1;
fi
fi
cd $data
if ! tar -xvzf $part.tgz; then
echo "$0: error un-tarring archive $data/$part.tgz"
exit 1;
fi
touch $data/$part/.complete
dev_dir=$data/$part/corpus/dev
test_dir=$data/$part/corpus/test
train_dir=$data/$part/corpus/train
if [ $part == "aidatatang_200zh" ]; then
for set in $dev_dir $test_dir $train_dir;do
cd $set
for wav in ./*.tar.gz; do
echo "Extracting wav from $wav"
tar -zxf $wav && rm $wav
done
done
fi
echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
if $remove_archive; then
echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
rm $data/$part.tgz
fi
exit 0;
#!/usr/bin/env bash
# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG)
# 2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU)
# Apache 2.0
# This script is copied from aishell2/s5/local/prepare_data.sh
# but using difference word segmentation script.
# transform raw AISHELL-2 data to kaldi format
. ./path.sh || exit 1;
tmp=
dir=
if [ $# != 2 ]; then
echo "Usage: $0 <corpus-data-dir> <output-dir>"
echo " $0 /export/AISHELL-2/iOS/train data/train"
exit 1;
fi
corpus=$1
dir=$2
tmp=$dir/tmp
echo "prepare_data.sh: Preparing data in $corpus"
mkdir -p $dir
mkdir -p $tmp
# corpus check
if [ ! -d $corpus ] || [ ! -f $corpus/wav.scp ] || [ ! -f $corpus/trans.txt ]; then
echo "Error: $0 requires wav.scp and trans.txt under $corpus directory."
exit 1;
fi
# validate utt-key list
awk '{print "AISHELL2_"$1}' $corpus/wav.scp > $tmp/wav_utt.list
awk '{print "AISHELL2_"$1}' $corpus/trans.txt > $tmp/trans_utt.list
tools/filter_scp.pl -f 1 $tmp/wav_utt.list $tmp/trans_utt.list > $tmp/utt.list
# wav.scp
awk -F'\t' -v path_prefix=$corpus '{printf("AISHELL2_%s %s/%s\n",$1,path_prefix,$2)}' $corpus/wav.scp > $tmp/tmp_wav.scp
tools/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_wav.scp | sort -k 1 | uniq > $tmp/wav.scp
awk -F'\t' '{printf("AISHELL2_%s %s\n",$1,$2)}' $corpus/trans.txt > $tmp/tmp_trans.txt
tools/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_trans.txt | sort -k 1 | uniq > $tmp/trans.txt
# text has ' sed "s/'//g"
dos2unix < $tmp/trans.txt | \
tools/filter_scp.pl -f 1 $tmp/utt.list - | \
sort -k 1 | uniq | tr '[a-z]' '[A-Z]' | \
sed 's/A/A/g' | sed 's/T/T/g' | sed 's/M/M/g' | sed 's/𫚉//g' | sed 's/𫖯/頫/g' \
> $tmp/text
# utt2spk & spk2utt
awk -F' ' '{print $2}' $tmp/wav.scp > $tmp/wav.list
sed -e 's:\.wav::g' $tmp/wav.list | \
awk -F'/' '{i=NF-1;printf("AISHELL2_%s AISHELL2_%s\n",$NF,$i)}' > $tmp/tmp_utt2spk
tools/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_utt2spk | sort -k 1 | uniq > $tmp/utt2spk
tools/utt2spk_to_spk2utt.pl $tmp/utt2spk | sort -k 1 | uniq > $tmp/spk2utt
# copy prepared resources from tmp_dir to target dir
mkdir -p $dir
for f in wav.scp text spk2utt utt2spk; do
cp $tmp/$f $dir/$f || exit 1;
done
tools/validate_data_dir.sh --no-feats $dir || exit 1;
echo "local/prepare_data.sh succeeded"
exit 0;
#!/bin/bash
# Copyright 2017 Xingyu Na
# Apache 2.0
. ./path.sh || exit 1;
if [ $# != 2 ]; then
echo "Usage: $0 <corpus-path> <data-path>"
echo " $0 /export/a05/xna/data/data_aishell data/aishell"
exit 1;
fi
aishell_audio_dir=$1/wav
aishell_text=$1/transcript/aishell_transcript_v0.8.txt
data=data/aishell
train_dir=$data/local/train
dev_dir=$data/local/dev
test_dir=$data/local/test
tmp_dir=$data/local/tmp
mkdir -p $train_dir
mkdir -p $dev_dir
mkdir -p $test_dir
mkdir -p $tmp_dir
# data directory check
if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then
echo "Error: $0 requires two directory arguments"
exit 1;
fi
echo "**** Creating aishell data folder ****"
# find wav audio file for train, dev and test resp.
find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
n=`cat $tmp_dir/wav.flist | wc -l`
[ $n -ne 141925 ] && \
echo Warning: expected 141925 data data files, found $n
grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
rm -r $tmp_dir
# Transcriptions preparation
for dir in $train_dir $dev_dir $test_dir; do
echo Preparing $dir transcriptions
sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all
paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
tools/filter_scp.pl -f 1 $dir/utt.list $aishell_text | \
sed 's/a/a/g' | sed 's/b/b/g' |\
sed 's/c/c/g' | sed 's/k/k/g' |\
sed 's/t/t/g' > $dir/transcripts.txt
awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
tools/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u | awk '{print $1" BAC009"$2}' > $dir/utt2spk
tools/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
sort -u $dir/transcripts.txt > $dir/text
tools/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
done
mkdir -p $data/train $data/dev $data/test
for f in spk2utt utt2spk wav.scp text; do
cp $train_dir/$f $data/train/$f || exit 1;
cp $dev_dir/$f $data/dev/$f || exit 1;
cp $test_dir/$f $data/test/$f || exit 1;
done
# utils/data/validate_data_dir.sh --no-feats $data/train || exit 1;
# utils/data/validate_data_dir.sh --no-feats $data/dev || exit 1;
# utils/data/validate_data_dir.sh --no-feats $data/test || exit 1;
echo "$0: AISHELL data preparation succeeded"
exit 0;
#!/bin/bash
# Copyright 2014 Johns Hopkins University (author: Daniel Povey)
# 2017 Xingyu Na
# Apache 2.0
remove_archive=false
if [ "$1" == --remove-archive ]; then
remove_archive=true
shift
fi
if [ $# -ne 3 ]; then
echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell"
echo "With --remove-archive it will remove the archive after successfully un-tarring it."
echo "<corpus-part> can be one of: data_aishell, resource_aishell."
fi
data=$1
url=$2
part=$3
if [ ! -d "$data" ]; then
echo "$0: no such directory $data"
mkdir -p $data
fi
part_ok=false
list="data_aishell resource_aishell"
for x in $list; do
if [ "$part" == $x ]; then part_ok=true; fi
done
if ! $part_ok; then
echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
exit 1;
fi
if [ -z "$url" ]; then
echo "$0: empty URL base."
exit 1;
fi
if [ -f $data/$part/.complete ]; then
echo "$0: data part $part was already successfully extracted, nothing to do."
exit 0;
fi
# sizes of the archive files in bytes.
sizes="15582913665 1246920"
if [ -f $data/$part.tgz ]; then
size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
size_ok=false
for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
if ! $size_ok; then
echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
echo "does not equal the size of one of the archives."
rm $data/$part.tgz
else
echo "$data/$part.tgz exists and appears to be complete."
fi
fi
if [ ! -f $data/$part.tgz ]; then
if ! which wget >/dev/null; then
echo "$0: wget is not installed."
exit 1;
fi
full_url=$url/$part.tgz
echo "$0: downloading data from $full_url. This may take some time, please be patient."
cd $data
if ! wget --no-check-certificate $full_url; then
echo "$0: error executing wget $full_url"
exit 1;
fi
fi
cd $data
if ! tar -xvzf $part.tgz; then
echo "$0: error un-tarring archive $data/$part.tgz"
exit 1;
fi
touch $data/$part/.complete
if [ $part == "data_aishell" ]; then
cd $data/$part/wav
for wav in ./*.tar.gz; do
echo "Extracting wav from $wav"
tar -zxf $wav && rm $wav
done
fi
echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
if $remove_archive; then
echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
rm $data/$part.tgz
fi
exit 0;
16_4013_20170819121429.wav
18_1565_20170712000170.wav
#!/bin/bash
# Copyright 2019 Xingyu Na
# Apache 2.0
. ./path.sh || exit 1;
if [ $# != 2 ]; then
echo "Usage: $0 <corpus-path> <data-path>"
echo " $0 /export/a05/xna/data/magicdata data/magicdata"
exit 1;
fi
corpus=$1
data=$2
if [ ! -d $corpus/train ] || [ ! -d $corpus/dev ] || [ ! -d $corpus/test ]; then
echo "Error: $0 requires complete corpus"
exit 1;
fi
echo "**** Creating magicdata data folder ****"
mkdir -p $data/{train,dev,test,tmp}
# find wav audio file for train, dev and test resp.
tmp_dir=$data/tmp
find $corpus -iname "*.wav" > $tmp_dir/wav.flist
n=`cat $tmp_dir/wav.flist | wc -l`
[ $n -ne 609552 ] && \
echo Warning: expected 609552 data data files, found $n
for x in train dev test; do
grep -i "/$x/" $tmp_dir/wav.flist > $data/$x/wav.flist || exit 1;
echo "Filtering data using found wav list and provided transcript for $x"
awk -F '.wav' '{print $1}' local/magicdata_badlist | tools/filter_scp.pl --exclude -f 1 - \
<(cat $data/$x/wav.flist|awk -F '/' '{print gensub(".wav", "", "g", $NF), $0}') \
> $data/$x/wav.scp
sed '1d' $corpus/$x/TRANS.txt | awk -F '\t' '{print gensub(".wav","","g",$1), $2}' > $data/$x/utt2spk
sed '1d' $corpus/$x/TRANS.txt | awk -F '\t' '{print gensub(".wav","","g",$1), $3}' |\
sed 's/!//g' | sed 's/?//g' |\
sed 's/,//g' | sed 's/-//g' |\
sed 's/://g' | sed 's/;//g' |\
sed 's/ //g' | sed 's/。//g' |\
sed 's/`//g' | sed 's/,//g' |\
sed 's/://g' | sed 's/?//g' |\
sed 's/\///g' | sed 's/·//g' |\
sed 's/\"//g' | sed 's/“//g' |\
sed 's/”//g' | sed 's/\\//g' |\
sed 's/…//g' | sed "s///g" |\
sed 's/、//g' | sed "s///g" | sed 's/《//g' | sed 's/》//g' |\
sed 's/\[//g' | sed 's/\]//g' | sed 's/FIL//g' | sed 's/SPK//' |\
tr '[a-z]' '[A-Z]' |\
awk '{if (NF > 1) print $0;}' > $data/$x/text
for file in wav.scp utt2spk text; do
sort $data/$x/$file -o $data/$x/$file
done
tools/utt2spk_to_spk2utt.pl $data/$x/utt2spk > $data/$x/spk2utt
done
# rm -r $tmp_dir
tools/fix_data_dir.sh $data/train || exit 1;
tools/fix_data_dir.sh $data/dev || exit 1;
tools/fix_data_dir.sh $data/test || exit 1;
#!/bin/bash
# Copyright 2014 Johns Hopkins University (author: Daniel Povey)
# 2019 Xingyu Na
# Apache 2.0
remove_archive=false
if [ "$1" == --remove-archive ]; then
remove_archive=true
shift
fi
if [ $# -ne 3 ]; then
echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/68 train_set"
echo "With --remove-archive it will remove the archive after successfully un-tarring it."
echo "<corpus-part> can be one of: train_set, dev_set, test_set."
fi
data=$1
url=$2
part=$3
part1=`echo $part | sed s/_set//`
if [ ! -d "$data" ]; then
echo "$0: no such directory $data, make it."
mkdir -p $data
fi
part_ok=false
list="train_set dev_set test_set"
for x in $list; do
if [ "$part" == $x ]; then part_ok=true; fi
done
if ! $part_ok; then
echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
exit 1;
fi
if [ -z "$url" ]; then
echo "$0: empty URL base."
exit 1;
fi
if [ -f $data/$part1/.complete ]; then
echo "$0: data part $part was already successfully extracted, nothing to do."
exit 0;
fi
# sizes of the archive files in bytes.
sizes="52627842921 1035537823 2201936013"
if [ -f $data/$part.tar.gz ]; then
size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}')
size_ok=false
for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
if ! $size_ok; then
echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size"
echo "does not equal the size of one of the archives."
rm $data/$part.tar.gz
else
echo "$data/$part.tar.gz exists and appears to be complete."
fi
fi
if [ ! -f $data/$part.tar.gz ]; then
if ! which wget >/dev/null; then
echo "$0: wget is not installed."
exit 1;
fi
full_url=$url/$part.tar.gz
echo "$0: downloading data from $full_url. This may take some time, please be patient."
cd $data
if ! wget --no-check-certificate $full_url; then
echo "$0: error executing wget $full_url"
exit 1;
fi
fi
cd $data
if ! tar -xvzf $part.tar.gz; then
echo "$0: error un-tarring archive $data/$part.tar.gz"
exit 1;
fi
touch $data/$part1/.complete
echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz"
if $remove_archive; then
echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied."
rm $data/$part.tar.gz
fi
exit 0;
#!/bin/bash
# Copyright 2019 Xingyu Na
# Apache 2.0
. ./path.sh || exit 1;
if [ $# != 2 ]; then
echo "Usage: $0 <corpus-path> <data-path>"
echo " $0 /export/a05/xna/data/primewords data/primewords"
exit 1;
fi
corpus=$1/primewords_md_2018_set1
data=$2
if [ ! -d $corpus/audio_files ] || [ ! -f $corpus/set1_transcript.json ]; then
echo "Error: $0 requires complete corpus"
exit 1;
fi
echo "**** Creating primewords data folder ****"
mkdir -p $data/train
# find wav audio file for train
find $corpus -iname "*.wav" > $data/wav.flist
n=`cat $data/wav.flist | wc -l`
[ $n -ne 50384 ] && \
echo Warning: expected 50384 data files, found $n
echo "Filtering data using found wav list and provided transcript"
local/primewords_parse_transcript.py $data/wav.flist $corpus/set1_transcript.json $data/train
cat $data/train/transcripts.txt |\
awk '{if (NF > 1) print $0;}' > $data/train/text
for file in wav.scp utt2spk text; do
sort $data/train/$file -o $data/train/$file
done
tools/utt2spk_to_spk2utt.pl $data/train/utt2spk > $data/train/spk2utt
# rm -r $data/wav.flist
tools/validate_data_dir.sh --no-feats $data/train || exit 1;
#!/bin/bash
# Copyright 2014 Johns Hopkins University (author: Daniel Povey)
# 2017 Xingyu Na
# Apache 2.0
remove_archive=false
if [ "$1" == --remove-archive ]; then
remove_archive=true
shift
fi
if [ $# -ne 2 ]; then
echo "Usage: $0 [--remove-archive] <data-base> <url-base>"
echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/38"
echo "With --remove-archive it will remove the archive after successfully un-tarring it."
fi
data=$1
url=$2
part=primewords_md_2018_set1
if [ ! -d "$data" ]; then
echo "$0: no such directory $data, make it"
mkdir -p $data
fi
if [ -z "$url" ]; then
echo "$0: empty URL base."
exit 1;
fi
if [ -f $data/.complete ]; then
echo "$0: data part $part was already successfully extracted, nothing to do."
exit 0;
fi
# sizes of the archive files in bytes.
sizes="9057625192"
if [ -f $data/$part.tar.gz ]; then
size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}')
size_ok=false
for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
if ! $size_ok; then
echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size"
echo "does not equal the size of one of the archives."
rm $data/$part.tar.gz
else
echo "$data/$part.tar.gz exists and appears to be complete."
fi
fi
if [ ! -f $data/$part.tar.gz ]; then
if ! which wget >/dev/null; then
echo "$0: wget is not installed."
exit 1;
fi
full_url=$url/$part.tar.gz
echo "$0: downloading data from $full_url. This may take some time, please be patient."
cd $data
if ! wget --no-check-certificate $full_url; then
echo "$0: error executing wget $full_url"
exit 1;
fi
fi
cd $data
if ! tar -xvzf $part.tar.gz; then
echo "$0: error un-tarring archive $data/$part.tar.gz"
exit 1;
fi
touch $data/.complete
echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
if $remove_archive; then
echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied."
rm $data/$part.tar.gz
fi
exit 0;
#!/usr/bin/env python3
import os
import sys
import json
def main(argv):
fp = open(argv[1], encoding="utf-8")
js = json.load(fp)
fp.close()
metas = {}
for ele in js:
fname = ele['file']
metas[fname] = ele
fWavScp = open(os.path.join(argv[2], 'wav.scp'), 'w')
fText = open(os.path.join(
argv[2], 'transcripts.txt'), 'w', encoding="utf-8")
fUtt2Spk = open(os.path.join(argv[2], 'utt2spk'), 'w')
for line in open(argv[0]):
fpath = line.strip('\r\n')
wname = os.path.basename(fpath)
meta = metas[wname]
spkid = 'P' + meta['user_id']
uttid = spkid + '-' + meta['id']
fWavScp.write(uttid + ' ' + fpath + '\n')
fText.write(uttid + ' ' + meta['text'] + '\n')
fUtt2Spk.write(uttid + ' ' + spkid + '\n')
fWavScp.close()
fText.close()
fUtt2Spk.close()
if __name__ == "__main__":
main(sys.argv[1:])
#!/bin/bash
# Copyright 2019 Xingyu Na
# Apache 2.0
. ./path.sh || exit 1;
if [ $# != 2 ]; then
echo "Usage: $0 <corpus-path> <data-path>"
echo " $0 /export/a05/xna/data/stcmds data/stcmds"
exit 1;
fi
corpus=$1/ST-CMDS-20170001_1-OS
data=$2
if [ ! -d $corpus ]; then
echo "Error: $0 requires complete corpus"
exit 1;
fi
echo "**** Creating ST-CMDS data folder ****"
mkdir -p $data/train
# find wav audio file for train
find $corpus -iname "*.wav" > $data/wav.list
n=`cat $data/wav.list | wc -l`
[ $n -ne 102600 ] && \
echo Warning: expected 102600 data files, found $n
cat $data/wav.list | awk -F'20170001' '{print $NF}' | awk -F'.' '{print $1}' > $data/utt.list
cat $data/utt.list | awk '{print substr($1,1,6)}' > $data/spk.list
while read line; do
tn=`dirname $line`/`basename $line .wav`.txt;
cat $tn; echo;
done < $data/wav.list > $data/text.list
paste -d' ' $data/utt.list $data/wav.list > $data/train/wav.scp
paste -d' ' $data/utt.list $data/spk.list > $data/train/utt2spk
paste -d' ' $data/utt.list $data/text.list |\
sed 's/,//g' |\
tr '[a-z]' '[A-Z]' |\
awk '{if (NF > 1) print $0;}' > $data/train/text
for file in wav.scp utt2spk text; do
sort $data/train/$file -o $data/train/$file
done
tools/utt2spk_to_spk2utt.pl $data/train/utt2spk > $data/train/spk2utt
# rm -r $data/{wav,utt,spk,text}.list
tools/validate_data_dir.sh --no-feats $data/train || exit 1;
#!/bin/bash
# Copyright 2014 Johns Hopkins University (author: Daniel Povey)
# 2017 Xingyu Na
# Apache 2.0
remove_archive=false
if [ "$1" == --remove-archive ]; then
remove_archive=true
shift
fi
if [ $# -ne 2 ]; then
echo "Usage: $0 [--remove-archive] <data-base> <url-base>"
echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/38"
echo "With --remove-archive it will remove the archive after successfully un-tarring it."
fi
data=$1
url=$2
part=ST-CMDS-20170001_1-OS
if [ ! -d "$data" ]; then
echo "$0: no such directory $data, make it"
mkdir -p $data
fi
if [ -z "$url" ]; then
echo "$0: empty URL base."
exit 1;
fi
if [ -f $data/.complete ]; then
echo "$0: data part $part was already successfully extracted, nothing to do."
exit 0;
fi
# sizes of the archive files in bytes.
sizes="8231662593"
if [ -f $data/$part.tar.gz ]; then
size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}')
size_ok=false
for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
if ! $size_ok; then
echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size"
echo "does not equal the size of one of the archives."
rm $data/$part.tar.gz
else
echo "$data/$part.tar.gz exists and appears to be complete."
fi
fi
if [ ! -f $data/$part.tar.gz ]; then
if ! which wget >/dev/null; then
echo "$0: wget is not installed."
exit 1;
fi
full_url=$url/$part.tar.gz
echo "$0: downloading data from $full_url. This may take some time, please be patient."
cd $data
if ! wget --no-check-certificate $full_url; then
echo "$0: error executing wget $full_url"
exit 1;
fi
fi
cd $data
if ! tar -xvzf $part.tar.gz; then
echo "$0: error un-tarring archive $data/$part.tar.gz"
exit 1;
fi
touch $data/.complete
echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
if $remove_archive; then
echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied."
rm $data/$part.tar.gz
fi
exit 0;
#!/bin/bash
# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan)
# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu)
# Apache 2.0
. ./path.sh || exit 1;
if [ $# != 2 ]; then
echo "Usage: $0 <corpus-path> <data-path>"
echo " $0 /export/a05/xna/data/aisolution_data data/tal_asr"
exit 1;
fi
tal_audio_dir=$1/wav/
tal_text=$1/transcript/transcript.txt
data=$2
train_dir=$data/local/train
dev_dir=$data/local/dev
test_dir=$data/local/test
tmp_dir=$data/local/tmp
mkdir -p $train_dir
mkdir -p $dev_dir
mkdir -p $test_dir
mkdir -p $tmp_dir
# data directory check
if [ ! -d $tal_audio_dir ] || [ ! -f $tal_text ]; then
echo "Error: $0 requires two directory arguments"
exit 1;
fi
echo "**** Creating tal asr data folder ****"
# find wav audio file for train, dev and test resp.
find $tal_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
n=`cat $tmp_dir/wav.flist | wc -l`
[ $n -ne 31747 ] && \
echo Warning: expected 31747 data files, found $n
grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
rm -r $tmp_dir
# Transcriptions preparation
for dir in $train_dir $dev_dir $test_dir; do
echo Preparing $dir transcriptions
sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF, "TALASR"$(NF-1)"-"$NF}' > $dir/utt_uttid
sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print "TALASR"$(NF-1)"-"$NF, "TALASR"$(NF-1)}' > $dir/utt2spk
paste -d ' ' <(awk '{print $2}' $dir/utt_uttid) $dir/wav.flist > $dir/wav.scp
tools/filter_scp.pl -f 1 $dir/utt.list $tal_text | \
sed 's/A/A/g' | sed 's/#//g' | sed 's/=//g' | sed 's/、//g' | \
sed 's/,//g' | sed 's/?//g' | sed 's/。//g' | sed 's/[ ][ ]*$//g'\
> $dir/transcripts.txt
awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
paste -d " " <(sort -u -k 1 $dir/utt_uttid | awk '{print $2}') \
<(sort -u -k 1 $dir/transcripts.txt | awk '{for(i=2;i<NF;i++) {printf($i" ")}printf($NF"\n") }') \
> $dir/text
tools/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
done
mkdir -p $data/train $data/dev $data/test
for f in spk2utt utt2spk wav.scp text; do
cp $train_dir/$f $data/train/$f || exit 1;
cp $dev_dir/$f $data/dev/$f || exit 1;
cp $test_dir/$f $data/test/$f || exit 1;
done
tools/fix_data_dir.sh $data/train || exit 1;
tools/fix_data_dir.sh $data/dev || exit 1;
tools/fix_data_dir.sh $data/test || exit 1;
echo "$0: tal asr data preparation succeeded"
exit 0;
#!/bin/bash
# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan)
# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu)
# Apache 2.0
. ./path.sh || exit 1;
if [ $# != 2 ]; then
echo "Usage: $0 <corpus-path> <data-path>"
echo " $0 /export/a05/xna/data/TAL_CSASR data/tal_mix"
exit 1;
fi
tal_mix_audio_dir=$1/cs_wav
tal_mix_text=$1/label
data=$2
train_dir=$data/local/train
tmp_dir=$data/local/tmp
mkdir -p $train_dir
mkdir -p $tmp_dir
# data directory check
if [ ! -d $tal_mix_audio_dir ] || [ ! -f $tal_mix_text ]; then
echo "Error: $0 requires two directory arguments"
exit 1;
fi
echo "**** Creating tal mix data folder ****"
# find wav audio file for train, dev and test resp.
find $tal_mix_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
n=`cat $tmp_dir/wav.flist | wc -l`
[ $n -ne 370000 ] && \
echo Warning: expected 370000 data files, found $n
# rm -r $tmp_dir
# Transcriptions preparation
echo Preparing transcriptions
sed -e 's/\.wav//' $tmp_dir/wav.flist | awk -F '/' '{print $NF}' > $train_dir/utt.list
sed -e 's/\.wav//' $tmp_dir/wav.flist | awk -F '/' '{printf("%s %s\n",$NF,$NF)}' > $train_dir/utt2spk
paste -d' ' $train_dir/utt.list $tmp_dir/wav.flist > $train_dir/wav.scp
cat $tal_mix_text | grep -Ev '^\s*$' | awk '{if(NF>1) print $0}' > $train_dir/transcript.txt
#cp $tal_mix_text $train_dir
wc -l $train_dir/transcript.txt
echo filtering
tools/filter_scp.pl -f 1 $train_dir/utt.list $train_dir/transcript.txt | \
sed 's/A/A/g' | sed 's/C/C/g' | sed 's/D/D/g' | sed 's/G/G/g' | \
sed 's/H/H/g' | sed 's/U/U/g' | sed 's/Y/Y/g' | sed 's/a/a/g' | \
sed 's/I/I/g' | sed 's/#//g' | sed 's/=//g' | sed 's/;//g' | \
sed 's/,//g' | sed 's/?//g' | sed 's/。//g' | sed 's/\///g' | \
sed 's/!//g' | sed 's/!//g' | sed 's/\.//g' | sed 's/\?//g' | \
sed 's/://g' | sed 's/,//g' | sed 's/\"//g' | sed 's/://g' | \
sed 's/@//g' | sed 's/-/ /g' | sed 's/、/ /g' | sed 's/~/ /g' | \
sed "s/‘/\'/g" | sed 's/E/E/g' | sed "s/’/\'/g" | sed 's/《//g' | sed 's/》//g' | \
sed "s/[ ][ ]*$//g" | sed "s/\[//g" | sed 's/、//g' > $train_dir/text
tools/utt2spk_to_spk2utt.pl $train_dir/utt2spk > $train_dir/spk2utt
mkdir -p $data/train
for f in spk2utt utt2spk wav.scp text; do
cp $train_dir/$f $data/train/$f || exit 1;
done
tools/fix_data_dir.sh $data/train || exit 1;
echo "$0: tal mix data preparation succeeded"
exit 0;
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment