Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
FunASR
Commits
431278fa
Commit
431278fa
authored
Nov 22, 2024
by
“change”
Browse files
Initial commit
parent
8c252776
Pipeline
#1949
failed with stages
in 0 seconds
Changes
788
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1318 additions
and
0 deletions
+1318
-0
examples/aishell/branchformer/infer.sh
examples/aishell/branchformer/infer.sh
+12
-0
examples/aishell/branchformer/local/aishell_data_prep.sh
examples/aishell/branchformer/local/aishell_data_prep.sh
+66
-0
examples/aishell/branchformer/local/download_and_untar.sh
examples/aishell/branchformer/local/download_and_untar.sh
+105
-0
examples/aishell/branchformer/run.sh
examples/aishell/branchformer/run.sh
+203
-0
examples/aishell/branchformer/utils
examples/aishell/branchformer/utils
+1
-0
examples/aishell/conformer/README.md
examples/aishell/conformer/README.md
+16
-0
examples/aishell/conformer/conf/conformer_12e_6d_2048_256.yaml
...les/aishell/conformer/conf/conformer_12e_6d_2048_256.yaml
+110
-0
examples/aishell/conformer/conf/conformer_rwkv.yaml
examples/aishell/conformer/conf/conformer_rwkv.yaml
+124
-0
examples/aishell/conformer/demo_infer.sh
examples/aishell/conformer/demo_infer.sh
+1
-0
examples/aishell/conformer/demo_train_or_finetune.sh
examples/aishell/conformer/demo_train_or_finetune.sh
+1
-0
examples/aishell/conformer/local/aishell_data_prep.sh
examples/aishell/conformer/local/aishell_data_prep.sh
+66
-0
examples/aishell/conformer/local/download_and_untar.sh
examples/aishell/conformer/local/download_and_untar.sh
+105
-0
examples/aishell/conformer/run.sh
examples/aishell/conformer/run.sh
+202
-0
examples/aishell/conformer/utils
examples/aishell/conformer/utils
+1
-0
examples/aishell/e_branchformer/README.md
examples/aishell/e_branchformer/README.md
+14
-0
examples/aishell/e_branchformer/conf/e_branchformer_12e_6d_2048_256.yaml
...l/e_branchformer/conf/e_branchformer_12e_6d_2048_256.yaml
+118
-0
examples/aishell/e_branchformer/demo_infer.sh
examples/aishell/e_branchformer/demo_infer.sh
+1
-0
examples/aishell/e_branchformer/demo_train_or_finetune.sh
examples/aishell/e_branchformer/demo_train_or_finetune.sh
+1
-0
examples/aishell/e_branchformer/local/aishell_data_prep.sh
examples/aishell/e_branchformer/local/aishell_data_prep.sh
+66
-0
examples/aishell/e_branchformer/local/download_and_untar.sh
examples/aishell/e_branchformer/local/download_and_untar.sh
+105
-0
No files found.
Too many changes to show.
To preserve performance only
788 of 788+
files are displayed.
Plain diff
Email patch
examples/aishell/branchformer/infer.sh
0 → 100644
View file @
431278fa
python
-m
funasr.bin.inference
\
--config-path
=
"/mnt/workspace/FunASR/examples/aishell/paraformer/exp/baseline_paraformer_conformer_12e_6d_2048_256_zh_char_exp3"
\
--config-name
=
"config.yaml"
\
++init_param
=
"/mnt/workspace/FunASR/examples/aishell/paraformer/exp/baseline_paraformer_conformer_12e_6d_2048_256_zh_char_exp3/model.pt.ep38"
\
++tokenizer_conf.token_list
=
"/mnt/nfs/zhifu.gzf/data/AISHELL-1-feats/DATA/data/zh_token_list/char/tokens.txt"
\
++frontend_conf.cmvn_file
=
"/mnt/nfs/zhifu.gzf/data/AISHELL-1-feats/DATA/data/train/am.mvn"
\
++input
=
"/mnt/nfs/zhifu.gzf/data/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0122.wav"
\
++output_dir
=
"./outputs/debug"
\
++device
=
"cuda:0"
\
examples/aishell/branchformer/local/aishell_data_prep.sh
0 → 100755
View file @
431278fa
#!/bin/bash
# Copyright 2017 Xingyu Na
# Apache 2.0
#. ./path.sh || exit 1;
if
[
$#
!=
3
]
;
then
echo
"Usage:
$0
<audio-path> <text-path> <output-path>"
echo
"
$0
/export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript data"
exit
1
;
fi
aishell_audio_dir
=
$1
aishell_text
=
$2
/aishell_transcript_v0.8.txt
output_dir
=
$3
train_dir
=
$output_dir
/data/local/train
dev_dir
=
$output_dir
/data/local/dev
test_dir
=
$output_dir
/data/local/test
tmp_dir
=
$output_dir
/data/local/tmp
mkdir
-p
$train_dir
mkdir
-p
$dev_dir
mkdir
-p
$test_dir
mkdir
-p
$tmp_dir
# data directory check
if
[
!
-d
$aishell_audio_dir
]
||
[
!
-f
$aishell_text
]
;
then
echo
"Error:
$0
requires two directory arguments"
exit
1
;
fi
# find wav audio file for train, dev and test resp.
find
$aishell_audio_dir
-iname
"*.wav"
>
$tmp_dir
/wav.flist
n
=
`
cat
$tmp_dir
/wav.flist |
wc
-l
`
[
$n
-ne
141925
]
&&
\
echo
Warning: expected 141925 data data files, found
$n
grep
-i
"wav/train"
$tmp_dir
/wav.flist
>
$train_dir
/wav.flist
||
exit
1
;
grep
-i
"wav/dev"
$tmp_dir
/wav.flist
>
$dev_dir
/wav.flist
||
exit
1
;
grep
-i
"wav/test"
$tmp_dir
/wav.flist
>
$test_dir
/wav.flist
||
exit
1
;
rm
-r
$tmp_dir
# Transcriptions preparation
for
dir
in
$train_dir
$dev_dir
$test_dir
;
do
echo
Preparing
$dir
transcriptions
sed
-e
's/\.wav//'
$dir
/wav.flist |
awk
-F
'/'
'{print $NF}'
>
$dir
/utt.list
paste
-d
' '
$dir
/utt.list
$dir
/wav.flist
>
$dir
/wav.scp_all
utils/filter_scp.pl
-f
1
$dir
/utt.list
$aishell_text
>
$dir
/transcripts.txt
awk
'{print $1}'
$dir
/transcripts.txt
>
$dir
/utt.list
utils/filter_scp.pl
-f
1
$dir
/utt.list
$dir
/wav.scp_all |
sort
-u
>
$dir
/wav.scp
sort
-u
$dir
/transcripts.txt
>
$dir
/text
done
mkdir
-p
$output_dir
/data/train
$output_dir
/data/dev
$output_dir
/data/test
for
f
in
wav.scp text
;
do
cp
$train_dir
/
$f
$output_dir
/data/train/
$f
||
exit
1
;
cp
$dev_dir
/
$f
$output_dir
/data/dev/
$f
||
exit
1
;
cp
$test_dir
/
$f
$output_dir
/data/test/
$f
||
exit
1
;
done
echo
"
$0
: AISHELL data preparation succeeded"
exit
0
;
examples/aishell/branchformer/local/download_and_untar.sh
0 → 100755
View file @
431278fa
#!/usr/bin/env bash
# Copyright 2014 Johns Hopkins University (author: Daniel Povey)
# 2017 Xingyu Na
# Apache 2.0
remove_archive
=
false
if
[
"
$1
"
==
--remove-archive
]
;
then
remove_archive
=
true
shift
fi
if
[
$#
-ne
3
]
;
then
echo
"Usage:
$0
[--remove-archive] <data-base> <url-base> <corpus-part>"
echo
"e.g.:
$0
/export/a05/xna/data www.openslr.org/resources/33 data_aishell"
echo
"With --remove-archive it will remove the archive after successfully un-tarring it."
echo
"<corpus-part> can be one of: data_aishell, resource_aishell."
fi
data
=
$1
url
=
$2
part
=
$3
if
[
!
-d
"
$data
"
]
;
then
echo
"
$0
: no such directory
$data
"
exit
1
;
fi
part_ok
=
false
list
=
"data_aishell resource_aishell"
for
x
in
$list
;
do
if
[
"
$part
"
==
$x
]
;
then
part_ok
=
true
;
fi
done
if
!
$part_ok
;
then
echo
"
$0
: expected <corpus-part> to be one of
$list
, but got '
$part
'"
exit
1
;
fi
if
[
-z
"
$url
"
]
;
then
echo
"
$0
: empty URL base."
exit
1
;
fi
if
[
-f
$data
/
$part
/.complete
]
;
then
echo
"
$0
: data part
$part
was already successfully extracted, nothing to do."
exit
0
;
fi
# sizes of the archive files in bytes.
sizes
=
"15582913665 1246920"
if
[
-f
$data
/
$part
.tgz
]
;
then
size
=
$(
/bin/ls
-l
$data
/
$part
.tgz |
awk
'{print $5}'
)
size_ok
=
false
for
s
in
$sizes
;
do if
[
$s
==
$size
]
;
then
size_ok
=
true
;
fi
;
done
if
!
$size_ok
;
then
echo
"
$0
: removing existing file
$data
/
$part
.tgz because its size in bytes
$size
"
echo
"does not equal the size of one of the archives."
rm
$data
/
$part
.tgz
else
echo
"
$data
/
$part
.tgz exists and appears to be complete."
fi
fi
if
[
!
-f
$data
/
$part
.tgz
]
;
then
if
!
command
-v
wget
>
/dev/null
;
then
echo
"
$0
: wget is not installed."
exit
1
;
fi
full_url
=
$url
/
$part
.tgz
echo
"
$0
: downloading data from
$full_url
. This may take some time, please be patient."
cd
$data
||
exit
1
if
!
wget
--no-check-certificate
$full_url
;
then
echo
"
$0
: error executing wget
$full_url
"
exit
1
;
fi
fi
cd
$data
||
exit
1
if
!
tar
-xvzf
$part
.tgz
;
then
echo
"
$0
: error un-tarring archive
$data
/
$part
.tgz"
exit
1
;
fi
touch
$data
/
$part
/.complete
if
[
$part
==
"data_aishell"
]
;
then
cd
$data
/
$part
/wav
||
exit
1
for
wav
in
./
*
.tar.gz
;
do
echo
"Extracting wav from
$wav
"
tar
-zxf
$wav
&&
rm
$wav
done
fi
echo
"
$0
: Successfully downloaded and un-tarred
$data
/
$part
.tgz"
if
$remove_archive
;
then
echo
"
$0
: removing
$data
/
$part
.tgz file since --remove-archive option was supplied."
rm
$data
/
$part
.tgz
fi
exit
0
;
examples/aishell/branchformer/run.sh
0 → 100755
View file @
431278fa
#!/usr/bin/env bash
CUDA_VISIBLE_DEVICES
=
"0,1,2,3"
# general configuration
feats_dir
=
"../DATA"
#feature output dictionary
exp_dir
=
`
pwd
`
lang
=
zh
token_type
=
char
stage
=
0
stop_stage
=
5
# feature configuration
nj
=
32
inference_device
=
"cuda"
#"cpu"
inference_checkpoint
=
"model.pt.avg10"
inference_scp
=
"wav.scp"
inference_batch_size
=
1
# data
raw_data
=
../raw_data
data_url
=
www.openslr.org/resources/33
# exp tag
tag
=
"exp1"
workspace
=
`
pwd
`
master_port
=
12345
.
utils/parse_options.sh
||
exit
1
;
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set
-e
set
-u
set
-o
pipefail
train_set
=
train
valid_set
=
dev
test_sets
=
"dev test"
config
=
branchformer_12e_6d_2048_256.yaml
model_dir
=
"baseline_
$(
basename
"
${
config
}
"
.yaml
)
_
${
lang
}
_
${
token_type
}
_
${
tag
}
"
if
[
${
stage
}
-le
-1
]
&&
[
${
stop_stage
}
-ge
-1
]
;
then
echo
"stage -1: Data Download"
mkdir
-p
${
raw_data
}
local
/download_and_untar.sh
${
raw_data
}
${
data_url
}
data_aishell
local
/download_and_untar.sh
${
raw_data
}
${
data_url
}
resource_aishell
fi
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
echo
"stage 0: Data preparation"
# Data preparation
local
/aishell_data_prep.sh
${
raw_data
}
/data_aishell/wav
${
raw_data
}
/data_aishell/transcript
${
feats_dir
}
for
x
in
train dev
test
;
do
cp
${
feats_dir
}
/data/
${
x
}
/text
${
feats_dir
}
/data/
${
x
}
/text.org
paste
-d
" "
<
(
cut
-f
1
-d
" "
${
feats_dir
}
/data/
${
x
}
/text.org
)
<
(
cut
-f
2-
-d
" "
${
feats_dir
}
/data/
${
x
}
/text.org |
tr
-d
" "
)
\
>
${
feats_dir
}
/data/
${
x
}
/text
utils/text2token.py
-n
1
-s
1
${
feats_dir
}
/data/
${
x
}
/text
>
${
feats_dir
}
/data/
${
x
}
/text.org
mv
${
feats_dir
}
/data/
${
x
}
/text.org
${
feats_dir
}
/data/
${
x
}
/text
# convert wav.scp text to jsonl
scp_file_list_arg
=
"++scp_file_list='[
\"
${
feats_dir
}
/data/
${
x
}
/wav.scp
\"
,
\"
${
feats_dir
}
/data/
${
x
}
/text
\"
]'"
python ../../../funasr/datasets/audio_datasets/scp2jsonl.py
\
++data_type_list
=
'["source", "target"]'
\
++jsonl_file_out
=
${
feats_dir
}
/data/
${
x
}
/audio_datasets.jsonl
\
${
scp_file_list_arg
}
done
fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
echo
"stage 1: Feature and CMVN Generation"
python ../../../funasr/bin/compute_audio_cmvn.py
\
--config-path
"
${
workspace
}
/conf"
\
--config-name
"
${
config
}
"
\
++train_data_set_list
=
"
${
feats_dir
}
/data/
${
train_set
}
/audio_datasets.jsonl"
\
++cmvn_file
=
"
${
feats_dir
}
/data/
${
train_set
}
/cmvn.json"
\
fi
token_list
=
${
feats_dir
}
/data/
${
lang
}
_token_list/
$token_type
/tokens.txt
echo
"dictionary:
${
token_list
}
"
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
echo
"stage 2: Dictionary Preparation"
mkdir
-p
${
feats_dir
}
/data/
${
lang
}
_token_list/
$token_type
/
echo
"make a dictionary"
echo
"<blank>"
>
${
token_list
}
echo
"<s>"
>>
${
token_list
}
echo
"</s>"
>>
${
token_list
}
utils/text2token.py
-s
1
-n
1
--space
""
${
feats_dir
}
/data/
$train_set
/text |
cut
-f
2-
-d
" "
|
tr
" "
"
\n
"
\
|
sort
|
uniq
|
grep
-a
-v
-e
'^\s*$'
|
awk
'{print $0}'
>>
${
token_list
}
echo
"<unk>"
>>
${
token_list
}
fi
# LM Training Stage
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
echo
"stage 3: LM Training"
fi
# ASR Training Stage
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
echo
"stage 4: ASR Training"
mkdir
-p
${
exp_dir
}
/exp/
${
model_dir
}
current_time
=
$(
date
"+%Y-%m-%d_%H-%M"
)
log_file
=
"
${
exp_dir
}
/exp/
${
model_dir
}
/train.log.txt.
${
current_time
}
"
echo
"log_file:
${
log_file
}
"
export
CUDA_VISIBLE_DEVICES
=
$CUDA_VISIBLE_DEVICES
gpu_num
=
$(
echo
$CUDA_VISIBLE_DEVICES
|
awk
-F
","
'{print NF}'
)
torchrun
\
--nnodes
1
\
--nproc_per_node
${
gpu_num
}
\
--master_port
${
master_port
}
\
../../../funasr/bin/train.py
\
--config-path
"
${
workspace
}
/conf"
\
--config-name
"
${
config
}
"
\
++train_data_set_list
=
"
${
feats_dir
}
/data/
${
train_set
}
/audio_datasets.jsonl"
\
++valid_data_set_list
=
"
${
feats_dir
}
/data/
${
valid_set
}
/audio_datasets.jsonl"
\
++tokenizer_conf.token_list
=
"
${
token_list
}
"
\
++frontend_conf.cmvn_file
=
"
${
feats_dir
}
/data/
${
train_set
}
/am.mvn"
\
++output_dir
=
"
${
exp_dir
}
/exp/
${
model_dir
}
"
&>
${
log_file
}
fi
# Testing Stage
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
echo
"stage 5: Inference"
if
[
${
inference_device
}
==
"cuda"
]
;
then
nj
=
$(
echo
$CUDA_VISIBLE_DEVICES
|
awk
-F
","
'{print NF}'
)
else
inference_batch_size
=
1
CUDA_VISIBLE_DEVICES
=
""
for
JOB
in
$(
seq
${
nj
}
)
;
do
CUDA_VISIBLE_DEVICES
=
$CUDA_VISIBLE_DEVICES
"-1,"
done
fi
for
dset
in
${
test_sets
}
;
do
inference_dir
=
"
${
exp_dir
}
/exp/
${
model_dir
}
/inference-
${
inference_checkpoint
}
/
${
dset
}
"
_logdir
=
"
${
inference_dir
}
/logdir"
echo
"inference_dir:
${
inference_dir
}
"
mkdir
-p
"
${
_logdir
}
"
data_dir
=
"
${
feats_dir
}
/data/
${
dset
}
"
key_file
=
${
data_dir
}
/
${
inference_scp
}
split_scps
=
for
JOB
in
$(
seq
"
${
nj
}
"
)
;
do
split_scps+
=
"
${
_logdir
}
/keys.
${
JOB
}
.scp"
done
utils/split_scp.pl
"
${
key_file
}
"
${
split_scps
}
gpuid_list_array
=(
${
CUDA_VISIBLE_DEVICES
//,/
}
)
for
JOB
in
$(
seq
${
nj
}
)
;
do
{
id
=
$((
JOB-1
))
gpuid
=
${
gpuid_list_array
[
$id
]
}
export
CUDA_VISIBLE_DEVICES
=
${
gpuid
}
python ../../../funasr/bin/inference.py
\
--config-path
=
"
${
exp_dir
}
/exp/
${
model_dir
}
"
\
--config-name
=
"config.yaml"
\
++init_param
=
"
${
exp_dir
}
/exp/
${
model_dir
}
/
${
inference_checkpoint
}
"
\
++tokenizer_conf.token_list
=
"
${
token_list
}
"
\
++frontend_conf.cmvn_file
=
"
${
feats_dir
}
/data/
${
train_set
}
/am.mvn"
\
++input
=
"
${
_logdir
}
/keys.
${
JOB
}
.scp"
\
++output_dir
=
"
${
inference_dir
}
/
${
JOB
}
"
\
++device
=
"
${
inference_device
}
"
\
++ncpu
=
1
\
++disable_log
=
true
\
++batch_size
=
"
${
inference_batch_size
}
"
&>
${
_logdir
}
/log.
${
JOB
}
.txt
}
&
done
wait
mkdir
-p
${
inference_dir
}
/1best_recog
for
f
in
token score text
;
do
if
[
-f
"
${
inference_dir
}
/
${
JOB
}
/1best_recog/
${
f
}
"
]
;
then
for
JOB
in
$(
seq
"
${
nj
}
"
)
;
do
cat
"
${
inference_dir
}
/
${
JOB
}
/1best_recog/
${
f
}
"
done
|
sort
-k1
>
"
${
inference_dir
}
/1best_recog/
${
f
}
"
fi
done
echo
"Computing WER ..."
python utils/postprocess_text_zh.py
${
inference_dir
}
/1best_recog/text
${
inference_dir
}
/1best_recog/text.proc
python utils/postprocess_text_zh.py
${
data_dir
}
/text
${
inference_dir
}
/1best_recog/text.ref
python utils/compute_wer.py
${
inference_dir
}
/1best_recog/text.ref
${
inference_dir
}
/1best_recog/text.proc
${
inference_dir
}
/1best_recog/text.cer
tail
-n
3
${
inference_dir
}
/1best_recog/text.cer
done
fi
\ No newline at end of file
examples/aishell/branchformer/utils
0 → 120000
View file @
431278fa
../paraformer/utils
\ No newline at end of file
examples/aishell/conformer/README.md
0 → 100644
View file @
431278fa
# Conformer Result
## Training Config
-
Feature info: using 80 dims fbank, global cmvn, speed perturb(0.9, 1.0, 1.1), specaugment
-
Train info: lr 5e-4, batch_size 25000, 2 gpu(Tesla V100), acc_grad 1, 50 epochs
-
Train config: conf/train_asr_transformer.yaml
-
LM config: LM was not used
-
Model size: 46M
## Results (CER)
| testset | CER(%) |
|:-----------:|:-------:|
| dev | 4.42 |
| test | 4.87 |
\ No newline at end of file
examples/aishell/conformer/conf/conformer_12e_6d_2048_256.yaml
0 → 100644
View file @
431278fa
# This is an example that demonstrates how to configure a model file.
# You can modify the configuration according to your own requirements.
# to print the register_table:
# from funasr.register import tables
# tables.print()
# network architecture
model
:
Conformer
model_conf
:
ctc_weight
:
0.3
lsm_weight
:
0.1
# label smoothing option
length_normalized_loss
:
false
# encoder
encoder
:
ConformerEncoder
encoder_conf
:
output_size
:
256
# dimension of attention
attention_heads
:
4
linear_units
:
2048
# the number of units of position-wise feed forward
num_blocks
:
12
# the number of encoder blocks
dropout_rate
:
0.1
positional_dropout_rate
:
0.1
attention_dropout_rate
:
0.0
input_layer
:
conv2d
# encoder architecture type
normalize_before
:
true
pos_enc_layer_type
:
rel_pos
selfattention_layer_type
:
rel_selfattn
activation_type
:
swish
macaron_style
:
true
use_cnn_module
:
true
cnn_module_kernel
:
15
# decoder
decoder
:
TransformerDecoder
decoder_conf
:
attention_heads
:
4
linear_units
:
2048
num_blocks
:
6
dropout_rate
:
0.1
positional_dropout_rate
:
0.1
self_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
# frontend related
frontend
:
WavFrontend
frontend_conf
:
fs
:
16000
window
:
hamming
n_mels
:
80
frame_length
:
25
frame_shift
:
10
lfr_m
:
1
lfr_n
:
1
specaug
:
SpecAug
specaug_conf
:
apply_time_warp
:
true
time_warp_window
:
5
time_warp_mode
:
bicubic
apply_freq_mask
:
true
freq_mask_width_range
:
-
0
-
30
num_freq_mask
:
2
apply_time_mask
:
true
time_mask_width_range
:
-
0
-
40
num_time_mask
:
2
train_conf
:
accum_grad
:
1
grad_clip
:
5
max_epoch
:
150
keep_nbest_models
:
10
log_interval
:
50
optim
:
adam
optim_conf
:
lr
:
0.0005
scheduler
:
warmuplr
scheduler_conf
:
warmup_steps
:
30000
dataset
:
AudioDataset
dataset_conf
:
index_ds
:
IndexDSJsonl
batch_sampler
:
EspnetStyleBatchSampler
batch_type
:
length
# example or length
batch_size
:
25000
# if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
max_token_length
:
2048
# filter samples if source_token_len+target_token_len > max_token_length,
buffer_size
:
1024
shuffle
:
True
num_workers
:
4
preprocessor_speech
:
SpeechPreprocessSpeedPerturb
preprocessor_speech_conf
:
speed_perturb
:
[
0.9
,
1.0
,
1.1
]
tokenizer
:
CharTokenizer
tokenizer_conf
:
unk_symbol
:
<unk>
ctc_conf
:
dropout_rate
:
0.0
ctc_type
:
builtin
reduce
:
true
ignore_nan_grad
:
true
normalize
:
null
examples/aishell/conformer/conf/conformer_rwkv.yaml
0 → 100644
View file @
431278fa
# This is an example that demonstrates how to configure a model file.
# You can modify the configuration according to your own requirements.
# to print the register_table:
# from funasr.register import tables
# tables.print()
# network architecture
model
:
Conformer
model_conf
:
ctc_weight
:
0.3
lsm_weight
:
0.1
# label smoothing option
length_normalized_loss
:
false
# encoder
encoder
:
ConformerEncoder
encoder_conf
:
output_size
:
256
# dimension of attention
attention_heads
:
4
linear_units
:
2048
# the number of units of position-wise feed forward
num_blocks
:
12
# the number of encoder blocks
dropout_rate
:
0.1
positional_dropout_rate
:
0.1
attention_dropout_rate
:
0.0
input_layer
:
conv2d
# encoder architecture type
normalize_before
:
true
pos_enc_layer_type
:
rel_pos
selfattention_layer_type
:
rel_selfattn
activation_type
:
swish
macaron_style
:
true
use_cnn_module
:
true
cnn_module_kernel
:
15
# decoder
decoder
:
TransformerRWKVDecoder
decoder_conf
:
attention_heads
:
4
linear_units
:
2048
num_blocks
:
6
dropout_rate
:
0.1
positional_dropout_rate
:
0.1
self_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
input_layer
:
embed
rwkv_cfg
:
n_embd
:
256
dropout
:
0
head_size_a
:
64
ctx_len
:
512
dim_att
:
256
#${model_conf.rwkv_cfg.n_embd}
dim_ffn
:
null
head_size_divisor
:
4
n_layer
:
6
pre_ffn
:
0
ln0
:
false
ln1
:
false
init_rwkv
:
true
# frontend related
frontend
:
WavFrontend
frontend_conf
:
fs
:
16000
window
:
hamming
n_mels
:
80
frame_length
:
25
frame_shift
:
10
lfr_m
:
1
lfr_n
:
1
specaug
:
SpecAug
specaug_conf
:
apply_time_warp
:
true
time_warp_window
:
5
time_warp_mode
:
bicubic
apply_freq_mask
:
true
freq_mask_width_range
:
-
0
-
30
num_freq_mask
:
2
apply_time_mask
:
true
time_mask_width_range
:
-
0
-
40
num_time_mask
:
2
train_conf
:
accum_grad
:
1
grad_clip
:
5
max_epoch
:
150
keep_nbest_models
:
10
log_interval
:
50
optim
:
adam
optim_conf
:
lr
:
0.0005
scheduler
:
warmuplr
scheduler_conf
:
warmup_steps
:
30000
dataset
:
AudioDataset
dataset_conf
:
index_ds
:
IndexDSJsonl
batch_sampler
:
EspnetStyleBatchSampler
batch_type
:
length
# example or length
batch_size
:
25000
# if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
max_token_length
:
2048
# filter samples if source_token_len+target_token_len > max_token_length,
buffer_size
:
1024
shuffle
:
True
num_workers
:
4
preprocessor_speech
:
SpeechPreprocessSpeedPerturb
preprocessor_speech_conf
:
speed_perturb
:
[
0.9
,
1.0
,
1.1
]
tokenizer
:
CharTokenizer
tokenizer_conf
:
unk_symbol
:
<unk>
ctc_conf
:
dropout_rate
:
0.0
ctc_type
:
builtin
reduce
:
true
ignore_nan_grad
:
true
normalize
:
null
examples/aishell/conformer/demo_infer.sh
0 → 120000
View file @
431278fa
../paraformer/demo_infer.sh
\ No newline at end of file
examples/aishell/conformer/demo_train_or_finetune.sh
0 → 120000
View file @
431278fa
../paraformer/demo_train_or_finetune.sh
\ No newline at end of file
examples/aishell/conformer/local/aishell_data_prep.sh
0 → 100755
View file @
431278fa
#!/bin/bash
# Copyright 2017 Xingyu Na
# Apache 2.0
#. ./path.sh || exit 1;
if
[
$#
!=
3
]
;
then
echo
"Usage:
$0
<audio-path> <text-path> <output-path>"
echo
"
$0
/export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript data"
exit
1
;
fi
aishell_audio_dir
=
$1
aishell_text
=
$2
/aishell_transcript_v0.8.txt
output_dir
=
$3
train_dir
=
$output_dir
/data/local/train
dev_dir
=
$output_dir
/data/local/dev
test_dir
=
$output_dir
/data/local/test
tmp_dir
=
$output_dir
/data/local/tmp
mkdir
-p
$train_dir
mkdir
-p
$dev_dir
mkdir
-p
$test_dir
mkdir
-p
$tmp_dir
# data directory check
if
[
!
-d
$aishell_audio_dir
]
||
[
!
-f
$aishell_text
]
;
then
echo
"Error:
$0
requires two directory arguments"
exit
1
;
fi
# find wav audio file for train, dev and test resp.
find
$aishell_audio_dir
-iname
"*.wav"
>
$tmp_dir
/wav.flist
n
=
`
cat
$tmp_dir
/wav.flist |
wc
-l
`
[
$n
-ne
141925
]
&&
\
echo
Warning: expected 141925 data data files, found
$n
grep
-i
"wav/train"
$tmp_dir
/wav.flist
>
$train_dir
/wav.flist
||
exit
1
;
grep
-i
"wav/dev"
$tmp_dir
/wav.flist
>
$dev_dir
/wav.flist
||
exit
1
;
grep
-i
"wav/test"
$tmp_dir
/wav.flist
>
$test_dir
/wav.flist
||
exit
1
;
rm
-r
$tmp_dir
# Transcriptions preparation
for
dir
in
$train_dir
$dev_dir
$test_dir
;
do
echo
Preparing
$dir
transcriptions
sed
-e
's/\.wav//'
$dir
/wav.flist |
awk
-F
'/'
'{print $NF}'
>
$dir
/utt.list
paste
-d
' '
$dir
/utt.list
$dir
/wav.flist
>
$dir
/wav.scp_all
utils/filter_scp.pl
-f
1
$dir
/utt.list
$aishell_text
>
$dir
/transcripts.txt
awk
'{print $1}'
$dir
/transcripts.txt
>
$dir
/utt.list
utils/filter_scp.pl
-f
1
$dir
/utt.list
$dir
/wav.scp_all |
sort
-u
>
$dir
/wav.scp
sort
-u
$dir
/transcripts.txt
>
$dir
/text
done
mkdir
-p
$output_dir
/data/train
$output_dir
/data/dev
$output_dir
/data/test
for
f
in
wav.scp text
;
do
cp
$train_dir
/
$f
$output_dir
/data/train/
$f
||
exit
1
;
cp
$dev_dir
/
$f
$output_dir
/data/dev/
$f
||
exit
1
;
cp
$test_dir
/
$f
$output_dir
/data/test/
$f
||
exit
1
;
done
echo
"
$0
: AISHELL data preparation succeeded"
exit
0
;
examples/aishell/conformer/local/download_and_untar.sh
0 → 100755
View file @
431278fa
#!/usr/bin/env bash
# Copyright 2014 Johns Hopkins University (author: Daniel Povey)
# 2017 Xingyu Na
# Apache 2.0
remove_archive
=
false
if
[
"
$1
"
==
--remove-archive
]
;
then
remove_archive
=
true
shift
fi
if
[
$#
-ne
3
]
;
then
echo
"Usage:
$0
[--remove-archive] <data-base> <url-base> <corpus-part>"
echo
"e.g.:
$0
/export/a05/xna/data www.openslr.org/resources/33 data_aishell"
echo
"With --remove-archive it will remove the archive after successfully un-tarring it."
echo
"<corpus-part> can be one of: data_aishell, resource_aishell."
fi
data
=
$1
url
=
$2
part
=
$3
if
[
!
-d
"
$data
"
]
;
then
echo
"
$0
: no such directory
$data
"
exit
1
;
fi
part_ok
=
false
list
=
"data_aishell resource_aishell"
for
x
in
$list
;
do
if
[
"
$part
"
==
$x
]
;
then
part_ok
=
true
;
fi
done
if
!
$part_ok
;
then
echo
"
$0
: expected <corpus-part> to be one of
$list
, but got '
$part
'"
exit
1
;
fi
if
[
-z
"
$url
"
]
;
then
echo
"
$0
: empty URL base."
exit
1
;
fi
if
[
-f
$data
/
$part
/.complete
]
;
then
echo
"
$0
: data part
$part
was already successfully extracted, nothing to do."
exit
0
;
fi
# sizes of the archive files in bytes.
sizes
=
"15582913665 1246920"
if
[
-f
$data
/
$part
.tgz
]
;
then
size
=
$(
/bin/ls
-l
$data
/
$part
.tgz |
awk
'{print $5}'
)
size_ok
=
false
for
s
in
$sizes
;
do if
[
$s
==
$size
]
;
then
size_ok
=
true
;
fi
;
done
if
!
$size_ok
;
then
echo
"
$0
: removing existing file
$data
/
$part
.tgz because its size in bytes
$size
"
echo
"does not equal the size of one of the archives."
rm
$data
/
$part
.tgz
else
echo
"
$data
/
$part
.tgz exists and appears to be complete."
fi
fi
if
[
!
-f
$data
/
$part
.tgz
]
;
then
if
!
command
-v
wget
>
/dev/null
;
then
echo
"
$0
: wget is not installed."
exit
1
;
fi
full_url
=
$url
/
$part
.tgz
echo
"
$0
: downloading data from
$full_url
. This may take some time, please be patient."
cd
$data
||
exit
1
if
!
wget
--no-check-certificate
$full_url
;
then
echo
"
$0
: error executing wget
$full_url
"
exit
1
;
fi
fi
cd
$data
||
exit
1
if
!
tar
-xvzf
$part
.tgz
;
then
echo
"
$0
: error un-tarring archive
$data
/
$part
.tgz"
exit
1
;
fi
touch
$data
/
$part
/.complete
if
[
$part
==
"data_aishell"
]
;
then
cd
$data
/
$part
/wav
||
exit
1
for
wav
in
./
*
.tar.gz
;
do
echo
"Extracting wav from
$wav
"
tar
-zxf
$wav
&&
rm
$wav
done
fi
echo
"
$0
: Successfully downloaded and un-tarred
$data
/
$part
.tgz"
if
$remove_archive
;
then
echo
"
$0
: removing
$data
/
$part
.tgz file since --remove-archive option was supplied."
rm
$data
/
$part
.tgz
fi
exit
0
;
examples/aishell/conformer/run.sh
0 → 100755
View file @
431278fa
#!/usr/bin/env bash
CUDA_VISIBLE_DEVICES
=
"0,1"
# general configuration
feats_dir
=
"../DATA"
#feature output dictionary
exp_dir
=
`
pwd
`
lang
=
zh
token_type
=
char
stage
=
0
stop_stage
=
5
# feature configuration
nj
=
32
inference_device
=
"cuda"
#"cpu", "cuda:0", "cuda:1"
inference_checkpoint
=
"model.pt.avg10"
inference_scp
=
"wav.scp"
inference_batch_size
=
1
# data
raw_data
=
../raw_data
data_url
=
www.openslr.org/resources/33
# exp tag
tag
=
"exp1"
workspace
=
`
pwd
`
master_port
=
12345
.
utils/parse_options.sh
||
exit
1
;
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set
-e
set
-u
set
-o
pipefail
train_set
=
train
valid_set
=
dev
test_sets
=
"dev test"
config
=
conformer_12e_6d_2048_256.yaml
model_dir
=
"baseline_
$(
basename
"
${
config
}
"
.yaml
)
_
${
lang
}
_
${
token_type
}
_
${
tag
}
"
if
[
${
stage
}
-le
-1
]
&&
[
${
stop_stage
}
-ge
-1
]
;
then
echo
"stage -1: Data Download"
mkdir
-p
${
raw_data
}
local
/download_and_untar.sh
${
raw_data
}
${
data_url
}
data_aishell
local
/download_and_untar.sh
${
raw_data
}
${
data_url
}
resource_aishell
fi
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
echo
"stage 0: Data preparation"
# Data preparation
local
/aishell_data_prep.sh
${
raw_data
}
/data_aishell/wav
${
raw_data
}
/data_aishell/transcript
${
feats_dir
}
for
x
in
train dev
test
;
do
cp
${
feats_dir
}
/data/
${
x
}
/text
${
feats_dir
}
/data/
${
x
}
/text.org
paste
-d
" "
<
(
cut
-f
1
-d
" "
${
feats_dir
}
/data/
${
x
}
/text.org
)
<
(
cut
-f
2-
-d
" "
${
feats_dir
}
/data/
${
x
}
/text.org |
tr
-d
" "
)
\
>
${
feats_dir
}
/data/
${
x
}
/text
utils/text2token.py
-n
1
-s
1
${
feats_dir
}
/data/
${
x
}
/text
>
${
feats_dir
}
/data/
${
x
}
/text.org
mv
${
feats_dir
}
/data/
${
x
}
/text.org
${
feats_dir
}
/data/
${
x
}
/text
# convert wav.scp text to jsonl
scp_file_list_arg
=
"++scp_file_list='[
\"
${
feats_dir
}
/data/
${
x
}
/wav.scp
\"
,
\"
${
feats_dir
}
/data/
${
x
}
/text
\"
]'"
python ../../../funasr/datasets/audio_datasets/scp2jsonl.py
\
++data_type_list
=
'["source", "target"]'
\
++jsonl_file_out
=
${
feats_dir
}
/data/
${
x
}
/audio_datasets.jsonl
\
${
scp_file_list_arg
}
done
fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
echo
"stage 1: Feature and CMVN Generation"
python ../../../funasr/bin/compute_audio_cmvn.py
\
--config-path
"
${
workspace
}
/conf"
\
--config-name
"
${
config
}
"
\
++train_data_set_list
=
"
${
feats_dir
}
/data/
${
train_set
}
/audio_datasets.jsonl"
\
++cmvn_file
=
"
${
feats_dir
}
/data/
${
train_set
}
/cmvn.json"
\
fi
token_list
=
${
feats_dir
}
/data/
${
lang
}
_token_list/
$token_type
/tokens.txt
echo
"dictionary:
${
token_list
}
"
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
echo
"stage 2: Dictionary Preparation"
mkdir
-p
${
feats_dir
}
/data/
${
lang
}
_token_list/
$token_type
/
echo
"make a dictionary"
echo
"<blank>"
>
${
token_list
}
echo
"<s>"
>>
${
token_list
}
echo
"</s>"
>>
${
token_list
}
utils/text2token.py
-s
1
-n
1
--space
""
${
feats_dir
}
/data/
$train_set
/text |
cut
-f
2-
-d
" "
|
tr
" "
"
\n
"
\
|
sort
|
uniq
|
grep
-a
-v
-e
'^\s*$'
|
awk
'{print $0}'
>>
${
token_list
}
echo
"<unk>"
>>
${
token_list
}
fi
# LM Training Stage
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
echo
"stage 3: LM Training"
fi
# ASR Training Stage
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
echo
"stage 4: ASR Training"
mkdir
-p
${
exp_dir
}
/exp/
${
model_dir
}
current_time
=
$(
date
"+%Y-%m-%d_%H-%M"
)
log_file
=
"
${
exp_dir
}
/exp/
${
model_dir
}
/train.log.txt.
${
current_time
}
"
echo
"log_file:
${
log_file
}
"
export
CUDA_VISIBLE_DEVICES
=
$CUDA_VISIBLE_DEVICES
gpu_num
=
$(
echo
$CUDA_VISIBLE_DEVICES
|
awk
-F
","
'{print NF}'
)
torchrun
\
--nnodes
1
\
--nproc_per_node
${
gpu_num
}
\
--master_port
${
master_port
}
\
../../../funasr/bin/train.py
\
--config-path
"
${
workspace
}
/conf"
\
--config-name
"
${
config
}
"
\
++train_data_set_list
=
"
${
feats_dir
}
/data/
${
train_set
}
/audio_datasets.jsonl"
\
++valid_data_set_list
=
"
${
feats_dir
}
/data/
${
valid_set
}
/audio_datasets.jsonl"
\
++tokenizer_conf.token_list
=
"
${
token_list
}
"
\
++frontend_conf.cmvn_file
=
"
${
feats_dir
}
/data/
${
train_set
}
/am.mvn"
\
++output_dir
=
"
${
exp_dir
}
/exp/
${
model_dir
}
"
&>
${
log_file
}
fi
# Testing Stage
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
echo
"stage 5: Inference"
if
[
${
inference_device
}
==
"cuda"
]
;
then
nj
=
$(
echo
$CUDA_VISIBLE_DEVICES
|
awk
-F
","
'{print NF}'
)
else
inference_batch_size
=
1
CUDA_VISIBLE_DEVICES
=
""
for
JOB
in
$(
seq
${
nj
}
)
;
do
CUDA_VISIBLE_DEVICES
=
$CUDA_VISIBLE_DEVICES
"-1,"
done
fi
for
dset
in
${
test_sets
}
;
do
inference_dir
=
"
${
exp_dir
}
/exp/
${
model_dir
}
/inference-
${
inference_checkpoint
}
/
${
dset
}
"
_logdir
=
"
${
inference_dir
}
/logdir"
echo
"inference_dir:
${
inference_dir
}
"
mkdir
-p
"
${
_logdir
}
"
data_dir
=
"
${
feats_dir
}
/data/
${
dset
}
"
key_file
=
${
data_dir
}
/
${
inference_scp
}
split_scps
=
for
JOB
in
$(
seq
"
${
nj
}
"
)
;
do
split_scps+
=
"
${
_logdir
}
/keys.
${
JOB
}
.scp"
done
utils/split_scp.pl
"
${
key_file
}
"
${
split_scps
}
gpuid_list_array
=(
${
CUDA_VISIBLE_DEVICES
//,/
}
)
for
JOB
in
$(
seq
${
nj
}
)
;
do
{
id
=
$((
JOB-1
))
gpuid
=
${
gpuid_list_array
[
$id
]
}
export
CUDA_VISIBLE_DEVICES
=
${
gpuid
}
python ../../../funasr/bin/inference.py
\
--config-path
=
"
${
exp_dir
}
/exp/
${
model_dir
}
"
\
--config-name
=
"config.yaml"
\
++init_param
=
"
${
exp_dir
}
/exp/
${
model_dir
}
/
${
inference_checkpoint
}
"
\
++tokenizer_conf.token_list
=
"
${
token_list
}
"
\
++frontend_conf.cmvn_file
=
"
${
feats_dir
}
/data/
${
train_set
}
/am.mvn"
\
++input
=
"
${
_logdir
}
/keys.
${
JOB
}
.scp"
\
++output_dir
=
"
${
inference_dir
}
/
${
JOB
}
"
\
++device
=
"
${
inference_device
}
"
\
++ncpu
=
1
\
++disable_log
=
true
\
++batch_size
=
"
${
inference_batch_size
}
"
&>
${
_logdir
}
/log.
${
JOB
}
.txt
}
&
done
wait
mkdir
-p
${
inference_dir
}
/1best_recog
for
f
in
token score text
;
do
if
[
-f
"
${
inference_dir
}
/
${
JOB
}
/1best_recog/
${
f
}
"
]
;
then
for
JOB
in
$(
seq
"
${
nj
}
"
)
;
do
cat
"
${
inference_dir
}
/
${
JOB
}
/1best_recog/
${
f
}
"
done
|
sort
-k1
>
"
${
inference_dir
}
/1best_recog/
${
f
}
"
fi
done
echo
"Computing WER ..."
python utils/postprocess_text_zh.py
${
inference_dir
}
/1best_recog/text
${
inference_dir
}
/1best_recog/text.proc
python utils/postprocess_text_zh.py
${
data_dir
}
/text
${
inference_dir
}
/1best_recog/text.ref
python utils/compute_wer.py
${
inference_dir
}
/1best_recog/text.ref
${
inference_dir
}
/1best_recog/text.proc
${
inference_dir
}
/1best_recog/text.cer
tail
-n
3
${
inference_dir
}
/1best_recog/text.cer
done
fi
examples/aishell/conformer/utils
0 → 120000
View file @
431278fa
../paraformer/utils
\ No newline at end of file
examples/aishell/e_branchformer/README.md
0 → 100644
View file @
431278fa
# E-Branchformer Result
## Training Config
-
Feature info: using raw speech, extracting 80 dims fbank online, global cmvn, speed perturb(0.9, 1.0, 1.1), specaugment
-
Train info: lr 0.001, batch_size 10000, 4 gpu(Tesla V100), acc_grad 1, 180 epochs
-
Train config: conf/train_asr_e_branchformer.yaml
-
LM config: LM was not used
## Results (CER)
| testset | CER(%) |
|:-----------:|:-------:|
| dev | 4.10 |
| test | 4.52 |
\ No newline at end of file
examples/aishell/e_branchformer/conf/e_branchformer_12e_6d_2048_256.yaml
0 → 100644
View file @
431278fa
# This is an example that demonstrates how to configure a model file.
# You can modify the configuration according to your own requirements.
# to print the register_table:
# from funasr.register import tables
# tables.print()
# network architecture
model
:
Branchformer
model_conf
:
ctc_weight
:
0.3
lsm_weight
:
0.1
# label smoothing option
length_normalized_loss
:
false
# encoder
encoder
:
EBranchformerEncoder
encoder_conf
:
output_size
:
256
attention_heads
:
4
attention_layer_type
:
rel_selfattn
pos_enc_layer_type
:
rel_pos
rel_pos_type
:
latest
cgmlp_linear_units
:
1024
cgmlp_conv_kernel
:
31
use_linear_after_conv
:
false
gate_activation
:
identity
num_blocks
:
12
dropout_rate
:
0.1
positional_dropout_rate
:
0.1
attention_dropout_rate
:
0.1
input_layer
:
conv2d
layer_drop_rate
:
0.0
linear_units
:
1024
positionwise_layer_type
:
linear
use_ffn
:
true
macaron_ffn
:
true
merge_conv_kernel
:
31
# decoder
decoder
:
TransformerDecoder
decoder_conf
:
attention_heads
:
4
linear_units
:
2048
num_blocks
:
6
dropout_rate
:
0.1
positional_dropout_rate
:
0.1
self_attention_dropout_rate
:
0.
src_attention_dropout_rate
:
0.
# frontend related
frontend
:
WavFrontend
frontend_conf
:
fs
:
16000
window
:
hamming
n_mels
:
80
frame_length
:
25
frame_shift
:
10
dither
:
0.0
lfr_m
:
1
lfr_n
:
1
specaug
:
SpecAug
specaug_conf
:
apply_time_warp
:
true
time_warp_window
:
5
time_warp_mode
:
bicubic
apply_freq_mask
:
true
freq_mask_width_range
:
-
0
-
30
num_freq_mask
:
2
apply_time_mask
:
true
time_mask_width_range
:
-
0
-
40
num_time_mask
:
2
train_conf
:
accum_grad
:
1
grad_clip
:
5
max_epoch
:
180
keep_nbest_models
:
10
log_interval
:
50
optim
:
adam
optim_conf
:
lr
:
0.001
weight_decay
:
0.000001
scheduler
:
warmuplr
scheduler_conf
:
warmup_steps
:
35000
dataset
:
AudioDataset
dataset_conf
:
index_ds
:
IndexDSJsonl
batch_sampler
:
EspnetStyleBatchSampler
batch_type
:
length
# example or length
batch_size
:
10000
# if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
max_token_length
:
2048
# filter samples if source_token_len+target_token_len > max_token_length,
buffer_size
:
1024
shuffle
:
True
num_workers
:
4
preprocessor_speech
:
SpeechPreprocessSpeedPerturb
preprocessor_speech_conf
:
speed_perturb
:
[
0.9
,
1.0
,
1.1
]
tokenizer
:
CharTokenizer
tokenizer_conf
:
unk_symbol
:
<unk>
ctc_conf
:
dropout_rate
:
0.0
ctc_type
:
builtin
reduce
:
true
ignore_nan_grad
:
true
normalize
:
null
examples/aishell/e_branchformer/demo_infer.sh
0 → 120000
View file @
431278fa
../paraformer/demo_infer.sh
\ No newline at end of file
examples/aishell/e_branchformer/demo_train_or_finetune.sh
0 → 120000
View file @
431278fa
../paraformer/demo_train_or_finetune.sh
\ No newline at end of file
examples/aishell/e_branchformer/local/aishell_data_prep.sh
0 → 100755
View file @
431278fa
#!/bin/bash
# Copyright 2017 Xingyu Na
# Apache 2.0
#. ./path.sh || exit 1;
if
[
$#
!=
3
]
;
then
echo
"Usage:
$0
<audio-path> <text-path> <output-path>"
echo
"
$0
/export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript data"
exit
1
;
fi
aishell_audio_dir
=
$1
aishell_text
=
$2
/aishell_transcript_v0.8.txt
output_dir
=
$3
train_dir
=
$output_dir
/data/local/train
dev_dir
=
$output_dir
/data/local/dev
test_dir
=
$output_dir
/data/local/test
tmp_dir
=
$output_dir
/data/local/tmp
mkdir
-p
$train_dir
mkdir
-p
$dev_dir
mkdir
-p
$test_dir
mkdir
-p
$tmp_dir
# data directory check
if
[
!
-d
$aishell_audio_dir
]
||
[
!
-f
$aishell_text
]
;
then
echo
"Error:
$0
requires two directory arguments"
exit
1
;
fi
# find wav audio file for train, dev and test resp.
find
$aishell_audio_dir
-iname
"*.wav"
>
$tmp_dir
/wav.flist
n
=
`
cat
$tmp_dir
/wav.flist |
wc
-l
`
[
$n
-ne
141925
]
&&
\
echo
Warning: expected 141925 data data files, found
$n
grep
-i
"wav/train"
$tmp_dir
/wav.flist
>
$train_dir
/wav.flist
||
exit
1
;
grep
-i
"wav/dev"
$tmp_dir
/wav.flist
>
$dev_dir
/wav.flist
||
exit
1
;
grep
-i
"wav/test"
$tmp_dir
/wav.flist
>
$test_dir
/wav.flist
||
exit
1
;
rm
-r
$tmp_dir
# Transcriptions preparation
for
dir
in
$train_dir
$dev_dir
$test_dir
;
do
echo
Preparing
$dir
transcriptions
sed
-e
's/\.wav//'
$dir
/wav.flist |
awk
-F
'/'
'{print $NF}'
>
$dir
/utt.list
paste
-d
' '
$dir
/utt.list
$dir
/wav.flist
>
$dir
/wav.scp_all
utils/filter_scp.pl
-f
1
$dir
/utt.list
$aishell_text
>
$dir
/transcripts.txt
awk
'{print $1}'
$dir
/transcripts.txt
>
$dir
/utt.list
utils/filter_scp.pl
-f
1
$dir
/utt.list
$dir
/wav.scp_all |
sort
-u
>
$dir
/wav.scp
sort
-u
$dir
/transcripts.txt
>
$dir
/text
done
mkdir
-p
$output_dir
/data/train
$output_dir
/data/dev
$output_dir
/data/test
for
f
in
wav.scp text
;
do
cp
$train_dir
/
$f
$output_dir
/data/train/
$f
||
exit
1
;
cp
$dev_dir
/
$f
$output_dir
/data/dev/
$f
||
exit
1
;
cp
$test_dir
/
$f
$output_dir
/data/test/
$f
||
exit
1
;
done
echo
"
$0
: AISHELL data preparation succeeded"
exit
0
;
examples/aishell/e_branchformer/local/download_and_untar.sh
0 → 100755
View file @
431278fa
#!/usr/bin/env bash
# Copyright 2014 Johns Hopkins University (author: Daniel Povey)
# 2017 Xingyu Na
# Apache 2.0
remove_archive
=
false
if
[
"
$1
"
==
--remove-archive
]
;
then
remove_archive
=
true
shift
fi
if
[
$#
-ne
3
]
;
then
echo
"Usage:
$0
[--remove-archive] <data-base> <url-base> <corpus-part>"
echo
"e.g.:
$0
/export/a05/xna/data www.openslr.org/resources/33 data_aishell"
echo
"With --remove-archive it will remove the archive after successfully un-tarring it."
echo
"<corpus-part> can be one of: data_aishell, resource_aishell."
fi
data
=
$1
url
=
$2
part
=
$3
if
[
!
-d
"
$data
"
]
;
then
echo
"
$0
: no such directory
$data
"
exit
1
;
fi
part_ok
=
false
list
=
"data_aishell resource_aishell"
for
x
in
$list
;
do
if
[
"
$part
"
==
$x
]
;
then
part_ok
=
true
;
fi
done
if
!
$part_ok
;
then
echo
"
$0
: expected <corpus-part> to be one of
$list
, but got '
$part
'"
exit
1
;
fi
if
[
-z
"
$url
"
]
;
then
echo
"
$0
: empty URL base."
exit
1
;
fi
if
[
-f
$data
/
$part
/.complete
]
;
then
echo
"
$0
: data part
$part
was already successfully extracted, nothing to do."
exit
0
;
fi
# sizes of the archive files in bytes.
sizes
=
"15582913665 1246920"
if
[
-f
$data
/
$part
.tgz
]
;
then
size
=
$(
/bin/ls
-l
$data
/
$part
.tgz |
awk
'{print $5}'
)
size_ok
=
false
for
s
in
$sizes
;
do if
[
$s
==
$size
]
;
then
size_ok
=
true
;
fi
;
done
if
!
$size_ok
;
then
echo
"
$0
: removing existing file
$data
/
$part
.tgz because its size in bytes
$size
"
echo
"does not equal the size of one of the archives."
rm
$data
/
$part
.tgz
else
echo
"
$data
/
$part
.tgz exists and appears to be complete."
fi
fi
if
[
!
-f
$data
/
$part
.tgz
]
;
then
if
!
command
-v
wget
>
/dev/null
;
then
echo
"
$0
: wget is not installed."
exit
1
;
fi
full_url
=
$url
/
$part
.tgz
echo
"
$0
: downloading data from
$full_url
. This may take some time, please be patient."
cd
$data
||
exit
1
if
!
wget
--no-check-certificate
$full_url
;
then
echo
"
$0
: error executing wget
$full_url
"
exit
1
;
fi
fi
cd
$data
||
exit
1
if
!
tar
-xvzf
$part
.tgz
;
then
echo
"
$0
: error un-tarring archive
$data
/
$part
.tgz"
exit
1
;
fi
touch
$data
/
$part
/.complete
if
[
$part
==
"data_aishell"
]
;
then
cd
$data
/
$part
/wav
||
exit
1
for
wav
in
./
*
.tar.gz
;
do
echo
"Extracting wav from
$wav
"
tar
-zxf
$wav
&&
rm
$wav
done
fi
echo
"
$0
: Successfully downloaded and un-tarred
$data
/
$part
.tgz"
if
$remove_archive
;
then
echo
"
$0
: removing
$data
/
$part
.tgz file since --remove-archive option was supplied."
rm
$data
/
$part
.tgz
fi
exit
0
;
Prev
1
2
3
4
5
6
7
8
9
…
40
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment