Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
Conformer_pytorch
Commits
a7785cc6
Commit
a7785cc6
authored
Mar 26, 2024
by
Sugon_ldc
Browse files
delete soft link
parent
9a2a05ca
Changes
162
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3168 additions
and
1 deletion
+3168
-1
examples/aishell/s0/tools
examples/aishell/s0/tools
+0
-1
examples/aishell/s0/tools/alignment.sh
examples/aishell/s0/tools/alignment.sh
+49
-0
examples/aishell/s0/tools/analyze_dataset.py
examples/aishell/s0/tools/analyze_dataset.py
+248
-0
examples/aishell/s0/tools/cmvn_kaldi2json.py
examples/aishell/s0/tools/cmvn_kaldi2json.py
+37
-0
examples/aishell/s0/tools/combine_data.sh
examples/aishell/s0/tools/combine_data.sh
+146
-0
examples/aishell/s0/tools/compute-cer.py
examples/aishell/s0/tools/compute-cer.py
+532
-0
examples/aishell/s0/tools/compute-wer.py
examples/aishell/s0/tools/compute-wer.py
+500
-0
examples/aishell/s0/tools/compute_cmvn_stats.py
examples/aishell/s0/tools/compute_cmvn_stats.py
+141
-0
examples/aishell/s0/tools/compute_fbank_feats.py
examples/aishell/s0/tools/compute_fbank_feats.py
+128
-0
examples/aishell/s0/tools/copy_data_dir.sh
examples/aishell/s0/tools/copy_data_dir.sh
+147
-0
examples/aishell/s0/tools/data/remove_dup_utts.sh
examples/aishell/s0/tools/data/remove_dup_utts.sh
+66
-0
examples/aishell/s0/tools/data/split_scp.pl
examples/aishell/s0/tools/data/split_scp.pl
+246
-0
examples/aishell/s0/tools/decode.sh
examples/aishell/s0/tools/decode.sh
+89
-0
examples/aishell/s0/tools/feat_to_shape.sh
examples/aishell/s0/tools/feat_to_shape.sh
+73
-0
examples/aishell/s0/tools/filter_scp.pl
examples/aishell/s0/tools/filter_scp.pl
+87
-0
examples/aishell/s0/tools/fix_data_dir.sh
examples/aishell/s0/tools/fix_data_dir.sh
+217
-0
examples/aishell/s0/tools/flake8_hook.py
examples/aishell/s0/tools/flake8_hook.py
+13
-0
examples/aishell/s0/tools/format_data.sh
examples/aishell/s0/tools/format_data.sh
+166
-0
examples/aishell/s0/tools/fst/add_lex_disambig.pl
examples/aishell/s0/tools/fst/add_lex_disambig.pl
+195
-0
examples/aishell/s0/tools/fst/compile_lexicon_token_fst.sh
examples/aishell/s0/tools/fst/compile_lexicon_token_fst.sh
+88
-0
No files found.
examples/aishell/s0/tools
deleted
120000 → 0
View file @
9a2a05ca
../../../tools/
\ No newline at end of file
examples/aishell/s0/tools/alignment.sh
0 → 100644
View file @
a7785cc6
#!/bin/bash
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
.
./path.sh
||
exit
1
;
stage
=
0
# start from 0 if you need to start from data preparation
stop_stage
=
0
nj
=
16
feat_dir
=
raw_wav
dict
=
data/dict/lang_char.txt
dir
=
exp/
config
=
$dir
/train.yaml
checkpoint
=
checkpoint
=
/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/avg_20.pt
config
=
/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/train.yaml
set
=
ali_format
=
$feat_dir
/
$set
/format.data
ali_format
=
format.data
ali_result
=
$dir
/ali
.
tools/parse_options.sh
||
exit
1
;
if
[
${
stage
}
-le
-1
]
&&
[
${
stop_stage
}
-ge
-1
]
;
then
nj
=
32
# Prepare required data for ctc alignment
echo
"Prepare data, prepare required format"
for
x
in
$set
;
do
tools/format_data.sh
--nj
${
nj
}
\
--feat-type
wav
--feat
$feat_dir
/
$x
/wav.scp
\
$feat_dir
/
$x
${
dict
}
>
$feat_dir
/
$x
/format.data.tmp
done
fi
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
# Test model, please specify the model you want to use by --checkpoint
python wenet/bin/alignment_deprecated.py
--gpu
-1
\
--config
$config
\
--input_file
$ali_format
\
--checkpoint
$checkpoint
\
--batch_size
1
\
--dict
$dict
\
--result_file
$ali_result
\
fi
examples/aishell/s0/tools/analyze_dataset.py
0 → 100755
View file @
a7785cc6
#!/usr/bin/env python3
# Copyright (c) 2022 Horizon Inc. (authors: Xingchen Song)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Analyze Dataset, Duration/TextLength/Speed etc.
Usage:
. ./path.sh && python3 tools/analyze_dataset.py
\
--data_type "shard"
\
--data_list data/test/data.list
\
--output_dir exp/analyze_test
\
--num_thread 32
"""
import
os
import
json
import
math
import
time
import
numpy
import
logging
import
librosa
import
tarfile
import
argparse
import
torchaudio
import
multiprocessing
from
wenet.utils.file_utils
import
read_lists
from
wenet.dataset.processor
import
AUDIO_FORMAT_SETS
def
get_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
'Analyze dataset'
)
parser
.
add_argument
(
'--data_type'
,
default
=
'wav_scp'
,
choices
=
[
'wav_scp'
,
'raw'
,
'shard'
],
help
=
'dataset type'
)
parser
.
add_argument
(
'--output_dir'
,
type
=
str
,
default
=
"exp"
,
help
=
'write info to output dir'
)
parser
.
add_argument
(
'--data_list'
,
default
=
None
,
help
=
'used in raw/shard mode'
)
parser
.
add_argument
(
'--wav_scp'
,
default
=
None
,
help
=
'used in wav_scp mode'
)
parser
.
add_argument
(
'--text'
,
default
=
None
,
help
=
'used in wav_scp mode'
)
parser
.
add_argument
(
'--num_thread'
,
type
=
int
,
default
=
4
,
help
=
'number of threads'
)
args
=
parser
.
parse_args
()
print
(
args
)
return
args
def
analyze
(
datas
,
output_file
,
thread_id
):
with
open
(
output_file
,
"w"
,
encoding
=
'utf8'
)
as
f
:
for
i
,
data
in
enumerate
(
datas
):
if
type
(
data
[
'wav'
])
is
numpy
.
ndarray
:
y
,
sample_rate
=
data
[
'wav'
],
data
[
'sample_rate'
]
data
[
'wav'
]
=
"None"
# NOTE(xcsong): Do not save wav.
elif
type
(
data
[
'wav'
]
is
str
):
y
,
sample_rate
=
librosa
.
load
(
data
[
'wav'
],
sr
=
16000
)
data
[
'dur'
]
=
len
(
y
)
/
sample_rate
data
[
'txt_length'
]
=
len
(
data
[
'txt'
])
data
[
'speed'
]
=
data
[
'txt_length'
]
/
data
[
'dur'
]
# Trim the beginning and ending silence
_
,
index
=
librosa
.
effects
.
trim
(
y
,
top_db
=
30
)
data
[
'leading_sil'
]
=
librosa
.
get_duration
(
y
=
y
[:
index
[
0
]],
sr
=
16000
)
*
1000
if
index
[
0
]
>
0
else
0
data
[
'trailing_sil'
]
=
librosa
.
get_duration
(
y
=
y
[
index
[
1
]:],
sr
=
16000
)
*
1000
if
index
[
1
]
<
len
(
y
)
else
0
data_str
=
json
.
dumps
(
data
,
ensure_ascii
=
False
)
f
.
write
(
"{}
\n
"
.
format
(
data_str
))
if
thread_id
==
0
and
i
%
100
==
0
:
logging
.
info
(
"
\t
Thread-{}: processed {}/{}"
.
format
(
thread_id
,
i
,
len
(
datas
)))
def
read_tar
(
file
):
try
:
with
tarfile
.
open
(
fileobj
=
open
(
file
,
"rb"
),
mode
=
"r|*"
)
as
stream
:
prev_prefix
=
None
data
=
{}
valid
=
True
for
tarinfo
in
stream
:
name
=
tarinfo
.
name
pos
=
name
.
rfind
(
'.'
)
assert
pos
>
0
prefix
,
postfix
=
name
[:
pos
],
name
[
pos
+
1
:]
if
prev_prefix
is
not
None
and
prefix
!=
prev_prefix
:
data
[
'key'
]
=
prev_prefix
if
valid
:
yield
data
data
=
{}
valid
=
True
with
stream
.
extractfile
(
tarinfo
)
as
file_obj
:
try
:
if
postfix
==
'txt'
:
data
[
'txt'
]
=
file_obj
.
read
().
decode
(
'utf8'
).
strip
()
elif
postfix
in
AUDIO_FORMAT_SETS
:
waveform
,
sample_rate
=
torchaudio
.
load
(
file_obj
)
# single channel
data
[
'wav'
]
=
waveform
.
numpy
()[
0
,
:]
data
[
'sample_rate'
]
=
sample_rate
else
:
data
[
postfix
]
=
file_obj
.
read
()
except
Exception
as
ex
:
valid
=
False
logging
.
warning
(
'error: {} when parse {}'
.
format
(
ex
,
name
))
prev_prefix
=
prefix
# The last data in tar
if
prev_prefix
is
not
None
:
data
[
'key'
]
=
prev_prefix
yield
data
except
Exception
as
ex
:
logging
.
warning
(
'tar_file error: {} when processing {}'
.
format
(
ex
,
file
))
def
main
():
start_time
=
time
.
time
()
args
=
get_args
()
logging
.
basicConfig
(
level
=
logging
.
DEBUG
,
format
=
'%(asctime)s %(levelname)s %(message)s'
)
os
.
makedirs
(
args
.
output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
args
.
output_dir
+
"/partition"
,
exist_ok
=
True
)
datas
=
[[]
for
i
in
range
(
args
.
num_thread
)]
logging
.
info
(
"Stage-1: Loading data.list OR wav.scp..."
)
if
args
.
data_type
==
"shard"
:
assert
args
.
data_list
is
not
None
lists
=
read_lists
(
args
.
data_list
)
# partition
total
=
0
for
line
in
lists
:
for
data
in
read_tar
(
line
):
datas
[
total
%
args
.
num_thread
].
append
(
data
)
total
=
total
+
1
elif
args
.
data_type
==
"raw"
:
assert
args
.
data_list
is
not
None
lists
=
read_lists
(
args
.
data_list
)
# partition
for
i
,
line
in
enumerate
(
lists
):
data
=
json
.
loads
(
line
)
datas
[
i
%
args
.
num_thread
].
append
(
data
)
elif
args
.
data_type
==
"wav_scp"
:
assert
args
.
wav_scp
is
not
None
assert
args
.
text
is
not
None
wavs
,
texts
=
{},
{}
# wavs
for
line
in
read_lists
(
args
.
wav_scp
):
line
=
line
.
strip
().
split
()
wavs
[
line
[
0
]]
=
line
[
1
]
# texts
for
line
in
read_lists
(
args
.
text
):
line
=
line
.
strip
().
split
(
maxsplit
=
1
)
texts
[
line
[
0
]]
=
line
[
1
]
sorted
(
wavs
)
sorted
(
texts
)
# partition
for
i
,
(
key1
,
key2
)
in
enumerate
(
zip
(
wavs
,
texts
)):
assert
key1
==
key2
datas
[
i
%
args
.
num_thread
].
append
(
{
'key'
:
key1
,
"wav"
:
wavs
[
key1
],
"txt"
:
texts
[
key1
]}
)
logging
.
info
(
"Stage-2: Start Analyze"
)
# threads
pool
=
multiprocessing
.
Pool
(
processes
=
args
.
num_thread
)
for
i
in
range
(
args
.
num_thread
):
output_file
=
os
.
path
.
join
(
args
.
output_dir
,
"partition"
,
"part-{}"
.
format
(
i
))
pool
.
apply_async
(
analyze
,
(
datas
[
i
],
output_file
,
i
))
pool
.
close
()
pool
.
join
()
logging
.
info
(
"Stage-3: Sort and Write Result"
)
datas
=
[]
for
i
in
range
(
args
.
num_thread
):
output_file
=
os
.
path
.
join
(
args
.
output_dir
,
"partition"
,
"part-{}"
.
format
(
i
))
with
open
(
output_file
,
"r"
,
encoding
=
'utf8'
)
as
f
:
for
line
in
f
.
readlines
():
data
=
json
.
loads
(
line
)
datas
.
append
(
data
)
total_dur
=
sum
([
x
[
'dur'
]
for
x
in
datas
])
total_len
=
sum
([
x
[
'txt_length'
]
for
x
in
datas
])
total_leading_sil
=
sum
([
x
[
'leading_sil'
]
for
x
in
datas
])
total_trailing_sil
=
sum
([
x
[
'trailing_sil'
]
for
x
in
datas
])
num_datas
=
len
(
datas
)
names
=
[
'key'
,
'dur'
,
'txt_length'
,
'speed'
,
'leading_sil'
,
'trailing_sil'
]
units
=
[
''
,
's'
,
''
,
'char/s'
,
'ms'
,
'ms'
]
avgs
=
[
0
,
total_dur
/
num_datas
,
total_len
/
num_datas
,
total_len
/
total_dur
,
total_leading_sil
/
num_datas
,
total_trailing_sil
/
num_datas
]
stds
=
[
0
,
sum
([(
x
[
'dur'
]
-
avgs
[
1
])
**
2
for
x
in
datas
]),
sum
([(
x
[
'txt_length'
]
-
avgs
[
2
])
**
2
for
x
in
datas
]),
sum
([(
x
[
'txt_length'
]
/
x
[
'dur'
]
-
avgs
[
3
])
**
2
for
x
in
datas
]),
sum
([(
x
[
'leading_sil'
]
-
avgs
[
4
])
**
2
for
x
in
datas
]),
sum
([(
x
[
'trailing_sil'
]
-
avgs
[
5
])
**
2
for
x
in
datas
])]
stds
=
[
math
.
sqrt
(
x
/
num_datas
)
for
x
in
stds
]
parts
=
[
'max'
,
'P99'
,
'P75'
,
'P50'
,
'P25'
,
'min'
]
index
=
[
num_datas
-
1
,
int
(
num_datas
*
0.99
),
int
(
num_datas
*
0.75
),
int
(
num_datas
*
0.50
),
int
(
num_datas
*
0.25
),
0
]
with
open
(
args
.
output_dir
+
"/analyze_result_brief"
,
"w"
,
encoding
=
'utf8'
)
as
f
:
for
i
,
(
name
,
unit
,
avg
,
std
)
in
enumerate
(
zip
(
names
,
units
,
avgs
,
stds
)):
if
name
==
'key'
:
continue
f
.
write
(
"==================
\n
"
)
datas
.
sort
(
key
=
lambda
x
:
x
[
name
])
for
p
,
j
in
zip
(
parts
,
index
):
f
.
write
(
"{} {}: {:.3f} {} (wav_id: {})
\n
"
.
format
(
p
,
name
,
datas
[
j
][
name
],
unit
,
datas
[
j
][
'key'
]))
f
.
write
(
"avg {}: {:.3f} {}
\n
"
.
format
(
name
,
avg
,
unit
))
f
.
write
(
"std {}: {:.3f}
\n
"
.
format
(
name
,
std
))
os
.
system
(
"cat {}"
.
format
(
args
.
output_dir
+
"/analyze_result_brief"
))
datas
.
sort
(
key
=
lambda
x
:
x
[
'dur'
])
with
open
(
args
.
output_dir
+
"/analyze_result"
,
"w"
,
encoding
=
'utf8'
)
as
f
:
for
data
in
datas
:
f
.
write
(
"{}
\n
"
.
format
(
json
.
dumps
(
data
,
ensure_ascii
=
False
)))
end_time
=
time
.
time
()
logging
.
info
(
"Time Cost: {:.3f}s"
.
format
(
end_time
-
start_time
))
if
__name__
==
'__main__'
:
main
()
examples/aishell/s0/tools/cmvn_kaldi2json.py
0 → 100755
View file @
a7785cc6
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import
logging
import
sys
import
json
def
kaldi2json
(
kaldi_cmvn_file
):
means
=
[]
variance
=
[]
with
open
(
kaldi_cmvn_file
,
'r'
)
as
fid
:
# kaldi binary file start with '\0B'
if
fid
.
read
(
2
)
==
'
\0
B'
:
logging
.
error
(
'kaldi cmvn binary file is not supported, please '
'recompute it by: compute-cmvn-stats --binary=false '
' scp:feats.scp global_cmvn'
)
sys
.
exit
(
1
)
fid
.
seek
(
0
)
arr
=
fid
.
read
().
split
()
assert
(
arr
[
0
]
==
'['
)
assert
(
arr
[
-
2
]
==
'0'
)
assert
(
arr
[
-
1
]
==
']'
)
feat_dim
=
int
((
len
(
arr
)
-
2
-
2
)
/
2
)
for
i
in
range
(
1
,
feat_dim
+
1
):
means
.
append
(
float
(
arr
[
i
]))
count
=
float
(
arr
[
feat_dim
+
1
])
for
i
in
range
(
feat_dim
+
2
,
2
*
feat_dim
+
2
):
variance
.
append
(
float
(
arr
[
i
]))
cmvn_info
=
{
'mean_stat:'
:
means
,
'var_stat'
:
variance
,
'frame_num'
:
count
}
return
cmvn_info
if
__name__
==
'__main__'
:
with
open
(
sys
.
argv
[
2
],
'w'
)
as
fout
:
cmvn
=
kaldi2json
(
sys
.
argv
[
1
])
fout
.
write
(
json
.
dumps
(
cmvn
))
examples/aishell/s0/tools/combine_data.sh
0 → 100755
View file @
a7785cc6
#!/bin/bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0.
# 2014 David Snyder
# This script combines the data from multiple source directories into
# a single destination directory.
# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information
# about what these directories contain.
# Begin configuration section.
extra_files
=
# specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..."
skip_fix
=
false
# skip the fix_data_dir.sh in the end
# End configuration section.
echo
"
$0
$@
"
# Print the command line for logging
if
[
-f
path.sh
]
;
then
.
./path.sh
;
fi
if
[
-f
parse_options.sh
]
;
then
.
parse_options.sh
||
exit
1
;
fi
if
[
$#
-lt
2
]
;
then
echo
"Usage: combine_data.sh [--extra-files 'file1 file2'] <dest-data-dir> <src-data-dir1> <src-data-dir2> ..."
echo
"Note, files that don't appear in all source dirs will not be combined,"
echo
"with the exception of utt2uniq and segments, which are created where necessary."
exit
1
fi
dest
=
$1
;
shift
;
first_src
=
$1
;
rm
-r
$dest
2>/dev/null
mkdir
-p
$dest
;
export
LC_ALL
=
C
for
dir
in
$*
;
do
if
[
!
-f
$dir
/utt2spk
]
;
then
echo
"
$0
: no such file
$dir
/utt2spk"
exit
1
;
fi
done
# Check that frame_shift are compatible, where present together with features.
dir_with_frame_shift
=
for
dir
in
$*
;
do
if
[[
-f
$dir
/feats.scp
&&
-f
$dir
/frame_shift
]]
;
then
if
[[
$dir_with_frame_shift
]]
&&
!
cmp
-s
$dir_with_frame_shift
/frame_shift
$dir
/frame_shift
;
then
echo
"
$0
:error: different frame_shift in directories
$dir
and "
\
"
$dir_with_frame_shift
. Cannot combine features."
exit
1
;
fi
dir_with_frame_shift
=
$dir
fi
done
# W.r.t. utt2uniq file the script has different behavior compared to other files
# it is not compulsary for it to exist in src directories, but if it exists in
# even one it should exist in all. We will create the files where necessary
has_utt2uniq
=
false
for
in_dir
in
$*
;
do
if
[
-f
$in_dir
/utt2uniq
]
;
then
has_utt2uniq
=
true
break
fi
done
if
$has_utt2uniq
;
then
# we are going to create an utt2uniq file in the destdir
for
in_dir
in
$*
;
do
if
[
!
-f
$in_dir
/utt2uniq
]
;
then
# we assume that utt2uniq is a one to one mapping
cat
$in_dir
/utt2spk |
awk
'{printf("%s %s\n", $1, $1);}'
else
cat
$in_dir
/utt2uniq
fi
done
|
sort
-k1
>
$dest
/utt2uniq
echo
"
$0
: combined utt2uniq"
else
echo
"
$0
[info]: not combining utt2uniq as it does not exist"
fi
# some of the old scripts might provide utt2uniq as an extrafile, so just remove it
extra_files
=
$(
echo
"
$extra_files
"
|sed
-e
"s/utt2uniq//g"
)
# segments are treated similarly to utt2uniq. If it exists in some, but not all
# src directories, then we generate segments where necessary.
has_segments
=
false
for
in_dir
in
$*
;
do
if
[
-f
$in_dir
/segments
]
;
then
has_segments
=
true
break
fi
done
if
$has_segments
;
then
for
in_dir
in
$*
;
do
if
[
!
-f
$in_dir
/segments
]
;
then
echo
"
$0
[info]: will generate missing segments for
$in_dir
"
1>&2
utils/data/get_segments_for_data.sh
$in_dir
else
cat
$in_dir
/segments
fi
done
|
sort
-k1
>
$dest
/segments
echo
"
$0
: combined segments"
else
echo
"
$0
[info]: not combining segments as it does not exist"
fi
for
file
in
utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender
$extra_files
;
do
exists_somewhere
=
false
absent_somewhere
=
false
for
d
in
$*
;
do
if
[
-f
$d
/
$file
]
;
then
exists_somewhere
=
true
else
absent_somewhere
=
true
fi
done
if
!
$absent_somewhere
;
then
set
-o
pipefail
(
for
f
in
$*
;
do
cat
$f
/
$file
;
done
)
|
sort
-k1
>
$dest
/
$file
||
exit
1
;
set
+o pipefail
echo
"
$0
: combined
$file
"
else
if
!
$exists_somewhere
;
then
echo
"
$0
[info]: not combining
$file
as it does not exist"
else
echo
"
$0
[info]: **not combining
$file
as it does not exist everywhere**"
fi
fi
done
tools/utt2spk_to_spk2utt.pl <
$dest
/utt2spk
>
$dest
/spk2utt
if
[[
$dir_with_frame_shift
]]
;
then
cp
$dir_with_frame_shift
/frame_shift
$dest
fi
if
!
$skip_fix
;
then
tools/fix_data_dir.sh
$dest
||
exit
1
;
fi
exit
0
examples/aishell/s0/tools/compute-cer.py
0 → 100755
View file @
a7785cc6
This diff is collapsed.
Click to expand it.
examples/aishell/s0/tools/compute-wer.py
0 → 100755
View file @
a7785cc6
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import
re
,
sys
,
unicodedata
import
codecs
remove_tag
=
True
spacelist
=
[
' '
,
'
\t
'
,
'
\r
'
,
'
\n
'
]
puncts
=
[
'!'
,
','
,
'?'
,
'、'
,
'。'
,
'!'
,
','
,
';'
,
'?'
,
':'
,
'「'
,
'」'
,
'︰'
,
'『'
,
'』'
,
'《'
,
'》'
]
def
characterize
(
string
)
:
res
=
[]
i
=
0
while
i
<
len
(
string
):
char
=
string
[
i
]
if
char
in
puncts
:
i
+=
1
continue
cat1
=
unicodedata
.
category
(
char
)
#https://unicodebook.readthedocs.io/unicode.html#unicode-categories
if
cat1
==
'Zs'
or
cat1
==
'Cn'
or
char
in
spacelist
:
# space or not assigned
i
+=
1
continue
if
cat1
==
'Lo'
:
# letter-other
res
.
append
(
char
)
i
+=
1
else
:
# some input looks like: <unk><noise>, we want to separate it to two words.
sep
=
' '
if
char
==
'<'
:
sep
=
'>'
j
=
i
+
1
while
j
<
len
(
string
):
c
=
string
[
j
]
if
ord
(
c
)
>=
128
or
(
c
in
spacelist
)
or
(
c
==
sep
):
break
j
+=
1
if
j
<
len
(
string
)
and
string
[
j
]
==
'>'
:
j
+=
1
res
.
append
(
string
[
i
:
j
])
i
=
j
return
res
def
stripoff_tags
(
x
):
if
not
x
:
return
''
chars
=
[]
i
=
0
;
T
=
len
(
x
)
while
i
<
T
:
if
x
[
i
]
==
'<'
:
while
i
<
T
and
x
[
i
]
!=
'>'
:
i
+=
1
i
+=
1
else
:
chars
.
append
(
x
[
i
])
i
+=
1
return
''
.
join
(
chars
)
def
normalize
(
sentence
,
ignore_words
,
cs
,
split
=
None
):
""" sentence, ignore_words are both in unicode
"""
new_sentence
=
[]
for
token
in
sentence
:
x
=
token
if
not
cs
:
x
=
x
.
upper
()
if
x
in
ignore_words
:
continue
if
remove_tag
:
x
=
stripoff_tags
(
x
)
if
not
x
:
continue
if
split
and
x
in
split
:
new_sentence
+=
split
[
x
]
else
:
new_sentence
.
append
(
x
)
return
new_sentence
class
Calculator
:
def
__init__
(
self
)
:
self
.
data
=
{}
self
.
space
=
[]
self
.
cost
=
{}
self
.
cost
[
'cor'
]
=
0
self
.
cost
[
'sub'
]
=
1
self
.
cost
[
'del'
]
=
1
self
.
cost
[
'ins'
]
=
1
def
calculate
(
self
,
lab
,
rec
)
:
# Initialization
lab
.
insert
(
0
,
''
)
rec
.
insert
(
0
,
''
)
while
len
(
self
.
space
)
<
len
(
lab
)
:
self
.
space
.
append
([])
for
row
in
self
.
space
:
for
element
in
row
:
element
[
'dist'
]
=
0
element
[
'error'
]
=
'non'
while
len
(
row
)
<
len
(
rec
)
:
row
.
append
({
'dist'
:
0
,
'error'
:
'non'
})
for
i
in
range
(
len
(
lab
))
:
self
.
space
[
i
][
0
][
'dist'
]
=
i
self
.
space
[
i
][
0
][
'error'
]
=
'del'
for
j
in
range
(
len
(
rec
))
:
self
.
space
[
0
][
j
][
'dist'
]
=
j
self
.
space
[
0
][
j
][
'error'
]
=
'ins'
self
.
space
[
0
][
0
][
'error'
]
=
'non'
for
token
in
lab
:
if
token
not
in
self
.
data
and
len
(
token
)
>
0
:
self
.
data
[
token
]
=
{
'all'
:
0
,
'cor'
:
0
,
'sub'
:
0
,
'ins'
:
0
,
'del'
:
0
}
for
token
in
rec
:
if
token
not
in
self
.
data
and
len
(
token
)
>
0
:
self
.
data
[
token
]
=
{
'all'
:
0
,
'cor'
:
0
,
'sub'
:
0
,
'ins'
:
0
,
'del'
:
0
}
# Computing edit distance
for
i
,
lab_token
in
enumerate
(
lab
)
:
for
j
,
rec_token
in
enumerate
(
rec
)
:
if
i
==
0
or
j
==
0
:
continue
min_dist
=
sys
.
maxsize
min_error
=
'none'
dist
=
self
.
space
[
i
-
1
][
j
][
'dist'
]
+
self
.
cost
[
'del'
]
error
=
'del'
if
dist
<
min_dist
:
min_dist
=
dist
min_error
=
error
dist
=
self
.
space
[
i
][
j
-
1
][
'dist'
]
+
self
.
cost
[
'ins'
]
error
=
'ins'
if
dist
<
min_dist
:
min_dist
=
dist
min_error
=
error
if
lab_token
==
rec_token
:
dist
=
self
.
space
[
i
-
1
][
j
-
1
][
'dist'
]
+
self
.
cost
[
'cor'
]
error
=
'cor'
else
:
dist
=
self
.
space
[
i
-
1
][
j
-
1
][
'dist'
]
+
self
.
cost
[
'sub'
]
error
=
'sub'
if
dist
<
min_dist
:
min_dist
=
dist
min_error
=
error
self
.
space
[
i
][
j
][
'dist'
]
=
min_dist
self
.
space
[
i
][
j
][
'error'
]
=
min_error
# Tracing back
result
=
{
'lab'
:[],
'rec'
:[],
'all'
:
0
,
'cor'
:
0
,
'sub'
:
0
,
'ins'
:
0
,
'del'
:
0
}
i
=
len
(
lab
)
-
1
j
=
len
(
rec
)
-
1
while
True
:
if
self
.
space
[
i
][
j
][
'error'
]
==
'cor'
:
# correct
if
len
(
lab
[
i
])
>
0
:
self
.
data
[
lab
[
i
]][
'all'
]
=
self
.
data
[
lab
[
i
]][
'all'
]
+
1
self
.
data
[
lab
[
i
]][
'cor'
]
=
self
.
data
[
lab
[
i
]][
'cor'
]
+
1
result
[
'all'
]
=
result
[
'all'
]
+
1
result
[
'cor'
]
=
result
[
'cor'
]
+
1
result
[
'lab'
].
insert
(
0
,
lab
[
i
])
result
[
'rec'
].
insert
(
0
,
rec
[
j
])
i
=
i
-
1
j
=
j
-
1
elif
self
.
space
[
i
][
j
][
'error'
]
==
'sub'
:
# substitution
if
len
(
lab
[
i
])
>
0
:
self
.
data
[
lab
[
i
]][
'all'
]
=
self
.
data
[
lab
[
i
]][
'all'
]
+
1
self
.
data
[
lab
[
i
]][
'sub'
]
=
self
.
data
[
lab
[
i
]][
'sub'
]
+
1
result
[
'all'
]
=
result
[
'all'
]
+
1
result
[
'sub'
]
=
result
[
'sub'
]
+
1
result
[
'lab'
].
insert
(
0
,
lab
[
i
])
result
[
'rec'
].
insert
(
0
,
rec
[
j
])
i
=
i
-
1
j
=
j
-
1
elif
self
.
space
[
i
][
j
][
'error'
]
==
'del'
:
# deletion
if
len
(
lab
[
i
])
>
0
:
self
.
data
[
lab
[
i
]][
'all'
]
=
self
.
data
[
lab
[
i
]][
'all'
]
+
1
self
.
data
[
lab
[
i
]][
'del'
]
=
self
.
data
[
lab
[
i
]][
'del'
]
+
1
result
[
'all'
]
=
result
[
'all'
]
+
1
result
[
'del'
]
=
result
[
'del'
]
+
1
result
[
'lab'
].
insert
(
0
,
lab
[
i
])
result
[
'rec'
].
insert
(
0
,
""
)
i
=
i
-
1
elif
self
.
space
[
i
][
j
][
'error'
]
==
'ins'
:
# insertion
if
len
(
rec
[
j
])
>
0
:
self
.
data
[
rec
[
j
]][
'ins'
]
=
self
.
data
[
rec
[
j
]][
'ins'
]
+
1
result
[
'ins'
]
=
result
[
'ins'
]
+
1
result
[
'lab'
].
insert
(
0
,
""
)
result
[
'rec'
].
insert
(
0
,
rec
[
j
])
j
=
j
-
1
elif
self
.
space
[
i
][
j
][
'error'
]
==
'non'
:
# starting point
break
else
:
# shouldn't reach here
print
(
'this should not happen , i = {i} , j = {j} , error = {error}'
.
format
(
i
=
i
,
j
=
j
,
error
=
self
.
space
[
i
][
j
][
'error'
]))
return
result
def
overall
(
self
)
:
result
=
{
'all'
:
0
,
'cor'
:
0
,
'sub'
:
0
,
'ins'
:
0
,
'del'
:
0
}
for
token
in
self
.
data
:
result
[
'all'
]
=
result
[
'all'
]
+
self
.
data
[
token
][
'all'
]
result
[
'cor'
]
=
result
[
'cor'
]
+
self
.
data
[
token
][
'cor'
]
result
[
'sub'
]
=
result
[
'sub'
]
+
self
.
data
[
token
][
'sub'
]
result
[
'ins'
]
=
result
[
'ins'
]
+
self
.
data
[
token
][
'ins'
]
result
[
'del'
]
=
result
[
'del'
]
+
self
.
data
[
token
][
'del'
]
return
result
def
cluster
(
self
,
data
)
:
result
=
{
'all'
:
0
,
'cor'
:
0
,
'sub'
:
0
,
'ins'
:
0
,
'del'
:
0
}
for
token
in
data
:
if
token
in
self
.
data
:
result
[
'all'
]
=
result
[
'all'
]
+
self
.
data
[
token
][
'all'
]
result
[
'cor'
]
=
result
[
'cor'
]
+
self
.
data
[
token
][
'cor'
]
result
[
'sub'
]
=
result
[
'sub'
]
+
self
.
data
[
token
][
'sub'
]
result
[
'ins'
]
=
result
[
'ins'
]
+
self
.
data
[
token
][
'ins'
]
result
[
'del'
]
=
result
[
'del'
]
+
self
.
data
[
token
][
'del'
]
return
result
def
keys
(
self
)
:
return
list
(
self
.
data
.
keys
())
def
width
(
string
):
return
sum
(
1
+
(
unicodedata
.
east_asian_width
(
c
)
in
"AFW"
)
for
c
in
string
)
def
default_cluster
(
word
)
:
unicode_names
=
[
unicodedata
.
name
(
char
)
for
char
in
word
]
for
i
in
reversed
(
range
(
len
(
unicode_names
)))
:
if
unicode_names
[
i
].
startswith
(
'DIGIT'
)
:
# 1
unicode_names
[
i
]
=
'Number'
# 'DIGIT'
elif
(
unicode_names
[
i
].
startswith
(
'CJK UNIFIED IDEOGRAPH'
)
or
unicode_names
[
i
].
startswith
(
'CJK COMPATIBILITY IDEOGRAPH'
))
:
# 明 / 郎
unicode_names
[
i
]
=
'Mandarin'
# 'CJK IDEOGRAPH'
elif
(
unicode_names
[
i
].
startswith
(
'LATIN CAPITAL LETTER'
)
or
unicode_names
[
i
].
startswith
(
'LATIN SMALL LETTER'
))
:
# A / a
unicode_names
[
i
]
=
'English'
# 'LATIN LETTER'
elif
unicode_names
[
i
].
startswith
(
'HIRAGANA LETTER'
)
:
# は こ め
unicode_names
[
i
]
=
'Japanese'
# 'GANA LETTER'
elif
(
unicode_names
[
i
].
startswith
(
'AMPERSAND'
)
or
unicode_names
[
i
].
startswith
(
'APOSTROPHE'
)
or
unicode_names
[
i
].
startswith
(
'COMMERCIAL AT'
)
or
unicode_names
[
i
].
startswith
(
'DEGREE CELSIUS'
)
or
unicode_names
[
i
].
startswith
(
'EQUALS SIGN'
)
or
unicode_names
[
i
].
startswith
(
'FULL STOP'
)
or
unicode_names
[
i
].
startswith
(
'HYPHEN-MINUS'
)
or
unicode_names
[
i
].
startswith
(
'LOW LINE'
)
or
unicode_names
[
i
].
startswith
(
'NUMBER SIGN'
)
or
unicode_names
[
i
].
startswith
(
'PLUS SIGN'
)
or
unicode_names
[
i
].
startswith
(
'SEMICOLON'
))
:
# & / ' / @ / ℃ / = / . / - / _ / # / + / ;
del
unicode_names
[
i
]
else
:
return
'Other'
if
len
(
unicode_names
)
==
0
:
return
'Other'
if
len
(
unicode_names
)
==
1
:
return
unicode_names
[
0
]
for
i
in
range
(
len
(
unicode_names
)
-
1
)
:
if
unicode_names
[
i
]
!=
unicode_names
[
i
+
1
]
:
return
'Other'
return
unicode_names
[
0
]
def
usage
()
:
print
(
"compute-wer.py : compute word error rate (WER) and align recognition results and references."
)
print
(
" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer"
)
if
__name__
==
'__main__'
:
if
len
(
sys
.
argv
)
==
1
:
usage
()
sys
.
exit
(
0
)
calculator
=
Calculator
()
cluster_file
=
''
ignore_words
=
set
()
tochar
=
False
verbose
=
1
padding_symbol
=
' '
case_sensitive
=
False
max_words_per_line
=
sys
.
maxsize
split
=
None
while
len
(
sys
.
argv
)
>
3
:
a
=
'--maxw='
if
sys
.
argv
[
1
].
startswith
(
a
):
b
=
sys
.
argv
[
1
][
len
(
a
):]
del
sys
.
argv
[
1
]
max_words_per_line
=
int
(
b
)
continue
a
=
'--rt='
if
sys
.
argv
[
1
].
startswith
(
a
):
b
=
sys
.
argv
[
1
][
len
(
a
):].
lower
()
del
sys
.
argv
[
1
]
remove_tag
=
(
b
==
'true'
)
or
(
b
!=
'0'
)
continue
a
=
'--cs='
if
sys
.
argv
[
1
].
startswith
(
a
):
b
=
sys
.
argv
[
1
][
len
(
a
):].
lower
()
del
sys
.
argv
[
1
]
case_sensitive
=
(
b
==
'true'
)
or
(
b
!=
'0'
)
continue
a
=
'--cluster='
if
sys
.
argv
[
1
].
startswith
(
a
):
cluster_file
=
sys
.
argv
[
1
][
len
(
a
):]
del
sys
.
argv
[
1
]
continue
a
=
'--splitfile='
if
sys
.
argv
[
1
].
startswith
(
a
):
split_file
=
sys
.
argv
[
1
][
len
(
a
):]
del
sys
.
argv
[
1
]
split
=
dict
()
with
codecs
.
open
(
split_file
,
'r'
,
'utf-8'
)
as
fh
:
for
line
in
fh
:
# line in unicode
words
=
line
.
strip
().
split
()
if
len
(
words
)
>=
2
:
split
[
words
[
0
]]
=
words
[
1
:]
continue
a
=
'--ig='
if
sys
.
argv
[
1
].
startswith
(
a
):
ignore_file
=
sys
.
argv
[
1
][
len
(
a
):]
del
sys
.
argv
[
1
]
with
codecs
.
open
(
ignore_file
,
'r'
,
'utf-8'
)
as
fh
:
for
line
in
fh
:
# line in unicode
line
=
line
.
strip
()
if
len
(
line
)
>
0
:
ignore_words
.
add
(
line
)
continue
a
=
'--char='
if
sys
.
argv
[
1
].
startswith
(
a
):
b
=
sys
.
argv
[
1
][
len
(
a
):].
lower
()
del
sys
.
argv
[
1
]
tochar
=
(
b
==
'true'
)
or
(
b
!=
'0'
)
continue
a
=
'--v='
if
sys
.
argv
[
1
].
startswith
(
a
):
b
=
sys
.
argv
[
1
][
len
(
a
):].
lower
()
del
sys
.
argv
[
1
]
verbose
=
0
try
:
verbose
=
int
(
b
)
except
:
if
b
==
'true'
or
b
!=
'0'
:
verbose
=
1
continue
a
=
'--padding-symbol='
if
sys
.
argv
[
1
].
startswith
(
a
):
b
=
sys
.
argv
[
1
][
len
(
a
):].
lower
()
del
sys
.
argv
[
1
]
if
b
==
'space'
:
padding_symbol
=
' '
elif
b
==
'underline'
:
padding_symbol
=
'_'
continue
if
True
or
sys
.
argv
[
1
].
startswith
(
'-'
):
#ignore invalid switch
del
sys
.
argv
[
1
]
continue
if
not
case_sensitive
:
ig
=
set
([
w
.
upper
()
for
w
in
ignore_words
])
ignore_words
=
ig
default_clusters
=
{}
default_words
=
{}
ref_file
=
sys
.
argv
[
1
]
hyp_file
=
sys
.
argv
[
2
]
rec_set
=
{}
if
split
and
not
case_sensitive
:
newsplit
=
dict
()
for
w
in
split
:
words
=
split
[
w
]
for
i
in
range
(
len
(
words
)):
words
[
i
]
=
words
[
i
].
upper
()
newsplit
[
w
.
upper
()]
=
words
split
=
newsplit
with
codecs
.
open
(
hyp_file
,
'r'
,
'utf-8'
)
as
fh
:
for
line
in
fh
:
if
tochar
:
array
=
characterize
(
line
)
else
:
array
=
line
.
strip
().
split
()
if
len
(
array
)
==
0
:
continue
fid
=
array
[
0
]
rec_set
[
fid
]
=
normalize
(
array
[
1
:],
ignore_words
,
case_sensitive
,
split
)
# compute error rate on the interaction of reference file and hyp file
for
line
in
open
(
ref_file
,
'r'
,
encoding
=
'utf-8'
)
:
if
tochar
:
array
=
characterize
(
line
)
else
:
array
=
line
.
rstrip
(
'
\n
'
).
split
()
if
len
(
array
)
==
0
:
continue
fid
=
array
[
0
]
if
fid
not
in
rec_set
:
continue
lab
=
normalize
(
array
[
1
:],
ignore_words
,
case_sensitive
,
split
)
rec
=
rec_set
[
fid
]
if
verbose
:
print
(
'
\n
utt: %s'
%
fid
)
for
word
in
rec
+
lab
:
if
word
not
in
default_words
:
default_cluster_name
=
default_cluster
(
word
)
if
default_cluster_name
not
in
default_clusters
:
default_clusters
[
default_cluster_name
]
=
{}
if
word
not
in
default_clusters
[
default_cluster_name
]
:
default_clusters
[
default_cluster_name
][
word
]
=
1
default_words
[
word
]
=
default_cluster_name
result
=
calculator
.
calculate
(
lab
,
rec
)
if
verbose
:
if
result
[
'all'
]
!=
0
:
wer
=
float
(
result
[
'ins'
]
+
result
[
'sub'
]
+
result
[
'del'
])
*
100.0
/
result
[
'all'
]
else
:
wer
=
0.0
print
(
'WER: %4.2f %%'
%
wer
,
end
=
' '
)
print
(
'N=%d C=%d S=%d D=%d I=%d'
%
(
result
[
'all'
],
result
[
'cor'
],
result
[
'sub'
],
result
[
'del'
],
result
[
'ins'
]))
space
=
{}
space
[
'lab'
]
=
[]
space
[
'rec'
]
=
[]
for
idx
in
range
(
len
(
result
[
'lab'
]))
:
len_lab
=
width
(
result
[
'lab'
][
idx
])
len_rec
=
width
(
result
[
'rec'
][
idx
])
length
=
max
(
len_lab
,
len_rec
)
space
[
'lab'
].
append
(
length
-
len_lab
)
space
[
'rec'
].
append
(
length
-
len_rec
)
upper_lab
=
len
(
result
[
'lab'
])
upper_rec
=
len
(
result
[
'rec'
])
lab1
,
rec1
=
0
,
0
while
lab1
<
upper_lab
or
rec1
<
upper_rec
:
if
verbose
>
1
:
print
(
'lab(%s):'
%
fid
.
encode
(
'utf-8'
),
end
=
' '
)
else
:
print
(
'lab:'
,
end
=
' '
)
lab2
=
min
(
upper_lab
,
lab1
+
max_words_per_line
)
for
idx
in
range
(
lab1
,
lab2
):
token
=
result
[
'lab'
][
idx
]
print
(
'{token}'
.
format
(
token
=
token
),
end
=
''
)
for
n
in
range
(
space
[
'lab'
][
idx
])
:
print
(
padding_symbol
,
end
=
''
)
print
(
' '
,
end
=
''
)
print
()
if
verbose
>
1
:
print
(
'rec(%s):'
%
fid
.
encode
(
'utf-8'
),
end
=
' '
)
else
:
print
(
'rec:'
,
end
=
' '
)
rec2
=
min
(
upper_rec
,
rec1
+
max_words_per_line
)
for
idx
in
range
(
rec1
,
rec2
):
token
=
result
[
'rec'
][
idx
]
print
(
'{token}'
.
format
(
token
=
token
),
end
=
''
)
for
n
in
range
(
space
[
'rec'
][
idx
])
:
print
(
padding_symbol
,
end
=
''
)
print
(
' '
,
end
=
''
)
print
(
'
\n
'
,
end
=
'
\n
'
)
lab1
=
lab2
rec1
=
rec2
if
verbose
:
print
(
'==========================================================================='
)
print
()
result
=
calculator
.
overall
()
if
result
[
'all'
]
!=
0
:
wer
=
float
(
result
[
'ins'
]
+
result
[
'sub'
]
+
result
[
'del'
])
*
100.0
/
result
[
'all'
]
else
:
wer
=
0.0
print
(
'Overall -> %4.2f %%'
%
wer
,
end
=
' '
)
print
(
'N=%d C=%d S=%d D=%d I=%d'
%
(
result
[
'all'
],
result
[
'cor'
],
result
[
'sub'
],
result
[
'del'
],
result
[
'ins'
]))
if
not
verbose
:
print
()
if
verbose
:
for
cluster_id
in
default_clusters
:
result
=
calculator
.
cluster
([
k
for
k
in
default_clusters
[
cluster_id
]
])
if
result
[
'all'
]
!=
0
:
wer
=
float
(
result
[
'ins'
]
+
result
[
'sub'
]
+
result
[
'del'
])
*
100.0
/
result
[
'all'
]
else
:
wer
=
0.0
print
(
'%s -> %4.2f %%'
%
(
cluster_id
,
wer
),
end
=
' '
)
print
(
'N=%d C=%d S=%d D=%d I=%d'
%
(
result
[
'all'
],
result
[
'cor'
],
result
[
'sub'
],
result
[
'del'
],
result
[
'ins'
]))
if
len
(
cluster_file
)
>
0
:
# compute separated WERs for word clusters
cluster_id
=
''
cluster
=
[]
for
line
in
open
(
cluster_file
,
'r'
,
encoding
=
'utf-8'
)
:
for
token
in
line
.
decode
(
'utf-8'
).
rstrip
(
'
\n
'
).
split
()
:
# end of cluster reached, like </Keyword>
if
token
[
0
:
2
]
==
'</'
and
token
[
len
(
token
)
-
1
]
==
'>'
and
\
token
.
lstrip
(
'</'
).
rstrip
(
'>'
)
==
cluster_id
:
result
=
calculator
.
cluster
(
cluster
)
if
result
[
'all'
]
!=
0
:
wer
=
float
(
result
[
'ins'
]
+
result
[
'sub'
]
+
result
[
'del'
])
*
100.0
/
result
[
'all'
]
else
:
wer
=
0.0
print
(
'%s -> %4.2f %%'
%
(
cluster_id
,
wer
),
end
=
' '
)
print
(
'N=%d C=%d S=%d D=%d I=%d'
%
(
result
[
'all'
],
result
[
'cor'
],
result
[
'sub'
],
result
[
'del'
],
result
[
'ins'
]))
cluster_id
=
''
cluster
=
[]
# begin of cluster reached, like <Keyword>
elif
token
[
0
]
==
'<'
and
token
[
len
(
token
)
-
1
]
==
'>'
and
\
cluster_id
==
''
:
cluster_id
=
token
.
lstrip
(
'<'
).
rstrip
(
'>'
)
cluster
=
[]
# general terms, like WEATHER / CAR / ...
else
:
cluster
.
append
(
token
)
print
()
print
(
'==========================================================================='
)
examples/aishell/s0/tools/compute_cmvn_stats.py
0 → 100755
View file @
a7785cc6
#!/usr/bin/env python3
# encoding: utf-8
import
sys
import
argparse
import
json
import
codecs
import
yaml
import
torch
import
torchaudio
import
torchaudio.compliance.kaldi
as
kaldi
from
torch.utils.data
import
Dataset
,
DataLoader
torchaudio
.
set_audio_backend
(
"sox_io"
)
class
CollateFunc
(
object
):
''' Collate function for AudioDataset
'''
def
__init__
(
self
,
feat_dim
,
resample_rate
):
self
.
feat_dim
=
feat_dim
self
.
resample_rate
=
resample_rate
pass
def
__call__
(
self
,
batch
):
mean_stat
=
torch
.
zeros
(
self
.
feat_dim
)
var_stat
=
torch
.
zeros
(
self
.
feat_dim
)
number
=
0
for
item
in
batch
:
value
=
item
[
1
].
strip
().
split
(
","
)
assert
len
(
value
)
==
3
or
len
(
value
)
==
1
wav_path
=
value
[
0
]
sample_rate
=
torchaudio
.
backend
.
sox_io_backend
.
info
(
wav_path
).
sample_rate
resample_rate
=
sample_rate
# len(value) == 3 means segmented wav.scp,
# len(value) == 1 means original wav.scp
if
len
(
value
)
==
3
:
start_frame
=
int
(
float
(
value
[
1
])
*
sample_rate
)
end_frame
=
int
(
float
(
value
[
2
])
*
sample_rate
)
waveform
,
sample_rate
=
torchaudio
.
backend
.
sox_io_backend
.
load
(
filepath
=
wav_path
,
num_frames
=
end_frame
-
start_frame
,
frame_offset
=
start_frame
)
else
:
waveform
,
sample_rate
=
torchaudio
.
load
(
item
[
1
])
waveform
=
waveform
*
(
1
<<
15
)
if
self
.
resample_rate
!=
0
and
self
.
resample_rate
!=
sample_rate
:
resample_rate
=
self
.
resample_rate
waveform
=
torchaudio
.
transforms
.
Resample
(
orig_freq
=
sample_rate
,
new_freq
=
resample_rate
)(
waveform
)
mat
=
kaldi
.
fbank
(
waveform
,
num_mel_bins
=
self
.
feat_dim
,
dither
=
0.0
,
energy_floor
=
0.0
,
sample_frequency
=
resample_rate
)
mean_stat
+=
torch
.
sum
(
mat
,
axis
=
0
)
var_stat
+=
torch
.
sum
(
torch
.
square
(
mat
),
axis
=
0
)
number
+=
mat
.
shape
[
0
]
return
number
,
mean_stat
,
var_stat
class
AudioDataset
(
Dataset
):
def
__init__
(
self
,
data_file
):
self
.
items
=
[]
with
codecs
.
open
(
data_file
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
f
:
arr
=
line
.
strip
().
split
()
self
.
items
.
append
((
arr
[
0
],
arr
[
1
]))
def
__len__
(
self
):
return
len
(
self
.
items
)
def
__getitem__
(
self
,
idx
):
return
self
.
items
[
idx
]
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'extract CMVN stats'
)
parser
.
add_argument
(
'--num_workers'
,
default
=
0
,
type
=
int
,
help
=
'num of subprocess workers for processing'
)
parser
.
add_argument
(
'--train_config'
,
default
=
''
,
help
=
'training yaml conf'
)
parser
.
add_argument
(
'--in_scp'
,
default
=
None
,
help
=
'wav scp file'
)
parser
.
add_argument
(
'--out_cmvn'
,
default
=
'global_cmvn'
,
help
=
'global cmvn file'
)
doc
=
"Print log after every log_interval audios are processed."
parser
.
add_argument
(
"--log_interval"
,
type
=
int
,
default
=
1000
,
help
=
doc
)
args
=
parser
.
parse_args
()
with
open
(
args
.
train_config
,
'r'
)
as
fin
:
configs
=
yaml
.
load
(
fin
,
Loader
=
yaml
.
FullLoader
)
feat_dim
=
configs
[
'dataset_conf'
][
'fbank_conf'
][
'num_mel_bins'
]
resample_rate
=
0
if
'resample_conf'
in
configs
[
'dataset_conf'
]:
resample_rate
=
configs
[
'dataset_conf'
][
'resample_conf'
][
'resample_rate'
]
print
(
'using resample and new sample rate is {}'
.
format
(
resample_rate
))
collate_func
=
CollateFunc
(
feat_dim
,
resample_rate
)
dataset
=
AudioDataset
(
args
.
in_scp
)
batch_size
=
20
data_loader
=
DataLoader
(
dataset
,
batch_size
=
batch_size
,
shuffle
=
True
,
sampler
=
None
,
num_workers
=
args
.
num_workers
,
collate_fn
=
collate_func
)
with
torch
.
no_grad
():
all_number
=
0
all_mean_stat
=
torch
.
zeros
(
feat_dim
)
all_var_stat
=
torch
.
zeros
(
feat_dim
)
wav_number
=
0
for
i
,
batch
in
enumerate
(
data_loader
):
number
,
mean_stat
,
var_stat
=
batch
all_mean_stat
+=
mean_stat
all_var_stat
+=
var_stat
all_number
+=
number
wav_number
+=
batch_size
if
wav_number
%
args
.
log_interval
==
0
:
print
(
f
'processed
{
wav_number
}
wavs,
{
all_number
}
frames'
,
file
=
sys
.
stderr
,
flush
=
True
)
cmvn_info
=
{
'mean_stat'
:
list
(
all_mean_stat
.
tolist
()),
'var_stat'
:
list
(
all_var_stat
.
tolist
()),
'frame_num'
:
all_number
}
with
open
(
args
.
out_cmvn
,
'w'
)
as
fout
:
fout
.
write
(
json
.
dumps
(
cmvn_info
))
examples/aishell/s0/tools/compute_fbank_feats.py
0 → 100644
View file @
a7785cc6
# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Chao Yang)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
import
logging
import
torchaudio
import
torchaudio.compliance.kaldi
as
kaldi
import
wenet.dataset.kaldi_io
as
kaldi_io
# The "sox" backends are deprecated and will be removed in 0.9.0 release.
# So here we use sox_io backend
torchaudio
.
set_audio_backend
(
"sox_io"
)
def
parse_opts
():
parser
=
argparse
.
ArgumentParser
(
description
=
'training your network'
)
parser
.
add_argument
(
'--num_mel_bins'
,
default
=
80
,
type
=
int
,
help
=
'Number of triangular mel-frequency bins'
)
parser
.
add_argument
(
'--frame_length'
,
type
=
int
,
default
=
25
,
help
=
'Frame length in milliseconds'
)
parser
.
add_argument
(
'--frame_shift'
,
type
=
int
,
default
=
10
,
help
=
'Frame shift in milliseconds'
)
parser
.
add_argument
(
'--dither'
,
type
=
int
,
default
=
0.0
,
help
=
'Dithering constant (0.0 means no dither)'
)
parser
.
add_argument
(
'--segments'
,
default
=
None
,
help
=
'segments file'
)
parser
.
add_argument
(
'wav_scp'
,
help
=
'wav scp file'
)
parser
.
add_argument
(
'out_ark'
,
help
=
'output ark file'
)
parser
.
add_argument
(
'out_scp'
,
help
=
'output scp file'
)
args
=
parser
.
parse_args
()
return
args
# wav format: <key> <wav_path>
def
load_wav_scp
(
wav_scp_file
):
wav_list
=
[]
with
open
(
wav_scp_file
,
'r'
,
encoding
=
'utf8'
)
as
fin
:
for
line
in
fin
:
arr
=
line
.
strip
().
split
()
assert
len
(
arr
)
==
2
wav_list
.
append
((
arr
[
0
],
arr
[
1
]))
return
wav_list
# wav format: <key> <wav_path>
def
load_wav_scp_dict
(
wav_scp_file
):
wav_dict
=
{}
with
open
(
wav_scp_file
,
'r'
,
encoding
=
'utf8'
)
as
fin
:
for
line
in
fin
:
arr
=
line
.
strip
().
split
()
assert
len
(
arr
)
==
2
wav_dict
[
arr
[
0
]]
=
arr
[
1
]
return
wav_dict
# Segments format: <key> <wav_key> <start> <end>
def
load_wav_segments
(
wav_scp_file
,
segments_file
):
wav_dict
=
load_wav_scp_dict
(
wav_scp_file
)
audio_list
=
[]
with
open
(
segments_file
,
'r'
,
encoding
=
'utf8'
)
as
fin
:
for
line
in
fin
:
arr
=
line
.
strip
().
split
()
assert
len
(
arr
)
==
4
key
=
arr
[
0
]
wav_file
=
wav_dict
[
arr
[
1
]]
start
=
float
(
arr
[
2
])
end
=
float
(
arr
[
3
])
audio_list
.
append
((
key
,
wav_file
,
start
,
end
))
return
audio_list
if
__name__
==
'__main__'
:
args
=
parse_opts
()
logging
.
basicConfig
(
level
=
logging
.
DEBUG
,
format
=
'%(asctime)s %(levelname)s %(message)s'
)
if
args
.
segments
is
None
:
audio_list
=
load_wav_scp
(
args
.
wav_scp
)
else
:
audio_list
=
load_wav_segments
(
args
.
wav_scp
,
args
.
segments
)
count
=
0
with
open
(
args
.
out_ark
,
'wb'
)
as
ark_fout
,
\
open
(
args
.
out_scp
,
'w'
,
encoding
=
'utf8'
)
as
scp_fout
:
for
item
in
audio_list
:
if
len
(
item
)
==
2
:
key
,
wav_path
=
item
waveform
,
sample_rate
=
torchaudio
.
load_wav
(
wav_path
)
else
:
assert
len
(
item
)
==
4
key
,
wav_path
,
start
,
end
=
item
sample_rate
=
torchaudio
.
info
(
wav_path
).
sample_rate
frame_offset
=
int
(
start
*
sample_rate
)
num_frames
=
int
((
end
-
start
)
*
sample_rate
)
waveform
,
sample_rate
=
torchaudio
.
load_wav
(
wav_path
,
frame_offset
,
num_frames
)
mat
=
kaldi
.
fbank
(
waveform
,
num_mel_bins
=
args
.
num_mel_bins
,
frame_length
=
args
.
frame_length
,
frame_shift
=
args
.
frame_shift
,
dither
=
args
.
dither
,
energy_floor
=
0.0
,
sample_frequency
=
sample_rate
)
mat
=
mat
.
detach
().
numpy
()
kaldi_io
.
write_ark_scp
(
key
,
mat
,
ark_fout
,
scp_fout
)
count
+=
1
if
count
%
10000
==
0
:
logging
.
info
(
'Progress {}/{}'
.
format
(
count
,
len
(
audio_list
)))
examples/aishell/s0/tools/copy_data_dir.sh
0 → 100644
View file @
a7785cc6
#!/bin/bash
# Copyright 2013 Johns Hopkins University (author: Daniel Povey)
# Apache 2.0
# This script operates on a directory, such as in data/train/,
# that contains some subset of the following files:
# feats.scp
# wav.scp
# vad.scp
# spk2utt
# utt2spk
# text
#
# It copies to another directory, possibly adding a specified prefix or a suffix
# to the utterance and/or speaker names. Note, the recording-ids stay the same.
#
# begin configuration section
spk_prefix
=
utt_prefix
=
spk_suffix
=
utt_suffix
=
validate_opts
=
# should rarely be needed.
# end configuration section
.
utils/parse_options.sh
if
[
$#
!=
2
]
;
then
echo
"Usage: "
echo
"
$0
[options] <srcdir> <destdir>"
echo
"e.g.:"
echo
"
$0
--spk-prefix=1- --utt-prefix=1- data/train data/train_1"
echo
"Options"
echo
" --spk-prefix=<prefix> # Prefix for speaker ids, default empty"
echo
" --utt-prefix=<prefix> # Prefix for utterance ids, default empty"
echo
" --spk-suffix=<suffix> # Suffix for speaker ids, default empty"
echo
" --utt-suffix=<suffix> # Suffix for utterance ids, default empty"
exit
1
;
fi
export
LC_ALL
=
C
srcdir
=
$1
destdir
=
$2
if
[
!
-f
$srcdir
/utt2spk
]
;
then
echo
"copy_data_dir.sh: no such file
$srcdir
/utt2spk"
exit
1
;
fi
if
[
"
$destdir
"
==
"
$srcdir
"
]
;
then
echo
"
$0
: this script requires <srcdir> and <destdir> to be different."
exit
1
fi
set
-e
;
mkdir
-p
$destdir
cat
$srcdir
/utt2spk |
awk
-v
p
=
$utt_prefix
-v
s
=
$utt_suffix
'{printf("%s %s%s%s\n", $1, p, $1, s);}'
>
$destdir
/utt_map
cat
$srcdir
/spk2utt |
awk
-v
p
=
$spk_prefix
-v
s
=
$spk_suffix
'{printf("%s %s%s%s\n", $1, p, $1, s);}'
>
$destdir
/spk_map
if
[
!
-f
$srcdir
/utt2uniq
]
;
then
if
[[
!
-z
$utt_prefix
||
!
-z
$utt_suffix
]]
;
then
cat
$srcdir
/utt2spk |
awk
-v
p
=
$utt_prefix
-v
s
=
$utt_suffix
'{printf("%s%s%s %s\n", p, $1, s, $1);}'
>
$destdir
/utt2uniq
fi
else
cat
$srcdir
/utt2uniq |
awk
-v
p
=
$utt_prefix
-v
s
=
$utt_suffix
'{printf("%s%s%s %s\n", p, $1, s, $2);}'
>
$destdir
/utt2uniq
fi
cat
$srcdir
/utt2spk | utils/apply_map.pl
-f
1
$destdir
/utt_map |
\
utils/apply_map.pl
-f
2
$destdir
/spk_map
>
$destdir
/utt2spk
utils/utt2spk_to_spk2utt.pl <
$destdir
/utt2spk
>
$destdir
/spk2utt
if
[
-f
$srcdir
/feats.scp
]
;
then
utils/apply_map.pl
-f
1
$destdir
/utt_map <
$srcdir
/feats.scp
>
$destdir
/feats.scp
fi
if
[
-f
$srcdir
/vad.scp
]
;
then
utils/apply_map.pl
-f
1
$destdir
/utt_map <
$srcdir
/vad.scp
>
$destdir
/vad.scp
fi
if
[
-f
$srcdir
/segments
]
;
then
utils/apply_map.pl
-f
1
$destdir
/utt_map <
$srcdir
/segments
>
$destdir
/segments
cp
$srcdir
/wav.scp
$destdir
else
# no segments->wav indexed by utt.
if
[
-f
$srcdir
/wav.scp
]
;
then
utils/apply_map.pl
-f
1
$destdir
/utt_map <
$srcdir
/wav.scp
>
$destdir
/wav.scp
fi
fi
if
[
-f
$srcdir
/reco2file_and_channel
]
;
then
cp
$srcdir
/reco2file_and_channel
$destdir
/
fi
if
[
-f
$srcdir
/text
]
;
then
utils/apply_map.pl
-f
1
$destdir
/utt_map <
$srcdir
/text
>
$destdir
/text
fi
if
[
-f
$srcdir
/utt2dur
]
;
then
utils/apply_map.pl
-f
1
$destdir
/utt_map <
$srcdir
/utt2dur
>
$destdir
/utt2dur
fi
if
[
-f
$srcdir
/utt2num_frames
]
;
then
utils/apply_map.pl
-f
1
$destdir
/utt_map <
$srcdir
/utt2num_frames
>
$destdir
/utt2num_frames
fi
if
[
-f
$srcdir
/reco2dur
]
;
then
if
[
-f
$srcdir
/segments
]
;
then
cp
$srcdir
/reco2dur
$destdir
/reco2dur
else
utils/apply_map.pl
-f
1
$destdir
/utt_map <
$srcdir
/reco2dur
>
$destdir
/reco2dur
fi
fi
if
[
-f
$srcdir
/spk2gender
]
;
then
utils/apply_map.pl
-f
1
$destdir
/spk_map <
$srcdir
/spk2gender
>
$destdir
/spk2gender
fi
if
[
-f
$srcdir
/cmvn.scp
]
;
then
utils/apply_map.pl
-f
1
$destdir
/spk_map <
$srcdir
/cmvn.scp
>
$destdir
/cmvn.scp
fi
for
f
in
frame_shift stm glm ctm
;
do
if
[
-f
$srcdir
/
$f
]
;
then
cp
$srcdir
/
$f
$destdir
fi
done
rm
$destdir
/spk_map
$destdir
/utt_map
echo
"
$0
: copied data from
$srcdir
to
$destdir
"
for
f
in
feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm
;
do
if
[
-f
$destdir
/
$f
]
&&
[
!
-f
$srcdir
/
$f
]
;
then
echo
"
$0
: file
$f
exists in dest
$destdir
but not in src
$srcdir
. Moving it to"
echo
" ...
$destdir
/.backup/
$f
"
mkdir
-p
$destdir
/.backup
mv
$destdir
/
$f
$destdir
/.backup/
fi
done
[
!
-f
$srcdir
/feats.scp
]
&&
validate_opts
=
"
$validate_opts
--no-feats"
[
!
-f
$srcdir
/text
]
&&
validate_opts
=
"
$validate_opts
--no-text"
echo
$validate_opts
echo
$destdir
utils/validate_data_dir.sh
$validate_opts
$destdir
examples/aishell/s0/tools/data/remove_dup_utts.sh
0 → 100755
View file @
a7785cc6
#!/usr/bin/env bash
# Script taken from kaldi repo:
# https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/utils/data/remove_dup_utts.sh
# Remove excess utterances once they appear more than a specified
# number of times with the same transcription, in a data set.
# E.g. useful for removing excess "uh-huh" from training.
if
[
$#
!=
3
]
;
then
echo
"Usage: remove_dup_utts.sh max-count <src-data-dir> <dest-data-dir>"
echo
"e.g.: remove_dup_utts.sh 10 data/train data/train_nodup"
echo
"This script is used to filter out utterances that have from over-represented"
echo
"transcriptions (such as 'uh-huh'), by limiting the number of repetitions of"
echo
"any given word-sequence to a specified value. It's often used to get"
echo
"subsets for early stages of training."
exit
1
;
fi
maxcount
=
$1
srcdir
=
$2
destdir
=
$3
mkdir
-p
$destdir
[
!
-f
$srcdir
/text
]
&&
echo
"
$0
: Invalid input directory
$srcdir
"
&&
exit
1
;
!
mkdir
-p
$destdir
&&
echo
"
$0
: could not create directory
$destdir
"
&&
exit
1
;
!
[
"
$maxcount
"
-gt
1
]
&&
echo
"
$0
: invalid max-count '
$maxcount
'"
&&
exit
1
;
cp
$srcdir
/
*
$destdir
cat
$srcdir
/text |
\
perl
-e
'
$maxcount = shift @ARGV;
@all = ();
$p1 = 103349; $p2 = 71147; $k = 0;
sub random { # our own random number generator: predictable.
$k = ($k + $p1) % $p2;
return ($k / $p2);
}
while(<>) {
push @all, $_;
@A = split(" ", $_);
shift @A;
$text = join(" ", @A);
$count{$text} ++;
}
foreach $line (@all) {
@A = split(" ", $line);
shift @A;
$text = join(" ", @A);
$n = $count{$text};
if ($n < $maxcount || random() < ($maxcount / $n)) {
print $line;
}
}'
$maxcount
>
$destdir
/text
echo
"Reduced number of utterances from
`
cat
$srcdir
/text |
wc
-l
`
to
`
cat
$destdir
/text |
wc
-l
`
"
# Not doing these steps as this script doesn't exist
# + the calling script already validates data
#echo "Using fix_data_dir.sh to reconcile the other files."
#utils/fix_data_dir.sh $destdir
#rm -r $destdir/.backup
exit
0
examples/aishell/s0/tools/data/split_scp.pl
0 → 100755
View file @
a7785cc6
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation
# See ../../COPYING for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This program splits up any kind of .scp or archive-type file.
# If there is no utt2spk option it will work on any text file and
# will split it up with an approximately equal number of lines in
# each but.
# With the --utt2spk option it will work on anything that has the
# utterance-id as the first entry on each line; the utt2spk file is
# of the form "utterance speaker" (on each line).
# It splits it into equal size chunks as far as it can. If you use the utt2spk
# option it will make sure these chunks coincide with speaker boundaries. In
# this case, if there are more chunks than speakers (and in some other
# circumstances), some of the resulting chunks will be empty and it will print
# an error message and exit with nonzero status.
# You will normally call this like:
# split_scp.pl scp scp.1 scp.2 scp.3 ...
# or
# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
# Note that you can use this script to split the utt2spk file itself,
# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...
# You can also call the scripts like:
# split_scp.pl -j 3 0 scp scp.0
# [note: with this option, it assumes zero-based indexing of the split parts,
# i.e. the second number must be 0 <= n < num-jobs.]
use
warnings
;
$num_jobs
=
0
;
$job_id
=
0
;
$utt2spk_file
=
"";
$one_based
=
0
;
for
(
$x
=
1
;
$x
<=
3
&&
@ARGV
>
0
;
$x
++
)
{
if
(
$ARGV
[
0
]
eq
"
-j
")
{
shift
@ARGV
;
$num_jobs
=
shift
@ARGV
;
$job_id
=
shift
@ARGV
;
}
if
(
$ARGV
[
0
]
=~
/--utt2spk=(.+)/
)
{
$utt2spk_file
=
$
1
;
shift
;
}
if
(
$ARGV
[
0
]
eq
'
--one-based
')
{
$one_based
=
1
;
shift
@ARGV
;
}
}
if
(
$num_jobs
!=
0
&&
(
$num_jobs
<
0
||
$job_id
-
$one_based
<
0
||
$job_id
-
$one_based
>=
$num_jobs
))
{
die
"
$0: Invalid job number/index values for '-j
$num_jobs
$job_id
"
.
(
$one_based
?
"
--one-based
"
:
"")
.
"
'
\n
"
}
$one_based
and
$job_id
--
;
if
((
$num_jobs
==
0
&&
@ARGV
<
2
)
||
(
$num_jobs
>
0
&&
(
@ARGV
<
1
||
@ARGV
>
2
)))
{
die
"
Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ...
or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] in.scp [out.scp]
... where 0 <= job-id < num-jobs, or 1 <= job-id <- num-jobs if --one-based.
\n
";
}
$error
=
0
;
$inscp
=
shift
@ARGV
;
if
(
$num_jobs
==
0
)
{
# without -j option
@OUTPUTS
=
@ARGV
;
}
else
{
for
(
$j
=
0
;
$j
<
$num_jobs
;
$j
++
)
{
if
(
$j
==
$job_id
)
{
if
(
@ARGV
>
0
)
{
push
@OUTPUTS
,
$ARGV
[
0
];
}
else
{
push
@OUTPUTS
,
"
-
";
}
}
else
{
push
@OUTPUTS
,
"
/dev/null
";
}
}
}
if
(
$utt2spk_file
ne
"")
{
# We have the --utt2spk option...
open
(
$u_fh
,
'
<
',
$utt2spk_file
)
||
die
"
$0: Error opening utt2spk file
$utt2spk_file
: $!
\n
";
while
(
<
$u_fh
>
)
{
@A
=
split
;
@A
==
2
||
die
"
$0: Bad line
$_
in utt2spk file
$utt2spk_file
\n
";
(
$u
,
$s
)
=
@A
;
$utt2spk
{
$u
}
=
$s
;
}
close
$u_fh
;
open
(
$i_fh
,
'
<
',
$inscp
)
||
die
"
$0: Error opening input scp file
$inscp
: $!
\n
";
@spkrs
=
();
while
(
<
$i_fh
>
)
{
@A
=
split
;
if
(
@A
==
0
)
{
die
"
$0: Empty or space-only line in scp file
$inscp
\n
";
}
$u
=
$A
[
0
];
$s
=
$utt2spk
{
$u
};
defined
$s
||
die
"
$0: No utterance
$u
in utt2spk file
$utt2spk_file
\n
";
if
(
!
defined
$spk_count
{
$s
})
{
push
@spkrs
,
$s
;
$spk_count
{
$s
}
=
0
;
$spk_data
{
$s
}
=
[]
;
# ref to new empty array.
}
$spk_count
{
$s
}
++
;
push
@
{
$spk_data
{
$s
}},
$_
;
}
# Now split as equally as possible ..
# First allocate spks to files by allocating an approximately
# equal number of speakers.
$numspks
=
@spkrs
;
# number of speakers.
$numscps
=
@OUTPUTS
;
# number of output files.
if
(
$numspks
<
$numscps
)
{
die
"
$0: Refusing to split data because number of speakers
$numspks
"
.
"
is less than the number of output .scp files
$numscps
\n
";
}
for
(
$scpidx
=
0
;
$scpidx
<
$numscps
;
$scpidx
++
)
{
$scparray
[
$scpidx
]
=
[]
;
# [] is array reference.
}
for
(
$spkidx
=
0
;
$spkidx
<
$numspks
;
$spkidx
++
)
{
$scpidx
=
int
((
$spkidx*$numscps
)
/
$numspks
);
$spk
=
$spkrs
[
$spkidx
];
push
@
{
$scparray
[
$scpidx
]},
$spk
;
$scpcount
[
$scpidx
]
+=
$spk_count
{
$spk
};
}
# Now will try to reassign beginning + ending speakers
# to different scp's and see if it gets more balanced.
# Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
# We can show that if considering changing just 2 scp's, we minimize
# this by minimizing the squared difference in sizes. This is
# equivalent to minimizing the absolute difference in sizes. This
# shows this method is bound to converge.
$changed
=
1
;
while
(
$changed
)
{
$changed
=
0
;
for
(
$scpidx
=
0
;
$scpidx
<
$numscps
;
$scpidx
++
)
{
# First try to reassign ending spk of this scp.
if
(
$scpidx
<
$numscps
-
1
)
{
$sz
=
@
{
$scparray
[
$scpidx
]};
if
(
$sz
>
0
)
{
$spk
=
$scparray
[
$scpidx
]
->
[
$sz
-
1
];
$count
=
$spk_count
{
$spk
};
$nutt1
=
$scpcount
[
$scpidx
];
$nutt2
=
$scpcount
[
$scpidx
+
1
];
if
(
abs
(
(
$nutt2
+
$count
)
-
(
$nutt1
-
$count
))
<
abs
(
$nutt2
-
$nutt1
))
{
# Would decrease
# size-diff by reassigning spk...
$scpcount
[
$scpidx
+
1
]
+=
$count
;
$scpcount
[
$scpidx
]
-=
$count
;
pop
@
{
$scparray
[
$scpidx
]};
unshift
@
{
$scparray
[
$scpidx
+
1
]},
$spk
;
$changed
=
1
;
}
}
}
if
(
$scpidx
>
0
&&
@
{
$scparray
[
$scpidx
]}
>
0
)
{
$spk
=
$scparray
[
$scpidx
]
->
[
0
];
$count
=
$spk_count
{
$spk
};
$nutt1
=
$scpcount
[
$scpidx
-
1
];
$nutt2
=
$scpcount
[
$scpidx
];
if
(
abs
(
(
$nutt2
-
$count
)
-
(
$nutt1
+
$count
))
<
abs
(
$nutt2
-
$nutt1
))
{
# Would decrease
# size-diff by reassigning spk...
$scpcount
[
$scpidx
-
1
]
+=
$count
;
$scpcount
[
$scpidx
]
-=
$count
;
shift
@
{
$scparray
[
$scpidx
]};
push
@
{
$scparray
[
$scpidx
-
1
]},
$spk
;
$changed
=
1
;
}
}
}
}
# Now print out the files...
for
(
$scpidx
=
0
;
$scpidx
<
$numscps
;
$scpidx
++
)
{
$scpfile
=
$OUTPUTS
[
$scpidx
];
(
$scpfile
ne
'
-
'
?
open
(
$f_fh
,
'
>
',
$scpfile
)
:
open
(
$f_fh
,
'
>&
',
\
*STDOUT
))
||
die
"
$0: Could not open scp file
$scpfile
for writing: $!
\n
";
$count
=
0
;
if
(
@
{
$scparray
[
$scpidx
]}
==
0
)
{
print
STDERR
"
$0: eError: split_scp.pl producing empty .scp file
"
.
"
$scpfile
(too many splits and too few speakers?)
\n
";
$error
=
1
;
}
else
{
foreach
$spk
(
@
{
$scparray
[
$scpidx
]}
)
{
print
$f_fh
@
{
$spk_data
{
$spk
}};
$count
+=
$spk_count
{
$spk
};
}
$count
==
$scpcount
[
$scpidx
]
||
die
"
Count mismatch [code error]
";
}
close
(
$f_fh
);
}
}
else
{
# This block is the "normal" case where there is no --utt2spk
# option and we just break into equal size chunks.
open
(
$i_fh
,
'
<
',
$inscp
)
||
die
"
$0: Error opening input scp file
$inscp
: $!
\n
";
$numscps
=
@OUTPUTS
;
# size of array.
@F
=
();
while
(
<
$i_fh
>
)
{
push
@F
,
$_
;
}
$numlines
=
@F
;
if
(
$numlines
==
0
)
{
print
STDERR
"
$0: error: empty input scp file
$inscp
\n
";
$error
=
1
;
}
$linesperscp
=
int
(
$numlines
/
$numscps
);
# the "whole part"..
$linesperscp
>=
1
||
die
"
$0: You are splitting into too many pieces! [reduce
\$
nj (
$numscps
) to be smaller than the number of lines (
$numlines
) in
$inscp
]
\n
";
$remainder
=
$numlines
-
(
$linesperscp
*
$numscps
);
(
$remainder
>=
0
&&
$remainder
<
$numlines
)
||
die
"
bad remainder
$remainder
";
# [just doing int() rounds down].
$n
=
0
;
for
(
$scpidx
=
0
;
$scpidx
<
@OUTPUTS
;
$scpidx
++
)
{
$scpfile
=
$OUTPUTS
[
$scpidx
];
(
$scpfile
ne
'
-
'
?
open
(
$o_fh
,
'
>
',
$scpfile
)
:
open
(
$o_fh
,
'
>&
',
\
*STDOUT
))
||
die
"
$0: Could not open scp file
$scpfile
for writing: $!
\n
";
for
(
$k
=
0
;
$k
<
$linesperscp
+
(
$scpidx
<
$remainder
?
1
:
0
);
$k
++
)
{
print
$o_fh
$F
[
$n
++
];
}
close
(
$o_fh
)
||
die
"
$0: Eror closing scp file
$scpfile
: $!
\n
";
}
$n
==
$numlines
||
die
"
$n
!=
$numlines
[code error]
";
}
exit
(
$error
);
examples/aishell/s0/tools/decode.sh
0 → 100755
View file @
a7785cc6
#!/usr/bin/env bash
# Copyright 2021 Mobvoi Inc. All Rights Reserved.
# Author: binbinzhang@mobvoi.com (Binbin Zhang)
export
GLOG_logtostderr
=
1
export
GLOG_v
=
2
set
-e
nj
=
1
chunk_size
=
-1
ctc_weight
=
0.0
reverse_weight
=
0.0
rescoring_weight
=
1.0
# For CTC WFST based decoding
fst_path
=
dict_path
=
acoustic_scale
=
1.0
beam
=
15.0
lattice_beam
=
12.0
min_active
=
200
max_active
=
7000
blank_skip_thresh
=
1.0
length_penalty
=
0.0
.
tools/parse_options.sh
||
exit
1
;
if
[
$#
!=
5
]
;
then
echo
"Usage:
$0
[options] <wav.scp> <label_file> <model_file> <unit_file> <output_dir>"
exit
1
;
fi
if
!
which decoder_main
>
/dev/null
;
then
echo
"decoder_main is not built, please go to runtime/libtorch to build it."
exit
1
;
fi
scp
=
$1
label_file
=
$2
model_file
=
$3
unit_file
=
$4
dir
=
$5
mkdir
-p
$dir
/split
${
nj
}
# Step 1. Split wav.scp
split_scps
=
""
for
n
in
$(
seq
${
nj
}
)
;
do
split_scps
=
"
${
split_scps
}
${
dir
}
/split
${
nj
}
/wav.
${
n
}
.scp"
done
tools/data/split_scp.pl
${
scp
}
${
split_scps
}
# Step 2. Parallel decoding
wfst_decode_opts
=
if
[
!
-z
$fst_path
]
;
then
wfst_decode_opts
=
"--fst_path
$fst_path
"
wfst_decode_opts
=
"
$wfst_decode_opts
--beam
$beam
"
wfst_decode_opts
=
"
$wfst_decode_opts
--dict_path
$dict_path
"
wfst_decode_opts
=
"
$wfst_decode_opts
--lattice_beam
$lattice_beam
"
wfst_decode_opts
=
"
$wfst_decode_opts
--max_active
$max_active
"
wfst_decode_opts
=
"
$wfst_decode_opts
--min_active
$min_active
"
wfst_decode_opts
=
"
$wfst_decode_opts
--acoustic_scale
$acoustic_scale
"
wfst_decode_opts
=
"
$wfst_decode_opts
--blank_skip_thresh
$blank_skip_thresh
"
wfst_decode_opts
=
"
$wfst_decode_opts
--length_penalty
$length_penalty
"
echo
$wfst_decode_opts
>
$dir
/config
fi
for
n
in
$(
seq
${
nj
}
)
;
do
{
decoder_main
\
--rescoring_weight
$rescoring_weight
\
--ctc_weight
$ctc_weight
\
--reverse_weight
$reverse_weight
\
--chunk_size
$chunk_size
\
--wav_scp
${
dir
}
/split
${
nj
}
/wav.
${
n
}
.scp
\
--model_path
$model_file
\
--unit_path
$unit_file
\
$wfst_decode_opts
\
--result
${
dir
}
/split
${
nj
}
/
${
n
}
.text &>
${
dir
}
/split
${
nj
}
/
${
n
}
.log
}
&
done
wait
# Step 3. Merge files
for
n
in
$(
seq
${
nj
}
)
;
do
cat
${
dir
}
/split
${
nj
}
/
${
n
}
.text
done
>
${
dir
}
/text
tail
$dir
/split
${
nj
}
/
*
.log |
grep
RTF |
awk
'{sum+=$NF}END{print sum/NR}'
>
$dir
/rtf
# Step 4. Compute WER
python3 tools/compute-wer.py
--char
=
1
--v
=
1
\
$label_file
$dir
/text
>
$dir
/wer
examples/aishell/s0/tools/feat_to_shape.sh
0 → 100755
View file @
a7785cc6
#!/bin/bash
# Begin configuration section.
nj
=
4
cmd
=
run.pl
verbose
=
0
filetype
=
""
preprocess_conf
=
""
# End configuration section.
help_message
=
$(
cat
<<
EOF
Usage:
$0
[options] <input-scp> <output-scp> [<log-dir>]
e.g.:
$0
data/train/feats.scp data/train/shape.scp data/train/log
Options:
--nj <nj> # number of parallel jobs
--cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
--filetype <mat|hdf5|sound.hdf5> # Specify the format of feats file
--preprocess-conf <json> # Apply preprocess to feats when creating shape.scp
--verbose <num> # Default: 0
EOF
)
echo
"
$0
$*
"
1>&2
# Print the command line for logging
.
parse_options.sh
||
exit
1
;
if
[
$#
-lt
2
]
||
[
$#
-gt
3
]
;
then
echo
"
${
help_message
}
"
1>&2
exit
1
;
fi
set
-euo
pipefail
scp
=
$1
outscp
=
$2
data
=
$(
dirname
${
scp
}
)
if
[
$#
-eq
3
]
;
then
logdir
=
$3
else
logdir
=
${
data
}
/log
fi
mkdir
-p
${
logdir
}
split_scps
=
""
for
n
in
$(
seq
${
nj
}
)
;
do
split_scps
=
"
${
split_scps
}
${
logdir
}
/feats.
${
n
}
.scp"
done
utils/split_scp.pl
${
scp
}
${
split_scps
}
if
[
-n
"
${
preprocess_conf
}
"
]
;
then
preprocess_opt
=
"--preprocess-conf
${
preprocess_conf
}
"
else
preprocess_opt
=
""
fi
if
[
-n
"
${
filetype
}
"
]
;
then
filetype_opt
=
"--filetype
${
filetype
}
"
else
filetype_opt
=
""
fi
${
cmd
}
JOB
=
1:
${
nj
}
${
logdir
}
/feat_to_shape.JOB.log
\
feat-to-len
--verbose
=
${
verbose
}
\
scp:
${
logdir
}
/feats.JOB.scp ark,t:
${
logdir
}
/shape.JOB.scp
feat_dim
=
$(
feat-to-dim scp:
$logdir
/feats.1.scp -
)
# concatenate the .scp files together.
for
n
in
$(
seq
${
nj
}
)
;
do
sed
"s:
\
*
$:
,
$feat_dim
:g"
${
logdir
}
/shape.
${
n
}
.scp
done
>
${
outscp
}
rm
-f
${
logdir
}
/feats.
*
.scp 2>/dev/null
examples/aishell/s0/tools/filter_scp.pl
0 → 100755
View file @
a7785cc6
#!/usr/bin/env perl
# Copyright 2010-2012 Microsoft Corporation
# Johns Hopkins University (author: Daniel Povey)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script takes a list of utterance-ids or any file whose first field
# of each line is an utterance-id, and filters an scp
# file (or any file whose "n-th" field is an utterance id), printing
# out only those lines whose "n-th" field is in id_list. The index of
# the "n-th" field is 1, by default, but can be changed by using
# the -f <n> switch
$exclude
=
0
;
$field
=
1
;
$shifted
=
0
;
do
{
$shifted
=
0
;
if
(
$ARGV
[
0
]
eq
"
--exclude
")
{
$exclude
=
1
;
shift
@ARGV
;
$shifted
=
1
;
}
if
(
$ARGV
[
0
]
eq
"
-f
")
{
$field
=
$ARGV
[
1
];
shift
@ARGV
;
shift
@ARGV
;
$shifted
=
1
}
}
while
(
$shifted
);
if
(
@ARGV
<
1
||
@ARGV
>
2
)
{
die
"
Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp
\n
"
.
"
Prints only the input lines whose f'th field (default: first) is in 'id_list'.
\n
"
.
"
Note: only the first field of each line in id_list matters. With --exclude, prints
\n
"
.
"
only the lines that were *not* in id_list.
\n
"
.
"
Caution: previously, the -f option was interpreted as a zero-based field index.
\n
"
.
"
If your older scripts (written before Oct 2014) stopped working and you used the
\n
"
.
"
-f option, add 1 to the argument.
\n
"
.
"
See also: utils/filter_scp.pl .
\n
";
}
$idlist
=
shift
@ARGV
;
open
(
F
,
"
<
$idlist
")
||
die
"
Could not open id-list file
$idlist
";
while
(
<
F
>
)
{
@A
=
split
;
@A
>=
1
||
die
"
Invalid id-list file line
$_
";
$seen
{
$A
[
0
]}
=
1
;
}
if
(
$field
==
1
)
{
# Treat this as special case, since it is common.
while
(
<>
)
{
$_
=~
m/\s*(\S+)\s*/
||
die
"
Bad line
$_
, could not get first field.
";
# $1 is what we filter on.
if
((
!
$exclude
&&
$seen
{
$
1
})
||
(
$exclude
&&
!
defined
$seen
{
$
1
}))
{
print
$_
;
}
}
}
else
{
while
(
<>
)
{
@A
=
split
;
@A
>
0
||
die
"
Invalid scp file line
$_
";
@A
>=
$field
||
die
"
Invalid scp file line
$_
";
if
((
!
$exclude
&&
$seen
{
$A
[
$field
-
1
]})
||
(
$exclude
&&
!
defined
$seen
{
$A
[
$field
-
1
]}))
{
print
$_
;
}
}
}
# tests:
# the following should print "foo 1"
# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo)
# the following should print "bar 2".
# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2)
examples/aishell/s0/tools/fix_data_dir.sh
0 → 100755
View file @
a7785cc6
#!/bin/bash
# This script makes sure that only the segments present in
# all of "feats.scp", "wav.scp" [if present], segments [if present]
# text, and utt2spk are present in any of them.
# It puts the original contents of data-dir into
# data-dir/.backup
cmd
=
"
$@
"
utt_extra_files
=
spk_extra_files
=
.
tools/parse_options.sh
if
[
$#
!=
1
]
;
then
echo
"Usage: utils/data/fix_data_dir.sh <data-dir>"
echo
"e.g.: utils/data/fix_data_dir.sh data/train"
echo
"This script helps ensure that the various files in a data directory"
echo
"are correctly sorted and filtered, for example removing utterances"
echo
"that have no features (if feats.scp is present)"
exit
1
fi
data
=
$1
if
[
-f
$data
/images.scp
]
;
then
image/fix_data_dir.sh
$cmd
exit
$?
fi
mkdir
-p
$data
/.backup
[
!
-d
$data
]
&&
echo
"
$0
: no such directory
$data
"
&&
exit
1
;
[
!
-f
$data
/utt2spk
]
&&
echo
"
$0
: no such file
$data
/utt2spk"
&&
exit
1
;
set
-e
-o
pipefail
-u
tmpdir
=
$(
mktemp
-d
/tmp/kaldi.XXXX
)
;
trap
'rm -rf "$tmpdir"'
EXIT HUP INT PIPE TERM
export
LC_ALL
=
C
function
check_sorted
{
file
=
$1
sort
-k1
,1
-u
<
$file
>
$file
.tmp
if
!
cmp
-s
$file
$file
.tmp
;
then
echo
"
$0
: file
$1
is not in sorted order or not unique, sorting it"
mv
$file
.tmp
$file
else
rm
$file
.tmp
fi
}
for
x
in
utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp
\
reco2file_and_channel spk2gender utt2lang utt2emo utt2uniq utt2dur reco2dur utt2num_frames
;
do
if
[
-f
$data
/
$x
]
;
then
cp
$data
/
$x
$data
/.backup/
$x
check_sorted
$data
/
$x
fi
done
function
filter_file
{
filter
=
$1
file_to_filter
=
$2
cp
$file_to_filter
${
file_to_filter
}
.tmp
tools/filter_scp.pl
$filter
${
file_to_filter
}
.tmp
>
$file_to_filter
if
!
cmp
${
file_to_filter
}
.tmp
$file_to_filter
>
&/dev/null
;
then
length1
=
$(
cat
${
file_to_filter
}
.tmp |
wc
-l
)
length2
=
$(
cat
${
file_to_filter
}
|
wc
-l
)
if
[
$length1
-ne
$length2
]
;
then
echo
"
$0
: filtered
$file_to_filter
from
$length1
to
$length2
lines based on filter
$filter
."
fi
fi
rm
$file_to_filter
.tmp
}
function
filter_recordings
{
# We call this once before the stage when we filter on utterance-id, and once
# after.
if
[
-f
$data
/segments
]
;
then
# We have a segments file -> we need to filter this and the file wav.scp, and
# reco2file_and_utt, if it exists, to make sure they have the same list of
# recording-ids.
if
[
!
-f
$data
/wav.scp
]
;
then
echo
"
$0
:
$data
/segments exists but not
$data
/wav.scp"
exit
1
;
fi
awk
'{print $2}'
<
$data
/segments |
sort
|
uniq
>
$tmpdir
/recordings
n1
=
$(
cat
$tmpdir
/recordings |
wc
-l
)
[
!
-s
$tmpdir
/recordings
]
&&
\
echo
"Empty list of recordings (bad file
$data
/segments)?"
&&
exit
1
;
tools/filter_scp.pl
$data
/wav.scp
$tmpdir
/recordings
>
$tmpdir
/recordings.tmp
mv
$tmpdir
/recordings.tmp
$tmpdir
/recordings
cp
$data
/segments
{
,.tmp
}
;
awk
'{print $2, $1, $3, $4}'
<
$data
/segments.tmp
>
$data
/segments
filter_file
$tmpdir
/recordings
$data
/segments
cp
$data
/segments
{
,.tmp
}
;
awk
'{print $2, $1, $3, $4}'
<
$data
/segments.tmp
>
$data
/segments
rm
$data
/segments.tmp
filter_file
$tmpdir
/recordings
$data
/wav.scp
[
-f
$data
/reco2file_and_channel
]
&&
filter_file
$tmpdir
/recordings
$data
/reco2file_and_channel
[
-f
$data
/reco2dur
]
&&
filter_file
$tmpdir
/recordings
$data
/reco2dur
true
fi
}
function
filter_speakers
{
# throughout this program, we regard utt2spk as primary and spk2utt as derived, so...
tools/utt2spk_to_spk2utt.pl
$data
/utt2spk
>
$data
/spk2utt
cat
$data
/spk2utt |
awk
'{print $1}'
>
$tmpdir
/speakers
for
s
in
cmvn.scp spk2gender
;
do
f
=
$data
/
$s
if
[
-f
$f
]
;
then
filter_file
$f
$tmpdir
/speakers
fi
done
filter_file
$tmpdir
/speakers
$data
/spk2utt
tools/spk2utt_to_utt2spk.pl
$data
/spk2utt
>
$data
/utt2spk
for
s
in
cmvn.scp spk2gender
$spk_extra_files
;
do
f
=
$data
/
$s
if
[
-f
$f
]
;
then
filter_file
$tmpdir
/speakers
$f
fi
done
}
function
filter_utts
{
cat
$data
/utt2spk |
awk
'{print $1}'
>
$tmpdir
/utts
echo
"
$(
cat
$tmpdir
/utts |
wc
-l
)
"
!
cat
$data
/utt2spk |
sort
| cmp -
$data
/utt2spk
&&
\
echo
"utt2spk is not in sorted order (fix this yourself)"
&&
exit
1
;
!
cat
$data
/utt2spk |
sort
-k2
| cmp -
$data
/utt2spk
&&
\
echo
"utt2spk is not in sorted order when sorted first on speaker-id "
&&
\
echo
"(fix this by making speaker-ids prefixes of utt-ids)"
&&
exit
1
;
!
cat
$data
/spk2utt |
sort
| cmp -
$data
/spk2utt
&&
\
echo
"spk2utt is not in sorted order (fix this yourself)"
&&
exit
1
;
if
[
-f
$data
/utt2uniq
]
;
then
!
cat
$data
/utt2uniq |
sort
| cmp -
$data
/utt2uniq
&&
\
echo
"utt2uniq is not in sorted order (fix this yourself)"
&&
exit
1
;
fi
maybe_wav
=
maybe_reco2dur
=
[
!
-f
$data
/segments
]
&&
maybe_wav
=
wav.scp
# wav indexed by utts only if segments does not exist.
[
-s
$data
/reco2dur
]
&&
[
!
-f
$data
/segments
]
&&
maybe_reco2dur
=
reco2dur
# reco2dur indexed by utts
maybe_utt2dur
=
if
[
-f
$data
/utt2dur
]
;
then
cat
$data
/utt2dur |
\
awk
'{ if (NF == 2 && $2 > 0) { print }}'
>
$data
/utt2dur.ok
||
exit
1
maybe_utt2dur
=
utt2dur.ok
fi
maybe_utt2num_frames
=
if
[
-f
$data
/utt2num_frames
]
;
then
cat
$data
/utt2num_frames |
\
awk
'{ if (NF == 2 && $2 > 0) { print }}'
>
$data
/utt2num_frames.ok
||
exit
1
maybe_utt2num_frames
=
utt2num_frames.ok
fi
for
x
in
feats.scp text segments utt2lang utt2emo
$maybe_wav
$maybe_utt2dur
$maybe_utt2num_frames
;
do
if
[
-f
$data
/
$x
]
;
then
tools/filter_scp.pl
$data
/
$x
$tmpdir
/utts
>
$tmpdir
/utts.tmp
echo
"
$data
/
$x
,
$(
cat
$tmpdir
/utts |
wc
-l
)
,
$(
cat
$tmpdir
/utts.tmp |
wc
-l
)
"
mv
$tmpdir
/utts.tmp
$tmpdir
/utts
# echo "$tmpdir/utts"
fi
done
rm
$data
/utt2dur.ok 2>/dev/null
||
true
rm
$data
/utt2num_frames.ok 2>/dev/null
||
true
[
!
-s
$tmpdir
/utts
]
&&
echo
"fix_data_dir.sh: no utterances remained: not proceeding further."
&&
\
rm
$tmpdir
/utts
&&
exit
1
;
if
[
-f
$data
/utt2spk
]
;
then
new_nutts
=
$(
cat
$tmpdir
/utts |
wc
-l
)
old_nutts
=
$(
cat
$data
/utt2spk |
wc
-l
)
if
[
$new_nutts
-ne
$old_nutts
]
;
then
echo
"fix_data_dir.sh: kept
$new_nutts
utterances out of
$old_nutts
"
else
echo
"fix_data_dir.sh: kept all
$old_nutts
utterances."
fi
fi
for
x
in
utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2emo utt2dur utt2num_frames
$maybe_wav
$maybe_reco2dur
$utt_extra_files
;
do
if
[
-f
$data
/
$x
]
;
then
cp
$data
/
$x
$data
/.backup/
$x
if
!
cmp
-s
$data
/
$x
<
(
tools/filter_scp.pl
$tmpdir
/utts
$data
/
$x
)
;
then
tools/filter_scp.pl
$tmpdir
/utts
$data
/.backup/
$x
>
$data
/
$x
fi
fi
done
}
filter_recordings
filter_speakers
filter_utts
filter_speakers
filter_recordings
tools/utt2spk_to_spk2utt.pl
$data
/utt2spk
>
$data
/spk2utt
echo
"fix_data_dir.sh: old files are kept in
$data
/.backup"
examples/aishell/s0/tools/flake8_hook.py
0 → 100755
View file @
a7785cc6
#!/usr/bin/env python3
# encoding: utf-8
import
sys
from
flake8.main
import
git
if
__name__
==
'__main__'
:
sys
.
exit
(
git
.
hook
(
strict
=
True
,
lazy
=
git
.
config_for
(
'lazy'
),
)
)
examples/aishell/s0/tools/format_data.sh
0 → 100755
View file @
a7785cc6
#!/bin/bash
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
# Mobvoi Corporation (Author: Di Wu)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
echo
"
$0
$*
"
>
&2
# Print the command line for logging
.
./path.sh
nj
=
1
cmd
=
run.pl
nlsyms
=
""
lang
=
""
feat
=
""
feat_type
=
"kaldi"
oov
=
"<unk>"
bpecode
=
""
allow_one_column
=
false
raw
=
""
verbose
=
0
trans_type
=
char
filetype
=
""
preprocess_conf
=
""
category
=
""
out
=
""
# If omitted, write in stdout
help_message
=
$(
cat
<<
EOF
Usage:
$0
<data-dir> <dict>
e.g.
$0
data/train data/lang_1char/train_units.txt
Options:
--nj <nj> # number of parallel jobs
--cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
--feat <feat-scp> # feat.scp or feat1.scp,feat2.scp,...
--feat-type <feat-type> # kaldi or wav
--oov <oov-word> # Default: <unk>
--out <outputfile> # If omitted, write in stdout
--filetype <mat|hdf5|sound.hdf5> # Specify the format of feats file
--preprocess-conf <json> # Apply preprocess to feats when creating shape.scp
--verbose <num> # Default: 0
EOF
)
.
tools/parse_options.sh
if
[
$#
!=
2
]
;
then
echo
"
${
help_message
}
"
1>&2
exit
1
;
fi
set
-euo
pipefail
dir
=
$1
dic
=
$2
tmpdir
=
$(
mktemp
-d
${
dir
}
/tmp-XXXXX
)
#trap 'rm -rf ${tmpdir}' EXIT
# 1. Create scp files for inputs
# These are not necessary for decoding mode, and make it as an option
input
=
if
[
-n
"
${
feat
}
"
]
;
then
_feat_scps
=
$(
echo
"
${
feat
}
"
|
tr
','
' '
)
read
-r
-a
feat_scps
<<<
$_feat_scps
num_feats
=
${#
feat_scps
[@]
}
for
((
i
=
1
;
i<
=
num_feats
;
i++
))
;
do
feat
=
${
feat_scps
[
$((
i-1
))
]
}
mkdir
-p
${
tmpdir
}
/input_
${
i
}
input+
=
"input_
${
i
}
"
cat
${
feat
}
>
${
tmpdir
}
/input_
${
i
}
/feat.scp
# Dump in the "legacy" style JSON format
if
[
-n
"
${
filetype
}
"
]
;
then
awk
-v
filetype
=
${
filetype
}
'{print $1 " " filetype}'
${
feat
}
\
>
${
tmpdir
}
/input_
${
i
}
/filetype.scp
fi
if
[
${
feat_type
}
==
"kaldi"
]
;
then
tools/feat_to_shape.sh
--cmd
"
${
cmd
}
"
--nj
${
nj
}
\
--filetype
"
${
filetype
}
"
\
--preprocess-conf
"
${
preprocess_conf
}
"
\
--verbose
${
verbose
}
${
feat
}
${
tmpdir
}
/input_
${
i
}
/shape.scp
elif
[
${
feat_type
}
==
"wav"
]
||
[
${
feat_type
}
==
"flac"
]
||
[
${
feat_type
}
==
"opus"
]
;
then
if
[
-f
$dir
/segments
]
;
then
# used for segmented wav.scp
awk
'{print $1" "$4-$3}'
$dir
/segments
>
$dir
/utt2dur
fi
if
[
!
-f
$dir
/utt2dur
]
;
then
tools/wav_to_duration.sh
--nj
${
nj
}
\
${
feat
}
${
tmpdir
}
/input_
${
i
}
/shape.scp
# use the existed utt2dur as shape.scp directly
else
cp
$dir
/utt2dur
${
tmpdir
}
/input_
${
i
}
/shape.scp
fi
fi
done
fi
# 2. Create scp files for outputs
mkdir
-p
${
tmpdir
}
/output
if
[
-n
"
${
bpecode
}
"
]
;
then
if
[
"
${
trans_type
}
"
==
"cn_char_en_bpe"
]
;
then
tools/text2token.py
-s
1
-n
1
-m
${
bpecode
}
${
dir
}
/text
--trans_type
${
trans_type
}
>
${
tmpdir
}
/output/token.scp
else
paste
-d
" "
<
(
awk
'{print $1}'
${
dir
}
/text
)
<
(
cut
-f
2-
-d
" "
${
dir
}
/text
\
| tools/spm_encode
--model
=
${
bpecode
}
--output_format
=
piece
)
\
>
${
tmpdir
}
/output/token.scp
fi
elif
[
-n
"
${
nlsyms
}
"
]
;
then
tools/text2token.py
-s
1
-n
1
-l
${
nlsyms
}
${
dir
}
/text
--trans_type
${
trans_type
}
>
${
tmpdir
}
/output/token.scp
elif
[
-n
"
${
raw
}
"
]
;
then
cat
$dir
/text
>
${
tmpdir
}
/output/token.scp
else
tools/text2token.py
-s
1
-n
1
${
dir
}
/text
--trans_type
${
trans_type
}
>
${
tmpdir
}
/output/token.scp
fi
<
${
tmpdir
}
/output/token.scp tools/sym2int.pl
--map-oov
${
oov
}
-f
2-
${
dic
}
>
${
tmpdir
}
/output/tokenid.scp
odim
=
$(
cat
${
dic
}
|
wc
-l
)
<
${
tmpdir
}
/output/tokenid.scp
awk
-v
odim
=
${
odim
}
'{print $1 " " NF-1 "," odim}'
>
${
tmpdir
}
/output/shape.scp
cat
${
dir
}
/text
>
${
tmpdir
}
/output/text.scp
# 3. Create scp files for the others
mkdir
-p
${
tmpdir
}
/other
if
[
-n
"
${
lang
}
"
]
;
then
awk
-v
lang
=
${
lang
}
'{print $1 " " lang}'
${
dir
}
/text
>
${
tmpdir
}
/other/lang.scp
fi
if
[
-n
"
${
category
}
"
]
;
then
awk
-v
category
=
${
category
}
'{print $1 " " category}'
${
dir
}
/text
\
>
${
tmpdir
}
/other/category.scp
fi
#cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp
# 4. Merge scp files into a one file
opts
=
""
for
intype
in
${
input
}
output other
;
do
if
[
-z
"
$(
find
"
${
tmpdir
}
/
${
intype
}
"
-name
"*.scp"
)
"
]
;
then
continue
fi
if
[
${
intype
}
!=
other
]
;
then
opts+
=
"--
${
intype
%_*
}
-scps "
else
opts+
=
"--scps "
fi
for
x
in
"
${
tmpdir
}
/
${
intype
}
"
/
*
.scp
;
do
k
=
$(
basename
${
x
}
.scp
)
if
[
${
k
}
=
shape
]
;
then
opts+
=
"shape:
${
x
}
:shape "
else
opts+
=
"
${
k
}
:
${
x
}
"
fi
done
done
if
${
allow_one_column
}
;
then
opts+
=
"--allow-one-column true "
else
opts+
=
"--allow-one-column false "
fi
if
[
-n
"
${
out
}
"
]
;
then
opts+
=
"-O
${
out
}
"
fi
tools/merge_scp2txt.py
--verbose
${
verbose
}
${
opts
}
#rm -fr ${tmpdir}
examples/aishell/s0/tools/fst/add_lex_disambig.pl
0 → 100755
View file @
a7785cc6
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation
# 2013-2016 Johns Hopkins University (author: Daniel Povey)
# 2015 Hainan Xu
# 2015 Guoguo Chen
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Adds disambiguation symbols to a lexicon.
# Outputs still in the normal lexicon format.
# Disambig syms are numbered #1, #2, #3, etc. (#0
# reserved for symbol in grammar).
# Outputs the number of disambig syms to the standard output.
# With the --pron-probs option, expects the second field
# of each lexicon line to be a pron-prob.
# With the --sil-probs option, expects three additional
# fields after the pron-prob, representing various components
# of the silence probability model.
$pron_probs
=
0
;
$sil_probs
=
0
;
$first_allowed_disambig
=
1
;
for
(
$n
=
1
;
$n
<=
3
&&
@ARGV
>
0
;
$n
++
)
{
if
(
$ARGV
[
0
]
eq
"
--pron-probs
")
{
$pron_probs
=
1
;
shift
@ARGV
;
}
if
(
$ARGV
[
0
]
eq
"
--sil-probs
")
{
$sil_probs
=
1
;
shift
@ARGV
;
}
if
(
$ARGV
[
0
]
eq
"
--first-allowed-disambig
")
{
$first_allowed_disambig
=
0
+
$ARGV
[
1
];
if
(
$first_allowed_disambig
<
1
)
{
die
"
add_lex_disambig.pl: invalid --first-allowed-disambig option:
$first_allowed_disambig
\n
";
}
shift
@ARGV
;
shift
@ARGV
;
}
}
if
(
@ARGV
!=
2
)
{
die
"
Usage: add_lex_disambig.pl [opts] <lexicon-in> <lexicon-out>
\n
"
.
"
This script adds disambiguation symbols to a lexicon in order to
\n
"
.
"
make decoding graphs determinizable; it adds pseudo-phone
\n
"
.
"
disambiguation symbols #1, #2 and so on at the ends of phones
\n
"
.
"
to ensure that all pronunciations are different, and that none
\n
"
.
"
is a prefix of another.
\n
"
.
"
It prints to the standard output the number of the largest-numbered
"
.
"
disambiguation symbol that was used.
\n
"
.
"
\n
"
.
"
Options: --pron-probs Expect pronunciation probabilities in the 2nd field
\n
"
.
"
--sil-probs [should be with --pron-probs option]
\n
"
.
"
Expect 3 extra fields after the pron-probs, for aspects of
\n
"
.
"
the silence probability model
\n
"
.
"
--first-allowed-disambig <n> The number of the first disambiguation symbol
\n
"
.
"
that this script is allowed to add. By default this is
\n
"
.
"
#1, but you can set this to a larger value using this option.
\n
"
.
"
e.g.:
\n
"
.
"
add_lex_disambig.pl lexicon.txt lexicon_disambig.txt
\n
"
.
"
add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt
\n
"
.
"
add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt
\n
";
}
$lexfn
=
shift
@ARGV
;
$lexoutfn
=
shift
@ARGV
;
open
(
L
,
"
<
$lexfn
")
||
die
"
Error opening lexicon
$lexfn
";
# (1) Read in the lexicon.
@L
=
(
);
while
(
<
L
>
)
{
@A
=
split
("
",
$_
);
push
@L
,
join
("
",
@A
);
}
# (2) Work out the count of each phone-sequence in the
# lexicon.
foreach
$l
(
@L
)
{
@A
=
split
("
",
$l
);
shift
@A
;
# Remove word.
if
(
$pron_probs
)
{
$p
=
shift
@A
;
if
(
!
(
$p
>
0.0
&&
$p
<=
1.0
))
{
die
"
Bad lexicon line
$l
(expecting pron-prob as second field)
";
}
}
if
(
$sil_probs
)
{
$silp
=
shift
@A
;
if
(
!
(
$silp
>
0.0
&&
$silp
<=
1.0
))
{
die
"
Bad lexicon line
$l
for silprobs
";
}
$correction
=
shift
@A
;
if
(
$correction
<=
0.0
)
{
die
"
Bad lexicon line
$l
for silprobs
";
}
$correction
=
shift
@A
;
if
(
$correction
<=
0.0
)
{
die
"
Bad lexicon line
$l
for silprobs
";
}
}
if
(
!
(
@A
))
{
die
"
Bad lexicon line $1, no phone in phone list
";
}
$count
{
join
("
",
@A
)}
++
;
}
# (3) For each left sub-sequence of each phone-sequence, note down
# that it exists (for identifying prefixes of longer strings).
foreach
$l
(
@L
)
{
@A
=
split
("
",
$l
);
shift
@A
;
# Remove word.
if
(
$pron_probs
)
{
shift
@A
;
}
# remove pron-prob.
if
(
$sil_probs
)
{
shift
@A
;
# Remove silprob
shift
@A
;
# Remove silprob
}
while
(
@A
>
0
)
{
pop
@A
;
# Remove last phone
$issubseq
{
join
("
",
@A
)}
=
1
;
}
}
# (4) For each entry in the lexicon:
# if the phone sequence is unique and is not a
# prefix of another word, no diambig symbol.
# Else output #1, or #2, #3, ... if the same phone-seq
# has already been assigned a disambig symbol.
open
(
O
,
"
>
$lexoutfn
")
||
die
"
Opening lexicon file
$lexoutfn
for writing.
\n
";
# max_disambig will always be the highest-numbered disambiguation symbol that
# has been used so far.
$max_disambig
=
$first_allowed_disambig
-
1
;
foreach
$l
(
@L
)
{
@A
=
split
("
",
$l
);
$word
=
shift
@A
;
if
(
$pron_probs
)
{
$pron_prob
=
shift
@A
;
}
if
(
$sil_probs
)
{
$sil_word_prob
=
shift
@A
;
$word_sil_correction
=
shift
@A
;
$prev_nonsil_correction
=
shift
@A
}
$phnseq
=
join
("
",
@A
);
if
(
!
defined
$issubseq
{
$phnseq
}
&&
$count
{
$phnseq
}
==
1
)
{
;
# Do nothing.
}
else
{
if
(
$phnseq
eq
"")
{
# need disambig symbols for the empty string
# that are not use anywhere else.
$max_disambig
++
;
$reserved_for_the_empty_string
{
$max_disambig
}
=
1
;
$phnseq
=
"
#
$max_disambig
";
}
else
{
$cur_disambig
=
$last_used_disambig_symbol_of
{
$phnseq
};
if
(
!
defined
$cur_disambig
)
{
$cur_disambig
=
$first_allowed_disambig
;
}
else
{
$cur_disambig
++
;
# Get a number that has not been used yet for
# this phone sequence.
}
while
(
defined
$reserved_for_the_empty_string
{
$cur_disambig
})
{
$cur_disambig
++
;
}
if
(
$cur_disambig
>
$max_disambig
)
{
$max_disambig
=
$cur_disambig
;
}
$last_used_disambig_symbol_of
{
$phnseq
}
=
$cur_disambig
;
$phnseq
=
$phnseq
.
"
#
"
.
$cur_disambig
;
}
}
if
(
$pron_probs
)
{
if
(
$sil_probs
)
{
print
O
"
$word
\t
$pron_prob
\t
$sil_word_prob
\t
$word_sil_correction
\t
$prev_nonsil_correction
\t
$phnseq
\n
";
}
else
{
print
O
"
$word
\t
$pron_prob
\t
$phnseq
\n
";
}
}
else
{
print
O
"
$word
\t
$phnseq
\n
";
}
}
print
$max_disambig
.
"
\n
";
examples/aishell/s0/tools/fst/compile_lexicon_token_fst.sh
0 → 100755
View file @
a7785cc6
#!/bin/bash
# Copyright 2015 Yajie Miao (Carnegie Mellon University)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the
# phoneme and character-based lexicons.
set
-eo
pipefail
.
tools/parse_options.sh
if
[
$#
-ne
3
]
;
then
echo
"usage: tools/fst/compile_lexicon_token_fst.sh <dict-src-dir> <tmp-dir> <lang-dir>"
echo
"e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang"
echo
"<dict-src-dir> should contain the following files:"
echo
"lexicon.txt units.txt"
echo
"options: "
exit
1
;
fi
srcdir
=
$1
tmpdir
=
$2
dir
=
$3
mkdir
-p
$dir
$tmpdir
[
-f
path.sh
]
&&
.
./path.sh
export
LC_ALL
=
C
cp
$srcdir
/units.txt
$dir
# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0.
# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is.
perl
-ape
's/(\S+\s+)(.+)/${1}1.0\t$2/;'
<
$srcdir
/lexicon.txt
>
$tmpdir
/lexiconp.txt
||
exit
1
;
# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst.
# Without these symbols, determinization will fail.
ndisambig
=
`
tools/fst/add_lex_disambig.pl
$tmpdir
/lexiconp.txt
$tmpdir
/lexiconp_disambig.txt
`
ndisambig
=
$[$ndisambig
+1]
;
(
for
n
in
`
seq
0
$ndisambig
`
;
do
echo
'#'
$n
;
done
)
>
$tmpdir
/disambig.list
# Get the full list of CTC tokens used in FST. These tokens include <eps>, the blank <blk>,
# the actual model unit, and the disambiguation symbols.
cat
$srcdir
/units.txt |
awk
'{print $1}'
>
$tmpdir
/units.list
(
echo
'<eps>'
;
)
|
cat
-
$tmpdir
/units.list
$tmpdir
/disambig.list |
awk
'{print $1 " " (NR-1)}'
>
$dir
/tokens.txt
# ctc_token_fst_corrected is too big and too slow for character based chinese modeling,
# so here use ctc_token_fst_compact
tools/fst/ctc_token_fst_compact.py
$dir
/tokens.txt |
\
fstcompile
--isymbols
=
$dir
/tokens.txt
--osymbols
=
$dir
/tokens.txt
--keep_isymbols
=
false
--keep_osymbols
=
false
|
\
fstarcsort
--sort_type
=
olabel
>
$dir
/T.fst
||
exit
1
;
# Encode the words with indices. Will be used in lexicon and language model FST compiling.
cat
$tmpdir
/lexiconp.txt |
awk
'{print $1}'
|
sort
|
uniq
|
awk
'
BEGIN {
print "<eps> 0";
}
{
printf("%s %d\n", $1, NR);
}
END {
printf("#0 %d\n", NR+1);
printf("<s> %d\n", NR+2);
printf("</s> %d\n", NR+3);
}'
>
$dir
/words.txt
||
exit
1
;
# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time.
token_disambig_symbol
=
`
grep
\#
0
$dir
/tokens.txt |
awk
'{print $2}'
`
word_disambig_symbol
=
`
grep
\#
0
$dir
/words.txt |
awk
'{print $2}'
`
tools/fst/make_lexicon_fst.pl
--pron-probs
$tmpdir
/lexiconp_disambig.txt 0
"sil"
'#'
$ndisambig
|
\
fstcompile
--isymbols
=
$dir
/tokens.txt
--osymbols
=
$dir
/words.txt
\
--keep_isymbols
=
false
--keep_osymbols
=
false
|
\
fstaddselfloops
"echo
$token_disambig_symbol
|"
"echo
$word_disambig_symbol
|"
|
\
fstarcsort
--sort_type
=
olabel
>
$dir
/L.fst
||
exit
1
;
echo
"Lexicon and token FSTs compiling succeeded"
Prev
1
2
3
4
5
…
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment