Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
Conformer_pytorch
Commits
a7785cc6
Commit
a7785cc6
authored
Mar 26, 2024
by
Sugon_ldc
Browse files
delete soft link
parent
9a2a05ca
Changes
162
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2300 additions
and
0 deletions
+2300
-0
examples/aishell/s0/tools/perturb_data_dir_speed.sh
examples/aishell/s0/tools/perturb_data_dir_speed.sh
+116
-0
examples/aishell/s0/tools/reduce_data_dir.sh
examples/aishell/s0/tools/reduce_data_dir.sh
+59
-0
examples/aishell/s0/tools/remove_longshortdata.py
examples/aishell/s0/tools/remove_longshortdata.py
+61
-0
examples/aishell/s0/tools/segment.py
examples/aishell/s0/tools/segment.py
+35
-0
examples/aishell/s0/tools/setup_anaconda.sh
examples/aishell/s0/tools/setup_anaconda.sh
+68
-0
examples/aishell/s0/tools/sph2wav.sh
examples/aishell/s0/tools/sph2wav.sh
+60
-0
examples/aishell/s0/tools/spk2utt_to_utt2spk.pl
examples/aishell/s0/tools/spk2utt_to_utt2spk.pl
+27
-0
examples/aishell/s0/tools/spm_decode
examples/aishell/s0/tools/spm_decode
+49
-0
examples/aishell/s0/tools/spm_encode
examples/aishell/s0/tools/spm_encode
+99
-0
examples/aishell/s0/tools/spm_train
examples/aishell/s0/tools/spm_train
+13
-0
examples/aishell/s0/tools/subset_data_dir.sh
examples/aishell/s0/tools/subset_data_dir.sh
+192
-0
examples/aishell/s0/tools/subset_scp.pl
examples/aishell/s0/tools/subset_scp.pl
+105
-0
examples/aishell/s0/tools/sym2int.pl
examples/aishell/s0/tools/sym2int.pl
+104
-0
examples/aishell/s0/tools/text2token.py
examples/aishell/s0/tools/text2token.py
+171
-0
examples/aishell/s0/tools/utt2spk_to_spk2utt.pl
examples/aishell/s0/tools/utt2spk_to_spk2utt.pl
+38
-0
examples/aishell/s0/tools/validate_data_dir.sh
examples/aishell/s0/tools/validate_data_dir.sh
+383
-0
examples/aishell/s0/tools/validate_dict_dir.pl
examples/aishell/s0/tools/validate_dict_dir.pl
+531
-0
examples/aishell/s0/tools/validate_text.pl
examples/aishell/s0/tools/validate_text.pl
+136
-0
examples/aishell/s0/tools/wav2dur.py
examples/aishell/s0/tools/wav2dur.py
+26
-0
examples/aishell/s0/tools/wav_to_duration.sh
examples/aishell/s0/tools/wav_to_duration.sh
+27
-0
No files found.
examples/aishell/s0/tools/perturb_data_dir_speed.sh
0 → 100755
View file @
a7785cc6
#!/bin/bash
# 2020 @kamo-naoyuki
# This file was copied from Kaldi and
# I deleted parts related to wav duration
# because we shouldn't use kaldi's command here
# and we don't need the files actually.
# Copyright 2013 Johns Hopkins University (author: Daniel Povey)
# 2014 Tom Ko
# 2018 Emotech LTD (author: Pawel Swietojanski)
# Apache 2.0
# This script operates on a directory, such as in data/train/,
# that contains some subset of the following files:
# wav.scp
# spk2utt
# utt2spk
# text
#
# It generates the files which are used for perturbing the speed of the original data.
export
LC_ALL
=
C
set
-euo
pipefail
if
[[
$#
!=
3
]]
;
then
echo
"Usage: perturb_data_dir_speed.sh <warping-factor> <srcdir> <destdir>"
echo
"e.g.:"
echo
"
$0
0.9 data/train_si284 data/train_si284p"
exit
1
fi
factor
=
$1
srcdir
=
$2
destdir
=
$3
label
=
"sp"
spk_prefix
=
"
${
label
}${
factor
}
-"
utt_prefix
=
"
${
label
}${
factor
}
-"
#check is sox on the path
!
command
-v
sox &>/dev/null
&&
echo
"sox: command not found"
&&
exit
1
;
if
[[
!
-f
${
srcdir
}
/utt2spk
]]
;
then
echo
"
$0
: no such file
${
srcdir
}
/utt2spk"
exit
1
;
fi
if
[[
${
destdir
}
==
"
${
srcdir
}
"
]]
;
then
echo
"
$0
: this script requires <srcdir> and <destdir> to be different."
exit
1
fi
mkdir
-p
"
${
destdir
}
"
<
"
${
srcdir
}
"
/utt2spk
awk
-v
p
=
"
${
utt_prefix
}
"
'{printf("%s %s%s\n", $1, p, $1);}'
>
"
${
destdir
}
/utt_map"
<
"
${
srcdir
}
"
/spk2utt
awk
-v
p
=
"
${
spk_prefix
}
"
'{printf("%s %s%s\n", $1, p, $1);}'
>
"
${
destdir
}
/spk_map"
<
"
${
srcdir
}
"
/wav.scp
awk
-v
p
=
"
${
spk_prefix
}
"
'{printf("%s %s%s\n", $1, p, $1);}'
>
"
${
destdir
}
/reco_map"
if
[[
!
-f
${
srcdir
}
/utt2uniq
]]
;
then
<
"
${
srcdir
}
/utt2spk"
awk
-v
p
=
"
${
utt_prefix
}
"
'{printf("%s%s %s\n", p, $1, $1);}'
>
"
${
destdir
}
/utt2uniq"
else
<
"
${
srcdir
}
/utt2uniq"
awk
-v
p
=
"
${
utt_prefix
}
"
'{printf("%s%s %s\n", p, $1, $2);}'
>
"
${
destdir
}
/utt2uniq"
fi
<
"
${
srcdir
}
"
/utt2spk utils/apply_map.pl
-f
1
"
${
destdir
}
"
/utt_map |
\
utils/apply_map.pl
-f
2
"
${
destdir
}
"
/spk_map
>
"
${
destdir
}
"
/utt2spk
utils/utt2spk_to_spk2utt.pl <
"
${
destdir
}
"
/utt2spk
>
"
${
destdir
}
"
/spk2utt
if
[[
-f
${
srcdir
}
/segments
]]
;
then
utils/apply_map.pl
-f
1
"
${
destdir
}
"
/utt_map <
"
${
srcdir
}
"
/segments |
\
utils/apply_map.pl
-f
2
"
${
destdir
}
"
/reco_map |
\
awk
-v
factor
=
"
${
factor
}
"
\
'{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }'
\
>
"
${
destdir
}
"
/segments
utils/apply_map.pl
-f
1
"
${
destdir
}
"
/reco_map <
"
${
srcdir
}
"
/wav.scp |
sed
's/| *$/ |/'
|
\
# Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename"
awk
-v
factor
=
"
${
factor
}
"
\
'{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"}
else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" }
else {print wid " sox" $_ " -t wav - speed " factor " |"}}'
\
>
"
${
destdir
}
"
/wav.scp
if
[[
-f
${
srcdir
}
/reco2file_and_channel
]]
;
then
utils/apply_map.pl
-f
1
"
${
destdir
}
"
/reco_map
\
<
"
${
srcdir
}
"
/reco2file_and_channel
>
"
${
destdir
}
"
/reco2file_and_channel
fi
else
# no segments->wav indexed by utterance.
if
[[
-f
${
srcdir
}
/wav.scp
]]
;
then
utils/apply_map.pl
-f
1
"
${
destdir
}
"
/utt_map <
"
${
srcdir
}
"
/wav.scp |
sed
's/| *$/ |/'
|
\
# Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename"
awk
-v
factor
=
"
${
factor
}
"
\
'{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"}
else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" }
else {print wid " sox" $_ " -t wav - speed " factor " |"}}'
\
>
"
${
destdir
}
"
/wav.scp
fi
fi
if
[[
-f
${
srcdir
}
/text
]]
;
then
utils/apply_map.pl
-f
1
"
${
destdir
}
"
/utt_map <
"
${
srcdir
}
"
/text
>
"
${
destdir
}
"
/text
fi
if
[[
-f
${
srcdir
}
/spk2gender
]]
;
then
utils/apply_map.pl
-f
1
"
${
destdir
}
"
/spk_map <
"
${
srcdir
}
"
/spk2gender
>
"
${
destdir
}
"
/spk2gender
fi
if
[[
-f
${
srcdir
}
/utt2lang
]]
;
then
utils/apply_map.pl
-f
1
"
${
destdir
}
"
/utt_map <
"
${
srcdir
}
"
/utt2lang
>
"
${
destdir
}
"
/utt2lang
fi
rm
"
${
destdir
}
"
/spk_map
"
${
destdir
}
"
/utt_map
"
${
destdir
}
"
/reco_map 2>/dev/null
echo
"
$0
: generated speed-perturbed version of data in
${
srcdir
}
, in
${
destdir
}
"
utils/validate_data_dir.sh
--no-feats
--no-text
"
${
destdir
}
"
examples/aishell/s0/tools/reduce_data_dir.sh
0 → 100755
View file @
a7785cc6
#!/bin/bash
# koried, 10/29/2012
# Reduce a data set based on a list of turn-ids
help_message
=
"usage:
$0
srcdir turnlist destdir"
if
[
$1
==
"--help"
]
;
then
echo
"
${
help_message
}
"
exit
0
;
fi
if
[
$#
!=
3
]
;
then
echo
"
${
help_message
}
"
exit
1
;
fi
srcdir
=
$1
reclist
=
$2
destdir
=
$3
if
[
!
-f
${
srcdir
}
/utt2spk
]
;
then
echo
"
$0
: no such file
$srcdir
/utt2spk"
exit
1
;
fi
function
do_filtering
{
# assumes the utt2spk and spk2utt files already exist.
[
-f
${
srcdir
}
/feats.scp
]
&&
utils/filter_scp.pl
${
destdir
}
/utt2spk <
${
srcdir
}
/feats.scp
>
${
destdir
}
/feats.scp
[
-f
${
srcdir
}
/wav.scp
]
&&
utils/filter_scp.pl
${
destdir
}
/utt2spk <
${
srcdir
}
/wav.scp
>
${
destdir
}
/wav.scp
[
-f
${
srcdir
}
/text
]
&&
utils/filter_scp.pl
${
destdir
}
/utt2spk <
${
srcdir
}
/text
>
${
destdir
}
/text
[
-f
${
srcdir
}
/utt2num_frames
]
&&
utils/filter_scp.pl
${
destdir
}
/utt2spk <
${
srcdir
}
/utt2num_frames
>
${
destdir
}
/utt2num_frames
[
-f
${
srcdir
}
/spk2gender
]
&&
utils/filter_scp.pl
${
destdir
}
/spk2utt <
${
srcdir
}
/spk2gender
>
${
destdir
}
/spk2gender
[
-f
${
srcdir
}
/cmvn.scp
]
&&
utils/filter_scp.pl
${
destdir
}
/spk2utt <
${
srcdir
}
/cmvn.scp
>
${
destdir
}
/cmvn.scp
if
[
-f
${
srcdir
}
/segments
]
;
then
utils/filter_scp.pl
${
destdir
}
/utt2spk <
${
srcdir
}
/segments
>
${
destdir
}
/segments
awk
'{print $2;}'
${
destdir
}
/segments |
sort
|
uniq
>
${
destdir
}
/reco
# recordings.
# The next line would override the command above for wav.scp, which would be incorrect.
[
-f
${
srcdir
}
/wav.scp
]
&&
utils/filter_scp.pl
${
destdir
}
/reco <
${
srcdir
}
/wav.scp
>
${
destdir
}
/wav.scp
[
-f
${
srcdir
}
/reco2file_and_channel
]
&&
\
utils/filter_scp.pl
${
destdir
}
/reco <
${
srcdir
}
/reco2file_and_channel
>
${
destdir
}
/reco2file_and_channel
# Filter the STM file for proper sclite scoring (this will also remove the comments lines)
[
-f
${
srcdir
}
/stm
]
&&
utils/filter_scp.pl
${
destdir
}
/reco <
${
srcdir
}
/stm
>
${
destdir
}
/stm
rm
${
destdir
}
/reco
fi
srcutts
=
$(
wc
-l
<
${
srcdir
}
/utt2spk
)
destutts
=
$(
wc
-l
<
${
destdir
}
/utt2spk
)
echo
"Reduced #utt from
$srcutts
to
$destutts
"
}
mkdir
-p
${
destdir
}
# filter the utt2spk based on the set of recordings
utils/filter_scp.pl
${
reclist
}
<
${
srcdir
}
/utt2spk
>
${
destdir
}
/utt2spk
utils/utt2spk_to_spk2utt.pl <
${
destdir
}
/utt2spk
>
${
destdir
}
/spk2utt
do_filtering
;
examples/aishell/s0/tools/remove_longshortdata.py
0 → 100755
View file @
a7785cc6
#!/usr/bin/env python3
# encoding: utf-8
import
argparse
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'remove too long or too short data in format.data'
)
parser
.
add_argument
(
'--data_file'
,
type
=
str
,
help
=
'input format data'
)
parser
.
add_argument
(
'--output_data_file'
,
type
=
str
,
help
=
'output format data'
)
parser
.
add_argument
(
'--min_input_len'
,
type
=
float
,
default
=
0
,
help
=
'minimum input seq length, in seconds for raw wav,
\
in frame numbers for feature data'
)
parser
.
add_argument
(
'--max_input_len'
,
type
=
float
,
default
=
20
,
help
=
'maximum output seq length, in seconds for raw wav,
\
in frame numbers for feature data'
)
parser
.
add_argument
(
'--min_output_len'
,
type
=
float
,
default
=
0
,
help
=
'minimum input seq length, in modeling units'
)
parser
.
add_argument
(
'--max_output_len'
,
type
=
float
,
default
=
500
,
help
=
'maximum output seq length, in modeling units'
)
parser
.
add_argument
(
'--min_output_input_ratio'
,
type
=
float
,
default
=
0.05
,
help
=
'minimum output seq length/output seq length ratio'
)
parser
.
add_argument
(
'--max_output_input_ratio'
,
type
=
float
,
default
=
10
,
help
=
'maximum output seq length/output seq length ratio'
)
args
=
parser
.
parse_args
()
data_file
=
args
.
data_file
output_data_file
=
args
.
output_data_file
min_input_len
=
args
.
min_input_len
max_input_len
=
args
.
max_input_len
min_output_len
=
args
.
min_output_len
max_output_len
=
args
.
max_output_len
min_output_input_ratio
=
args
.
min_output_input_ratio
max_output_input_ratio
=
args
.
max_output_input_ratio
with
open
(
data_file
,
'r'
)
as
f
,
open
(
output_data_file
,
'w'
)
as
fout
:
for
l
in
f
:
l
=
l
.
strip
()
if
l
:
items
=
l
.
strip
().
split
(
'
\t
'
)
token_shape
=
items
[
6
]
feature_shape
=
items
[
2
]
feat_len
=
float
(
feature_shape
.
split
(
':'
)[
1
].
split
(
','
)[
0
])
token_len
=
float
(
token_shape
.
split
(
':'
)[
1
].
split
(
','
)[
0
])
condition
=
[
feat_len
>
min_input_len
,
feat_len
<
max_input_len
,
token_len
>
min_output_len
,
token_len
<
max_output_len
,
token_len
/
feat_len
>
min_output_input_ratio
,
token_len
/
feat_len
<
max_output_input_ratio
,
]
if
all
(
condition
):
fout
.
write
(
'{}
\n
'
.
format
(
l
))
continue
examples/aishell/s0/tools/segment.py
0 → 100755
View file @
a7785cc6
#!/usr/bin/env python3
# Copyright (c) 2021 Mobvoi Inc. (Di Wu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
import
argparse
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'generate segmented wav.scp'
)
parser
.
add_argument
(
'--segments'
,
required
=
True
,
help
=
'segments file'
)
parser
.
add_argument
(
'--input'
,
required
=
True
,
help
=
'origin wav.scp that not segmented'
)
parser
.
add_argument
(
'--output'
,
required
=
True
,
help
=
'output segmented wav.scp'
)
wav_dic
=
{}
args
=
parser
.
parse_args
()
ori_wav
=
args
.
input
segment_file
=
args
.
segments
wav_scp
=
args
.
output
with
open
(
ori_wav
,
'r'
)
as
ori
:
for
l
in
ori
:
item
=
l
.
strip
().
split
()
wav_dic
[
item
[
0
]]
=
item
[
1
]
with
open
(
wav_scp
,
'w'
)
as
f
,
open
(
segment_file
,
'r'
)
as
sgement
:
for
l
in
sgement
:
item
=
l
.
strip
().
split
()
if
item
[
1
]
in
wav_dic
:
item
[
1
]
=
wav_dic
[
item
[
1
]]
f
.
write
(
"{} {},{},{}
\n
"
.
format
(
item
[
0
],
item
[
1
],
item
[
2
],
item
[
3
]))
examples/aishell/s0/tools/setup_anaconda.sh
0 → 100755
View file @
a7785cc6
#!/usr/bin/env bash
# NOTE(hslee): this code is borrowed from ESPnet (https://github.com/espnet/espnet)
set
-euo
pipefail
if
[
-z
"
${
PS1
:-}
"
]
;
then
PS1
=
__dummy__
fi
CONDA_URL
=
https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
if
[
$#
-gt
4
]
;
then
echo
"Usage:
$0
[output] [conda-env-name] [python-version>]"
exit
1
;
elif
[
$#
-eq
3
]
;
then
output_dir
=
"
$1
"
name
=
"
$2
"
PYTHON_VERSION
=
"
$3
"
elif
[
$#
-eq
2
]
;
then
output_dir
=
"
$1
"
name
=
"
$2
"
PYTHON_VERSION
=
""
elif
[
$#
-eq
1
]
;
then
output_dir
=
"
$1
"
name
=
""
PYTHON_VERSION
=
""
elif
[
$#
-eq
0
]
;
then
output_dir
=
venv
name
=
""
PYTHON_VERSION
=
""
fi
if
[
-e
activate_python.sh
]
;
then
echo
"Warning: activate_python.sh already exists. It will be overwritten"
fi
if
[
!
-e
"
${
output_dir
}
/etc/profile.d/conda.sh"
]
;
then
if
[
!
-e
miniconda.sh
]
;
then
wget
--tries
=
3
"
${
CONDA_URL
}
"
-O
miniconda.sh
fi
bash miniconda.sh
-b
-p
"
${
output_dir
}
"
fi
# shellcheck disable=SC1090
source
"
${
output_dir
}
/etc/profile.d/conda.sh"
conda deactivate
# If the env already exists, skip recreation
if
[
-n
"
${
name
}
"
]
&&
!
conda activate
${
name
}
;
then
conda create
-yn
"
${
name
}
"
fi
conda activate
${
name
}
if
[
-n
"
${
PYTHON_VERSION
}
"
]
;
then
conda
install
-y
conda
"python=
${
PYTHON_VERSION
}
"
else
conda
install
-y
conda
fi
conda
install
-y
pip setuptools
cat
<<
EOF
> activate_python.sh
#!/usr/bin/env bash
# THIS FILE IS GENERATED BY tools/setup_anaconda.sh
if [ -z "
\$
{PS1:-}" ]; then
PS1=__dummy__
fi
.
$(
cd
${
output_dir
}
;
pwd
)
/etc/profile.d/conda.sh && conda deactivate && conda activate
${
name
}
EOF
examples/aishell/s0/tools/sph2wav.sh
0 → 100755
View file @
a7785cc6
#!/bin/bash
# convert sph scp to segmented wav scp
nj
=
1
.
tools/parse_options.sh
||
exit
1
;
inscp
=
$1
segments
=
$2
outscp
=
$3
data
=
$(
dirname
${
inscp
}
)
if
[
$#
-eq
4
]
;
then
logdir
=
$4
else
logdir
=
${
data
}
/log
fi
mkdir
-p
${
logdir
}
sph2pipe_version
=
"v2.5"
if
[
!
-d
tools/sph2pipe_
${
sph2pipe_version
}
]
;
then
echo
"Download sph2pipe_
${
sph2pipe_version
}
......"
wget
-T
10
-t
3
-P
tools https://www.openslr.org/resources/3/sph2pipe_
${
sph2pipe_version
}
.tar.gz
||
\
wget
-T
10
-c
-P
tools https://sourceforge.net/projects/kaldi/files/sph2pipe_
${
sph2pipe_version
}
.tar.gz
;
\
tar
--no-same-owner
-xzf
tools/sph2pipe_
${
sph2pipe_version
}
.tar.gz
-C
tools
cd
tools/sph2pipe_
${
sph2pipe_version
}
/
&&
\
gcc
-o
sph2pipe
*
.c
-lm
cd
-
fi
sph2pipe
=
`
which sph2pipe
`
||
sph2pipe
=
`
pwd
`
/tools/sph2pipe_
${
sph2pipe_version
}
/sph2pipe
[
!
-x
$sph2pipe
]
&&
echo
"Could not find the sph2pipe program at
$sph2pipe
"
&&
exit
1
;
sox
=
`
which sox
`
[
!
-x
$sox
]
&&
echo
"Could not find the sox program at
$sph2pipe
"
&&
exit
1
;
cat
$inscp
|
awk
-v
sph2pipe
=
$sph2pipe
'{printf("%s-A %s#-f#wav#-p#-c#1#%s#|\n", $1, sph2pipe, $2);
printf("%s-B %s#-f#wav#-p#-c#2#%s#|\n", $1, sph2pipe, $2);}'
|
\
sort
>
$data
/wav_ori.scp
||
exit
1
;
tools/segment.py
--segments
$segments
--input
$data
/wav_ori.scp
--output
$data
/wav_segments.scp
sed
-i
's/ /,/g'
$data
/wav_segments.scp
sed
-i
's/#/ /g'
$data
/wav_segments.scp
rm
-f
$logdir
/wav_
*
.slice
rm
-f
$logdir
/
*
.log
split
--additional-suffix
.slice
-d
-n
l/
$nj
$data
/wav_segments.scp
$logdir
/wav_
for
slice
in
`
ls
$logdir
/wav_
*
.slice
`
;
do
{
name
=
`
basename
-s
.slice
$slice
`
mkdir
-p
${
data
}
/wavs/
${
name
}
cat
${
slice
}
|
awk
-F
','
-v
sox
=
$sox
-v
data
=
`
pwd
`
/
$data
/wavs/
$name
\
-v
logdir
=
$logdir
-v
name
=
$name
'{
during=$4-$3
cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during;
system(cmd)
printf("%s %s/%s.wav\n", $1, data, $1);
}'
|
\
sort
>
${
data
}
/wavs_
${
name
}
.scp
||
exit
1
;
}
&
done
wait
cat
${
data
}
/wavs_
*
.scp
>
$outscp
rm
${
data
}
/wavs_
*
.scp
examples/aishell/s0/tools/spk2utt_to_utt2spk.pl
0 → 100755
View file @
a7785cc6
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
while
(
<>
){
@A
=
split
("
",
$_
);
@A
>
1
||
die
"
Invalid line in spk2utt file:
$_
";
$s
=
shift
@A
;
foreach
$u
(
@A
)
{
print
"
$u
$s
\n
";
}
}
examples/aishell/s0/tools/spm_decode
0 → 100755
View file @
a7785cc6
#!/usr/bin/env python
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# https://github.com/pytorch/fairseq/blob/master/LICENSE
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
argparse
import
sys
import
sentencepiece
as
spm
def
main
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--model"
,
required
=
True
,
help
=
"sentencepiece model to use for decoding"
)
parser
.
add_argument
(
"--input"
,
default
=
None
,
help
=
"input file to decode"
)
parser
.
add_argument
(
"--input_format"
,
choices
=
[
"piece"
,
"id"
],
default
=
"piece"
)
args
=
parser
.
parse_args
()
sp
=
spm
.
SentencePieceProcessor
()
sp
.
Load
(
args
.
model
)
if
args
.
input_format
==
"piece"
:
def
decode
(
l
):
return
""
.
join
(
sp
.
DecodePieces
(
l
))
elif
args
.
input_format
==
"id"
:
def
decode
(
l
):
return
""
.
join
(
sp
.
DecodeIds
(
l
))
else
:
raise
NotImplementedError
def
tok2int
(
tok
):
# remap reference-side <unk> (represented as <<unk>>) to 0
return
int
(
tok
)
if
tok
!=
"<<unk>>"
else
0
if
args
.
input
is
None
:
h
=
sys
.
stdin
else
:
h
=
open
(
args
.
input
,
"r"
,
encoding
=
"utf-8"
)
for
line
in
h
:
print
(
decode
(
line
.
split
()))
if
__name__
==
"__main__"
:
main
()
examples/aishell/s0/tools/spm_encode
0 → 100755
View file @
a7785cc6
#!/usr/bin/env python
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in
# https://github.com/pytorch/fairseq/blob/master/LICENSE
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
argparse
import
contextlib
import
sys
import
sentencepiece
as
spm
def
main
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--model"
,
required
=
True
,
help
=
"sentencepiece model to use for encoding"
)
parser
.
add_argument
(
"--inputs"
,
nargs
=
"+"
,
default
=
[
'-'
],
help
=
"input files to filter/encode"
)
parser
.
add_argument
(
"--outputs"
,
nargs
=
"+"
,
default
=
[
'-'
],
help
=
"path to save encoded outputs"
)
parser
.
add_argument
(
"--output_format"
,
choices
=
[
"piece"
,
"id"
],
default
=
"piece"
)
parser
.
add_argument
(
"--min-len"
,
type
=
int
,
metavar
=
"N"
,
help
=
"filter sentence pairs with fewer than N tokens"
)
parser
.
add_argument
(
"--max-len"
,
type
=
int
,
metavar
=
"N"
,
help
=
"filter sentence pairs with more than N tokens"
)
args
=
parser
.
parse_args
()
assert
len
(
args
.
inputs
)
==
len
(
args
.
outputs
),
\
"number of input and output paths should match"
sp
=
spm
.
SentencePieceProcessor
()
sp
.
Load
(
args
.
model
)
if
args
.
output_format
==
"piece"
:
def
encode
(
l
):
return
sp
.
EncodeAsPieces
(
l
)
elif
args
.
output_format
==
"id"
:
def
encode
(
l
):
return
list
(
map
(
str
,
sp
.
EncodeAsIds
(
l
)))
else
:
raise
NotImplementedError
if
args
.
min_len
is
not
None
or
args
.
max_len
is
not
None
:
def
valid
(
line
):
return
(
(
args
.
min_len
is
None
or
len
(
line
)
>=
args
.
min_len
)
and
(
args
.
max_len
is
None
or
len
(
line
)
<=
args
.
max_len
)
)
else
:
def
valid
(
lines
):
return
True
with
contextlib
.
ExitStack
()
as
stack
:
inputs
=
[
stack
.
enter_context
(
open
(
input
,
"r"
,
encoding
=
"utf-8"
))
if
input
!=
"-"
else
sys
.
stdin
for
input
in
args
.
inputs
]
outputs
=
[
stack
.
enter_context
(
open
(
output
,
"w"
,
encoding
=
"utf-8"
))
if
output
!=
"-"
else
sys
.
stdout
for
output
in
args
.
outputs
]
stats
=
{
"num_empty"
:
0
,
"num_filtered"
:
0
,
}
def
encode_line
(
line
):
line
=
line
.
strip
()
if
len
(
line
)
>
0
:
line
=
encode
(
line
)
if
valid
(
line
):
return
line
else
:
stats
[
"num_filtered"
]
+=
1
else
:
stats
[
"num_empty"
]
+=
1
return
None
for
i
,
lines
in
enumerate
(
zip
(
*
inputs
),
start
=
1
):
enc_lines
=
list
(
map
(
encode_line
,
lines
))
if
not
any
(
enc_line
is
None
for
enc_line
in
enc_lines
):
for
enc_line
,
output_h
in
zip
(
enc_lines
,
outputs
):
print
(
" "
.
join
(
enc_line
),
file
=
output_h
)
if
i
%
10000
==
0
:
print
(
"processed {} lines"
.
format
(
i
),
file
=
sys
.
stderr
)
print
(
"skipped {} empty lines"
.
format
(
stats
[
"num_empty"
]),
file
=
sys
.
stderr
)
print
(
"filtered {} lines"
.
format
(
stats
[
"num_filtered"
]),
file
=
sys
.
stderr
)
if
__name__
==
"__main__"
:
main
()
examples/aishell/s0/tools/spm_train
0 → 100755
View file @
a7785cc6
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# https://github.com/pytorch/fairseq/blob/master/LICENSE
import
sys
import
sentencepiece
as
spm
if
__name__
==
"__main__"
:
spm
.
SentencePieceTrainer
.
Train
(
" "
.
join
(
sys
.
argv
[
1
:]))
examples/aishell/s0/tools/subset_data_dir.sh
0 → 100755
View file @
a7785cc6
#!/usr/bin/env bash
# Copyright 2010-2011 Microsoft Corporation
# 2012-2013 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0
# This script operates on a data directory, such as in data/train/.
# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data
# for what these directories contain.
# This script creates a subset of that data, consisting of some specified
# number of utterances. (The selected utterances are distributed evenly
# throughout the file, by the program ./subset_scp.pl).
# There are six options, none compatible with any other.
# If you give the --per-spk option, it will attempt to select the supplied
# number of utterances for each speaker (typically you would supply a much
# smaller number in this case).
# If you give the --speakers option, it selects a subset of n randomly
# selected speakers.
# If you give the --shortest option, it will give you the n shortest utterances.
# If you give the --first option, it will just give you the n first utterances.
# If you give the --last option, it will just give you the n last utterances.
# If you give the --spk-list or --utt-list option, it reads the
# speakers/utterances to keep from <speaker-list-file>/<utt-list-file>" (note,
# in this case there is no <num-utt> positional parameter; see usage message.)
shortest
=
false
perspk
=
false
speakers
=
false
first_opt
=
spk_list
=
utt_list
=
expect_args
=
3
case
$1
in
--first
|
--last
)
first_opt
=
$1
;
shift
;;
--per-spk
)
perspk
=
true
;
shift
;;
--shortest
)
shortest
=
true
;
shift
;;
--speakers
)
speakers
=
true
;
shift
;;
--spk-list
)
shift
;
spk_list
=
$1
;
shift
;
expect_args
=
2
;;
--utt-list
)
shift
;
utt_list
=
$1
;
shift
;
expect_args
=
2
;;
--
*
)
echo
"
$0
: invalid option '
$1
'"
;
exit
1
esac
if
[
$#
!=
$expect_args
]
;
then
echo
"Usage:"
echo
" subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] <srcdir> <num-utt> <destdir>"
echo
" subset_data_dir.sh [--spk-list <speaker-list-file>] <srcdir> <destdir>"
echo
" subset_data_dir.sh [--utt-list <utt-list-file>] <srcdir> <destdir>"
echo
"By default, randomly selects <num-utt> utterances from the data directory."
echo
"With --speakers, randomly selects enough speakers that we have <num-utt> utterances"
echo
"With --per-spk, selects <num-utt> utterances per speaker, if available."
echo
"With --first, selects the first <num-utt> utterances"
echo
"With --last, selects the last <num-utt> utterances"
echo
"With --shortest, selects the shortest <num-utt> utterances."
echo
"With --spk-list, reads the speakers to keep from <speaker-list-file>"
echo
"With --utt-list, reads the utterances to keep from <utt-list-file>"
exit
1
;
fi
srcdir
=
$1
if
[[
$spk_list
||
$utt_list
]]
;
then
numutt
=
destdir
=
$2
else
numutt
=
$2
destdir
=
$3
fi
export
LC_ALL
=
C
if
[
!
-f
$srcdir
/utt2spk
]
;
then
echo
"
$0
: no such file
$srcdir
/utt2spk"
exit
1
fi
if
[[
$numutt
&&
$numutt
-gt
$(
wc
-l
<
$srcdir
/utt2spk
)
]]
;
then
echo
"
$0
: cannot subset to more utterances than you originally had."
exit
1
fi
if
$shortest
&&
[
!
-f
$srcdir
/feats.scp
]
;
then
echo
"
$0
: you selected --shortest but no feats.scp exist."
exit
1
fi
mkdir
-p
$destdir
||
exit
1
if
[[
$spk_list
]]
;
then
tools/filter_scp.pl
"
$spk_list
"
$srcdir
/spk2utt
>
$destdir
/spk2utt
||
exit
1
;
tools/spk2utt_to_utt2spk.pl <
$destdir
/spk2utt
>
$destdir
/utt2spk
||
exit
1
;
elif
[[
$utt_list
]]
;
then
tools/filter_scp.pl
"
$utt_list
"
$srcdir
/utt2spk
>
$destdir
/utt2spk
||
exit
1
;
tools/utt2spk_to_spk2utt.pl <
$destdir
/utt2spk
>
$destdir
/spk2utt
||
exit
1
;
elif
$speakers
;
then
tools/shuffle_list.pl <
$srcdir
/spk2utt |
awk
-v
numutt
=
$numutt
'{ if (tot < numutt){ print; } tot += (NF-1); }'
|
sort
>
$destdir
/spk2utt
tools/spk2utt_to_utt2spk.pl <
$destdir
/spk2utt
>
$destdir
/utt2spk
elif
$perspk
;
then
awk
'{ n='
$numutt
'; printf("%s ",$1);
skip=1; while(n*(skip+1) <= NF-1) { skip++; }
for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); }
printf("\n"); }'
<
$srcdir
/spk2utt
>
$destdir
/spk2utt
tools/spk2utt_to_utt2spk.pl <
$destdir
/spk2utt
>
$destdir
/utt2spk
else
if
$shortest
;
then
# Select $numutt shortest utterances.
.
./path.sh
feat-to-len scp:
$srcdir
/feats.scp ark,t:
$destdir
/tmp.len
||
exit
1
;
sort
-n
-k2
$destdir
/tmp.len |
awk
'{print $1}'
|
head
-
$numutt
>
$destdir
/tmp.uttlist
tools/filter_scp.pl
$destdir
/tmp.uttlist
$srcdir
/utt2spk
>
$destdir
/utt2spk
rm
$destdir
/tmp.uttlist
$destdir
/tmp.len
else
# Select $numutt random utterances.
tools/subset_scp.pl
$first_opt
$numutt
$srcdir
/utt2spk
>
$destdir
/utt2spk
||
exit
1
;
fi
tools/utt2spk_to_spk2utt.pl <
$destdir
/utt2spk
>
$destdir
/spk2utt
fi
# Perform filtering. utt2spk and spk2utt files already exist by this point.
# Filter by utterance.
[
-f
$srcdir
/feats.scp
]
&&
tools/filter_scp.pl
$destdir
/utt2spk <
$srcdir
/feats.scp
>
$destdir
/feats.scp
[
-f
$srcdir
/vad.scp
]
&&
tools/filter_scp.pl
$destdir
/utt2spk <
$srcdir
/vad.scp
>
$destdir
/vad.scp
[
-f
$srcdir
/utt2lang
]
&&
tools/filter_scp.pl
$destdir
/utt2spk <
$srcdir
/utt2lang
>
$destdir
/utt2lang
[
-f
$srcdir
/utt2dur
]
&&
tools/filter_scp.pl
$destdir
/utt2spk <
$srcdir
/utt2dur
>
$destdir
/utt2dur
[
-f
$srcdir
/utt2num_frames
]
&&
tools/filter_scp.pl
$destdir
/utt2spk <
$srcdir
/utt2num_frames
>
$destdir
/utt2num_frames
[
-f
$srcdir
/utt2uniq
]
&&
tools/filter_scp.pl
$destdir
/utt2spk <
$srcdir
/utt2uniq
>
$destdir
/utt2uniq
[
-f
$srcdir
/wav.scp
]
&&
tools/filter_scp.pl
$destdir
/utt2spk <
$srcdir
/wav.scp
>
$destdir
/wav.scp
[
-f
$srcdir
/utt2warp
]
&&
tools/filter_scp.pl
$destdir
/utt2spk <
$srcdir
/utt2warp
>
$destdir
/utt2warp
[
-f
$srcdir
/text
]
&&
tools/filter_scp.pl
$destdir
/utt2spk <
$srcdir
/text
>
$destdir
/text
# Filter by speaker.
[
-f
$srcdir
/spk2warp
]
&&
tools/filter_scp.pl
$destdir
/spk2utt <
$srcdir
/spk2warp
>
$destdir
/spk2warp
[
-f
$srcdir
/spk2gender
]
&&
tools/filter_scp.pl
$destdir
/spk2utt <
$srcdir
/spk2gender
>
$destdir
/spk2gender
[
-f
$srcdir
/cmvn.scp
]
&&
tools/filter_scp.pl
$destdir
/spk2utt <
$srcdir
/cmvn.scp
>
$destdir
/cmvn.scp
# Filter by recording-id.
if
[
-f
$srcdir
/segments
]
;
then
tools/filter_scp.pl
$destdir
/utt2spk <
$srcdir
/segments
>
$destdir
/segments
# Recording-ids are in segments.
awk
'{print $2}'
$destdir
/segments |
sort
|
uniq
>
$destdir
/reco
# The next line overrides the command above for wav.scp, which would be incorrect.
#[ -f $srcdir/wav.scp ] &&
# tools/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp
else
# No segments; recording-ids are in wav.scp.
awk
'{print $1}'
$destdir
/wav.scp |
sort
|
uniq
>
$destdir
/reco
fi
[
-f
$srcdir
/reco2file_and_channel
]
&&
tools/filter_scp.pl
$destdir
/reco <
$srcdir
/reco2file_and_channel
>
$destdir
/reco2file_and_channel
[
-f
$srcdir
/reco2dur
]
&&
tools/filter_scp.pl
$destdir
/reco <
$srcdir
/reco2dur
>
$destdir
/reco2dur
# Filter the STM file for proper sclite scoring.
# Copy over the comments from STM file.
[
-f
$srcdir
/stm
]
&&
(
grep
"^;;"
$srcdir
/stm
tools/filter_scp.pl
$destdir
/reco
$srcdir
/stm
)
>
$destdir
/stm
rm
$destdir
/reco
# Copy frame_shift if present.
[
-f
$srcdir
/frame_shift
]
&&
cp
$srcdir
/frame_shift
$destdir
srcutts
=
$(
wc
-l
<
$srcdir
/utt2spk
)
destutts
=
$(
wc
-l
<
$destdir
/utt2spk
)
echo
"
$0
: reducing #utt from
$srcutts
to
$destutts
"
exit
0
examples/aishell/s0/tools/subset_scp.pl
0 → 100755
View file @
a7785cc6
#!/usr/bin/env perl
use
warnings
;
#sed replacement for -w perl parameter
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This program selects a subset of N elements in the scp.
# By default, it selects them evenly from throughout the scp, in order to avoid
# selecting too many from the same speaker. It prints them on the standard
# output.
# With the option --first, it just selects the N first utterances.
# With the option --last, it just selects the N last utterances.
# Last modified by JHU & HKUST @2013
$quiet
=
0
;
$first
=
0
;
$last
=
0
;
if
(
@ARGV
>
0
&&
$ARGV
[
0
]
eq
"
--quiet
")
{
shift
;
$quiet
=
1
;
}
if
(
@ARGV
>
0
&&
$ARGV
[
0
]
eq
"
--first
")
{
shift
;
$first
=
1
;
}
if
(
@ARGV
>
0
&&
$ARGV
[
0
]
eq
"
--last
")
{
shift
;
$last
=
1
;
}
if
(
@ARGV
<
2
)
{
die
"
Usage: subset_scp.pl [--quiet][--first|--last] N in.scp
\n
"
.
"
--quiet causes it to not die if N < num lines in scp.
\n
"
.
"
--first and --last make it equivalent to head or tail.
\n
"
.
"
See also: filter_scp.pl
\n
";
}
$N
=
shift
@ARGV
;
if
(
$N
==
0
)
{
die
"
First command-line parameter to subset_scp.pl must be an integer, got
\"
$N
\"
";
}
$inscp
=
shift
@ARGV
;
open
(
I
,
"
<
$inscp
")
||
die
"
Opening input scp file
$inscp
";
@F
=
();
while
(
<
I
>
)
{
push
@F
,
$_
;
}
$numlines
=
@F
;
if
(
$N
>
$numlines
)
{
if
(
$quiet
)
{
$N
=
$numlines
;
}
else
{
die
"
You requested from subset_scp.pl more elements than available:
$N
>
$numlines
";
}
}
sub
select_n
{
my
(
$start
,
$end
,
$num_needed
)
=
@_
;
my
$diff
=
$end
-
$start
;
if
(
$num_needed
>
$diff
)
{
die
"
select_n: code error
";
}
if
(
$diff
==
1
)
{
if
(
$num_needed
>
0
)
{
print
$F
[
$start
];
}
}
else
{
my
$halfdiff
=
int
(
$diff
/
2
);
my
$halfneeded
=
int
(
$num_needed
/
2
);
select_n
(
$start
,
$start
+
$halfdiff
,
$halfneeded
);
select_n
(
$start
+
$halfdiff
,
$end
,
$num_needed
-
$halfneeded
);
}
}
if
(
!
$first
&&
!
$last
)
{
if
(
$N
>
0
)
{
select_n
(
0
,
$numlines
,
$N
);
}
}
else
{
if
(
$first
)
{
# --first option: same as head.
for
(
$n
=
0
;
$n
<
$N
;
$n
++
)
{
print
$F
[
$n
];
}
}
else
{
# --last option: same as tail.
for
(
$n
=
@F
-
$N
;
$n
<
@F
;
$n
++
)
{
print
$F
[
$n
];
}
}
}
examples/aishell/s0/tools/sym2int.pl
0 → 100755
View file @
a7785cc6
#!/usr/bin/env perl
# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
$ignore_oov
=
0
;
for
(
$x
=
0
;
$x
<
2
;
$x
++
)
{
if
(
$ARGV
[
0
]
eq
"
--map-oov
")
{
shift
@ARGV
;
$map_oov
=
shift
@ARGV
;
if
(
$map_oov
eq
"
-f
"
||
$map_oov
=~
m/words\.txt$/
||
$map_oov
eq
"")
{
# disallow '-f', the empty string and anything ending in words.txt as the
# OOV symbol because these are likely command-line errors.
die
"
the --map-oov option requires an argument
";
}
}
if
(
$ARGV
[
0
]
eq
"
-f
")
{
shift
@ARGV
;
$field_spec
=
shift
@ARGV
;
if
(
$field_spec
=~
m/^\d+$/
)
{
$field_begin
=
$field_spec
-
1
;
$field_end
=
$field_spec
-
1
;
}
if
(
$field_spec
=~
m/^(\d*)[-:](\d*)/
)
{
# accept e.g. 1:10 as a courtesty (properly, 1-10)
if
(
$
1
ne
"")
{
$field_begin
=
$
1
-
1
;
# Change to zero-based indexing.
}
if
(
$
2
ne
"")
{
$field_end
=
$
2
-
1
;
# Change to zero-based indexing.
}
}
if
(
!
defined
$field_begin
&&
!
defined
$field_end
)
{
die
"
Bad argument to -f option:
$field_spec
";
}
}
}
$symtab
=
shift
@ARGV
;
if
(
!
defined
$symtab
)
{
print
STDERR
"
Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions
\n
"
.
"
options: [--map-oov <oov-symbol> ] [-f <field-range> ]
\n
"
.
"
note: <field-range> can look like 4-5, or 4-, or 5-, or 1.
\n
";
}
open
(
F
,
"
<
$symtab
")
||
die
"
Error opening symbol table file
$symtab
";
while
(
<
F
>
)
{
@A
=
split
("
",
$_
);
@A
==
2
||
die
"
bad line in symbol table file:
$_
";
$sym2int
{
$A
[
0
]}
=
$A
[
1
]
+
0
;
}
if
(
defined
$map_oov
&&
$map_oov
!~
m/^\d+$/
)
{
# not numeric-> look it up
if
(
!
defined
$sym2int
{
$map_oov
})
{
die
"
OOV symbol
$map_oov
not defined.
";
}
$map_oov
=
$sym2int
{
$map_oov
};
}
$num_warning
=
0
;
$max_warning
=
20
;
while
(
<>
)
{
@A
=
split
("
",
$_
);
@B
=
();
for
(
$n
=
0
;
$n
<
@A
;
$n
++
)
{
$a
=
$A
[
$n
];
if
(
(
!
defined
$field_begin
||
$n
>=
$field_begin
)
&&
(
!
defined
$field_end
||
$n
<=
$field_end
))
{
$i
=
$sym2int
{
$a
};
if
(
!
defined
(
$i
))
{
if
(
defined
$map_oov
)
{
if
(
$num_warning
++
<
$max_warning
)
{
print
STDERR
"
sym2int.pl: replacing
$a
with
$map_oov
\n
";
if
(
$num_warning
==
$max_warning
)
{
print
STDERR
"
sym2int.pl: not warning for OOVs any more times
\n
";
}
}
$i
=
$map_oov
;
}
else
{
$pos
=
$n
+
1
;
die
"
sym2int.pl: undefined symbol
$a
(in position
$pos
)
\n
";
}
}
$a
=
$i
;
}
push
@B
,
$a
;
}
print
join
("
",
@B
);
print
"
\n
";
}
if
(
$num_warning
>
0
)
{
print
STDERR
"
** Replaced
$num_warning
instances of OOVs with
$map_oov
\n
";
}
exit
(
0
);
examples/aishell/s0/tools/text2token.py
0 → 100755
View file @
a7785cc6
#!/usr/bin/env python3
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan)
# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
from
__future__
import
print_function
from
__future__
import
unicode_literals
import
argparse
import
codecs
import
re
import
sys
is_python2
=
sys
.
version_info
[
0
]
==
2
def
exist_or_not
(
i
,
match_pos
):
start_pos
=
None
end_pos
=
None
for
pos
in
match_pos
:
if
pos
[
0
]
<=
i
<
pos
[
1
]:
start_pos
=
pos
[
0
]
end_pos
=
pos
[
1
]
break
return
start_pos
,
end_pos
def
seg_char
(
sent
):
pattern
=
re
.
compile
(
r
'([\u4e00-\u9fa5])'
)
chars
=
pattern
.
split
(
sent
)
chars
=
[
w
for
w
in
chars
if
len
(
w
.
strip
())
>
0
]
return
chars
def
get_parser
():
parser
=
argparse
.
ArgumentParser
(
description
=
'convert raw text to tokenized text'
,
formatter_class
=
argparse
.
ArgumentDefaultsHelpFormatter
)
parser
.
add_argument
(
'--nchar'
,
'-n'
,
default
=
1
,
type
=
int
,
help
=
'number of characters to split, i.e.,
\
aabb -> a a b b with -n 1 and aa bb with -n 2'
)
parser
.
add_argument
(
'--skip-ncols'
,
'-s'
,
default
=
0
,
type
=
int
,
help
=
'skip first n columns'
)
parser
.
add_argument
(
'--space'
,
default
=
'<space>'
,
type
=
str
,
help
=
'space symbol'
)
parser
.
add_argument
(
'--bpe-model'
,
'-m'
,
default
=
None
,
type
=
str
,
help
=
'bpe model for english part'
)
parser
.
add_argument
(
'--non-lang-syms'
,
'-l'
,
default
=
None
,
type
=
str
,
help
=
'list of non-linguistic symobles,'
' e.g., <NOISE> etc.'
)
parser
.
add_argument
(
'text'
,
type
=
str
,
default
=
False
,
nargs
=
'?'
,
help
=
'input text'
)
parser
.
add_argument
(
'--trans_type'
,
'-t'
,
type
=
str
,
default
=
"char"
,
choices
=
[
"char"
,
"phn"
,
"cn_char_en_bpe"
],
help
=
"""Transcript type. char/phn. e.g., for TIMIT
FADG0_SI1279 -
If trans_type is char, read from
SI1279.WRD file -> "bricks are an alternative"
Else if trans_type is phn,
read from SI1279.PHN file ->
"sil b r ih sil k s aa r er n aa l
sil t er n ih sil t ih v sil" """
)
return
parser
def
main
():
parser
=
get_parser
()
args
=
parser
.
parse_args
()
rs
=
[]
if
args
.
non_lang_syms
is
not
None
:
with
codecs
.
open
(
args
.
non_lang_syms
,
'r'
,
encoding
=
"utf-8"
)
as
f
:
nls
=
[
x
.
rstrip
()
for
x
in
f
.
readlines
()]
rs
=
[
re
.
compile
(
re
.
escape
(
x
))
for
x
in
nls
]
if
args
.
bpe_model
is
not
None
:
import
sentencepiece
as
spm
sp
=
spm
.
SentencePieceProcessor
()
sp
.
load
(
args
.
bpe_model
)
if
args
.
text
:
f
=
codecs
.
open
(
args
.
text
,
encoding
=
"utf-8"
)
else
:
f
=
codecs
.
getreader
(
"utf-8"
)(
sys
.
stdin
if
is_python2
else
sys
.
stdin
.
buffer
)
sys
.
stdout
=
codecs
.
getwriter
(
"utf-8"
)(
sys
.
stdout
if
is_python2
else
sys
.
stdout
.
buffer
)
line
=
f
.
readline
()
n
=
args
.
nchar
while
line
:
x
=
line
.
split
()
print
(
' '
.
join
(
x
[:
args
.
skip_ncols
]),
end
=
" "
)
a
=
' '
.
join
(
x
[
args
.
skip_ncols
:])
# get all matched positions
match_pos
=
[]
for
r
in
rs
:
i
=
0
while
i
>=
0
:
m
=
r
.
search
(
a
,
i
)
if
m
:
match_pos
.
append
([
m
.
start
(),
m
.
end
()])
i
=
m
.
end
()
else
:
break
if
len
(
match_pos
)
>
0
:
chars
=
[]
i
=
0
while
i
<
len
(
a
):
start_pos
,
end_pos
=
exist_or_not
(
i
,
match_pos
)
if
start_pos
is
not
None
:
chars
.
append
(
a
[
start_pos
:
end_pos
])
i
=
end_pos
else
:
chars
.
append
(
a
[
i
])
i
+=
1
a
=
chars
if
(
args
.
trans_type
==
"phn"
):
a
=
a
.
split
(
" "
)
elif
args
.
trans_type
==
"cn_char_en_bpe"
:
b
=
seg_char
(
a
)
a
=
[]
for
j
in
b
:
# we use "▁" to instead of blanks among english words
# warning: here is "▁", not "_"
for
l
in
j
.
strip
().
split
(
"▁"
):
if
not
l
.
encode
(
'UTF-8'
).
isalpha
():
a
.
append
(
l
)
else
:
for
k
in
sp
.
encode_as_pieces
(
l
):
a
.
append
(
k
)
else
:
a
=
[
a
[
j
:
j
+
n
]
for
j
in
range
(
0
,
len
(
a
),
n
)]
a_flat
=
[]
for
z
in
a
:
a_flat
.
append
(
""
.
join
(
z
))
a_chars
=
[
z
.
replace
(
' '
,
args
.
space
)
for
z
in
a_flat
]
if
(
args
.
trans_type
==
"phn"
):
a_chars
=
[
z
.
replace
(
"sil"
,
args
.
space
)
for
z
in
a_chars
]
print
(
' '
.
join
(
a_chars
))
line
=
f
.
readline
()
if
__name__
==
'__main__'
:
main
()
examples/aishell/s0/tools/utt2spk_to_spk2utt.pl
0 → 100755
View file @
a7785cc6
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# converts an utt2spk file to a spk2utt file.
# Takes input from the stdin or from a file argument;
# output goes to the standard out.
if
(
@ARGV
>
1
)
{
die
"
Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt
";
}
while
(
<>
){
@A
=
split
("
",
$_
);
@A
==
2
||
die
"
Invalid line in utt2spk file:
$_
";
(
$u
,
$s
)
=
@A
;
if
(
!
$seen_spk
{
$s
})
{
$seen_spk
{
$s
}
=
1
;
push
@spklist
,
$s
;
}
push
(
@
{
$spk_hash
{
$s
}},
"
$u
");
}
foreach
$s
(
@spklist
)
{
$l
=
join
('
',
@
{
$spk_hash
{
$s
}});
print
"
$s
$l
\n
";
}
examples/aishell/s0/tools/validate_data_dir.sh
0 → 100755
View file @
a7785cc6
#!/bin/bash
cmd
=
"
$@
"
no_feats
=
false
no_wav
=
false
no_text
=
false
no_spk_sort
=
false
for
x
in
`
seq
4
`
;
do
if
[
"
$1
"
==
"--no-feats"
]
;
then
no_feats
=
true
shift
;
fi
if
[
"
$1
"
==
"--no-text"
]
;
then
no_text
=
true
shift
;
fi
if
[
"
$1
"
==
"--no-wav"
]
;
then
no_wav
=
true
shift
;
fi
if
[
"
$1
"
==
"--no-spk-sort"
]
;
then
no_spk_sort
=
true
shift
;
fi
done
if
[
$#
-ne
1
]
;
then
echo
"Usage:
$0
[--no-feats] [--no-text] [--no-wav] [--no-spk-sort] <data-dir>"
echo
"The --no-xxx options mean that the script does not require "
echo
"xxx.scp to be present, but it will check it if it is present."
echo
"--no-spk-sort means that the script does not require the utt2spk to be "
echo
"sorted by the speaker-id in addition to being sorted by utterance-id."
echo
"By default, utt2spk is expected to be sorted by both, which can be "
echo
"achieved by making the speaker-id prefixes of the utterance-ids"
echo
"e.g.:
$0
data/train"
exit
1
;
fi
data
=
$1
if
[
!
-d
$data
]
;
then
echo
"
$0
: no such directory
$data
"
exit
1
;
fi
if
[
-f
$data
/images.scp
]
;
then
cmd
=
${
cmd
/--no-wav/
}
# remove --no-wav if supplied
image/validate_data_dir.sh
$cmd
exit
$?
fi
for
f
in
spk2utt utt2spk
;
do
if
[
!
-f
$data
/
$f
]
;
then
echo
"
$0
: no such file
$f
"
exit
1
;
fi
if
[
!
-s
$data
/
$f
]
;
then
echo
"
$0
: empty file
$f
"
exit
1
;
fi
done
!
cat
$data
/utt2spk |
awk
'{if (NF != 2) exit(1); }'
&&
\
echo
"
$0
:
$data
/utt2spk has wrong format."
&&
exit
;
ns
=
$(
wc
-l
<
$data
/spk2utt
)
if
[
"
$ns
"
==
1
]
;
then
echo
"
$0
: WARNING: you have only one speaker. This probably a bad idea."
echo
" Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html"
echo
" for more information."
fi
tmpdir
=
$(
mktemp
-d
/tmp/kaldi.XXXX
)
;
trap
'rm -rf "$tmpdir"'
EXIT HUP INT PIPE TERM
export
LC_ALL
=
C
function
check_sorted_and_uniq
{
!
perl
-ne
'((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";'
$1
&&
exit
1
;
!
awk
'{print $1}'
$1
|
sort
|
uniq
| cmp
-s
- <
(
awk
'{print $1}'
$1
)
&&
\
echo
"
$0
: file
$1
is not in sorted order or has duplicates"
&&
exit
1
;
}
function
partial_diff
{
diff
-U1
$1
$2
|
(
head
-n
6
;
echo
"..."
;
tail
-n
6
)
n1
=
`
cat
$1
|
wc
-l
`
n2
=
`
cat
$2
|
wc
-l
`
echo
"[Lengths are
$1
=
$n1
versus
$2
=
$n2
]"
}
check_sorted_and_uniq
$data
/utt2spk
if
!
$no_spk_sort
;
then
!
cat
$data
/utt2spk |
sort
-k2
| cmp
-s
-
$data
/utt2spk
&&
\
echo
"
$0
: utt2spk is not in sorted order when sorted first on speaker-id "
&&
\
echo
"(fix this by making speaker-ids prefixes of utt-ids)"
&&
exit
1
;
fi
check_sorted_and_uniq
$data
/spk2utt
!
cmp
-s
<
(
cat
$data
/utt2spk |
awk
'{print $1, $2;}'
)
\
<
(
tools/spk2utt_to_utt2spk.pl
$data
/spk2utt
)
&&
\
echo
"
$0
: spk2utt and utt2spk do not seem to match"
&&
exit
1
;
cat
$data
/utt2spk |
awk
'{print $1;}'
>
$tmpdir
/utts
if
[
!
-f
$data
/text
]
&&
!
$no_text
;
then
echo
"
$0
: no such file
$data
/text (if this is by design, specify --no-text)"
exit
1
;
fi
num_utts
=
`
cat
$tmpdir
/utts |
wc
-l
`
if
[
-f
$data
/text
]
;
then
tools/validate_text.pl
$data
/text
||
exit
1
;
check_sorted_and_uniq
$data
/text
text_len
=
`
cat
$data
/text |
wc
-l
`
illegal_sym_list
=
"<s> </s> #0"
for
x
in
$illegal_sym_list
;
do
if
grep
-w
"
$x
"
$data
/text
>
/dev/null
;
then
echo
"
$0
: Error: in
$data
, text contains illegal symbol
$x
"
exit
1
;
fi
done
awk
'{print $1}'
<
$data
/text
>
$tmpdir
/utts.txt
if
!
cmp
-s
$tmpdir
/utts
{
,.txt
}
;
then
echo
"
$0
: Error: in
$data
, utterance lists extracted from utt2spk and text"
echo
"
$0
: differ, partial diff is:"
partial_diff
$tmpdir
/utts
{
,.txt
}
exit
1
;
fi
fi
if
[
-f
$data
/segments
]
&&
[
!
-f
$data
/wav.scp
]
;
then
echo
"
$0
: in directory
$data
, segments file exists but no wav.scp"
exit
1
;
fi
if
[
!
-f
$data
/wav.scp
]
&&
!
$no_wav
;
then
echo
"
$0
: no such file
$data
/wav.scp (if this is by design, specify --no-wav)"
exit
1
;
fi
if
[
-f
$data
/wav.scp
]
;
then
check_sorted_and_uniq
$data
/wav.scp
if
grep
-E
-q
'^\S+\s+~'
$data
/wav.scp
;
then
# note: it's not a good idea to have any kind of tilde in wav.scp, even if
# part of a command, as it would cause compatibility problems if run by
# other users, but this used to be not checked for so we let it slide unless
# it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which
# would definitely cause problems as the fopen system call does not do
# tilde expansion.
echo
"
$0
: Please do not use tilde (~) in your wav.scp."
exit
1
;
fi
if
[
-f
$data
/segments
]
;
then
check_sorted_and_uniq
$data
/segments
# We have a segments file -> interpret wav file as "recording-ids" not utterance-ids.
!
cat
$data
/segments |
\
awk
'{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}'
&&
\
echo
"
$0
: badly formatted segments file"
&&
exit
1
;
segments_len
=
`
cat
$data
/segments |
wc
-l
`
if
[
-f
$data
/text
]
;
then
!
cmp
-s
$tmpdir
/utts <
(
awk
'{print $1}'
<
$data
/segments
)
&&
\
echo
"
$0
: Utterance list differs between
$data
/utt2spk and
$data
/segments "
&&
\
echo
"
$0
: Lengths are
$segments_len
vs
$num_utts
"
&&
\
exit
1
fi
cat
$data
/segments |
awk
'{print $2}'
|
sort
|
uniq
>
$tmpdir
/recordings
awk
'{print $1}'
$data
/wav.scp
>
$tmpdir
/recordings.wav
if
!
cmp
-s
$tmpdir
/recordings
{
,.wav
}
;
then
echo
"
$0
: Error: in
$data
, recording-ids extracted from segments and wav.scp"
echo
"
$0
: differ, partial diff is:"
partial_diff
$tmpdir
/recordings
{
,.wav
}
exit
1
;
fi
if
[
-f
$data
/reco2file_and_channel
]
;
then
# this file is needed only for ctm scoring; it's indexed by recording-id.
check_sorted_and_uniq
$data
/reco2file_and_channel
!
cat
$data
/reco2file_and_channel |
\
awk
'{if (NF != 3 || ($3 != "A" && $3 != "B" )) {
if ( NF == 3 && $3 == "1" ) {
warning_issued = 1;
} else {
print "Bad line ", $0; exit 1;
}
}
}
END {
if (warning_issued == 1) {
print "The channel should be marked as A or B, not 1! You should change it ASAP! "
}
}'
&&
echo
"
$0
: badly formatted reco2file_and_channel file"
&&
exit
1
;
cat
$data
/reco2file_and_channel |
awk
'{print $1}'
>
$tmpdir
/recordings.r2fc
if
!
cmp
-s
$tmpdir
/recordings
{
,.r2fc
}
;
then
echo
"
$0
: Error: in
$data
, recording-ids extracted from segments and reco2file_and_channel"
echo
"
$0
: differ, partial diff is:"
partial_diff
$tmpdir
/recordings
{
,.r2fc
}
exit
1
;
fi
fi
else
# No segments file -> assume wav.scp indexed by utterance.
cat
$data
/wav.scp |
awk
'{print $1}'
>
$tmpdir
/utts.wav
if
!
cmp
-s
$tmpdir
/utts
{
,.wav
}
;
then
echo
"
$0
: Error: in
$data
, utterance lists extracted from utt2spk and wav.scp"
echo
"
$0
: differ, partial diff is:"
partial_diff
$tmpdir
/utts
{
,.wav
}
exit
1
;
fi
if
[
-f
$data
/reco2file_and_channel
]
;
then
# this file is needed only for ctm scoring; it's indexed by recording-id.
check_sorted_and_uniq
$data
/reco2file_and_channel
!
cat
$data
/reco2file_and_channel |
\
awk
'{if (NF != 3 || ($3 != "A" && $3 != "B" )) {
if ( NF == 3 && $3 == "1" ) {
warning_issued = 1;
} else {
print "Bad line ", $0; exit 1;
}
}
}
END {
if (warning_issued == 1) {
print "The channel should be marked as A or B, not 1! You should change it ASAP! "
}
}'
&&
echo
"
$0
: badly formatted reco2file_and_channel file"
&&
exit
1
;
cat
$data
/reco2file_and_channel |
awk
'{print $1}'
>
$tmpdir
/utts.r2fc
if
!
cmp
-s
$tmpdir
/utts
{
,.r2fc
}
;
then
echo
"
$0
: Error: in
$data
, utterance-ids extracted from segments and reco2file_and_channel"
echo
"
$0
: differ, partial diff is:"
partial_diff
$tmpdir
/utts
{
,.r2fc
}
exit
1
;
fi
fi
fi
fi
if
[
!
-f
$data
/feats.scp
]
&&
!
$no_feats
;
then
echo
"
$0
: no such file
$data
/feats.scp (if this is by design, specify --no-feats)"
exit
1
;
fi
if
[
-f
$data
/feats.scp
]
;
then
check_sorted_and_uniq
$data
/feats.scp
cat
$data
/feats.scp |
awk
'{print $1}'
>
$tmpdir
/utts.feats
if
!
cmp
-s
$tmpdir
/utts
{
,.feats
}
;
then
echo
"
$0
: Error: in
$data
, utterance-ids extracted from utt2spk and features"
echo
"
$0
: differ, partial diff is:"
partial_diff
$tmpdir
/utts
{
,.feats
}
exit
1
;
fi
fi
if
[
-f
$data
/cmvn.scp
]
;
then
check_sorted_and_uniq
$data
/cmvn.scp
cat
$data
/cmvn.scp |
awk
'{print $1}'
>
$tmpdir
/speakers.cmvn
cat
$data
/spk2utt |
awk
'{print $1}'
>
$tmpdir
/speakers
if
!
cmp
-s
$tmpdir
/speakers
{
,.cmvn
}
;
then
echo
"
$0
: Error: in
$data
, speaker lists extracted from spk2utt and cmvn"
echo
"
$0
: differ, partial diff is:"
partial_diff
$tmpdir
/speakers
{
,.cmvn
}
exit
1
;
fi
fi
if
[
-f
$data
/spk2gender
]
;
then
check_sorted_and_uniq
$data
/spk2gender
!
cat
$data
/spk2gender |
awk
'{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }'
&&
\
echo
"
$0
: Mal-formed spk2gender file"
&&
exit
1
;
cat
$data
/spk2gender |
awk
'{print $1}'
>
$tmpdir
/speakers.spk2gender
cat
$data
/spk2utt |
awk
'{print $1}'
>
$tmpdir
/speakers
if
!
cmp
-s
$tmpdir
/speakers
{
,.spk2gender
}
;
then
echo
"
$0
: Error: in
$data
, speaker lists extracted from spk2utt and spk2gender"
echo
"
$0
: differ, partial diff is:"
partial_diff
$tmpdir
/speakers
{
,.spk2gender
}
exit
1
;
fi
fi
if
[
-f
$data
/spk2warp
]
;
then
check_sorted_and_uniq
$data
/spk2warp
!
cat
$data
/spk2warp |
awk
'{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}'
&&
\
echo
"
$0
: Mal-formed spk2warp file"
&&
exit
1
;
cat
$data
/spk2warp |
awk
'{print $1}'
>
$tmpdir
/speakers.spk2warp
cat
$data
/spk2utt |
awk
'{print $1}'
>
$tmpdir
/speakers
if
!
cmp
-s
$tmpdir
/speakers
{
,.spk2warp
}
;
then
echo
"
$0
: Error: in
$data
, speaker lists extracted from spk2utt and spk2warp"
echo
"
$0
: differ, partial diff is:"
partial_diff
$tmpdir
/speakers
{
,.spk2warp
}
exit
1
;
fi
fi
if
[
-f
$data
/utt2warp
]
;
then
check_sorted_and_uniq
$data
/utt2warp
!
cat
$data
/utt2warp |
awk
'{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}'
&&
\
echo
"
$0
: Mal-formed utt2warp file"
&&
exit
1
;
cat
$data
/utt2warp |
awk
'{print $1}'
>
$tmpdir
/utts.utt2warp
cat
$data
/utt2spk |
awk
'{print $1}'
>
$tmpdir
/utts
if
!
cmp
-s
$tmpdir
/utts
{
,.utt2warp
}
;
then
echo
"
$0
: Error: in
$data
, utterance lists extracted from utt2spk and utt2warp"
echo
"
$0
: differ, partial diff is:"
partial_diff
$tmpdir
/utts
{
,.utt2warp
}
exit
1
;
fi
fi
# check some optionally-required things
for
f
in
vad.scp utt2lang utt2uniq
;
do
if
[
-f
$data
/
$f
]
;
then
check_sorted_and_uniq
$data
/
$f
if
!
cmp
-s
<
(
awk
'{print $1}'
$data
/utt2spk
)
\
<
(
awk
'{print $1}'
$data
/
$f
)
;
then
echo
"
$0
: error: in
$data
,
$f
and utt2spk do not have identical utterance-id list"
exit
1
;
fi
fi
done
if
[
-f
$data
/utt2dur
]
;
then
check_sorted_and_uniq
$data
/utt2dur
cat
$data
/utt2dur |
awk
'{print $1}'
>
$tmpdir
/utts.utt2dur
if
!
cmp
-s
$tmpdir
/utts
{
,.utt2dur
}
;
then
echo
"
$0
: Error: in
$data
, utterance-ids extracted from utt2spk and utt2dur file"
echo
"
$0
: differ, partial diff is:"
partial_diff
$tmpdir
/utts
{
,.utt2dur
}
exit
1
;
fi
cat
$data
/utt2dur |
\
awk
'{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}'
||
exit
1
fi
if
[
-f
$data
/utt2num_frames
]
;
then
check_sorted_and_uniq
$data
/utt2num_frames
cat
$data
/utt2num_frames |
awk
'{print $1}'
>
$tmpdir
/utts.utt2num_frames
if
!
cmp
-s
$tmpdir
/utts
{
,.utt2num_frames
}
;
then
echo
"
$0
: Error: in
$data
, utterance-ids extracted from utt2spk and utt2num_frames file"
echo
"
$0
: differ, partial diff is:"
partial_diff
$tmpdir
/utts
{
,.utt2num_frames
}
exit
1
fi
awk
<
$data
/utt2num_frames
'{
if (NF != 2 || !($2 > 0) || $2 != int($2)) {
print "Bad line utt2num_frames:" NR ":" $0
exit 1 } }'
||
exit
1
fi
if
[
-f
$data
/reco2dur
]
;
then
check_sorted_and_uniq
$data
/reco2dur
cat
$data
/reco2dur |
awk
'{print $1}'
>
$tmpdir
/recordings.reco2dur
if
[
-f
$tmpdir
/recordings
]
;
then
if
!
cmp
-s
$tmpdir
/recordings
{
,.reco2dur
}
;
then
echo
"
$0
: Error: in
$data
, recording-ids extracted from segments and reco2dur file"
echo
"
$0
: differ, partial diff is:"
partial_diff
$tmpdir
/recordings
{
,.reco2dur
}
exit
1
;
fi
else
if
!
cmp
-s
$tmpdir
/
{
utts,recordings.reco2dur
}
;
then
echo
"
$0
: Error: in
$data
, recording-ids extracted from wav.scp and reco2dur file"
echo
"
$0
: differ, partial diff is:"
partial_diff
$tmpdir
/
{
utts,recordings.reco2dur
}
exit
1
;
fi
fi
cat
$data
/reco2dur |
\
awk
'{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}'
||
exit
1
fi
echo
"
$0
: Successfully validated data-directory
$data
"
examples/aishell/s0/tools/validate_dict_dir.pl
0 → 100755
View file @
a7785cc6
#!/usr/bin/env perl
# Apache 2.0.
# Copyright 2012 Guoguo Chen
# 2015 Daniel Povey
# 2017 Johns Hopkins University (Jan "Yenda" Trmal <jtrmal@gmail.com>)
#
# Validation script for 'dict' directories (e.g. data/local/dict)
# this function reads the opened file (supplied as a first
# parameter) into an array of lines. For each
# line, it tests whether it's a valid utf-8 compatible
# line. If all lines are valid utf-8, it returns the lines
# decoded as utf-8, otherwise it assumes the file's encoding
# is one of those 1-byte encodings, such as ISO-8859-x
# or Windows CP-X.
# Please recall we do not really care about
# the actually encoding, we just need to
# make sure the length of the (decoded) string
# is correct (to make the output formatting looking right).
sub
get_utf8_or_bytestream
{
use
Encode
qw(decode encode)
;
my
$is_utf_compatible
=
1
;
my
@unicode_lines
;
my
@raw_lines
;
my
$raw_text
;
my
$lineno
=
0
;
my
$file
=
shift
;
while
(
<
$file
>
)
{
$raw_text
=
$_
;
last
unless
$raw_text
;
if
(
$is_utf_compatible
)
{
my
$decoded_text
=
eval
{
decode
("
UTF-8
",
$raw_text
,
Encode::
FB_CROAK
)
}
;
$is_utf_compatible
=
$is_utf_compatible
&&
defined
(
$decoded_text
);
push
@unicode_lines
,
$decoded_text
;
}
else
{
#print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n";
;
}
push
@raw_lines
,
$raw_text
;
$lineno
+=
1
;
}
if
(
!
$is_utf_compatible
)
{
return
(
0
,
@raw_lines
);
}
else
{
return
(
1
,
@unicode_lines
);
}
}
# check if the given unicode string contain unicode whitespaces
# other than the usual four: TAB, LF, CR and SPACE
sub
validate_utf8_whitespaces
{
my
$unicode_lines
=
shift
;
use
feature
'
unicode_strings
';
for
(
my
$i
=
0
;
$i
<
scalar
@
{
$unicode_lines
};
$i
++
)
{
my
$current_line
=
$unicode_lines
->
[
$i
];
if
((
substr
$current_line
,
-
1
)
ne
"
\n
"){
print
STDERR
"
$0: The current line (nr.
$i
) has invalid newline
\n
";
return
1
;
}
my
@A
=
split
("
",
$current_line
);
my
$utt_id
=
$A
[
0
];
# we replace TAB, LF, CR, and SPACE
# this is to simplify the test
if
(
$current_line
=~
/\x{000d}/
)
{
print
STDERR
"
$0: The line for utterance
$utt_id
contains CR (0x0D) character
\n
";
return
1
;
}
$current_line
=~
s/[\x{0009}\x{000a}\x{0020}]/./g
;
if
(
$current_line
=~
/\s/
)
{
print
STDERR
"
$0: The line for utterance
$utt_id
contains disallowed Unicode whitespaces
\n
";
return
1
;
}
}
return
0
;
}
# checks if the text in the file (supplied as the argument) is utf-8 compatible
# if yes, checks if it contains only allowed whitespaces. If no, then does not
# do anything. The function seeks to the original position in the file after
# reading the text.
sub
check_allowed_whitespace
{
my
$file
=
shift
;
my
$pos
=
tell
(
$file
);
(
my
$is_utf
,
my
@lines
)
=
get_utf8_or_bytestream
(
$file
);
seek
(
$file
,
$pos
,
SEEK_SET
);
if
(
$is_utf
)
{
my
$has_invalid_whitespaces
=
validate_utf8_whitespaces
(
\
@lines
);
print
"
--> text seems to be UTF-8 or ASCII, checking whitespaces
\n
";
if
(
$has_invalid_whitespaces
)
{
print
"
--> ERROR: the text containes disallowed UTF-8 whitespace character(s)
\n
";
return
0
;
}
else
{
print
"
--> text contains only allowed whitespaces
\n
";
}
}
else
{
print
"
--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces
\n
";
}
return
1
;
}
if
(
@ARGV
!=
1
)
{
die
"
Usage: validate_dict_dir.pl <dict-dir>
\n
"
.
"
e.g.: validate_dict_dir.pl data/local/dict
\n
";
}
$dict
=
shift
@ARGV
;
$dict
=~
s:/$::
;
$exit
=
0
;
$success
=
1
;
# this is re-set each time we read a file.
sub
set_to_fail
{
$exit
=
1
;
$success
=
0
;
}
# Checking silence_phones.txt -------------------------------
print
"
Checking
$dict
/silence_phones.txt ...
\n
";
if
(
-
z
"
$dict
/silence_phones.txt
")
{
print
"
--> ERROR:
$dict
/silence_phones.txt is empty or not exists
\n
";
exit
1
;}
if
(
!
open
(
S
,
"
<
$dict
/silence_phones.txt
"))
{
print
"
--> ERROR: fail to open
$dict
/silence_phones.txt
\n
";
exit
1
;}
$idx
=
1
;
%silence
=
();
$crlf
=
1
;
print
"
--> reading
$dict
/silence_phones.txt
\n
";
check_allowed_whitespace
(
\
*S
)
||
set_to_fail
();
while
(
<
S
>
)
{
if
(
!
s/\n$//
)
{
print
"
--> ERROR: last line '
$_
' of
$dict
/silence_phones.txt does not end in newline.
\n
";
set_to_fail
();
}
if
(
$crlf
==
1
&&
m/\r/
)
{
print
"
--> ERROR:
$dict
/silence_phones.txt contains Carriage Return (^M) characters.
\n
";
set_to_fail
();
$crlf
=
0
;
}
my
@col
=
split
("
",
$_
);
if
(
@col
==
0
)
{
set_to_fail
();
print
"
--> ERROR: empty line in
$dict
/silence_phones.txt (line
$idx
)
\n
";
}
foreach
(
0
..
@col
-
1
)
{
my
$p
=
$col
[
$_
];
if
(
$silence
{
$p
})
{
set_to_fail
();
print
"
--> ERROR: phone
\"
$p
\"
duplicates in
$dict
/silence_phones.txt (line
$idx
)
\n
";
}
else
{
$silence
{
$p
}
=
1
;
}
# disambiguation symbols; phones ending in _B, _E, _S or _I will cause
# problems with word-position-dependent systems, and <eps> is obviously
# confusable with epsilon.
if
(
$p
=~
m/^#/
||
$p
=~
m/_[BESI]$/
||
$p
eq
"
<eps>
"){
set_to_fail
();
print
"
--> ERROR: phone
\"
$p
\"
has disallowed written form
\n
";
}
}
$idx
++
;
}
close
(
S
);
$success
==
0
||
print
"
-->
$dict
/silence_phones.txt is OK
\n
";
print
"
\n
";
# Checking optional_silence.txt -------------------------------
print
"
Checking
$dict
/optional_silence.txt ...
\n
";
if
(
-
z
"
$dict
/optional_silence.txt
")
{
print
"
--> ERROR:
$dict
/optional_silence.txt is empty or not exists
\n
";
exit
1
;}
if
(
!
open
(
OS
,
"
<
$dict
/optional_silence.txt
"))
{
print
"
--> ERROR: fail to open
$dict
/optional_silence.txt
\n
";
exit
1
;}
$idx
=
1
;
$success
=
1
;
$crlf
=
1
;
print
"
--> reading
$dict
/optional_silence.txt
\n
";
check_allowed_whitespace
(
\
*OS
)
or
exit
1
;
while
(
<
OS
>
)
{
chomp
;
my
@col
=
split
("
",
$_
);
if
(
$idx
>
1
or
@col
>
1
)
{
set_to_fail
();
print
"
--> ERROR: only 1 phone expected in
$dict
/optional_silence.txt
\n
";
}
elsif
(
!
$silence
{
$col
[
0
]})
{
set_to_fail
();
print
"
--> ERROR: phone
$col
[0] not found in
$dict
/silence_phones.txt
\n
";
}
if
(
$crlf
==
1
&&
m/\r/
)
{
print
"
--> ERROR:
$dict
/optional_silence.txt contains Carriage Return (^M) characters.
\n
";
set_to_fail
();
$crlf
=
0
;
}
$idx
++
;
}
close
(
OS
);
$success
==
0
||
print
"
-->
$dict
/optional_silence.txt is OK
\n
";
print
"
\n
";
# Checking nonsilence_phones.txt -------------------------------
print
"
Checking
$dict
/nonsilence_phones.txt ...
\n
";
if
(
-
z
"
$dict
/nonsilence_phones.txt
")
{
print
"
--> ERROR:
$dict
/nonsilence_phones.txt is empty or not exists
\n
";
exit
1
;}
if
(
!
open
(
NS
,
"
<
$dict
/nonsilence_phones.txt
"))
{
print
"
--> ERROR: fail to open
$dict
/nonsilence_phones.txt
\n
";
exit
1
;}
$idx
=
1
;
%nonsilence
=
();
$success
=
1
;
$crlf
=
1
;
print
"
--> reading
$dict
/nonsilence_phones.txt
\n
";
check_allowed_whitespace
(
\
*NS
)
or
set_to_fail
();
while
(
<
NS
>
)
{
if
(
$crlf
==
1
&&
m/\r/
)
{
print
"
--> ERROR:
$dict
/nonsilence_phones.txt contains Carriage Return (^M) characters.
\n
";
set_to_fail
();
$crlf
=
0
;
}
if
(
!
s/\n$//
)
{
print
"
--> ERROR: last line '
$_
' of
$dict
/nonsilence_phones.txt does not end in newline.
\n
";
set_to_fail
();
}
my
@col
=
split
("
",
$_
);
if
(
@col
==
0
)
{
set_to_fail
();
print
"
--> ERROR: empty line in
$dict
/nonsilence_phones.txt (line
$idx
)
\n
";
}
foreach
(
0
..
@col
-
1
)
{
my
$p
=
$col
[
$_
];
if
(
$nonsilence
{
$p
})
{
set_to_fail
();
print
"
--> ERROR: phone
\"
$p
\"
duplicates in
$dict
/nonsilence_phones.txt (line
$idx
)
\n
";
}
else
{
$nonsilence
{
$p
}
=
1
;
}
# phones that start with the pound sign/hash may be mistaken for
# disambiguation symbols; phones ending in _B, _E, _S or _I will cause
# problems with word-position-dependent systems, and <eps> is obviously
# confusable with epsilon.
if
(
$p
=~
m/^#/
||
$p
=~
m/_[BESI]$/
||
$p
eq
"
<eps>
"){
set_to_fail
();
print
"
--> ERROR: phone
\"
$p
\"
has disallowed written form
\n
";
}
}
$idx
++
;
}
close
(
NS
);
$success
==
0
||
print
"
-->
$dict
/nonsilence_phones.txt is OK
\n
";
print
"
\n
";
# Checking disjoint -------------------------------
sub
intersect
{
my
(
$a
,
$b
)
=
@_
;
@itset
=
();
%itset
=
();
foreach
(
keys
%$a
)
{
if
(
exists
$b
->
{
$_
}
and
!
$itset
{
$_
})
{
push
(
@itset
,
$_
);
$itset
{
$_
}
=
1
;
}
}
return
@itset
;
}
print
"
Checking disjoint: silence_phones.txt, nonsilence_phones.txt
\n
";
@itset
=
intersect
(
\
%silence
,
\
%nonsilence
);
if
(
@itset
==
0
)
{
print
"
--> disjoint property is OK.
\n
";}
else
{
set_to_fail
();
print
"
--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap:
";
foreach
(
@itset
)
{
print
"
$_
";}
print
"
\n
";}
print
"
\n
";
sub
check_lexicon
{
my
(
$lex
,
$num_prob_cols
,
$num_skipped_cols
)
=
@_
;
print
"
Checking
$lex
\n
";
!
open
(
L
,
"
<
$lex
")
&&
print
"
--> ERROR: fail to open
$lex
\n
"
&&
set_to_fail
();
my
%seen_line
=
{};
$idx
=
1
;
$success
=
1
;
$crlf
=
1
;
print
"
--> reading
$lex
\n
";
check_allowed_whitespace
(
\
*L
)
or
set_to_fail
();
while
(
<
L
>
)
{
if
(
$crlf
==
1
&&
m/\r/
)
{
print
"
--> ERROR:
$lex
contains Carriage Return (^M) characters.
\n
";
set_to_fail
();
$crlf
=
0
;
}
if
(
defined
$seen_line
{
$_
})
{
print
"
--> ERROR: line '
$_
' of
$lex
is repeated
\n
";
set_to_fail
();
}
$seen_line
{
$_
}
=
1
;
if
(
!
s/\n$//
)
{
print
"
--> ERROR: last line '
$_
' of
$lex
does not end in newline.
\n
";
set_to_fail
();
}
my
@col
=
split
("
",
$_
);
$word
=
shift
@col
;
if
(
!
defined
$word
)
{
print
"
--> ERROR: empty lexicon line in
$lex
\n
";
set_to_fail
();
}
if
(
$word
eq
"
<s>
"
||
$word
eq
"
</s>
"
||
$word
eq
"
<eps>
"
||
$word
eq
"
#0
")
{
print
"
--> ERROR: lexicon.txt contains forbidden word
$word
\n
";
set_to_fail
();
}
for
(
$n
=
0
;
$n
<
$num_prob_cols
;
$n
++
)
{
$prob
=
shift
@col
;
if
(
!
(
$prob
>
0.0
&&
$prob
<=
1.0
))
{
print
"
--> ERROR: bad pron-prob in lexicon-line '
$_
', in
$lex
\n
";
set_to_fail
();
}
}
for
(
$n
=
0
;
$n
<
$num_skipped_cols
;
$n
++
)
{
shift
@col
;
}
if
(
@col
==
0
)
{
print
"
--> ERROR: lexicon.txt contains word
$word
with empty
";
print
"
pronunciation.
\n
";
set_to_fail
();
}
foreach
(
0
..
@col
-
1
)
{
if
(
!
$silence
{
@col
[
$_
]}
and
!
$nonsilence
{
@col
[
$_
]})
{
print
"
--> ERROR: phone
\"
@col
[
$_
]
\"
is not in {, non}silence.txt
";
print
"
(line
$idx
)
\n
";
set_to_fail
();
}
}
$idx
++
;
}
close
(
L
);
$success
==
0
||
print
"
-->
$lex
is OK
\n
";
print
"
\n
";
}
if
(
-
f
"
$dict
/lexicon.txt
")
{
check_lexicon
("
$dict
/lexicon.txt
",
0
,
0
);
}
if
(
-
f
"
$dict
/lexiconp.txt
")
{
check_lexicon
("
$dict
/lexiconp.txt
",
1
,
0
);
}
if
(
-
f
"
$dict
/lexiconp_silprob.txt
")
{
# If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also
# exist.
check_lexicon
("
$dict
/lexiconp_silprob.txt
",
2
,
2
);
if
(
-
f
"
$dict
/silprob.txt
")
{
!
open
(
SP
,
"
<
$dict
/silprob.txt
")
&&
print
"
--> ERROR: fail to open
$dict
/silprob.txt
\n
"
&&
set_to_fail
();
$crlf
=
1
;
while
(
<
SP
>
)
{
if
(
$crlf
==
1
&&
m/\r/
)
{
print
"
--> ERROR:
$dict
/silprob.txt contains Carriage Return (^M) characters.
\n
";
set_to_fail
();
$crlf
=
0
;
}
chomp
;
my
@col
=
split
;
@col
!=
2
&&
die
"
--> ERROR: bad line
\"
$_
\"\n
"
&&
set_to_fail
();
if
(
$col
[
0
]
eq
"
<s>
"
||
$col
[
0
]
eq
"
overall
")
{
if
(
!
(
$col
[
1
]
>
0.0
&&
$col
[
1
]
<=
1.0
))
{
set_to_fail
();
print
"
--> ERROR: bad probability in
$dir
/silprob.txt
\"
$_
\"\n
";
}
}
elsif
(
$col
[
0
]
eq
"
</s>_s
"
||
$col
[
0
]
eq
"
</s>_n
")
{
if
(
$col
[
1
]
<=
0.0
)
{
set_to_fail
();
print
"
--> ERROR: bad correction term in
$dir
/silprob.txt
\"
$_
\"\n
";
}
}
else
{
print
"
--> ERROR: unexpected line in
$dir
/silprob.txt
\"
$_
\"\n
";
set_to_fail
();
}
}
close
(
SP
);
}
else
{
set_to_fail
();
print
"
--> ERROR: expecting
$dict
/silprob.txt to exist
\n
";
}
}
if
(
!
(
-
f
"
$dict
/lexicon.txt
"
||
-
f
"
$dict
/lexiconp.txt
"))
{
print
"
--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory
$dir
\n
";
set_to_fail
();
}
sub
check_lexicon_pair
{
my
(
$lex1
,
$num_prob_cols1
,
$num_skipped_cols1
,
$lex2
,
$num_prob_cols2
,
$num_skipped_cols2
)
=
@_
;
# We have checked individual lexicons already.
open
(
L1
,
"
<
$lex1
");
open
(
L2
,
"
<
$lex2
");
print
"
Checking lexicon pair
$lex1
and
$lex2
\n
";
my
$line_num
=
0
;
while
(
<
L1
>
)
{
$line_num
++
;
@A
=
split
;
$line_B
=
<
L2
>
;
if
(
!
defined
$line_B
)
{
print
"
--> ERROR:
$lex1
and
$lex2
have different number of lines.
\n
";
set_to_fail
();
last
;
}
@B
=
split
("
",
$line_B
);
# Check if the word matches.
if
(
$A
[
0
]
ne
$B
[
0
])
{
print
"
--> ERROR:
$lex1
and
$lex2
mismatch at line
$line_num
. sorting?
\n
";
set_to_fail
();
last
;
}
shift
@A
;
shift
@B
;
for
(
$n
=
0
;
$n
<
$num_prob_cols1
+
$num_skipped_cols1
;
$n
++
)
{
shift
@A
;
}
for
(
$n
=
0
;
$n
<
$num_prob_cols2
+
$num_skipped_cols2
;
$n
++
)
{
shift
@B
;
}
# Check if the pronunciation matches
if
(
join
("
",
@A
)
ne
join
("
",
@B
))
{
print
"
--> ERROR:
$lex1
and
$lex2
mismatch at line
$line_num
. sorting?
\n
";
set_to_fail
();
last
;
}
}
$line_B
=
<
L2
>
;
if
(
defined
$line_B
&&
$exit
==
0
)
{
print
"
--> ERROR:
$lex1
and
$lex2
have different number of lines.
\n
";
set_to_fail
();
}
$success
==
0
||
print
"
--> lexicon pair
$lex1
and
$lex2
match
\n\n
";
}
# If more than one lexicon exist, we have to check if they correspond to each
# other. It could be that the user overwrote one and we need to regenerate the
# other, but we do not know which is which.
if
(
-
f
"
$dict
/lexicon.txt
"
&&
-
f
"
$dict
/lexiconp.txt
")
{
check_lexicon_pair
("
$dict
/lexicon.txt
",
0
,
0
,
"
$dict
/lexiconp.txt
",
1
,
0
);
}
if
(
-
f
"
$dict
/lexiconp.txt
"
&&
-
f
"
$dict
/lexiconp_silprob.txt
")
{
check_lexicon_pair
("
$dict
/lexiconp.txt
",
1
,
0
,
"
$dict
/lexiconp_silprob.txt
",
2
,
2
);
}
# Checking extra_questions.txt -------------------------------
%distinguished
=
();
# Keep track of all phone-pairs including nonsilence that
# are distinguished (split apart) by extra_questions.txt,
# as $distinguished{$p1,$p2} = 1. This will be used to
# make sure that we don't have pairs of phones on the same
# line in nonsilence_phones.txt that can never be
# distinguished from each other by questions. (If any two
# phones appear on the same line in nonsilence_phones.txt,
# they share a tree root, and since the automatic
# question-building treats all phones that appear on the
# same line of nonsilence_phones.txt as being in the same
# group, we can never distinguish them without resorting to
# questions in extra_questions.txt.
print
"
Checking
$dict
/extra_questions.txt ...
\n
";
if
(
-
s "$dict/extra_questions.txt") {
if (!open(EX, "
<
$dict
/
extra_questions
.
txt
"
)) {
set_to_fail(); print
"
-->
ERROR:
fail
to
open
$dict
/
extra_questions
.
txt
\
n
"
;
}
$idx
= 1;
$success
= 1;
$crlf
= 1;
print
"
-->
reading
$dict
/
extra_questions
.
txt
\
n
"
;
check_allowed_whitespace(
\
*EX) or set_to_fail();
while(<EX>) {
if (
$crlf
== 1 && m/
\r
/) {
print
"
-->
ERROR:
$dict
/
extra_questions
.
txt
contains
Carriage
Return
(
^
M
)
characters
.\
n
"
;
set_to_fail();
$crlf
= 0;
}
if (! s/
\n
$//) {
print
"
-->
ERROR:
last
line
'
$_
'
of
$dict
/
extra_questions
.
txt
does
not
end
in
newline
.\
n
"
;
set_to_fail();
}
my
@col
= split(
"
"
,
$_
);
if (
@col
== 0) {
set_to_fail(); print
"
-->
ERROR:
empty
line
in
$dict
/
extra_questions
.
txt
\
n
"
;
}
foreach (0 ..
@col
-1) {
if(!
$silence
{
@col
[
$_
]} and !
$nonsilence
{
@col
[
$_
]}) {
set_to_fail(); print
"
-->
ERROR:
phone
\
"
@col
[
$_
]
\"
is not in {, non}silence_phones.txt (line
$idx
, block
",
$_
+
1
,
"
)
\n
";
}
$idx
++
;
}
%col_hash
=
();
foreach
$p
(
@col
)
{
$col_hash
{
$p
}
=
1
;
}
foreach
$p1
(
@col
)
{
# Update %distinguished hash.
foreach
$p2
(
keys
%nonsilence
)
{
if
(
!
defined
$col_hash
{
$p2
})
{
# for each p1 in this question and p2 not
# in this question (and in nonsilence
# phones)... mark p1,p2 as being split apart
$distinguished
{
$p1
,
$p2
}
=
1
;
$distinguished
{
$p2
,
$p1
}
=
1
;
}
}
}
}
close
(
EX
);
$success
==
0
||
print
"
-->
$dict
/extra_questions.txt is OK
\n
";
}
else
{
print
"
-->
$dict
/extra_questions.txt is empty (this is OK)
\n
";}
if
(
-
f
"
$dict
/nonterminals.txt
")
{
open
(
NT
,
"
<
$dict
/nonterminals.txt
")
||
die
"
opening
$dict
/nonterminals.txt
";
my
%nonterminals
=
();
my
$line_number
=
1
;
while
(
<
NT
>
)
{
chop
;
my
@line
=
split
("
",
$_
);
if
(
@line
!=
1
||
!
m/^#nonterm:/
||
defined
$nonterminals
{
$line
[
0
]})
{
print
"
--> ERROR: bad (or duplicate) line
$line_number
: '
$_
' in
$dict
/nonterminals.txt
\n
";
exit
1
;
}
$nonterminals
{
$line
[
0
]}
=
1
;
$line_number
++
;
}
print
"
-->
$dict
/nonterminals.txt is OK
\n
";
}
# check nonsilence_phones.txt again for phone-pairs that are never
# distnguishable. (note: this situation is normal and expected for silence
# phones, so we don't check it.)
if
(
!
open
(
NS
,
"
<
$dict
/nonsilence_phones.txt
"))
{
print
"
--> ERROR: fail to open
$dict
/nonsilence_phones.txt the second time
\n
";
exit
1
;
}
$num_warn_nosplit
=
0
;
$num_warn_nosplit_limit
=
10
;
while
(
<
NS
>
)
{
my
@col
=
split
("
",
$_
);
foreach
$p1
(
@col
)
{
foreach
$p2
(
@col
)
{
if
(
$p1
ne
$p2
&&
!
$distinguished
{
$p1
,
$p2
})
{
set_to_fail
();
if
(
$num_warn_nosplit
<=
$num_warn_nosplit_limit
)
{
print
"
--> ERROR: phones
$p1
and
$p2
share a tree root but can never be distinguished by extra_questions.txt.
\n
";
}
if
(
$num_warn_nosplit
==
$num_warn_nosplit_limit
)
{
print
"
... Not warning any more times about this issue.
\n
";
}
if
(
$num_warn_nosplit
==
0
)
{
print
"
(note: we started checking for this only recently. You can still build a system but
\n
";
print
"
phones
$p1
and
$p2
will be acoustically indistinguishable).
\n
";
}
$num_warn_nosplit
++
;
}
}
}
}
if
(
$exit
==
1
)
{
print
"
--> ERROR validating dictionary directory
$dict
(see detailed error
";
print
"
messages above)
\n\n
";
exit
1
;
}
else
{
print
"
--> SUCCESS [validating dictionary directory
$dict
]
\n\n
";
}
exit
0
;
examples/aishell/s0/tools/validate_text.pl
0 → 100755
View file @
a7785cc6
#!/usr/bin/env perl
#
#===============================================================================
# Copyright 2017 Johns Hopkins University (author: Yenda Trmal <jtrmal@gmail.com>)
# Johns Hopkins University (author: Daniel Povey)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
#===============================================================================
# validation script for data/<dataset>/text
# to be called (preferably) from utils/validate_data_dir.sh
use
strict
;
use
warnings
;
use
utf8
;
use
Fcntl
qw< SEEK_SET >
;
# this function reads the opened file (supplied as a first
# parameter) into an array of lines. For each
# line, it tests whether it's a valid utf-8 compatible
# line. If all lines are valid utf-8, it returns the lines
# decoded as utf-8, otherwise it assumes the file's encoding
# is one of those 1-byte encodings, such as ISO-8859-x
# or Windows CP-X.
# Please recall we do not really care about
# the actually encoding, we just need to
# make sure the length of the (decoded) string
# is correct (to make the output formatting looking right).
sub
get_utf8_or_bytestream
{
use
Encode
qw(decode encode)
;
my
$is_utf_compatible
=
1
;
my
@unicode_lines
;
my
@raw_lines
;
my
$raw_text
;
my
$lineno
=
0
;
my
$file
=
shift
;
while
(
<
$file
>
)
{
$raw_text
=
$_
;
last
unless
$raw_text
;
if
(
$is_utf_compatible
)
{
my
$decoded_text
=
eval
{
decode
("
UTF-8
",
$raw_text
,
Encode::
FB_CROAK
)
}
;
$is_utf_compatible
=
$is_utf_compatible
&&
defined
(
$decoded_text
);
push
@unicode_lines
,
$decoded_text
;
}
else
{
#print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n";
;
}
push
@raw_lines
,
$raw_text
;
$lineno
+=
1
;
}
if
(
!
$is_utf_compatible
)
{
return
(
0
,
@raw_lines
);
}
else
{
return
(
1
,
@unicode_lines
);
}
}
# check if the given unicode string contain unicode whitespaces
# other than the usual four: TAB, LF, CR and SPACE
sub
validate_utf8_whitespaces
{
my
$unicode_lines
=
shift
;
use
feature
'
unicode_strings
';
for
(
my
$i
=
0
;
$i
<
scalar
@
{
$unicode_lines
};
$i
++
)
{
my
$current_line
=
$unicode_lines
->
[
$i
];
if
((
substr
$current_line
,
-
1
)
ne
"
\n
"){
print
STDERR
"
$0: The current line (nr.
$i
) has invalid newline
\n
";
return
1
;
}
my
@A
=
split
("
",
$current_line
);
my
$utt_id
=
$A
[
0
];
# we replace TAB, LF, CR, and SPACE
# this is to simplify the test
if
(
$current_line
=~
/\x{000d}/
)
{
print
STDERR
"
$0: The line for utterance
$utt_id
contains CR (0x0D) character
\n
";
return
1
;
}
$current_line
=~
s/[\x{0009}\x{000a}\x{0020}]/./g
;
if
(
$current_line
=~
/\s/
)
{
print
STDERR
"
$0: The line for utterance
$utt_id
contains disallowed Unicode whitespaces
\n
";
return
1
;
}
}
return
0
;
}
# checks if the text in the file (supplied as the argument) is utf-8 compatible
# if yes, checks if it contains only allowed whitespaces. If no, then does not
# do anything. The function seeks to the original position in the file after
# reading the text.
sub
check_allowed_whitespace
{
my
$file
=
shift
;
my
$filename
=
shift
;
my
$pos
=
tell
(
$file
);
(
my
$is_utf
,
my
@lines
)
=
get_utf8_or_bytestream
(
$file
);
seek
(
$file
,
$pos
,
SEEK_SET
);
if
(
$is_utf
)
{
my
$has_invalid_whitespaces
=
validate_utf8_whitespaces
(
\
@lines
);
if
(
$has_invalid_whitespaces
)
{
print
STDERR
"
$0: ERROR: text file '
$filename
' contains disallowed UTF-8 whitespace character(s)
\n
";
return
0
;
}
}
return
1
;
}
if
(
@ARGV
!=
1
)
{
die
"
Usage: validate_text.pl <text-file>
\n
"
.
"
e.g.: validate_text.pl data/train/text
\n
";
}
my
$text
=
shift
@ARGV
;
if
(
-
z
"
$text
")
{
print
STDERR
"
$0: ERROR: file '
$text
' is empty or does not exist
\n
";
exit
1
;
}
if
(
!
open
(
FILE
,
"
<
$text
"))
{
print
STDERR
"
$0: ERROR: failed to open
$text
\n
";
exit
1
;
}
check_allowed_whitespace
(
\
*FILE
,
$text
)
or
exit
1
;
close
(
FILE
);
examples/aishell/s0/tools/wav2dur.py
0 → 100755
View file @
a7785cc6
#!/usr/bin/env python3
# encoding: utf-8
import
sys
import
torchaudio
torchaudio
.
set_audio_backend
(
"sox_io"
)
scp
=
sys
.
argv
[
1
]
dur_scp
=
sys
.
argv
[
2
]
with
open
(
scp
,
'r'
)
as
f
,
open
(
dur_scp
,
'w'
)
as
fout
:
cnt
=
0
total_duration
=
0
for
l
in
f
:
items
=
l
.
strip
().
split
()
wav_id
=
items
[
0
]
fname
=
items
[
1
]
cnt
+=
1
waveform
,
rate
=
torchaudio
.
load
(
fname
)
frames
=
len
(
waveform
[
0
])
duration
=
frames
/
float
(
rate
)
total_duration
+=
duration
fout
.
write
(
'{} {}
\n
'
.
format
(
wav_id
,
duration
))
print
(
'process {} utts'
.
format
(
cnt
))
print
(
'total {} s'
.
format
(
total_duration
))
examples/aishell/s0/tools/wav_to_duration.sh
0 → 100755
View file @
a7785cc6
#!/bin/bash
# split the wav scp, calculate duration and merge
nj
=
4
.
tools/parse_options.sh
||
exit
1
;
inscp
=
$1
outscp
=
$2
data
=
$(
dirname
${
inscp
}
)
if
[
$#
-eq
3
]
;
then
logdir
=
$3
else
logdir
=
${
data
}
/log
fi
mkdir
-p
${
logdir
}
rm
-f
$logdir
/wav_
*
.slice
rm
-f
$logdir
/wav_
*
.shape
split
--additional-suffix
.slice
-d
-n
l/
$nj
$inscp
$logdir
/wav_
for
slice
in
`
ls
$logdir
/wav_
*
.slice
`
;
do
{
name
=
`
basename
-s
.slice
$slice
`
tools/wav2dur.py
$slice
$logdir
/
$name
.shape 1>
$logdir
/
$name
.log
}
&
done
wait
cat
$logdir
/wav_
*
.shape
>
$outscp
Prev
1
2
3
4
5
6
7
…
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment