Commit a7785cc6 authored by Sugon_ldc's avatar Sugon_ldc
Browse files

delete soft link

parent 9a2a05ca
#!/bin/bash
# 2020 @kamo-naoyuki
# This file was copied from Kaldi and
# I deleted parts related to wav duration
# because we shouldn't use kaldi's command here
# and we don't need the files actually.
# Copyright 2013 Johns Hopkins University (author: Daniel Povey)
# 2014 Tom Ko
# 2018 Emotech LTD (author: Pawel Swietojanski)
# Apache 2.0
# This script operates on a directory, such as in data/train/,
# that contains some subset of the following files:
# wav.scp
# spk2utt
# utt2spk
# text
#
# It generates the files which are used for perturbing the speed of the original data.
export LC_ALL=C
set -euo pipefail
if [[ $# != 3 ]]; then
echo "Usage: perturb_data_dir_speed.sh <warping-factor> <srcdir> <destdir>"
echo "e.g.:"
echo " $0 0.9 data/train_si284 data/train_si284p"
exit 1
fi
factor=$1
srcdir=$2
destdir=$3
label="sp"
spk_prefix="${label}${factor}-"
utt_prefix="${label}${factor}-"
#check is sox on the path
! command -v sox &>/dev/null && echo "sox: command not found" && exit 1;
if [[ ! -f ${srcdir}/utt2spk ]]; then
echo "$0: no such file ${srcdir}/utt2spk"
exit 1;
fi
if [[ ${destdir} == "${srcdir}" ]]; then
echo "$0: this script requires <srcdir> and <destdir> to be different."
exit 1
fi
mkdir -p "${destdir}"
<"${srcdir}"/utt2spk awk -v p="${utt_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/utt_map"
<"${srcdir}"/spk2utt awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/spk_map"
<"${srcdir}"/wav.scp awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/reco_map"
if [[ ! -f ${srcdir}/utt2uniq ]]; then
<"${srcdir}/utt2spk" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $1);}' > "${destdir}/utt2uniq"
else
<"${srcdir}/utt2uniq" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $2);}' > "${destdir}/utt2uniq"
fi
<"${srcdir}"/utt2spk utils/apply_map.pl -f 1 "${destdir}"/utt_map | \
utils/apply_map.pl -f 2 "${destdir}"/spk_map >"${destdir}"/utt2spk
utils/utt2spk_to_spk2utt.pl <"${destdir}"/utt2spk >"${destdir}"/spk2utt
if [[ -f ${srcdir}/segments ]]; then
utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/segments | \
utils/apply_map.pl -f 2 "${destdir}"/reco_map | \
awk -v factor="${factor}" \
'{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' \
>"${destdir}"/segments
utils/apply_map.pl -f 1 "${destdir}"/reco_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \
# Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename"
awk -v factor="${factor}" \
'{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"}
else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" }
else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \
> "${destdir}"/wav.scp
if [[ -f ${srcdir}/reco2file_and_channel ]]; then
utils/apply_map.pl -f 1 "${destdir}"/reco_map \
<"${srcdir}"/reco2file_and_channel >"${destdir}"/reco2file_and_channel
fi
else # no segments->wav indexed by utterance.
if [[ -f ${srcdir}/wav.scp ]]; then
utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \
# Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename"
awk -v factor="${factor}" \
'{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"}
else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" }
else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \
> "${destdir}"/wav.scp
fi
fi
if [[ -f ${srcdir}/text ]]; then
utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text
fi
if [[ -f ${srcdir}/spk2gender ]]; then
utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender
fi
if [[ -f ${srcdir}/utt2lang ]]; then
utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang
fi
rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null
echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}"
utils/validate_data_dir.sh --no-feats --no-text "${destdir}"
#!/bin/bash
# koried, 10/29/2012
# Reduce a data set based on a list of turn-ids
help_message="usage: $0 srcdir turnlist destdir"
if [ $1 == "--help" ]; then
echo "${help_message}"
exit 0;
fi
if [ $# != 3 ]; then
echo "${help_message}"
exit 1;
fi
srcdir=$1
reclist=$2
destdir=$3
if [ ! -f ${srcdir}/utt2spk ]; then
echo "$0: no such file $srcdir/utt2spk"
exit 1;
fi
function do_filtering {
# assumes the utt2spk and spk2utt files already exist.
[ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp
[ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp
[ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text
[ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames
[ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender
[ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp
if [ -f ${srcdir}/segments ]; then
utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments
awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings.
# The next line would override the command above for wav.scp, which would be incorrect.
[ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp
[ -f ${srcdir}/reco2file_and_channel ] && \
utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel
# Filter the STM file for proper sclite scoring (this will also remove the comments lines)
[ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm
rm ${destdir}/reco
fi
srcutts=$(wc -l < ${srcdir}/utt2spk)
destutts=$(wc -l < ${destdir}/utt2spk)
echo "Reduced #utt from $srcutts to $destutts"
}
mkdir -p ${destdir}
# filter the utt2spk based on the set of recordings
utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk
utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt
do_filtering;
#!/usr/bin/env python3
# encoding: utf-8
import argparse
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='remove too long or too short data in format.data')
parser.add_argument('--data_file',
type=str,
help='input format data')
parser.add_argument('--output_data_file',
type=str,
help='output format data')
parser.add_argument('--min_input_len', type=float,
default=0,
help='minimum input seq length, in seconds for raw wav, \
in frame numbers for feature data')
parser.add_argument('--max_input_len', type=float,
default=20,
help='maximum output seq length, in seconds for raw wav, \
in frame numbers for feature data')
parser.add_argument('--min_output_len', type=float,
default=0, help='minimum input seq length, in modeling units')
parser.add_argument('--max_output_len', type=float,
default=500,
help='maximum output seq length, in modeling units')
parser.add_argument('--min_output_input_ratio', type=float, default=0.05,
help='minimum output seq length/output seq length ratio')
parser.add_argument('--max_output_input_ratio', type=float, default=10,
help='maximum output seq length/output seq length ratio')
args = parser.parse_args()
data_file = args.data_file
output_data_file = args.output_data_file
min_input_len = args.min_input_len
max_input_len = args.max_input_len
min_output_len = args.min_output_len
max_output_len = args.max_output_len
min_output_input_ratio = args.min_output_input_ratio
max_output_input_ratio = args.max_output_input_ratio
with open(data_file, 'r') as f, open(output_data_file, 'w') as fout:
for l in f:
l = l.strip()
if l:
items = l.strip().split('\t')
token_shape = items[6]
feature_shape = items[2]
feat_len = float(feature_shape.split(':')[1].split(',')[0])
token_len = float(token_shape.split(':')[1].split(',')[0])
condition = [feat_len > min_input_len,
feat_len < max_input_len,
token_len > min_output_len,
token_len < max_output_len,
token_len / feat_len > min_output_input_ratio,
token_len / feat_len < max_output_input_ratio,
]
if all(condition):
fout.write('{}\n'.format(l))
continue
#!/usr/bin/env python3
# Copyright (c) 2021 Mobvoi Inc. (Di Wu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
import argparse
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='generate segmented wav.scp')
parser.add_argument('--segments', required=True, help='segments file')
parser.add_argument('--input',
required=True,
help='origin wav.scp that not segmented')
parser.add_argument('--output',
required=True,
help='output segmented wav.scp')
wav_dic = {}
args = parser.parse_args()
ori_wav = args.input
segment_file = args.segments
wav_scp = args.output
with open(ori_wav, 'r') as ori:
for l in ori:
item = l.strip().split()
wav_dic[item[0]] = item[1]
with open(wav_scp, 'w') as f, open(segment_file, 'r') as sgement:
for l in sgement:
item = l.strip().split()
if item[1] in wav_dic:
item[1] = wav_dic[item[1]]
f.write("{} {},{},{}\n".format(item[0], item[1], item[2], item[3]))
#!/usr/bin/env bash
# NOTE(hslee): this code is borrowed from ESPnet (https://github.com/espnet/espnet)
set -euo pipefail
if [ -z "${PS1:-}" ]; then
PS1=__dummy__
fi
CONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
if [ $# -gt 4 ]; then
echo "Usage: $0 [output] [conda-env-name] [python-version>]"
exit 1;
elif [ $# -eq 3 ]; then
output_dir="$1"
name="$2"
PYTHON_VERSION="$3"
elif [ $# -eq 2 ]; then
output_dir="$1"
name="$2"
PYTHON_VERSION=""
elif [ $# -eq 1 ]; then
output_dir="$1"
name=""
PYTHON_VERSION=""
elif [ $# -eq 0 ]; then
output_dir=venv
name=""
PYTHON_VERSION=""
fi
if [ -e activate_python.sh ]; then
echo "Warning: activate_python.sh already exists. It will be overwritten"
fi
if [ ! -e "${output_dir}/etc/profile.d/conda.sh" ]; then
if [ ! -e miniconda.sh ]; then
wget --tries=3 "${CONDA_URL}" -O miniconda.sh
fi
bash miniconda.sh -b -p "${output_dir}"
fi
# shellcheck disable=SC1090
source "${output_dir}/etc/profile.d/conda.sh"
conda deactivate
# If the env already exists, skip recreation
if [ -n "${name}" ] && ! conda activate ${name}; then
conda create -yn "${name}"
fi
conda activate ${name}
if [ -n "${PYTHON_VERSION}" ]; then
conda install -y conda "python=${PYTHON_VERSION}"
else
conda install -y conda
fi
conda install -y pip setuptools
cat << EOF > activate_python.sh
#!/usr/bin/env bash
# THIS FILE IS GENERATED BY tools/setup_anaconda.sh
if [ -z "\${PS1:-}" ]; then
PS1=__dummy__
fi
. $(cd ${output_dir}; pwd)/etc/profile.d/conda.sh && conda deactivate && conda activate ${name}
EOF
#!/bin/bash
# convert sph scp to segmented wav scp
nj=1
. tools/parse_options.sh || exit 1;
inscp=$1
segments=$2
outscp=$3
data=$(dirname ${inscp})
if [ $# -eq 4 ]; then
logdir=$4
else
logdir=${data}/log
fi
mkdir -p ${logdir}
sph2pipe_version="v2.5"
if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then
echo "Download sph2pipe_${sph2pipe_version} ......"
wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \
wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \
tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools
cd tools/sph2pipe_${sph2pipe_version}/ && \
gcc -o sph2pipe *.c -lm
cd -
fi
sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe
[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1;
sox=`which sox`
[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1;
cat $inscp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s#-f#wav#-p#-c#1#%s#|\n", $1, sph2pipe, $2);
printf("%s-B %s#-f#wav#-p#-c#2#%s#|\n", $1, sph2pipe, $2);}' | \
sort > $data/wav_ori.scp || exit 1;
tools/segment.py --segments $segments --input $data/wav_ori.scp --output $data/wav_segments.scp
sed -i 's/ /,/g' $data/wav_segments.scp
sed -i 's/#/ /g' $data/wav_segments.scp
rm -f $logdir/wav_*.slice
rm -f $logdir/*.log
split --additional-suffix .slice -d -n l/$nj $data/wav_segments.scp $logdir/wav_
for slice in `ls $logdir/wav_*.slice`; do
{
name=`basename -s .slice $slice`
mkdir -p ${data}/wavs/${name}
cat ${slice} | awk -F ',' -v sox=$sox -v data=`pwd`/$data/wavs/$name \
-v logdir=$logdir -v name=$name '{
during=$4-$3
cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during;
system(cmd)
printf("%s %s/%s.wav\n", $1, data, $1);
}' | \
sort > ${data}/wavs_${name}.scp || exit 1;
} &
done
wait
cat ${data}/wavs_*.scp > $outscp
rm ${data}/wavs_*.scp
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
while(<>){
@A = split(" ", $_);
@A > 1 || die "Invalid line in spk2utt file: $_";
$s = shift @A;
foreach $u ( @A ) {
print "$u $s\n";
}
}
#!/usr/bin/env python
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# https://github.com/pytorch/fairseq/blob/master/LICENSE
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import sys
import sentencepiece as spm
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model", required=True,
help="sentencepiece model to use for decoding")
parser.add_argument("--input", default=None, help="input file to decode")
parser.add_argument("--input_format", choices=["piece", "id"], default="piece")
args = parser.parse_args()
sp = spm.SentencePieceProcessor()
sp.Load(args.model)
if args.input_format == "piece":
def decode(l):
return "".join(sp.DecodePieces(l))
elif args.input_format == "id":
def decode(l):
return "".join(sp.DecodeIds(l))
else:
raise NotImplementedError
def tok2int(tok):
# remap reference-side <unk> (represented as <<unk>>) to 0
return int(tok) if tok != "<<unk>>" else 0
if args.input is None:
h = sys.stdin
else:
h = open(args.input, "r", encoding="utf-8")
for line in h:
print(decode(line.split()))
if __name__ == "__main__":
main()
#!/usr/bin/env python
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in
# https://github.com/pytorch/fairseq/blob/master/LICENSE
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import contextlib
import sys
import sentencepiece as spm
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model", required=True,
help="sentencepiece model to use for encoding")
parser.add_argument("--inputs", nargs="+", default=['-'],
help="input files to filter/encode")
parser.add_argument("--outputs", nargs="+", default=['-'],
help="path to save encoded outputs")
parser.add_argument("--output_format", choices=["piece", "id"], default="piece")
parser.add_argument("--min-len", type=int, metavar="N",
help="filter sentence pairs with fewer than N tokens")
parser.add_argument("--max-len", type=int, metavar="N",
help="filter sentence pairs with more than N tokens")
args = parser.parse_args()
assert len(args.inputs) == len(args.outputs), \
"number of input and output paths should match"
sp = spm.SentencePieceProcessor()
sp.Load(args.model)
if args.output_format == "piece":
def encode(l):
return sp.EncodeAsPieces(l)
elif args.output_format == "id":
def encode(l):
return list(map(str, sp.EncodeAsIds(l)))
else:
raise NotImplementedError
if args.min_len is not None or args.max_len is not None:
def valid(line):
return (
(args.min_len is None or len(line) >= args.min_len) and
(args.max_len is None or len(line) <= args.max_len)
)
else:
def valid(lines):
return True
with contextlib.ExitStack() as stack:
inputs = [
stack.enter_context(open(input, "r", encoding="utf-8"))
if input != "-" else sys.stdin
for input in args.inputs
]
outputs = [
stack.enter_context(open(output, "w", encoding="utf-8"))
if output != "-" else sys.stdout
for output in args.outputs
]
stats = {
"num_empty": 0,
"num_filtered": 0,
}
def encode_line(line):
line = line.strip()
if len(line) > 0:
line = encode(line)
if valid(line):
return line
else:
stats["num_filtered"] += 1
else:
stats["num_empty"] += 1
return None
for i, lines in enumerate(zip(*inputs), start=1):
enc_lines = list(map(encode_line, lines))
if not any(enc_line is None for enc_line in enc_lines):
for enc_line, output_h in zip(enc_lines, outputs):
print(" ".join(enc_line), file=output_h)
if i % 10000 == 0:
print("processed {} lines".format(i), file=sys.stderr)
print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr)
print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr)
if __name__ == "__main__":
main()
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# https://github.com/pytorch/fairseq/blob/master/LICENSE
import sys
import sentencepiece as spm
if __name__ == "__main__":
spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:]))
#!/usr/bin/env bash
# Copyright 2010-2011 Microsoft Corporation
# 2012-2013 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0
# This script operates on a data directory, such as in data/train/.
# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data
# for what these directories contain.
# This script creates a subset of that data, consisting of some specified
# number of utterances. (The selected utterances are distributed evenly
# throughout the file, by the program ./subset_scp.pl).
# There are six options, none compatible with any other.
# If you give the --per-spk option, it will attempt to select the supplied
# number of utterances for each speaker (typically you would supply a much
# smaller number in this case).
# If you give the --speakers option, it selects a subset of n randomly
# selected speakers.
# If you give the --shortest option, it will give you the n shortest utterances.
# If you give the --first option, it will just give you the n first utterances.
# If you give the --last option, it will just give you the n last utterances.
# If you give the --spk-list or --utt-list option, it reads the
# speakers/utterances to keep from <speaker-list-file>/<utt-list-file>" (note,
# in this case there is no <num-utt> positional parameter; see usage message.)
shortest=false
perspk=false
speakers=false
first_opt=
spk_list=
utt_list=
expect_args=3
case $1 in
--first|--last) first_opt=$1; shift ;;
--per-spk) perspk=true; shift ;;
--shortest) shortest=true; shift ;;
--speakers) speakers=true; shift ;;
--spk-list) shift; spk_list=$1; shift; expect_args=2 ;;
--utt-list) shift; utt_list=$1; shift; expect_args=2 ;;
--*) echo "$0: invalid option '$1'"; exit 1
esac
if [ $# != $expect_args ]; then
echo "Usage:"
echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] <srcdir> <num-utt> <destdir>"
echo " subset_data_dir.sh [--spk-list <speaker-list-file>] <srcdir> <destdir>"
echo " subset_data_dir.sh [--utt-list <utt-list-file>] <srcdir> <destdir>"
echo "By default, randomly selects <num-utt> utterances from the data directory."
echo "With --speakers, randomly selects enough speakers that we have <num-utt> utterances"
echo "With --per-spk, selects <num-utt> utterances per speaker, if available."
echo "With --first, selects the first <num-utt> utterances"
echo "With --last, selects the last <num-utt> utterances"
echo "With --shortest, selects the shortest <num-utt> utterances."
echo "With --spk-list, reads the speakers to keep from <speaker-list-file>"
echo "With --utt-list, reads the utterances to keep from <utt-list-file>"
exit 1;
fi
srcdir=$1
if [[ $spk_list || $utt_list ]]; then
numutt=
destdir=$2
else
numutt=$2
destdir=$3
fi
export LC_ALL=C
if [ ! -f $srcdir/utt2spk ]; then
echo "$0: no such file $srcdir/utt2spk"
exit 1
fi
if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then
echo "$0: cannot subset to more utterances than you originally had."
exit 1
fi
if $shortest && [ ! -f $srcdir/feats.scp ]; then
echo "$0: you selected --shortest but no feats.scp exist."
exit 1
fi
mkdir -p $destdir || exit 1
if [[ $spk_list ]]; then
tools/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1;
tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1;
elif [[ $utt_list ]]; then
tools/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1;
tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1;
elif $speakers; then
tools/shuffle_list.pl < $srcdir/spk2utt |
awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' |
sort > $destdir/spk2utt
tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
elif $perspk; then
awk '{ n='$numutt'; printf("%s ",$1);
skip=1; while(n*(skip+1) <= NF-1) { skip++; }
for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); }
printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt
tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
else
if $shortest; then
# Select $numutt shortest utterances.
. ./path.sh
feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1;
sort -n -k2 $destdir/tmp.len |
awk '{print $1}' |
head -$numutt >$destdir/tmp.uttlist
tools/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk
rm $destdir/tmp.uttlist $destdir/tmp.len
else
# Select $numutt random utterances.
tools/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1;
fi
tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt
fi
# Perform filtering. utt2spk and spk2utt files already exist by this point.
# Filter by utterance.
[ -f $srcdir/feats.scp ] &&
tools/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp
[ -f $srcdir/vad.scp ] &&
tools/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp
[ -f $srcdir/utt2lang ] &&
tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang
[ -f $srcdir/utt2dur ] &&
tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur
[ -f $srcdir/utt2num_frames ] &&
tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames
[ -f $srcdir/utt2uniq ] &&
tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq
[ -f $srcdir/wav.scp ] &&
tools/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp
[ -f $srcdir/utt2warp ] &&
tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp
[ -f $srcdir/text ] &&
tools/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text
# Filter by speaker.
[ -f $srcdir/spk2warp ] &&
tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp
[ -f $srcdir/spk2gender ] &&
tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender
[ -f $srcdir/cmvn.scp ] &&
tools/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp
# Filter by recording-id.
if [ -f $srcdir/segments ]; then
tools/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments
# Recording-ids are in segments.
awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco
# The next line overrides the command above for wav.scp, which would be incorrect.
#[ -f $srcdir/wav.scp ] &&
# tools/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp
else
# No segments; recording-ids are in wav.scp.
awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco
fi
[ -f $srcdir/reco2file_and_channel ] &&
tools/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
[ -f $srcdir/reco2dur ] &&
tools/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur
# Filter the STM file for proper sclite scoring.
# Copy over the comments from STM file.
[ -f $srcdir/stm ] &&
(grep "^;;" $srcdir/stm
tools/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm
rm $destdir/reco
# Copy frame_shift if present.
[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir
srcutts=$(wc -l <$srcdir/utt2spk)
destutts=$(wc -l <$destdir/utt2spk)
echo "$0: reducing #utt from $srcutts to $destutts"
exit 0
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This program selects a subset of N elements in the scp.
# By default, it selects them evenly from throughout the scp, in order to avoid
# selecting too many from the same speaker. It prints them on the standard
# output.
# With the option --first, it just selects the N first utterances.
# With the option --last, it just selects the N last utterances.
# Last modified by JHU & HKUST @2013
$quiet = 0;
$first = 0;
$last = 0;
if (@ARGV > 0 && $ARGV[0] eq "--quiet") {
shift;
$quiet = 1;
}
if (@ARGV > 0 && $ARGV[0] eq "--first") {
shift;
$first = 1;
}
if (@ARGV > 0 && $ARGV[0] eq "--last") {
shift;
$last = 1;
}
if(@ARGV < 2 ) {
die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" .
" --quiet causes it to not die if N < num lines in scp.\n" .
" --first and --last make it equivalent to head or tail.\n" .
"See also: filter_scp.pl\n";
}
$N = shift @ARGV;
if($N == 0) {
die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\"";
}
$inscp = shift @ARGV;
open(I, "<$inscp") || die "Opening input scp file $inscp";
@F = ();
while(<I>) {
push @F, $_;
}
$numlines = @F;
if($N > $numlines) {
if ($quiet) {
$N = $numlines;
} else {
die "You requested from subset_scp.pl more elements than available: $N > $numlines";
}
}
sub select_n {
my ($start,$end,$num_needed) = @_;
my $diff = $end - $start;
if ($num_needed > $diff) {
die "select_n: code error";
}
if ($diff == 1 ) {
if ($num_needed > 0) {
print $F[$start];
}
} else {
my $halfdiff = int($diff/2);
my $halfneeded = int($num_needed/2);
select_n($start, $start+$halfdiff, $halfneeded);
select_n($start+$halfdiff, $end, $num_needed - $halfneeded);
}
}
if ( ! $first && ! $last) {
if ($N > 0) {
select_n(0, $numlines, $N);
}
} else {
if ($first) { # --first option: same as head.
for ($n = 0; $n < $N; $n++) {
print $F[$n];
}
} else { # --last option: same as tail.
for ($n = @F - $N; $n < @F; $n++) {
print $F[$n];
}
}
}
#!/usr/bin/env perl
# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
$ignore_oov = 0;
for($x = 0; $x < 2; $x++) {
if ($ARGV[0] eq "--map-oov") {
shift @ARGV;
$map_oov = shift @ARGV;
if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") {
# disallow '-f', the empty string and anything ending in words.txt as the
# OOV symbol because these are likely command-line errors.
die "the --map-oov option requires an argument";
}
}
if ($ARGV[0] eq "-f") {
shift @ARGV;
$field_spec = shift @ARGV;
if ($field_spec =~ m/^\d+$/) {
$field_begin = $field_spec - 1; $field_end = $field_spec - 1;
}
if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
if ($1 ne "") {
$field_begin = $1 - 1; # Change to zero-based indexing.
}
if ($2 ne "") {
$field_end = $2 - 1; # Change to zero-based indexing.
}
}
if (!defined $field_begin && !defined $field_end) {
die "Bad argument to -f option: $field_spec";
}
}
}
$symtab = shift @ARGV;
if (!defined $symtab) {
print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" .
"options: [--map-oov <oov-symbol> ] [-f <field-range> ]\n" .
"note: <field-range> can look like 4-5, or 4-, or 5-, or 1.\n";
}
open(F, "<$symtab") || die "Error opening symbol table file $symtab";
while(<F>) {
@A = split(" ", $_);
@A == 2 || die "bad line in symbol table file: $_";
$sym2int{$A[0]} = $A[1] + 0;
}
if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up
if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; }
$map_oov = $sym2int{$map_oov};
}
$num_warning = 0;
$max_warning = 20;
while (<>) {
@A = split(" ", $_);
@B = ();
for ($n = 0; $n < @A; $n++) {
$a = $A[$n];
if ( (!defined $field_begin || $n >= $field_begin)
&& (!defined $field_end || $n <= $field_end)) {
$i = $sym2int{$a};
if (!defined ($i)) {
if (defined $map_oov) {
if ($num_warning++ < $max_warning) {
print STDERR "sym2int.pl: replacing $a with $map_oov\n";
if ($num_warning == $max_warning) {
print STDERR "sym2int.pl: not warning for OOVs any more times\n";
}
}
$i = $map_oov;
} else {
$pos = $n+1;
die "sym2int.pl: undefined symbol $a (in position $pos)\n";
}
}
$a = $i;
}
push @B, $a;
}
print join(" ", @B);
print "\n";
}
if ($num_warning > 0) {
print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n";
}
exit(0);
#!/usr/bin/env python3
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan)
# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
from __future__ import print_function
from __future__ import unicode_literals
import argparse
import codecs
import re
import sys
is_python2 = sys.version_info[0] == 2
def exist_or_not(i, match_pos):
start_pos = None
end_pos = None
for pos in match_pos:
if pos[0] <= i < pos[1]:
start_pos = pos[0]
end_pos = pos[1]
break
return start_pos, end_pos
def seg_char(sent):
pattern = re.compile(r'([\u4e00-\u9fa5])')
chars = pattern.split(sent)
chars = [w for w in chars if len(w.strip()) > 0]
return chars
def get_parser():
parser = argparse.ArgumentParser(
description='convert raw text to tokenized text',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--nchar',
'-n',
default=1,
type=int,
help='number of characters to split, i.e., \
aabb -> a a b b with -n 1 and aa bb with -n 2')
parser.add_argument('--skip-ncols',
'-s',
default=0,
type=int,
help='skip first n columns')
parser.add_argument('--space',
default='<space>',
type=str,
help='space symbol')
parser.add_argument('--bpe-model',
'-m',
default=None,
type=str,
help='bpe model for english part')
parser.add_argument('--non-lang-syms',
'-l',
default=None,
type=str,
help='list of non-linguistic symobles,'
' e.g., <NOISE> etc.')
parser.add_argument('text',
type=str,
default=False,
nargs='?',
help='input text')
parser.add_argument('--trans_type',
'-t',
type=str,
default="char",
choices=["char", "phn", "cn_char_en_bpe"],
help="""Transcript type. char/phn. e.g., for TIMIT
FADG0_SI1279 -
If trans_type is char, read from
SI1279.WRD file -> "bricks are an alternative"
Else if trans_type is phn,
read from SI1279.PHN file ->
"sil b r ih sil k s aa r er n aa l
sil t er n ih sil t ih v sil" """)
return parser
def main():
parser = get_parser()
args = parser.parse_args()
rs = []
if args.non_lang_syms is not None:
with codecs.open(args.non_lang_syms, 'r', encoding="utf-8") as f:
nls = [x.rstrip() for x in f.readlines()]
rs = [re.compile(re.escape(x)) for x in nls]
if args.bpe_model is not None:
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.load(args.bpe_model)
if args.text:
f = codecs.open(args.text, encoding="utf-8")
else:
f = codecs.getreader("utf-8")(
sys.stdin if is_python2 else sys.stdin.buffer)
sys.stdout = codecs.getwriter("utf-8")(
sys.stdout if is_python2 else sys.stdout.buffer)
line = f.readline()
n = args.nchar
while line:
x = line.split()
print(' '.join(x[:args.skip_ncols]), end=" ")
a = ' '.join(x[args.skip_ncols:])
# get all matched positions
match_pos = []
for r in rs:
i = 0
while i >= 0:
m = r.search(a, i)
if m:
match_pos.append([m.start(), m.end()])
i = m.end()
else:
break
if len(match_pos) > 0:
chars = []
i = 0
while i < len(a):
start_pos, end_pos = exist_or_not(i, match_pos)
if start_pos is not None:
chars.append(a[start_pos:end_pos])
i = end_pos
else:
chars.append(a[i])
i += 1
a = chars
if (args.trans_type == "phn"):
a = a.split(" ")
elif args.trans_type == "cn_char_en_bpe":
b = seg_char(a)
a = []
for j in b:
# we use "▁" to instead of blanks among english words
# warning: here is "▁", not "_"
for l in j.strip().split("▁"):
if not l.encode('UTF-8').isalpha():
a.append(l)
else:
for k in sp.encode_as_pieces(l):
a.append(k)
else:
a = [a[j:j + n] for j in range(0, len(a), n)]
a_flat = []
for z in a:
a_flat.append("".join(z))
a_chars = [z.replace(' ', args.space) for z in a_flat]
if (args.trans_type == "phn"):
a_chars = [z.replace("sil", args.space) for z in a_chars]
print(' '.join(a_chars))
line = f.readline()
if __name__ == '__main__':
main()
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# converts an utt2spk file to a spk2utt file.
# Takes input from the stdin or from a file argument;
# output goes to the standard out.
if ( @ARGV > 1 ) {
die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt";
}
while(<>){
@A = split(" ", $_);
@A == 2 || die "Invalid line in utt2spk file: $_";
($u,$s) = @A;
if(!$seen_spk{$s}) {
$seen_spk{$s} = 1;
push @spklist, $s;
}
push (@{$spk_hash{$s}}, "$u");
}
foreach $s (@spklist) {
$l = join(' ',@{$spk_hash{$s}});
print "$s $l\n";
}
#!/bin/bash
cmd="$@"
no_feats=false
no_wav=false
no_text=false
no_spk_sort=false
for x in `seq 4`; do
if [ "$1" == "--no-feats" ]; then
no_feats=true
shift;
fi
if [ "$1" == "--no-text" ]; then
no_text=true
shift;
fi
if [ "$1" == "--no-wav" ]; then
no_wav=true
shift;
fi
if [ "$1" == "--no-spk-sort" ]; then
no_spk_sort=true
shift;
fi
done
if [ $# -ne 1 ]; then
echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] <data-dir>"
echo "The --no-xxx options mean that the script does not require "
echo "xxx.scp to be present, but it will check it if it is present."
echo "--no-spk-sort means that the script does not require the utt2spk to be "
echo "sorted by the speaker-id in addition to being sorted by utterance-id."
echo "By default, utt2spk is expected to be sorted by both, which can be "
echo "achieved by making the speaker-id prefixes of the utterance-ids"
echo "e.g.: $0 data/train"
exit 1;
fi
data=$1
if [ ! -d $data ]; then
echo "$0: no such directory $data"
exit 1;
fi
if [ -f $data/images.scp ]; then
cmd=${cmd/--no-wav/} # remove --no-wav if supplied
image/validate_data_dir.sh $cmd
exit $?
fi
for f in spk2utt utt2spk; do
if [ ! -f $data/$f ]; then
echo "$0: no such file $f"
exit 1;
fi
if [ ! -s $data/$f ]; then
echo "$0: empty file $f"
exit 1;
fi
done
! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \
echo "$0: $data/utt2spk has wrong format." && exit;
ns=$(wc -l < $data/spk2utt)
if [ "$ns" == 1 ]; then
echo "$0: WARNING: you have only one speaker. This probably a bad idea."
echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html"
echo " for more information."
fi
tmpdir=$(mktemp -d /tmp/kaldi.XXXX);
trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM
export LC_ALL=C
function check_sorted_and_uniq {
! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1;
! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \
echo "$0: file $1 is not in sorted order or has duplicates" && exit 1;
}
function partial_diff {
diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6)
n1=`cat $1 | wc -l`
n2=`cat $2 | wc -l`
echo "[Lengths are $1=$n1 versus $2=$n2]"
}
check_sorted_and_uniq $data/utt2spk
if ! $no_spk_sort; then
! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \
echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \
echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1;
fi
check_sorted_and_uniq $data/spk2utt
! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \
<(tools/spk2utt_to_utt2spk.pl $data/spk2utt) && \
echo "$0: spk2utt and utt2spk do not seem to match" && exit 1;
cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts
if [ ! -f $data/text ] && ! $no_text; then
echo "$0: no such file $data/text (if this is by design, specify --no-text)"
exit 1;
fi
num_utts=`cat $tmpdir/utts | wc -l`
if [ -f $data/text ]; then
tools/validate_text.pl $data/text || exit 1;
check_sorted_and_uniq $data/text
text_len=`cat $data/text | wc -l`
illegal_sym_list="<s> </s> #0"
for x in $illegal_sym_list; do
if grep -w "$x" $data/text > /dev/null; then
echo "$0: Error: in $data, text contains illegal symbol $x"
exit 1;
fi
done
awk '{print $1}' < $data/text > $tmpdir/utts.txt
if ! cmp -s $tmpdir/utts{,.txt}; then
echo "$0: Error: in $data, utterance lists extracted from utt2spk and text"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/utts{,.txt}
exit 1;
fi
fi
if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then
echo "$0: in directory $data, segments file exists but no wav.scp"
exit 1;
fi
if [ ! -f $data/wav.scp ] && ! $no_wav; then
echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)"
exit 1;
fi
if [ -f $data/wav.scp ]; then
check_sorted_and_uniq $data/wav.scp
if grep -E -q '^\S+\s+~' $data/wav.scp; then
# note: it's not a good idea to have any kind of tilde in wav.scp, even if
# part of a command, as it would cause compatibility problems if run by
# other users, but this used to be not checked for so we let it slide unless
# it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which
# would definitely cause problems as the fopen system call does not do
# tilde expansion.
echo "$0: Please do not use tilde (~) in your wav.scp."
exit 1;
fi
if [ -f $data/segments ]; then
check_sorted_and_uniq $data/segments
# We have a segments file -> interpret wav file as "recording-ids" not utterance-ids.
! cat $data/segments | \
awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \
echo "$0: badly formatted segments file" && exit 1;
segments_len=`cat $data/segments | wc -l`
if [ -f $data/text ]; then
! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \
echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \
echo "$0: Lengths are $segments_len vs $num_utts" && \
exit 1
fi
cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings
awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav
if ! cmp -s $tmpdir/recordings{,.wav}; then
echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/recordings{,.wav}
exit 1;
fi
if [ -f $data/reco2file_and_channel ]; then
# this file is needed only for ctm scoring; it's indexed by recording-id.
check_sorted_and_uniq $data/reco2file_and_channel
! cat $data/reco2file_and_channel | \
awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) {
if ( NF == 3 && $3 == "1" ) {
warning_issued = 1;
} else {
print "Bad line ", $0; exit 1;
}
}
}
END {
if (warning_issued == 1) {
print "The channel should be marked as A or B, not 1! You should change it ASAP! "
}
}' && echo "$0: badly formatted reco2file_and_channel file" && exit 1;
cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc
if ! cmp -s $tmpdir/recordings{,.r2fc}; then
echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/recordings{,.r2fc}
exit 1;
fi
fi
else
# No segments file -> assume wav.scp indexed by utterance.
cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav
if ! cmp -s $tmpdir/utts{,.wav}; then
echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/utts{,.wav}
exit 1;
fi
if [ -f $data/reco2file_and_channel ]; then
# this file is needed only for ctm scoring; it's indexed by recording-id.
check_sorted_and_uniq $data/reco2file_and_channel
! cat $data/reco2file_and_channel | \
awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) {
if ( NF == 3 && $3 == "1" ) {
warning_issued = 1;
} else {
print "Bad line ", $0; exit 1;
}
}
}
END {
if (warning_issued == 1) {
print "The channel should be marked as A or B, not 1! You should change it ASAP! "
}
}' && echo "$0: badly formatted reco2file_and_channel file" && exit 1;
cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc
if ! cmp -s $tmpdir/utts{,.r2fc}; then
echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/utts{,.r2fc}
exit 1;
fi
fi
fi
fi
if [ ! -f $data/feats.scp ] && ! $no_feats; then
echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)"
exit 1;
fi
if [ -f $data/feats.scp ]; then
check_sorted_and_uniq $data/feats.scp
cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats
if ! cmp -s $tmpdir/utts{,.feats}; then
echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/utts{,.feats}
exit 1;
fi
fi
if [ -f $data/cmvn.scp ]; then
check_sorted_and_uniq $data/cmvn.scp
cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn
cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
if ! cmp -s $tmpdir/speakers{,.cmvn}; then
echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/speakers{,.cmvn}
exit 1;
fi
fi
if [ -f $data/spk2gender ]; then
check_sorted_and_uniq $data/spk2gender
! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \
echo "$0: Mal-formed spk2gender file" && exit 1;
cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender
cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
if ! cmp -s $tmpdir/speakers{,.spk2gender}; then
echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/speakers{,.spk2gender}
exit 1;
fi
fi
if [ -f $data/spk2warp ]; then
check_sorted_and_uniq $data/spk2warp
! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \
echo "$0: Mal-formed spk2warp file" && exit 1;
cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp
cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
if ! cmp -s $tmpdir/speakers{,.spk2warp}; then
echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/speakers{,.spk2warp}
exit 1;
fi
fi
if [ -f $data/utt2warp ]; then
check_sorted_and_uniq $data/utt2warp
! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \
echo "$0: Mal-formed utt2warp file" && exit 1;
cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp
cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts
if ! cmp -s $tmpdir/utts{,.utt2warp}; then
echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/utts{,.utt2warp}
exit 1;
fi
fi
# check some optionally-required things
for f in vad.scp utt2lang utt2uniq; do
if [ -f $data/$f ]; then
check_sorted_and_uniq $data/$f
if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \
<( awk '{print $1}' $data/$f ); then
echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list"
exit 1;
fi
fi
done
if [ -f $data/utt2dur ]; then
check_sorted_and_uniq $data/utt2dur
cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur
if ! cmp -s $tmpdir/utts{,.utt2dur}; then
echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/utts{,.utt2dur}
exit 1;
fi
cat $data/utt2dur | \
awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1
fi
if [ -f $data/utt2num_frames ]; then
check_sorted_and_uniq $data/utt2num_frames
cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames
if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then
echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/utts{,.utt2num_frames}
exit 1
fi
awk <$data/utt2num_frames '{
if (NF != 2 || !($2 > 0) || $2 != int($2)) {
print "Bad line utt2num_frames:" NR ":" $0
exit 1 } }' || exit 1
fi
if [ -f $data/reco2dur ]; then
check_sorted_and_uniq $data/reco2dur
cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur
if [ -f $tmpdir/recordings ]; then
if ! cmp -s $tmpdir/recordings{,.reco2dur}; then
echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/recordings{,.reco2dur}
exit 1;
fi
else
if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then
echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/{utts,recordings.reco2dur}
exit 1;
fi
fi
cat $data/reco2dur | \
awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1
fi
echo "$0: Successfully validated data-directory $data"
#!/usr/bin/env perl
# Apache 2.0.
# Copyright 2012 Guoguo Chen
# 2015 Daniel Povey
# 2017 Johns Hopkins University (Jan "Yenda" Trmal <jtrmal@gmail.com>)
#
# Validation script for 'dict' directories (e.g. data/local/dict)
# this function reads the opened file (supplied as a first
# parameter) into an array of lines. For each
# line, it tests whether it's a valid utf-8 compatible
# line. If all lines are valid utf-8, it returns the lines
# decoded as utf-8, otherwise it assumes the file's encoding
# is one of those 1-byte encodings, such as ISO-8859-x
# or Windows CP-X.
# Please recall we do not really care about
# the actually encoding, we just need to
# make sure the length of the (decoded) string
# is correct (to make the output formatting looking right).
sub get_utf8_or_bytestream {
use Encode qw(decode encode);
my $is_utf_compatible = 1;
my @unicode_lines;
my @raw_lines;
my $raw_text;
my $lineno = 0;
my $file = shift;
while (<$file>) {
$raw_text = $_;
last unless $raw_text;
if ($is_utf_compatible) {
my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
$is_utf_compatible = $is_utf_compatible && defined($decoded_text);
push @unicode_lines, $decoded_text;
} else {
#print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n";
;
}
push @raw_lines, $raw_text;
$lineno += 1;
}
if (!$is_utf_compatible) {
return (0, @raw_lines);
} else {
return (1, @unicode_lines);
}
}
# check if the given unicode string contain unicode whitespaces
# other than the usual four: TAB, LF, CR and SPACE
sub validate_utf8_whitespaces {
my $unicode_lines = shift;
use feature 'unicode_strings';
for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) {
my $current_line = $unicode_lines->[$i];
if ((substr $current_line, -1) ne "\n"){
print STDERR "$0: The current line (nr. $i) has invalid newline\n";
return 1;
}
my @A = split(" ", $current_line);
my $utt_id = $A[0];
# we replace TAB, LF, CR, and SPACE
# this is to simplify the test
if ($current_line =~ /\x{000d}/) {
print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n";
return 1;
}
$current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g;
if ($current_line =~/\s/) {
print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n";
return 1;
}
}
return 0;
}
# checks if the text in the file (supplied as the argument) is utf-8 compatible
# if yes, checks if it contains only allowed whitespaces. If no, then does not
# do anything. The function seeks to the original position in the file after
# reading the text.
sub check_allowed_whitespace {
my $file = shift;
my $pos = tell($file);
(my $is_utf, my @lines) = get_utf8_or_bytestream($file);
seek($file, $pos, SEEK_SET);
if ($is_utf) {
my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines);
print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n";
if ($has_invalid_whitespaces) {
print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n";
return 0;
} else {
print "--> text contains only allowed whitespaces\n";
}
} else {
print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n";
}
return 1;
}
if(@ARGV != 1) {
die "Usage: validate_dict_dir.pl <dict-dir>\n" .
"e.g.: validate_dict_dir.pl data/local/dict\n";
}
$dict = shift @ARGV;
$dict =~ s:/$::;
$exit = 0;
$success = 1; # this is re-set each time we read a file.
sub set_to_fail { $exit = 1; $success = 0; }
# Checking silence_phones.txt -------------------------------
print "Checking $dict/silence_phones.txt ...\n";
if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;}
if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;}
$idx = 1;
%silence = ();
$crlf = 1;
print "--> reading $dict/silence_phones.txt\n";
check_allowed_whitespace(\*S) || set_to_fail();
while(<S>) {
if (! s/\n$//) {
print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n";
set_to_fail();
}
if ($crlf == 1 && m/\r/) {
print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n";
set_to_fail();
$crlf = 0;
}
my @col = split(" ", $_);
if (@col == 0) {
set_to_fail();
print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n";
}
foreach(0 .. @col-1) {
my $p = $col[$_];
if($silence{$p}) {
set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n";
} else {
$silence{$p} = 1;
}
# disambiguation symbols; phones ending in _B, _E, _S or _I will cause
# problems with word-position-dependent systems, and <eps> is obviously
# confusable with epsilon.
if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq "<eps>"){
set_to_fail();
print "--> ERROR: phone \"$p\" has disallowed written form\n";
}
}
$idx ++;
}
close(S);
$success == 0 || print "--> $dict/silence_phones.txt is OK\n";
print "\n";
# Checking optional_silence.txt -------------------------------
print "Checking $dict/optional_silence.txt ...\n";
if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;}
if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;}
$idx = 1;
$success = 1;
$crlf = 1;
print "--> reading $dict/optional_silence.txt\n";
check_allowed_whitespace(\*OS) or exit 1;
while(<OS>) {
chomp;
my @col = split(" ", $_);
if ($idx > 1 or @col > 1) {
set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n";
} elsif (!$silence{$col[0]}) {
set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n";
}
if ($crlf == 1 && m/\r/) {
print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n";
set_to_fail();
$crlf = 0;
}
$idx ++;
}
close(OS);
$success == 0 || print "--> $dict/optional_silence.txt is OK\n";
print "\n";
# Checking nonsilence_phones.txt -------------------------------
print "Checking $dict/nonsilence_phones.txt ...\n";
if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;}
if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;}
$idx = 1;
%nonsilence = ();
$success = 1;
$crlf = 1;
print "--> reading $dict/nonsilence_phones.txt\n";
check_allowed_whitespace(\*NS) or set_to_fail();
while(<NS>) {
if ($crlf == 1 && m/\r/) {
print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n";
set_to_fail();
$crlf = 0;
}
if (! s/\n$//) {
print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n";
set_to_fail();
}
my @col = split(" ", $_);
if (@col == 0) {
set_to_fail();
print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n";
}
foreach(0 .. @col-1) {
my $p = $col[$_];
if($nonsilence{$p}) {
set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n";
} else {
$nonsilence{$p} = 1;
}
# phones that start with the pound sign/hash may be mistaken for
# disambiguation symbols; phones ending in _B, _E, _S or _I will cause
# problems with word-position-dependent systems, and <eps> is obviously
# confusable with epsilon.
if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq "<eps>"){
set_to_fail();
print "--> ERROR: phone \"$p\" has disallowed written form\n";
}
}
$idx ++;
}
close(NS);
$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n";
print "\n";
# Checking disjoint -------------------------------
sub intersect {
my ($a, $b) = @_;
@itset = ();
%itset = ();
foreach(keys %$a) {
if(exists $b->{$_} and !$itset{$_}) {
push(@itset, $_);
$itset{$_} = 1;
}
}
return @itset;
}
print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n";
@itset = intersect(\%silence, \%nonsilence);
if(@itset == 0) {print "--> disjoint property is OK.\n";}
else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";}
print "\n";
sub check_lexicon {
my ($lex, $num_prob_cols, $num_skipped_cols) = @_;
print "Checking $lex\n";
!open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail();
my %seen_line = {};
$idx = 1; $success = 1; $crlf = 1;
print "--> reading $lex\n";
check_allowed_whitespace(\*L) or set_to_fail();
while (<L>) {
if ($crlf == 1 && m/\r/) {
print "--> ERROR: $lex contains Carriage Return (^M) characters.\n";
set_to_fail();
$crlf = 0;
}
if (defined $seen_line{$_}) {
print "--> ERROR: line '$_' of $lex is repeated\n";
set_to_fail();
}
$seen_line{$_} = 1;
if (! s/\n$//) {
print "--> ERROR: last line '$_' of $lex does not end in newline.\n";
set_to_fail();
}
my @col = split(" ", $_);
$word = shift @col;
if (!defined $word) {
print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail();
}
if ($word eq "<s>" || $word eq "</s>" || $word eq "<eps>" || $word eq "#0") {
print "--> ERROR: lexicon.txt contains forbidden word $word\n";
set_to_fail();
}
for ($n = 0; $n < $num_prob_cols; $n++) {
$prob = shift @col;
if (!($prob > 0.0 && $prob <= 1.0)) {
print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n";
set_to_fail();
}
}
for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; }
if (@col == 0) {
print "--> ERROR: lexicon.txt contains word $word with empty ";
print "pronunciation.\n";
set_to_fail();
}
foreach (0 .. @col-1) {
if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt ";
print "(line $idx)\n";
set_to_fail();
}
}
$idx ++;
}
close(L);
$success == 0 || print "--> $lex is OK\n";
print "\n";
}
if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); }
if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); }
if (-f "$dict/lexiconp_silprob.txt") {
# If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also
# exist.
check_lexicon("$dict/lexiconp_silprob.txt", 2, 2);
if (-f "$dict/silprob.txt") {
!open(SP, "<$dict/silprob.txt") &&
print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail();
$crlf = 1;
while (<SP>) {
if ($crlf == 1 && m/\r/) {
print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n";
set_to_fail();
$crlf = 0;
}
chomp; my @col = split;
@col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail();
if ($col[0] eq "<s>" || $col[0] eq "overall") {
if (!($col[1] > 0.0 && $col[1] <= 1.0)) {
set_to_fail();
print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n";
}
} elsif ($col[0] eq "</s>_s" || $col[0] eq "</s>_n") {
if ($col[1] <= 0.0) {
set_to_fail();
print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n";
}
} else {
print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n";
set_to_fail();
}
}
close(SP);
} else {
set_to_fail();
print "--> ERROR: expecting $dict/silprob.txt to exist\n";
}
}
if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) {
print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n";
set_to_fail();
}
sub check_lexicon_pair {
my ($lex1, $num_prob_cols1, $num_skipped_cols1,
$lex2, $num_prob_cols2, $num_skipped_cols2) = @_;
# We have checked individual lexicons already.
open(L1, "<$lex1"); open(L2, "<$lex2");
print "Checking lexicon pair $lex1 and $lex2\n";
my $line_num = 0;
while(<L1>) {
$line_num++;
@A = split;
$line_B = <L2>;
if (!defined $line_B) {
print "--> ERROR: $lex1 and $lex2 have different number of lines.\n";
set_to_fail(); last;
}
@B = split(" ", $line_B);
# Check if the word matches.
if ($A[0] ne $B[0]) {
print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n";
set_to_fail(); last;
}
shift @A; shift @B;
for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; }
for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; }
# Check if the pronunciation matches
if (join(" ", @A) ne join(" ", @B)) {
print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n";
set_to_fail(); last;
}
}
$line_B = <L2>;
if (defined $line_B && $exit == 0) {
print "--> ERROR: $lex1 and $lex2 have different number of lines.\n";
set_to_fail();
}
$success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n";
}
# If more than one lexicon exist, we have to check if they correspond to each
# other. It could be that the user overwrote one and we need to regenerate the
# other, but we do not know which is which.
if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") {
check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0);
}
if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") {
check_lexicon_pair("$dict/lexiconp.txt", 1, 0,
"$dict/lexiconp_silprob.txt", 2, 2);
}
# Checking extra_questions.txt -------------------------------
%distinguished = (); # Keep track of all phone-pairs including nonsilence that
# are distinguished (split apart) by extra_questions.txt,
# as $distinguished{$p1,$p2} = 1. This will be used to
# make sure that we don't have pairs of phones on the same
# line in nonsilence_phones.txt that can never be
# distinguished from each other by questions. (If any two
# phones appear on the same line in nonsilence_phones.txt,
# they share a tree root, and since the automatic
# question-building treats all phones that appear on the
# same line of nonsilence_phones.txt as being in the same
# group, we can never distinguish them without resorting to
# questions in extra_questions.txt.
print "Checking $dict/extra_questions.txt ...\n";
if (-s "$dict/extra_questions.txt") {
if (!open(EX, "<$dict/extra_questions.txt")) {
set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n";
}
$idx = 1;
$success = 1;
$crlf = 1;
print "--> reading $dict/extra_questions.txt\n";
check_allowed_whitespace(\*EX) or set_to_fail();
while(<EX>) {
if ($crlf == 1 && m/\r/) {
print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n";
set_to_fail();
$crlf = 0;
}
if (! s/\n$//) {
print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n";
set_to_fail();
}
my @col = split(" ", $_);
if (@col == 0) {
set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n";
}
foreach (0 .. @col-1) {
if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")\n";
}
$idx ++;
}
%col_hash = ();
foreach $p (@col) { $col_hash{$p} = 1; }
foreach $p1 (@col) {
# Update %distinguished hash.
foreach $p2 (keys %nonsilence) {
if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not
# in this question (and in nonsilence
# phones)... mark p1,p2 as being split apart
$distinguished{$p1,$p2} = 1;
$distinguished{$p2,$p1} = 1;
}
}
}
}
close(EX);
$success == 0 || print "--> $dict/extra_questions.txt is OK\n";
} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";}
if (-f "$dict/nonterminals.txt") {
open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt";
my %nonterminals = ();
my $line_number = 1;
while (<NT>) {
chop;
my @line = split(" ", $_);
if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) {
print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt\n"; exit 1;
}
$nonterminals{$line[0]} = 1;
$line_number++;
}
print "--> $dict/nonterminals.txt is OK\n";
}
# check nonsilence_phones.txt again for phone-pairs that are never
# distnguishable. (note: this situation is normal and expected for silence
# phones, so we don't check it.)
if(!open(NS, "<$dict/nonsilence_phones.txt")) {
print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1;
}
$num_warn_nosplit = 0;
$num_warn_nosplit_limit = 10;
while(<NS>) {
my @col = split(" ", $_);
foreach $p1 (@col) {
foreach $p2 (@col) {
if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) {
set_to_fail();
if ($num_warn_nosplit <= $num_warn_nosplit_limit) {
print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n";
}
if ($num_warn_nosplit == $num_warn_nosplit_limit) {
print "... Not warning any more times about this issue.\n";
}
if ($num_warn_nosplit == 0) {
print " (note: we started checking for this only recently. You can still build a system but\n";
print " phones $p1 and $p2 will be acoustically indistinguishable).\n";
}
$num_warn_nosplit++;
}
}
}
}
if ($exit == 1) {
print "--> ERROR validating dictionary directory $dict (see detailed error ";
print "messages above)\n\n";
exit 1;
} else {
print "--> SUCCESS [validating dictionary directory $dict]\n\n";
}
exit 0;
#!/usr/bin/env perl
#
#===============================================================================
# Copyright 2017 Johns Hopkins University (author: Yenda Trmal <jtrmal@gmail.com>)
# Johns Hopkins University (author: Daniel Povey)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
#===============================================================================
# validation script for data/<dataset>/text
# to be called (preferably) from utils/validate_data_dir.sh
use strict;
use warnings;
use utf8;
use Fcntl qw< SEEK_SET >;
# this function reads the opened file (supplied as a first
# parameter) into an array of lines. For each
# line, it tests whether it's a valid utf-8 compatible
# line. If all lines are valid utf-8, it returns the lines
# decoded as utf-8, otherwise it assumes the file's encoding
# is one of those 1-byte encodings, such as ISO-8859-x
# or Windows CP-X.
# Please recall we do not really care about
# the actually encoding, we just need to
# make sure the length of the (decoded) string
# is correct (to make the output formatting looking right).
sub get_utf8_or_bytestream {
use Encode qw(decode encode);
my $is_utf_compatible = 1;
my @unicode_lines;
my @raw_lines;
my $raw_text;
my $lineno = 0;
my $file = shift;
while (<$file>) {
$raw_text = $_;
last unless $raw_text;
if ($is_utf_compatible) {
my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
$is_utf_compatible = $is_utf_compatible && defined($decoded_text);
push @unicode_lines, $decoded_text;
} else {
#print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n";
;
}
push @raw_lines, $raw_text;
$lineno += 1;
}
if (!$is_utf_compatible) {
return (0, @raw_lines);
} else {
return (1, @unicode_lines);
}
}
# check if the given unicode string contain unicode whitespaces
# other than the usual four: TAB, LF, CR and SPACE
sub validate_utf8_whitespaces {
my $unicode_lines = shift;
use feature 'unicode_strings';
for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) {
my $current_line = $unicode_lines->[$i];
if ((substr $current_line, -1) ne "\n"){
print STDERR "$0: The current line (nr. $i) has invalid newline\n";
return 1;
}
my @A = split(" ", $current_line);
my $utt_id = $A[0];
# we replace TAB, LF, CR, and SPACE
# this is to simplify the test
if ($current_line =~ /\x{000d}/) {
print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n";
return 1;
}
$current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g;
if ($current_line =~/\s/) {
print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n";
return 1;
}
}
return 0;
}
# checks if the text in the file (supplied as the argument) is utf-8 compatible
# if yes, checks if it contains only allowed whitespaces. If no, then does not
# do anything. The function seeks to the original position in the file after
# reading the text.
sub check_allowed_whitespace {
my $file = shift;
my $filename = shift;
my $pos = tell($file);
(my $is_utf, my @lines) = get_utf8_or_bytestream($file);
seek($file, $pos, SEEK_SET);
if ($is_utf) {
my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines);
if ($has_invalid_whitespaces) {
print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n";
return 0;
}
}
return 1;
}
if(@ARGV != 1) {
die "Usage: validate_text.pl <text-file>\n" .
"e.g.: validate_text.pl data/train/text\n";
}
my $text = shift @ARGV;
if (-z "$text") {
print STDERR "$0: ERROR: file '$text' is empty or does not exist\n";
exit 1;
}
if(!open(FILE, "<$text")) {
print STDERR "$0: ERROR: failed to open $text\n";
exit 1;
}
check_allowed_whitespace(\*FILE, $text) or exit 1;
close(FILE);
#!/usr/bin/env python3
# encoding: utf-8
import sys
import torchaudio
torchaudio.set_audio_backend("sox_io")
scp = sys.argv[1]
dur_scp = sys.argv[2]
with open(scp, 'r') as f, open(dur_scp, 'w') as fout:
cnt = 0
total_duration = 0
for l in f:
items = l.strip().split()
wav_id = items[0]
fname = items[1]
cnt += 1
waveform, rate = torchaudio.load(fname)
frames = len(waveform[0])
duration = frames / float(rate)
total_duration += duration
fout.write('{} {}\n'.format(wav_id, duration))
print('process {} utts'.format(cnt))
print('total {} s'.format(total_duration))
#!/bin/bash
# split the wav scp, calculate duration and merge
nj=4
. tools/parse_options.sh || exit 1;
inscp=$1
outscp=$2
data=$(dirname ${inscp})
if [ $# -eq 3 ]; then
logdir=$3
else
logdir=${data}/log
fi
mkdir -p ${logdir}
rm -f $logdir/wav_*.slice
rm -f $logdir/wav_*.shape
split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_
for slice in `ls $logdir/wav_*.slice`; do
{
name=`basename -s .slice $slice`
tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log
} &
done
wait
cat $logdir/wav_*.shape > $outscp
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment