Commit a7785cc6 authored by Sugon_ldc's avatar Sugon_ldc
Browse files

delete soft link

parent 9a2a05ca
../../../tools/
\ No newline at end of file
#!/bin/bash
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
. ./path.sh || exit 1;
stage=0 # start from 0 if you need to start from data preparation
stop_stage=0
nj=16
feat_dir=raw_wav
dict=data/dict/lang_char.txt
dir=exp/
config=$dir/train.yaml
checkpoint=
checkpoint=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/avg_20.pt
config=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/train.yaml
set=
ali_format=$feat_dir/$set/format.data
ali_format=format.data
ali_result=$dir/ali
. tools/parse_options.sh || exit 1;
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
nj=32
# Prepare required data for ctc alignment
echo "Prepare data, prepare required format"
for x in $set; do
tools/format_data.sh --nj ${nj} \
--feat-type wav --feat $feat_dir/$x/wav.scp \
$feat_dir/$x ${dict} > $feat_dir/$x/format.data.tmp
done
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# Test model, please specify the model you want to use by --checkpoint
python wenet/bin/alignment_deprecated.py --gpu -1 \
--config $config \
--input_file $ali_format \
--checkpoint $checkpoint \
--batch_size 1 \
--dict $dict \
--result_file $ali_result \
fi
#!/usr/bin/env python3
# Copyright (c) 2022 Horizon Inc. (authors: Xingchen Song)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Analyze Dataset, Duration/TextLength/Speed etc.
Usage:
. ./path.sh && python3 tools/analyze_dataset.py \
--data_type "shard" \
--data_list data/test/data.list \
--output_dir exp/analyze_test \
--num_thread 32
"""
import os
import json
import math
import time
import numpy
import logging
import librosa
import tarfile
import argparse
import torchaudio
import multiprocessing
from wenet.utils.file_utils import read_lists
from wenet.dataset.processor import AUDIO_FORMAT_SETS
def get_args():
parser = argparse.ArgumentParser(description='Analyze dataset')
parser.add_argument('--data_type',
default='wav_scp',
choices=['wav_scp', 'raw', 'shard'],
help='dataset type')
parser.add_argument('--output_dir', type=str,
default="exp", help='write info to output dir')
parser.add_argument('--data_list', default=None,
help='used in raw/shard mode')
parser.add_argument('--wav_scp', default=None,
help='used in wav_scp mode')
parser.add_argument('--text', default=None,
help='used in wav_scp mode')
parser.add_argument('--num_thread', type=int,
default=4, help='number of threads')
args = parser.parse_args()
print(args)
return args
def analyze(datas, output_file, thread_id):
with open(output_file, "w", encoding='utf8') as f:
for i, data in enumerate(datas):
if type(data['wav']) is numpy.ndarray:
y, sample_rate = data['wav'], data['sample_rate']
data['wav'] = "None" # NOTE(xcsong): Do not save wav.
elif type(data['wav'] is str):
y, sample_rate = librosa.load(data['wav'], sr=16000)
data['dur'] = len(y) / sample_rate
data['txt_length'] = len(data['txt'])
data['speed'] = data['txt_length'] / data['dur']
# Trim the beginning and ending silence
_, index = librosa.effects.trim(y, top_db=30)
data['leading_sil'] = librosa.get_duration(
y=y[:index[0]], sr=16000) * 1000 if index[0] > 0 else 0
data['trailing_sil'] = librosa.get_duration(
y=y[index[1]:], sr=16000) * 1000 if index[1] < len(y) else 0
data_str = json.dumps(data, ensure_ascii=False)
f.write("{}\n".format(data_str))
if thread_id == 0 and i % 100 == 0:
logging.info("\tThread-{}: processed {}/{}".format(
thread_id, i, len(datas)))
def read_tar(file):
try:
with tarfile.open(fileobj=open(file, "rb"), mode="r|*") as stream:
prev_prefix = None
data = {}
valid = True
for tarinfo in stream:
name = tarinfo.name
pos = name.rfind('.')
assert pos > 0
prefix, postfix = name[:pos], name[pos + 1:]
if prev_prefix is not None and prefix != prev_prefix:
data['key'] = prev_prefix
if valid:
yield data
data = {}
valid = True
with stream.extractfile(tarinfo) as file_obj:
try:
if postfix == 'txt':
data['txt'] = file_obj.read().decode(
'utf8').strip()
elif postfix in AUDIO_FORMAT_SETS:
waveform, sample_rate = torchaudio.load(
file_obj)
# single channel
data['wav'] = waveform.numpy()[0, :]
data['sample_rate'] = sample_rate
else:
data[postfix] = file_obj.read()
except Exception as ex:
valid = False
logging.warning(
'error: {} when parse {}'.format(ex, name))
prev_prefix = prefix
# The last data in tar
if prev_prefix is not None:
data['key'] = prev_prefix
yield data
except Exception as ex:
logging.warning(
'tar_file error: {} when processing {}'.format(ex, file))
def main():
start_time = time.time()
args = get_args()
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s')
os.makedirs(args.output_dir, exist_ok=True)
os.makedirs(args.output_dir + "/partition", exist_ok=True)
datas = [[] for i in range(args.num_thread)]
logging.info("Stage-1: Loading data.list OR wav.scp...")
if args.data_type == "shard":
assert args.data_list is not None
lists = read_lists(args.data_list)
# partition
total = 0
for line in lists:
for data in read_tar(line):
datas[total % args.num_thread].append(data)
total = total + 1
elif args.data_type == "raw":
assert args.data_list is not None
lists = read_lists(args.data_list)
# partition
for i, line in enumerate(lists):
data = json.loads(line)
datas[i % args.num_thread].append(data)
elif args.data_type == "wav_scp":
assert args.wav_scp is not None
assert args.text is not None
wavs, texts = {}, {}
# wavs
for line in read_lists(args.wav_scp):
line = line.strip().split()
wavs[line[0]] = line[1]
# texts
for line in read_lists(args.text):
line = line.strip().split(maxsplit=1)
texts[line[0]] = line[1]
sorted(wavs)
sorted(texts)
# partition
for i, (key1, key2) in enumerate(zip(wavs, texts)):
assert key1 == key2
datas[i % args.num_thread].append(
{'key': key1, "wav": wavs[key1], "txt": texts[key1]}
)
logging.info("Stage-2: Start Analyze")
# threads
pool = multiprocessing.Pool(processes=args.num_thread)
for i in range(args.num_thread):
output_file = os.path.join(
args.output_dir, "partition", "part-{}".format(i))
pool.apply_async(analyze, (datas[i], output_file, i))
pool.close()
pool.join()
logging.info("Stage-3: Sort and Write Result")
datas = []
for i in range(args.num_thread):
output_file = os.path.join(
args.output_dir, "partition", "part-{}".format(i))
with open(output_file, "r", encoding='utf8') as f:
for line in f.readlines():
data = json.loads(line)
datas.append(data)
total_dur = sum([x['dur'] for x in datas])
total_len = sum([x['txt_length'] for x in datas])
total_leading_sil = sum([x['leading_sil'] for x in datas])
total_trailing_sil = sum([x['trailing_sil'] for x in datas])
num_datas = len(datas)
names = ['key', 'dur', 'txt_length', 'speed',
'leading_sil', 'trailing_sil']
units = ['', 's', '', 'char/s', 'ms', 'ms']
avgs = [0, total_dur / num_datas, total_len / num_datas,
total_len / total_dur, total_leading_sil / num_datas,
total_trailing_sil / num_datas]
stds = [0, sum([(x['dur'] - avgs[1])**2 for x in datas]),
sum([(x['txt_length'] - avgs[2])**2 for x in datas]),
sum([(x['txt_length'] / x['dur'] - avgs[3])**2 for x in datas]),
sum([(x['leading_sil'] - avgs[4])**2 for x in datas]),
sum([(x['trailing_sil'] - avgs[5])**2 for x in datas])]
stds = [math.sqrt(x / num_datas) for x in stds]
parts = ['max', 'P99', 'P75', 'P50', 'P25', 'min']
index = [num_datas - 1, int(num_datas * 0.99), int(num_datas * 0.75),
int(num_datas * 0.50), int(num_datas * 0.25), 0]
with open(args.output_dir + "/analyze_result_brief",
"w", encoding='utf8') as f:
for i, (name, unit, avg, std) in enumerate(
zip(names, units, avgs, stds)):
if name == 'key':
continue
f.write("==================\n")
datas.sort(key=lambda x: x[name])
for p, j in zip(parts, index):
f.write("{} {}: {:.3f} {} (wav_id: {})\n".format(
p, name, datas[j][name], unit, datas[j]['key']))
f.write("avg {}: {:.3f} {}\n".format(
name, avg, unit))
f.write("std {}: {:.3f}\n".format(
name, std))
os.system("cat {}".format(args.output_dir + "/analyze_result_brief"))
datas.sort(key=lambda x: x['dur'])
with open(args.output_dir + "/analyze_result", "w", encoding='utf8') as f:
for data in datas:
f.write("{}\n".format(json.dumps(data, ensure_ascii=False)))
end_time = time.time()
logging.info("Time Cost: {:.3f}s".format(end_time - start_time))
if __name__ == '__main__':
main()
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import logging
import sys
import json
def kaldi2json(kaldi_cmvn_file):
means = []
variance = []
with open(kaldi_cmvn_file, 'r') as fid:
# kaldi binary file start with '\0B'
if fid.read(2) == '\0B':
logging.error('kaldi cmvn binary file is not supported, please '
'recompute it by: compute-cmvn-stats --binary=false '
' scp:feats.scp global_cmvn')
sys.exit(1)
fid.seek(0)
arr = fid.read().split()
assert (arr[0] == '[')
assert (arr[-2] == '0')
assert (arr[-1] == ']')
feat_dim = int((len(arr) - 2 - 2) / 2)
for i in range(1, feat_dim + 1):
means.append(float(arr[i]))
count = float(arr[feat_dim + 1])
for i in range(feat_dim + 2, 2 * feat_dim + 2):
variance.append(float(arr[i]))
cmvn_info = {'mean_stat:' : means,
'var_stat' : variance,
'frame_num' : count}
return cmvn_info
if __name__ == '__main__':
with open(sys.argv[2], 'w') as fout:
cmvn = kaldi2json(sys.argv[1])
fout.write(json.dumps(cmvn))
#!/bin/bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0.
# 2014 David Snyder
# This script combines the data from multiple source directories into
# a single destination directory.
# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information
# about what these directories contain.
# Begin configuration section.
extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..."
skip_fix=false # skip the fix_data_dir.sh in the end
# End configuration section.
echo "$0 $@" # Print the command line for logging
if [ -f path.sh ]; then . ./path.sh; fi
if [ -f parse_options.sh ]; then . parse_options.sh || exit 1; fi
if [ $# -lt 2 ]; then
echo "Usage: combine_data.sh [--extra-files 'file1 file2'] <dest-data-dir> <src-data-dir1> <src-data-dir2> ..."
echo "Note, files that don't appear in all source dirs will not be combined,"
echo "with the exception of utt2uniq and segments, which are created where necessary."
exit 1
fi
dest=$1;
shift;
first_src=$1;
rm -r $dest 2>/dev/null
mkdir -p $dest;
export LC_ALL=C
for dir in $*; do
if [ ! -f $dir/utt2spk ]; then
echo "$0: no such file $dir/utt2spk"
exit 1;
fi
done
# Check that frame_shift are compatible, where present together with features.
dir_with_frame_shift=
for dir in $*; do
if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then
if [[ $dir_with_frame_shift ]] &&
! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then
echo "$0:error: different frame_shift in directories $dir and " \
"$dir_with_frame_shift. Cannot combine features."
exit 1;
fi
dir_with_frame_shift=$dir
fi
done
# W.r.t. utt2uniq file the script has different behavior compared to other files
# it is not compulsary for it to exist in src directories, but if it exists in
# even one it should exist in all. We will create the files where necessary
has_utt2uniq=false
for in_dir in $*; do
if [ -f $in_dir/utt2uniq ]; then
has_utt2uniq=true
break
fi
done
if $has_utt2uniq; then
# we are going to create an utt2uniq file in the destdir
for in_dir in $*; do
if [ ! -f $in_dir/utt2uniq ]; then
# we assume that utt2uniq is a one to one mapping
cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}'
else
cat $in_dir/utt2uniq
fi
done | sort -k1 > $dest/utt2uniq
echo "$0: combined utt2uniq"
else
echo "$0 [info]: not combining utt2uniq as it does not exist"
fi
# some of the old scripts might provide utt2uniq as an extrafile, so just remove it
extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g")
# segments are treated similarly to utt2uniq. If it exists in some, but not all
# src directories, then we generate segments where necessary.
has_segments=false
for in_dir in $*; do
if [ -f $in_dir/segments ]; then
has_segments=true
break
fi
done
if $has_segments; then
for in_dir in $*; do
if [ ! -f $in_dir/segments ]; then
echo "$0 [info]: will generate missing segments for $in_dir" 1>&2
utils/data/get_segments_for_data.sh $in_dir
else
cat $in_dir/segments
fi
done | sort -k1 > $dest/segments
echo "$0: combined segments"
else
echo "$0 [info]: not combining segments as it does not exist"
fi
for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do
exists_somewhere=false
absent_somewhere=false
for d in $*; do
if [ -f $d/$file ]; then
exists_somewhere=true
else
absent_somewhere=true
fi
done
if ! $absent_somewhere; then
set -o pipefail
( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1;
set +o pipefail
echo "$0: combined $file"
else
if ! $exists_somewhere; then
echo "$0 [info]: not combining $file as it does not exist"
else
echo "$0 [info]: **not combining $file as it does not exist everywhere**"
fi
fi
done
tools/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt
if [[ $dir_with_frame_shift ]]; then
cp $dir_with_frame_shift/frame_shift $dest
fi
if ! $skip_fix ; then
tools/fix_data_dir.sh $dest || exit 1;
fi
exit 0
This diff is collapsed.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re, sys, unicodedata
import codecs
remove_tag = True
spacelist= [' ', '\t', '\r', '\n']
puncts = ['!', ',', '?',
'、', '。', '!', ',', ';', '?',
':', '「', '」', '︰', '『', '』', '《', '》']
def characterize(string) :
res = []
i = 0
while i < len(string):
char = string[i]
if char in puncts:
i += 1
continue
cat1 = unicodedata.category(char)
#https://unicodebook.readthedocs.io/unicode.html#unicode-categories
if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned
i += 1
continue
if cat1 == 'Lo': # letter-other
res.append(char)
i += 1
else:
# some input looks like: <unk><noise>, we want to separate it to two words.
sep = ' '
if char == '<': sep = '>'
j = i+1
while j < len(string):
c = string[j]
if ord(c) >= 128 or (c in spacelist) or (c==sep):
break
j += 1
if j < len(string) and string[j] == '>':
j += 1
res.append(string[i:j])
i = j
return res
def stripoff_tags(x):
if not x: return ''
chars = []
i = 0; T=len(x)
while i < T:
if x[i] == '<':
while i < T and x[i] != '>':
i += 1
i += 1
else:
chars.append(x[i])
i += 1
return ''.join(chars)
def normalize(sentence, ignore_words, cs, split=None):
""" sentence, ignore_words are both in unicode
"""
new_sentence = []
for token in sentence:
x = token
if not cs:
x = x.upper()
if x in ignore_words:
continue
if remove_tag:
x = stripoff_tags(x)
if not x:
continue
if split and x in split:
new_sentence += split[x]
else:
new_sentence.append(x)
return new_sentence
class Calculator :
def __init__(self) :
self.data = {}
self.space = []
self.cost = {}
self.cost['cor'] = 0
self.cost['sub'] = 1
self.cost['del'] = 1
self.cost['ins'] = 1
def calculate(self, lab, rec) :
# Initialization
lab.insert(0, '')
rec.insert(0, '')
while len(self.space) < len(lab) :
self.space.append([])
for row in self.space :
for element in row :
element['dist'] = 0
element['error'] = 'non'
while len(row) < len(rec) :
row.append({'dist' : 0, 'error' : 'non'})
for i in range(len(lab)) :
self.space[i][0]['dist'] = i
self.space[i][0]['error'] = 'del'
for j in range(len(rec)) :
self.space[0][j]['dist'] = j
self.space[0][j]['error'] = 'ins'
self.space[0][0]['error'] = 'non'
for token in lab :
if token not in self.data and len(token) > 0 :
self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0}
for token in rec :
if token not in self.data and len(token) > 0 :
self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0}
# Computing edit distance
for i, lab_token in enumerate(lab) :
for j, rec_token in enumerate(rec) :
if i == 0 or j == 0 :
continue
min_dist = sys.maxsize
min_error = 'none'
dist = self.space[i-1][j]['dist'] + self.cost['del']
error = 'del'
if dist < min_dist :
min_dist = dist
min_error = error
dist = self.space[i][j-1]['dist'] + self.cost['ins']
error = 'ins'
if dist < min_dist :
min_dist = dist
min_error = error
if lab_token == rec_token :
dist = self.space[i-1][j-1]['dist'] + self.cost['cor']
error = 'cor'
else :
dist = self.space[i-1][j-1]['dist'] + self.cost['sub']
error = 'sub'
if dist < min_dist :
min_dist = dist
min_error = error
self.space[i][j]['dist'] = min_dist
self.space[i][j]['error'] = min_error
# Tracing back
result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0}
i = len(lab) - 1
j = len(rec) - 1
while True :
if self.space[i][j]['error'] == 'cor' : # correct
if len(lab[i]) > 0 :
self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1
result['all'] = result['all'] + 1
result['cor'] = result['cor'] + 1
result['lab'].insert(0, lab[i])
result['rec'].insert(0, rec[j])
i = i - 1
j = j - 1
elif self.space[i][j]['error'] == 'sub' : # substitution
if len(lab[i]) > 0 :
self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1
result['all'] = result['all'] + 1
result['sub'] = result['sub'] + 1
result['lab'].insert(0, lab[i])
result['rec'].insert(0, rec[j])
i = i - 1
j = j - 1
elif self.space[i][j]['error'] == 'del' : # deletion
if len(lab[i]) > 0 :
self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1
result['all'] = result['all'] + 1
result['del'] = result['del'] + 1
result['lab'].insert(0, lab[i])
result['rec'].insert(0, "")
i = i - 1
elif self.space[i][j]['error'] == 'ins' : # insertion
if len(rec[j]) > 0 :
self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1
result['ins'] = result['ins'] + 1
result['lab'].insert(0, "")
result['rec'].insert(0, rec[j])
j = j - 1
elif self.space[i][j]['error'] == 'non' : # starting point
break
else : # shouldn't reach here
print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error']))
return result
def overall(self) :
result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0}
for token in self.data :
result['all'] = result['all'] + self.data[token]['all']
result['cor'] = result['cor'] + self.data[token]['cor']
result['sub'] = result['sub'] + self.data[token]['sub']
result['ins'] = result['ins'] + self.data[token]['ins']
result['del'] = result['del'] + self.data[token]['del']
return result
def cluster(self, data) :
result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0}
for token in data :
if token in self.data :
result['all'] = result['all'] + self.data[token]['all']
result['cor'] = result['cor'] + self.data[token]['cor']
result['sub'] = result['sub'] + self.data[token]['sub']
result['ins'] = result['ins'] + self.data[token]['ins']
result['del'] = result['del'] + self.data[token]['del']
return result
def keys(self) :
return list(self.data.keys())
def width(string):
return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string)
def default_cluster(word) :
unicode_names = [ unicodedata.name(char) for char in word ]
for i in reversed(range(len(unicode_names))) :
if unicode_names[i].startswith('DIGIT') : # 1
unicode_names[i] = 'Number' # 'DIGIT'
elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or
unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) :
# 明 / 郎
unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH'
elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or
unicode_names[i].startswith('LATIN SMALL LETTER')) :
# A / a
unicode_names[i] = 'English' # 'LATIN LETTER'
elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め
unicode_names[i] = 'Japanese' # 'GANA LETTER'
elif (unicode_names[i].startswith('AMPERSAND') or
unicode_names[i].startswith('APOSTROPHE') or
unicode_names[i].startswith('COMMERCIAL AT') or
unicode_names[i].startswith('DEGREE CELSIUS') or
unicode_names[i].startswith('EQUALS SIGN') or
unicode_names[i].startswith('FULL STOP') or
unicode_names[i].startswith('HYPHEN-MINUS') or
unicode_names[i].startswith('LOW LINE') or
unicode_names[i].startswith('NUMBER SIGN') or
unicode_names[i].startswith('PLUS SIGN') or
unicode_names[i].startswith('SEMICOLON')) :
# & / ' / @ / ℃ / = / . / - / _ / # / + / ;
del unicode_names[i]
else :
return 'Other'
if len(unicode_names) == 0 :
return 'Other'
if len(unicode_names) == 1 :
return unicode_names[0]
for i in range(len(unicode_names)-1) :
if unicode_names[i] != unicode_names[i+1] :
return 'Other'
return unicode_names[0]
def usage() :
print("compute-wer.py : compute word error rate (WER) and align recognition results and references.")
print(" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer")
if __name__ == '__main__':
if len(sys.argv) == 1 :
usage()
sys.exit(0)
calculator = Calculator()
cluster_file = ''
ignore_words = set()
tochar = False
verbose= 1
padding_symbol= ' '
case_sensitive = False
max_words_per_line = sys.maxsize
split = None
while len(sys.argv) > 3:
a = '--maxw='
if sys.argv[1].startswith(a):
b = sys.argv[1][len(a):]
del sys.argv[1]
max_words_per_line = int(b)
continue
a = '--rt='
if sys.argv[1].startswith(a):
b = sys.argv[1][len(a):].lower()
del sys.argv[1]
remove_tag = (b == 'true') or (b != '0')
continue
a = '--cs='
if sys.argv[1].startswith(a):
b = sys.argv[1][len(a):].lower()
del sys.argv[1]
case_sensitive = (b == 'true') or (b != '0')
continue
a = '--cluster='
if sys.argv[1].startswith(a):
cluster_file = sys.argv[1][len(a):]
del sys.argv[1]
continue
a = '--splitfile='
if sys.argv[1].startswith(a):
split_file = sys.argv[1][len(a):]
del sys.argv[1]
split = dict()
with codecs.open(split_file, 'r', 'utf-8') as fh:
for line in fh: # line in unicode
words = line.strip().split()
if len(words) >= 2:
split[words[0]] = words[1:]
continue
a = '--ig='
if sys.argv[1].startswith(a):
ignore_file = sys.argv[1][len(a):]
del sys.argv[1]
with codecs.open(ignore_file, 'r', 'utf-8') as fh:
for line in fh: # line in unicode
line = line.strip()
if len(line) > 0:
ignore_words.add(line)
continue
a = '--char='
if sys.argv[1].startswith(a):
b = sys.argv[1][len(a):].lower()
del sys.argv[1]
tochar = (b == 'true') or (b != '0')
continue
a = '--v='
if sys.argv[1].startswith(a):
b = sys.argv[1][len(a):].lower()
del sys.argv[1]
verbose=0
try:
verbose=int(b)
except:
if b == 'true' or b != '0':
verbose = 1
continue
a = '--padding-symbol='
if sys.argv[1].startswith(a):
b = sys.argv[1][len(a):].lower()
del sys.argv[1]
if b == 'space':
padding_symbol= ' '
elif b == 'underline':
padding_symbol= '_'
continue
if True or sys.argv[1].startswith('-'):
#ignore invalid switch
del sys.argv[1]
continue
if not case_sensitive:
ig=set([w.upper() for w in ignore_words])
ignore_words = ig
default_clusters = {}
default_words = {}
ref_file = sys.argv[1]
hyp_file = sys.argv[2]
rec_set = {}
if split and not case_sensitive:
newsplit = dict()
for w in split:
words = split[w]
for i in range(len(words)):
words[i] = words[i].upper()
newsplit[w.upper()] = words
split = newsplit
with codecs.open(hyp_file, 'r', 'utf-8') as fh:
for line in fh:
if tochar:
array = characterize(line)
else:
array = line.strip().split()
if len(array)==0: continue
fid = array[0]
rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split)
# compute error rate on the interaction of reference file and hyp file
for line in open(ref_file, 'r', encoding='utf-8') :
if tochar:
array = characterize(line)
else:
array = line.rstrip('\n').split()
if len(array)==0: continue
fid = array[0]
if fid not in rec_set:
continue
lab = normalize(array[1:], ignore_words, case_sensitive, split)
rec = rec_set[fid]
if verbose:
print('\nutt: %s' % fid)
for word in rec + lab :
if word not in default_words :
default_cluster_name = default_cluster(word)
if default_cluster_name not in default_clusters :
default_clusters[default_cluster_name] = {}
if word not in default_clusters[default_cluster_name] :
default_clusters[default_cluster_name][word] = 1
default_words[word] = default_cluster_name
result = calculator.calculate(lab, rec)
if verbose:
if result['all'] != 0 :
wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all']
else :
wer = 0.0
print('WER: %4.2f %%' % wer, end = ' ')
print('N=%d C=%d S=%d D=%d I=%d' %
(result['all'], result['cor'], result['sub'], result['del'], result['ins']))
space = {}
space['lab'] = []
space['rec'] = []
for idx in range(len(result['lab'])) :
len_lab = width(result['lab'][idx])
len_rec = width(result['rec'][idx])
length = max(len_lab, len_rec)
space['lab'].append(length-len_lab)
space['rec'].append(length-len_rec)
upper_lab = len(result['lab'])
upper_rec = len(result['rec'])
lab1, rec1 = 0, 0
while lab1 < upper_lab or rec1 < upper_rec:
if verbose > 1:
print('lab(%s):' % fid.encode('utf-8'), end = ' ')
else:
print('lab:', end = ' ')
lab2 = min(upper_lab, lab1 + max_words_per_line)
for idx in range(lab1, lab2):
token = result['lab'][idx]
print('{token}'.format(token = token), end = '')
for n in range(space['lab'][idx]) :
print(padding_symbol, end = '')
print(' ',end='')
print()
if verbose > 1:
print('rec(%s):' % fid.encode('utf-8'), end = ' ')
else:
print('rec:', end = ' ')
rec2 = min(upper_rec, rec1 + max_words_per_line)
for idx in range(rec1, rec2):
token = result['rec'][idx]
print('{token}'.format(token = token), end = '')
for n in range(space['rec'][idx]) :
print(padding_symbol, end = '')
print(' ',end='')
print('\n', end='\n')
lab1 = lab2
rec1 = rec2
if verbose:
print('===========================================================================')
print()
result = calculator.overall()
if result['all'] != 0 :
wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all']
else :
wer = 0.0
print('Overall -> %4.2f %%' % wer, end = ' ')
print('N=%d C=%d S=%d D=%d I=%d' %
(result['all'], result['cor'], result['sub'], result['del'], result['ins']))
if not verbose:
print()
if verbose:
for cluster_id in default_clusters :
result = calculator.cluster([ k for k in default_clusters[cluster_id] ])
if result['all'] != 0 :
wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all']
else :
wer = 0.0
print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ')
print('N=%d C=%d S=%d D=%d I=%d' %
(result['all'], result['cor'], result['sub'], result['del'], result['ins']))
if len(cluster_file) > 0 : # compute separated WERs for word clusters
cluster_id = ''
cluster = []
for line in open(cluster_file, 'r', encoding='utf-8') :
for token in line.decode('utf-8').rstrip('\n').split() :
# end of cluster reached, like </Keyword>
if token[0:2] == '</' and token[len(token)-1] == '>' and \
token.lstrip('</').rstrip('>') == cluster_id :
result = calculator.cluster(cluster)
if result['all'] != 0 :
wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all']
else :
wer = 0.0
print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ')
print('N=%d C=%d S=%d D=%d I=%d' %
(result['all'], result['cor'], result['sub'], result['del'], result['ins']))
cluster_id = ''
cluster = []
# begin of cluster reached, like <Keyword>
elif token[0] == '<' and token[len(token)-1] == '>' and \
cluster_id == '' :
cluster_id = token.lstrip('<').rstrip('>')
cluster = []
# general terms, like WEATHER / CAR / ...
else :
cluster.append(token)
print()
print('===========================================================================')
#!/usr/bin/env python3
# encoding: utf-8
import sys
import argparse
import json
import codecs
import yaml
import torch
import torchaudio
import torchaudio.compliance.kaldi as kaldi
from torch.utils.data import Dataset, DataLoader
torchaudio.set_audio_backend("sox_io")
class CollateFunc(object):
''' Collate function for AudioDataset
'''
def __init__(self, feat_dim, resample_rate):
self.feat_dim = feat_dim
self.resample_rate = resample_rate
pass
def __call__(self, batch):
mean_stat = torch.zeros(self.feat_dim)
var_stat = torch.zeros(self.feat_dim)
number = 0
for item in batch:
value = item[1].strip().split(",")
assert len(value) == 3 or len(value) == 1
wav_path = value[0]
sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate
resample_rate = sample_rate
# len(value) == 3 means segmented wav.scp,
# len(value) == 1 means original wav.scp
if len(value) == 3:
start_frame = int(float(value[1]) * sample_rate)
end_frame = int(float(value[2]) * sample_rate)
waveform, sample_rate = torchaudio.backend.sox_io_backend.load(
filepath=wav_path,
num_frames=end_frame - start_frame,
frame_offset=start_frame)
else:
waveform, sample_rate = torchaudio.load(item[1])
waveform = waveform * (1 << 15)
if self.resample_rate != 0 and self.resample_rate != sample_rate:
resample_rate = self.resample_rate
waveform = torchaudio.transforms.Resample(
orig_freq=sample_rate, new_freq=resample_rate)(waveform)
mat = kaldi.fbank(waveform,
num_mel_bins=self.feat_dim,
dither=0.0,
energy_floor=0.0,
sample_frequency=resample_rate)
mean_stat += torch.sum(mat, axis=0)
var_stat += torch.sum(torch.square(mat), axis=0)
number += mat.shape[0]
return number, mean_stat, var_stat
class AudioDataset(Dataset):
def __init__(self, data_file):
self.items = []
with codecs.open(data_file, 'r', encoding='utf-8') as f:
for line in f:
arr = line.strip().split()
self.items.append((arr[0], arr[1]))
def __len__(self):
return len(self.items)
def __getitem__(self, idx):
return self.items[idx]
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='extract CMVN stats')
parser.add_argument('--num_workers',
default=0,
type=int,
help='num of subprocess workers for processing')
parser.add_argument('--train_config',
default='',
help='training yaml conf')
parser.add_argument('--in_scp', default=None, help='wav scp file')
parser.add_argument('--out_cmvn',
default='global_cmvn',
help='global cmvn file')
doc = "Print log after every log_interval audios are processed."
parser.add_argument("--log_interval", type=int, default=1000, help=doc)
args = parser.parse_args()
with open(args.train_config, 'r') as fin:
configs = yaml.load(fin, Loader=yaml.FullLoader)
feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins']
resample_rate = 0
if 'resample_conf' in configs['dataset_conf']:
resample_rate = configs['dataset_conf']['resample_conf']['resample_rate']
print('using resample and new sample rate is {}'.format(resample_rate))
collate_func = CollateFunc(feat_dim, resample_rate)
dataset = AudioDataset(args.in_scp)
batch_size = 20
data_loader = DataLoader(dataset,
batch_size=batch_size,
shuffle=True,
sampler=None,
num_workers=args.num_workers,
collate_fn=collate_func)
with torch.no_grad():
all_number = 0
all_mean_stat = torch.zeros(feat_dim)
all_var_stat = torch.zeros(feat_dim)
wav_number = 0
for i, batch in enumerate(data_loader):
number, mean_stat, var_stat = batch
all_mean_stat += mean_stat
all_var_stat += var_stat
all_number += number
wav_number += batch_size
if wav_number % args.log_interval == 0:
print(f'processed {wav_number} wavs, {all_number} frames',
file=sys.stderr,
flush=True)
cmvn_info = {
'mean_stat': list(all_mean_stat.tolist()),
'var_stat': list(all_var_stat.tolist()),
'frame_num': all_number
}
with open(args.out_cmvn, 'w') as fout:
fout.write(json.dumps(cmvn_info))
# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Chao Yang)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
import torchaudio
import torchaudio.compliance.kaldi as kaldi
import wenet.dataset.kaldi_io as kaldi_io
# The "sox" backends are deprecated and will be removed in 0.9.0 release.
# So here we use sox_io backend
torchaudio.set_audio_backend("sox_io")
def parse_opts():
parser = argparse.ArgumentParser(description='training your network')
parser.add_argument('--num_mel_bins',
default=80,
type=int,
help='Number of triangular mel-frequency bins')
parser.add_argument('--frame_length',
type=int,
default=25,
help='Frame length in milliseconds')
parser.add_argument('--frame_shift',
type=int,
default=10,
help='Frame shift in milliseconds')
parser.add_argument('--dither',
type=int,
default=0.0,
help='Dithering constant (0.0 means no dither)')
parser.add_argument('--segments', default=None, help='segments file')
parser.add_argument('wav_scp', help='wav scp file')
parser.add_argument('out_ark', help='output ark file')
parser.add_argument('out_scp', help='output scp file')
args = parser.parse_args()
return args
# wav format: <key> <wav_path>
def load_wav_scp(wav_scp_file):
wav_list = []
with open(wav_scp_file, 'r', encoding='utf8') as fin:
for line in fin:
arr = line.strip().split()
assert len(arr) == 2
wav_list.append((arr[0], arr[1]))
return wav_list
# wav format: <key> <wav_path>
def load_wav_scp_dict(wav_scp_file):
wav_dict = {}
with open(wav_scp_file, 'r', encoding='utf8') as fin:
for line in fin:
arr = line.strip().split()
assert len(arr) == 2
wav_dict[arr[0]] = arr[1]
return wav_dict
# Segments format: <key> <wav_key> <start> <end>
def load_wav_segments(wav_scp_file, segments_file):
wav_dict = load_wav_scp_dict(wav_scp_file)
audio_list = []
with open(segments_file, 'r', encoding='utf8') as fin:
for line in fin:
arr = line.strip().split()
assert len(arr) == 4
key = arr[0]
wav_file = wav_dict[arr[1]]
start = float(arr[2])
end = float(arr[3])
audio_list.append((key, wav_file, start, end))
return audio_list
if __name__ == '__main__':
args = parse_opts()
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s')
if args.segments is None:
audio_list = load_wav_scp(args.wav_scp)
else:
audio_list = load_wav_segments(args.wav_scp, args.segments)
count = 0
with open(args.out_ark, 'wb') as ark_fout, \
open(args.out_scp, 'w', encoding='utf8') as scp_fout:
for item in audio_list:
if len(item) == 2:
key, wav_path = item
waveform, sample_rate = torchaudio.load_wav(wav_path)
else:
assert len(item) == 4
key, wav_path, start, end = item
sample_rate = torchaudio.info(wav_path).sample_rate
frame_offset = int(start * sample_rate)
num_frames = int((end - start) * sample_rate)
waveform, sample_rate = torchaudio.load_wav(
wav_path, frame_offset, num_frames)
mat = kaldi.fbank(waveform,
num_mel_bins=args.num_mel_bins,
frame_length=args.frame_length,
frame_shift=args.frame_shift,
dither=args.dither,
energy_floor=0.0,
sample_frequency=sample_rate)
mat = mat.detach().numpy()
kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout)
count += 1
if count % 10000 == 0:
logging.info('Progress {}/{}'.format(count, len(audio_list)))
#!/bin/bash
# Copyright 2013 Johns Hopkins University (author: Daniel Povey)
# Apache 2.0
# This script operates on a directory, such as in data/train/,
# that contains some subset of the following files:
# feats.scp
# wav.scp
# vad.scp
# spk2utt
# utt2spk
# text
#
# It copies to another directory, possibly adding a specified prefix or a suffix
# to the utterance and/or speaker names. Note, the recording-ids stay the same.
#
# begin configuration section
spk_prefix=
utt_prefix=
spk_suffix=
utt_suffix=
validate_opts= # should rarely be needed.
# end configuration section
. utils/parse_options.sh
if [ $# != 2 ]; then
echo "Usage: "
echo " $0 [options] <srcdir> <destdir>"
echo "e.g.:"
echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1"
echo "Options"
echo " --spk-prefix=<prefix> # Prefix for speaker ids, default empty"
echo " --utt-prefix=<prefix> # Prefix for utterance ids, default empty"
echo " --spk-suffix=<suffix> # Suffix for speaker ids, default empty"
echo " --utt-suffix=<suffix> # Suffix for utterance ids, default empty"
exit 1;
fi
export LC_ALL=C
srcdir=$1
destdir=$2
if [ ! -f $srcdir/utt2spk ]; then
echo "copy_data_dir.sh: no such file $srcdir/utt2spk"
exit 1;
fi
if [ "$destdir" == "$srcdir" ]; then
echo "$0: this script requires <srcdir> and <destdir> to be different."
exit 1
fi
set -e;
mkdir -p $destdir
cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map
cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map
if [ ! -f $srcdir/utt2uniq ]; then
if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then
cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq
fi
else
cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq
fi
cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \
utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk
utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt
if [ -f $srcdir/feats.scp ]; then
utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp
fi
if [ -f $srcdir/vad.scp ]; then
utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp
fi
if [ -f $srcdir/segments ]; then
utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments
cp $srcdir/wav.scp $destdir
else # no segments->wav indexed by utt.
if [ -f $srcdir/wav.scp ]; then
utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp
fi
fi
if [ -f $srcdir/reco2file_and_channel ]; then
cp $srcdir/reco2file_and_channel $destdir/
fi
if [ -f $srcdir/text ]; then
utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text
fi
if [ -f $srcdir/utt2dur ]; then
utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur
fi
if [ -f $srcdir/utt2num_frames ]; then
utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames
fi
if [ -f $srcdir/reco2dur ]; then
if [ -f $srcdir/segments ]; then
cp $srcdir/reco2dur $destdir/reco2dur
else
utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur
fi
fi
if [ -f $srcdir/spk2gender ]; then
utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender
fi
if [ -f $srcdir/cmvn.scp ]; then
utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp
fi
for f in frame_shift stm glm ctm; do
if [ -f $srcdir/$f ]; then
cp $srcdir/$f $destdir
fi
done
rm $destdir/spk_map $destdir/utt_map
echo "$0: copied data from $srcdir to $destdir"
for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do
if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then
echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to"
echo " ... $destdir/.backup/$f"
mkdir -p $destdir/.backup
mv $destdir/$f $destdir/.backup/
fi
done
[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats"
[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text"
echo $validate_opts
echo $destdir
utils/validate_data_dir.sh $validate_opts $destdir
#!/usr/bin/env bash
# Script taken from kaldi repo:
# https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/utils/data/remove_dup_utts.sh
# Remove excess utterances once they appear more than a specified
# number of times with the same transcription, in a data set.
# E.g. useful for removing excess "uh-huh" from training.
if [ $# != 3 ]; then
echo "Usage: remove_dup_utts.sh max-count <src-data-dir> <dest-data-dir>"
echo "e.g.: remove_dup_utts.sh 10 data/train data/train_nodup"
echo "This script is used to filter out utterances that have from over-represented"
echo "transcriptions (such as 'uh-huh'), by limiting the number of repetitions of"
echo "any given word-sequence to a specified value. It's often used to get"
echo "subsets for early stages of training."
exit 1;
fi
maxcount=$1
srcdir=$2
destdir=$3
mkdir -p $destdir
[ ! -f $srcdir/text ] && echo "$0: Invalid input directory $srcdir" && exit 1;
! mkdir -p $destdir && echo "$0: could not create directory $destdir" && exit 1;
! [ "$maxcount" -gt 1 ] && echo "$0: invalid max-count '$maxcount'" && exit 1;
cp $srcdir/* $destdir
cat $srcdir/text | \
perl -e '
$maxcount = shift @ARGV;
@all = ();
$p1 = 103349; $p2 = 71147; $k = 0;
sub random { # our own random number generator: predictable.
$k = ($k + $p1) % $p2;
return ($k / $p2);
}
while(<>) {
push @all, $_;
@A = split(" ", $_);
shift @A;
$text = join(" ", @A);
$count{$text} ++;
}
foreach $line (@all) {
@A = split(" ", $line);
shift @A;
$text = join(" ", @A);
$n = $count{$text};
if ($n < $maxcount || random() < ($maxcount / $n)) {
print $line;
}
}' $maxcount >$destdir/text
echo "Reduced number of utterances from `cat $srcdir/text | wc -l` to `cat $destdir/text | wc -l`"
# Not doing these steps as this script doesn't exist
# + the calling script already validates data
#echo "Using fix_data_dir.sh to reconcile the other files."
#utils/fix_data_dir.sh $destdir
#rm -r $destdir/.backup
exit 0
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation
# See ../../COPYING for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This program splits up any kind of .scp or archive-type file.
# If there is no utt2spk option it will work on any text file and
# will split it up with an approximately equal number of lines in
# each but.
# With the --utt2spk option it will work on anything that has the
# utterance-id as the first entry on each line; the utt2spk file is
# of the form "utterance speaker" (on each line).
# It splits it into equal size chunks as far as it can. If you use the utt2spk
# option it will make sure these chunks coincide with speaker boundaries. In
# this case, if there are more chunks than speakers (and in some other
# circumstances), some of the resulting chunks will be empty and it will print
# an error message and exit with nonzero status.
# You will normally call this like:
# split_scp.pl scp scp.1 scp.2 scp.3 ...
# or
# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
# Note that you can use this script to split the utt2spk file itself,
# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...
# You can also call the scripts like:
# split_scp.pl -j 3 0 scp scp.0
# [note: with this option, it assumes zero-based indexing of the split parts,
# i.e. the second number must be 0 <= n < num-jobs.]
use warnings;
$num_jobs = 0;
$job_id = 0;
$utt2spk_file = "";
$one_based = 0;
for ($x = 1; $x <= 3 && @ARGV > 0; $x++) {
if ($ARGV[0] eq "-j") {
shift @ARGV;
$num_jobs = shift @ARGV;
$job_id = shift @ARGV;
}
if ($ARGV[0] =~ /--utt2spk=(.+)/) {
$utt2spk_file=$1;
shift;
}
if ($ARGV[0] eq '--one-based') {
$one_based = 1;
shift @ARGV;
}
}
if ($num_jobs != 0 && ($num_jobs < 0 || $job_id - $one_based < 0 ||
$job_id - $one_based >= $num_jobs)) {
die "$0: Invalid job number/index values for '-j $num_jobs $job_id" .
($one_based ? " --one-based" : "") . "'\n"
}
$one_based
and $job_id--;
if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
die
"Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ...
or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] in.scp [out.scp]
... where 0 <= job-id < num-jobs, or 1 <= job-id <- num-jobs if --one-based.\n";
}
$error = 0;
$inscp = shift @ARGV;
if ($num_jobs == 0) { # without -j option
@OUTPUTS = @ARGV;
} else {
for ($j = 0; $j < $num_jobs; $j++) {
if ($j == $job_id) {
if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; }
else { push @OUTPUTS, "-"; }
} else {
push @OUTPUTS, "/dev/null";
}
}
}
if ($utt2spk_file ne "") { # We have the --utt2spk option...
open($u_fh, '<', $utt2spk_file) || die "$0: Error opening utt2spk file $utt2spk_file: $!\n";
while(<$u_fh>) {
@A = split;
@A == 2 || die "$0: Bad line $_ in utt2spk file $utt2spk_file\n";
($u,$s) = @A;
$utt2spk{$u} = $s;
}
close $u_fh;
open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
@spkrs = ();
while(<$i_fh>) {
@A = split;
if(@A == 0) { die "$0: Empty or space-only line in scp file $inscp\n"; }
$u = $A[0];
$s = $utt2spk{$u};
defined $s || die "$0: No utterance $u in utt2spk file $utt2spk_file\n";
if(!defined $spk_count{$s}) {
push @spkrs, $s;
$spk_count{$s} = 0;
$spk_data{$s} = []; # ref to new empty array.
}
$spk_count{$s}++;
push @{$spk_data{$s}}, $_;
}
# Now split as equally as possible ..
# First allocate spks to files by allocating an approximately
# equal number of speakers.
$numspks = @spkrs; # number of speakers.
$numscps = @OUTPUTS; # number of output files.
if ($numspks < $numscps) {
die "$0: Refusing to split data because number of speakers $numspks " .
"is less than the number of output .scp files $numscps\n";
}
for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
$scparray[$scpidx] = []; # [] is array reference.
}
for ($spkidx = 0; $spkidx < $numspks; $spkidx++) {
$scpidx = int(($spkidx*$numscps) / $numspks);
$spk = $spkrs[$spkidx];
push @{$scparray[$scpidx]}, $spk;
$scpcount[$scpidx] += $spk_count{$spk};
}
# Now will try to reassign beginning + ending speakers
# to different scp's and see if it gets more balanced.
# Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
# We can show that if considering changing just 2 scp's, we minimize
# this by minimizing the squared difference in sizes. This is
# equivalent to minimizing the absolute difference in sizes. This
# shows this method is bound to converge.
$changed = 1;
while($changed) {
$changed = 0;
for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
# First try to reassign ending spk of this scp.
if($scpidx < $numscps-1) {
$sz = @{$scparray[$scpidx]};
if($sz > 0) {
$spk = $scparray[$scpidx]->[$sz-1];
$count = $spk_count{$spk};
$nutt1 = $scpcount[$scpidx];
$nutt2 = $scpcount[$scpidx+1];
if( abs( ($nutt2+$count) - ($nutt1-$count))
< abs($nutt2 - $nutt1)) { # Would decrease
# size-diff by reassigning spk...
$scpcount[$scpidx+1] += $count;
$scpcount[$scpidx] -= $count;
pop @{$scparray[$scpidx]};
unshift @{$scparray[$scpidx+1]}, $spk;
$changed = 1;
}
}
}
if($scpidx > 0 && @{$scparray[$scpidx]} > 0) {
$spk = $scparray[$scpidx]->[0];
$count = $spk_count{$spk};
$nutt1 = $scpcount[$scpidx-1];
$nutt2 = $scpcount[$scpidx];
if( abs( ($nutt2-$count) - ($nutt1+$count))
< abs($nutt2 - $nutt1)) { # Would decrease
# size-diff by reassigning spk...
$scpcount[$scpidx-1] += $count;
$scpcount[$scpidx] -= $count;
shift @{$scparray[$scpidx]};
push @{$scparray[$scpidx-1]}, $spk;
$changed = 1;
}
}
}
}
# Now print out the files...
for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
$scpfile = $OUTPUTS[$scpidx];
($scpfile ne '-' ? open($f_fh, '>', $scpfile)
: open($f_fh, '>&', \*STDOUT)) ||
die "$0: Could not open scp file $scpfile for writing: $!\n";
$count = 0;
if(@{$scparray[$scpidx]} == 0) {
print STDERR "$0: eError: split_scp.pl producing empty .scp file " .
"$scpfile (too many splits and too few speakers?)\n";
$error = 1;
} else {
foreach $spk ( @{$scparray[$scpidx]} ) {
print $f_fh @{$spk_data{$spk}};
$count += $spk_count{$spk};
}
$count == $scpcount[$scpidx] || die "Count mismatch [code error]";
}
close($f_fh);
}
} else {
# This block is the "normal" case where there is no --utt2spk
# option and we just break into equal size chunks.
open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
$numscps = @OUTPUTS; # size of array.
@F = ();
while(<$i_fh>) {
push @F, $_;
}
$numlines = @F;
if($numlines == 0) {
print STDERR "$0: error: empty input scp file $inscp\n";
$error = 1;
}
$linesperscp = int( $numlines / $numscps); # the "whole part"..
$linesperscp >= 1 || die "$0: You are splitting into too many pieces! [reduce \$nj ($numscps) to be smaller than the number of lines ($numlines) in $inscp]\n";
$remainder = $numlines - ($linesperscp * $numscps);
($remainder >= 0 && $remainder < $numlines) || die "bad remainder $remainder";
# [just doing int() rounds down].
$n = 0;
for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
$scpfile = $OUTPUTS[$scpidx];
($scpfile ne '-' ? open($o_fh, '>', $scpfile)
: open($o_fh, '>&', \*STDOUT)) ||
die "$0: Could not open scp file $scpfile for writing: $!\n";
for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) {
print $o_fh $F[$n++];
}
close($o_fh) || die "$0: Eror closing scp file $scpfile: $!\n";
}
$n == $numlines || die "$n != $numlines [code error]";
}
exit ($error);
#!/usr/bin/env bash
# Copyright 2021 Mobvoi Inc. All Rights Reserved.
# Author: binbinzhang@mobvoi.com (Binbin Zhang)
export GLOG_logtostderr=1
export GLOG_v=2
set -e
nj=1
chunk_size=-1
ctc_weight=0.0
reverse_weight=0.0
rescoring_weight=1.0
# For CTC WFST based decoding
fst_path=
dict_path=
acoustic_scale=1.0
beam=15.0
lattice_beam=12.0
min_active=200
max_active=7000
blank_skip_thresh=1.0
length_penalty=0.0
. tools/parse_options.sh || exit 1;
if [ $# != 5 ]; then
echo "Usage: $0 [options] <wav.scp> <label_file> <model_file> <unit_file> <output_dir>"
exit 1;
fi
if ! which decoder_main > /dev/null; then
echo "decoder_main is not built, please go to runtime/libtorch to build it."
exit 1;
fi
scp=$1
label_file=$2
model_file=$3
unit_file=$4
dir=$5
mkdir -p $dir/split${nj}
# Step 1. Split wav.scp
split_scps=""
for n in $(seq ${nj}); do
split_scps="${split_scps} ${dir}/split${nj}/wav.${n}.scp"
done
tools/data/split_scp.pl ${scp} ${split_scps}
# Step 2. Parallel decoding
wfst_decode_opts=
if [ ! -z $fst_path ]; then
wfst_decode_opts="--fst_path $fst_path"
wfst_decode_opts="$wfst_decode_opts --beam $beam"
wfst_decode_opts="$wfst_decode_opts --dict_path $dict_path"
wfst_decode_opts="$wfst_decode_opts --lattice_beam $lattice_beam"
wfst_decode_opts="$wfst_decode_opts --max_active $max_active"
wfst_decode_opts="$wfst_decode_opts --min_active $min_active"
wfst_decode_opts="$wfst_decode_opts --acoustic_scale $acoustic_scale"
wfst_decode_opts="$wfst_decode_opts --blank_skip_thresh $blank_skip_thresh"
wfst_decode_opts="$wfst_decode_opts --length_penalty $length_penalty"
echo $wfst_decode_opts > $dir/config
fi
for n in $(seq ${nj}); do
{
decoder_main \
--rescoring_weight $rescoring_weight \
--ctc_weight $ctc_weight \
--reverse_weight $reverse_weight \
--chunk_size $chunk_size \
--wav_scp ${dir}/split${nj}/wav.${n}.scp \
--model_path $model_file \
--unit_path $unit_file \
$wfst_decode_opts \
--result ${dir}/split${nj}/${n}.text &> ${dir}/split${nj}/${n}.log
} &
done
wait
# Step 3. Merge files
for n in $(seq ${nj}); do
cat ${dir}/split${nj}/${n}.text
done > ${dir}/text
tail $dir/split${nj}/*.log | grep RTF | awk '{sum+=$NF}END{print sum/NR}' > $dir/rtf
# Step 4. Compute WER
python3 tools/compute-wer.py --char=1 --v=1 \
$label_file $dir/text > $dir/wer
#!/bin/bash
# Begin configuration section.
nj=4
cmd=run.pl
verbose=0
filetype=""
preprocess_conf=""
# End configuration section.
help_message=$(cat << EOF
Usage: $0 [options] <input-scp> <output-scp> [<log-dir>]
e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/log
Options:
--nj <nj> # number of parallel jobs
--cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
--filetype <mat|hdf5|sound.hdf5> # Specify the format of feats file
--preprocess-conf <json> # Apply preprocess to feats when creating shape.scp
--verbose <num> # Default: 0
EOF
)
echo "$0 $*" 1>&2 # Print the command line for logging
. parse_options.sh || exit 1;
if [ $# -lt 2 ] || [ $# -gt 3 ]; then
echo "${help_message}" 1>&2
exit 1;
fi
set -euo pipefail
scp=$1
outscp=$2
data=$(dirname ${scp})
if [ $# -eq 3 ]; then
logdir=$3
else
logdir=${data}/log
fi
mkdir -p ${logdir}
split_scps=""
for n in $(seq ${nj}); do
split_scps="${split_scps} ${logdir}/feats.${n}.scp"
done
utils/split_scp.pl ${scp} ${split_scps}
if [ -n "${preprocess_conf}" ]; then
preprocess_opt="--preprocess-conf ${preprocess_conf}"
else
preprocess_opt=""
fi
if [ -n "${filetype}" ]; then
filetype_opt="--filetype ${filetype}"
else
filetype_opt=""
fi
${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \
feat-to-len --verbose=${verbose} \
scp:${logdir}/feats.JOB.scp ark,t:${logdir}/shape.JOB.scp
feat_dim=$(feat-to-dim scp:$logdir/feats.1.scp -)
# concatenate the .scp files together.
for n in $(seq ${nj}); do
sed "s:\ *$:,$feat_dim:g" ${logdir}/shape.${n}.scp
done > ${outscp}
rm -f ${logdir}/feats.*.scp 2>/dev/null
#!/usr/bin/env perl
# Copyright 2010-2012 Microsoft Corporation
# Johns Hopkins University (author: Daniel Povey)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script takes a list of utterance-ids or any file whose first field
# of each line is an utterance-id, and filters an scp
# file (or any file whose "n-th" field is an utterance id), printing
# out only those lines whose "n-th" field is in id_list. The index of
# the "n-th" field is 1, by default, but can be changed by using
# the -f <n> switch
$exclude = 0;
$field = 1;
$shifted = 0;
do {
$shifted=0;
if ($ARGV[0] eq "--exclude") {
$exclude = 1;
shift @ARGV;
$shifted=1;
}
if ($ARGV[0] eq "-f") {
$field = $ARGV[1];
shift @ARGV; shift @ARGV;
$shifted=1
}
} while ($shifted);
if(@ARGV < 1 || @ARGV > 2) {
die "Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp \n" .
"Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" .
"Note: only the first field of each line in id_list matters. With --exclude, prints\n" .
"only the lines that were *not* in id_list.\n" .
"Caution: previously, the -f option was interpreted as a zero-based field index.\n" .
"If your older scripts (written before Oct 2014) stopped working and you used the\n" .
"-f option, add 1 to the argument.\n" .
"See also: utils/filter_scp.pl .\n";
}
$idlist = shift @ARGV;
open(F, "<$idlist") || die "Could not open id-list file $idlist";
while(<F>) {
@A = split;
@A>=1 || die "Invalid id-list file line $_";
$seen{$A[0]} = 1;
}
if ($field == 1) { # Treat this as special case, since it is common.
while(<>) {
$_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
# $1 is what we filter on.
if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) {
print $_;
}
}
} else {
while(<>) {
@A = split;
@A > 0 || die "Invalid scp file line $_";
@A >= $field || die "Invalid scp file line $_";
if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) {
print $_;
}
}
}
# tests:
# the following should print "foo 1"
# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo)
# the following should print "bar 2".
# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2)
#!/bin/bash
# This script makes sure that only the segments present in
# all of "feats.scp", "wav.scp" [if present], segments [if present]
# text, and utt2spk are present in any of them.
# It puts the original contents of data-dir into
# data-dir/.backup
cmd="$@"
utt_extra_files=
spk_extra_files=
. tools/parse_options.sh
if [ $# != 1 ]; then
echo "Usage: utils/data/fix_data_dir.sh <data-dir>"
echo "e.g.: utils/data/fix_data_dir.sh data/train"
echo "This script helps ensure that the various files in a data directory"
echo "are correctly sorted and filtered, for example removing utterances"
echo "that have no features (if feats.scp is present)"
exit 1
fi
data=$1
if [ -f $data/images.scp ]; then
image/fix_data_dir.sh $cmd
exit $?
fi
mkdir -p $data/.backup
[ ! -d $data ] && echo "$0: no such directory $data" && exit 1;
[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1;
set -e -o pipefail -u
tmpdir=$(mktemp -d /tmp/kaldi.XXXX);
trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM
export LC_ALL=C
function check_sorted {
file=$1
sort -k1,1 -u <$file >$file.tmp
if ! cmp -s $file $file.tmp; then
echo "$0: file $1 is not in sorted order or not unique, sorting it"
mv $file.tmp $file
else
rm $file.tmp
fi
}
for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \
reco2file_and_channel spk2gender utt2lang utt2emo utt2uniq utt2dur reco2dur utt2num_frames; do
if [ -f $data/$x ]; then
cp $data/$x $data/.backup/$x
check_sorted $data/$x
fi
done
function filter_file {
filter=$1
file_to_filter=$2
cp $file_to_filter ${file_to_filter}.tmp
tools/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter
if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then
length1=$(cat ${file_to_filter}.tmp | wc -l)
length2=$(cat ${file_to_filter} | wc -l)
if [ $length1 -ne $length2 ]; then
echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter."
fi
fi
rm $file_to_filter.tmp
}
function filter_recordings {
# We call this once before the stage when we filter on utterance-id, and once
# after.
if [ -f $data/segments ]; then
# We have a segments file -> we need to filter this and the file wav.scp, and
# reco2file_and_utt, if it exists, to make sure they have the same list of
# recording-ids.
if [ ! -f $data/wav.scp ]; then
echo "$0: $data/segments exists but not $data/wav.scp"
exit 1;
fi
awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings
n1=$(cat $tmpdir/recordings | wc -l)
[ ! -s $tmpdir/recordings ] && \
echo "Empty list of recordings (bad file $data/segments)?" && exit 1;
tools/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp
mv $tmpdir/recordings.tmp $tmpdir/recordings
cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments
filter_file $tmpdir/recordings $data/segments
cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments
rm $data/segments.tmp
filter_file $tmpdir/recordings $data/wav.scp
[ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel
[ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur
true
fi
}
function filter_speakers {
# throughout this program, we regard utt2spk as primary and spk2utt as derived, so...
tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt
cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
for s in cmvn.scp spk2gender; do
f=$data/$s
if [ -f $f ]; then
filter_file $f $tmpdir/speakers
fi
done
filter_file $tmpdir/speakers $data/spk2utt
tools/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk
for s in cmvn.scp spk2gender $spk_extra_files; do
f=$data/$s
if [ -f $f ]; then
filter_file $tmpdir/speakers $f
fi
done
}
function filter_utts {
cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts
echo "$(cat $tmpdir/utts | wc -l)"
! cat $data/utt2spk | sort | cmp - $data/utt2spk && \
echo "utt2spk is not in sorted order (fix this yourself)" && exit 1;
! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \
echo "utt2spk is not in sorted order when sorted first on speaker-id " && \
echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1;
! cat $data/spk2utt | sort | cmp - $data/spk2utt && \
echo "spk2utt is not in sorted order (fix this yourself)" && exit 1;
if [ -f $data/utt2uniq ]; then
! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \
echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1;
fi
maybe_wav=
maybe_reco2dur=
[ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist.
[ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts
maybe_utt2dur=
if [ -f $data/utt2dur ]; then
cat $data/utt2dur | \
awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1
maybe_utt2dur=utt2dur.ok
fi
maybe_utt2num_frames=
if [ -f $data/utt2num_frames ]; then
cat $data/utt2num_frames | \
awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2num_frames.ok || exit 1
maybe_utt2num_frames=utt2num_frames.ok
fi
for x in feats.scp text segments utt2lang utt2emo $maybe_wav $maybe_utt2dur $maybe_utt2num_frames; do
if [ -f $data/$x ]; then
tools/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp
echo "$data/$x, $(cat $tmpdir/utts | wc -l), $(cat $tmpdir/utts.tmp | wc -l)"
mv $tmpdir/utts.tmp $tmpdir/utts
# echo "$tmpdir/utts"
fi
done
rm $data/utt2dur.ok 2>/dev/null || true
rm $data/utt2num_frames.ok 2>/dev/null || true
[ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \
rm $tmpdir/utts && exit 1;
if [ -f $data/utt2spk ]; then
new_nutts=$(cat $tmpdir/utts | wc -l)
old_nutts=$(cat $data/utt2spk | wc -l)
if [ $new_nutts -ne $old_nutts ]; then
echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts"
else
echo "fix_data_dir.sh: kept all $old_nutts utterances."
fi
fi
for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2emo utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do
if [ -f $data/$x ]; then
cp $data/$x $data/.backup/$x
if ! cmp -s $data/$x <( tools/filter_scp.pl $tmpdir/utts $data/$x ) ; then
tools/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x
fi
fi
done
}
filter_recordings
filter_speakers
filter_utts
filter_speakers
filter_recordings
tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt
echo "fix_data_dir.sh: old files are kept in $data/.backup"
#!/usr/bin/env python3
# encoding: utf-8
import sys
from flake8.main import git
if __name__ == '__main__':
sys.exit(
git.hook(
strict=True,
lazy=git.config_for('lazy'),
)
)
#!/bin/bash
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
# Mobvoi Corporation (Author: Di Wu)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
echo "$0 $*" >&2 # Print the command line for logging
. ./path.sh
nj=1
cmd=run.pl
nlsyms=""
lang=""
feat=""
feat_type="kaldi"
oov="<unk>"
bpecode=""
allow_one_column=false
raw=""
verbose=0
trans_type=char
filetype=""
preprocess_conf=""
category=""
out="" # If omitted, write in stdout
help_message=$(cat << EOF
Usage: $0 <data-dir> <dict>
e.g. $0 data/train data/lang_1char/train_units.txt
Options:
--nj <nj> # number of parallel jobs
--cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
--feat <feat-scp> # feat.scp or feat1.scp,feat2.scp,...
--feat-type <feat-type> # kaldi or wav
--oov <oov-word> # Default: <unk>
--out <outputfile> # If omitted, write in stdout
--filetype <mat|hdf5|sound.hdf5> # Specify the format of feats file
--preprocess-conf <json> # Apply preprocess to feats when creating shape.scp
--verbose <num> # Default: 0
EOF
)
. tools/parse_options.sh
if [ $# != 2 ]; then
echo "${help_message}" 1>&2
exit 1;
fi
set -euo pipefail
dir=$1
dic=$2
tmpdir=$(mktemp -d ${dir}/tmp-XXXXX)
#trap 'rm -rf ${tmpdir}' EXIT
# 1. Create scp files for inputs
# These are not necessary for decoding mode, and make it as an option
input=
if [ -n "${feat}" ]; then
_feat_scps=$(echo "${feat}" | tr ',' ' ' )
read -r -a feat_scps <<< $_feat_scps
num_feats=${#feat_scps[@]}
for (( i=1; i<=num_feats; i++ )); do
feat=${feat_scps[$((i-1))]}
mkdir -p ${tmpdir}/input_${i}
input+="input_${i} "
cat ${feat} > ${tmpdir}/input_${i}/feat.scp
# Dump in the "legacy" style JSON format
if [ -n "${filetype}" ]; then
awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \
> ${tmpdir}/input_${i}/filetype.scp
fi
if [ ${feat_type} == "kaldi" ]; then
tools/feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \
--filetype "${filetype}" \
--preprocess-conf "${preprocess_conf}" \
--verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp
elif [ ${feat_type} == "wav" ] || [ ${feat_type} == "flac" ] || [ ${feat_type} == "opus" ]; then
if [ -f $dir/segments ]; then
# used for segmented wav.scp
awk '{print $1" "$4-$3}' $dir/segments > $dir/utt2dur
fi
if [ ! -f $dir/utt2dur ]; then
tools/wav_to_duration.sh --nj ${nj} \
${feat} ${tmpdir}/input_${i}/shape.scp
# use the existed utt2dur as shape.scp directly
else
cp $dir/utt2dur ${tmpdir}/input_${i}/shape.scp
fi
fi
done
fi
# 2. Create scp files for outputs
mkdir -p ${tmpdir}/output
if [ -n "${bpecode}" ]; then
if [ "${trans_type}" == "cn_char_en_bpe" ]; then
tools/text2token.py -s 1 -n 1 -m ${bpecode} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp
else
paste -d " " <(awk '{print $1}' ${dir}/text) <(cut -f 2- -d" " ${dir}/text \
| tools/spm_encode --model=${bpecode} --output_format=piece) \
> ${tmpdir}/output/token.scp
fi
elif [ -n "${nlsyms}" ]; then
tools/text2token.py -s 1 -n 1 -l ${nlsyms} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp
elif [ -n "${raw}" ]; then
cat $dir/text > ${tmpdir}/output/token.scp
else
tools/text2token.py -s 1 -n 1 ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp
fi
< ${tmpdir}/output/token.scp tools/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp
odim=$(cat ${dic} | wc -l)
< ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp
cat ${dir}/text > ${tmpdir}/output/text.scp
# 3. Create scp files for the others
mkdir -p ${tmpdir}/other
if [ -n "${lang}" ]; then
awk -v lang=${lang} '{print $1 " " lang}' ${dir}/text > ${tmpdir}/other/lang.scp
fi
if [ -n "${category}" ]; then
awk -v category=${category} '{print $1 " " category}' ${dir}/text \
> ${tmpdir}/other/category.scp
fi
#cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp
# 4. Merge scp files into a one file
opts=""
for intype in ${input} output other; do
if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then
continue
fi
if [ ${intype} != other ]; then
opts+="--${intype%_*}-scps "
else
opts+="--scps "
fi
for x in "${tmpdir}/${intype}"/*.scp; do
k=$(basename ${x} .scp)
if [ ${k} = shape ]; then
opts+="shape:${x}:shape "
else
opts+="${k}:${x} "
fi
done
done
if ${allow_one_column}; then
opts+="--allow-one-column true "
else
opts+="--allow-one-column false "
fi
if [ -n "${out}" ]; then
opts+="-O ${out}"
fi
tools/merge_scp2txt.py --verbose ${verbose} ${opts}
#rm -fr ${tmpdir}
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation
# 2013-2016 Johns Hopkins University (author: Daniel Povey)
# 2015 Hainan Xu
# 2015 Guoguo Chen
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Adds disambiguation symbols to a lexicon.
# Outputs still in the normal lexicon format.
# Disambig syms are numbered #1, #2, #3, etc. (#0
# reserved for symbol in grammar).
# Outputs the number of disambig syms to the standard output.
# With the --pron-probs option, expects the second field
# of each lexicon line to be a pron-prob.
# With the --sil-probs option, expects three additional
# fields after the pron-prob, representing various components
# of the silence probability model.
$pron_probs = 0;
$sil_probs = 0;
$first_allowed_disambig = 1;
for ($n = 1; $n <= 3 && @ARGV > 0; $n++) {
if ($ARGV[0] eq "--pron-probs") {
$pron_probs = 1;
shift @ARGV;
}
if ($ARGV[0] eq "--sil-probs") {
$sil_probs = 1;
shift @ARGV;
}
if ($ARGV[0] eq "--first-allowed-disambig") {
$first_allowed_disambig = 0 + $ARGV[1];
if ($first_allowed_disambig < 1) {
die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n";
}
shift @ARGV;
shift @ARGV;
}
}
if (@ARGV != 2) {
die "Usage: add_lex_disambig.pl [opts] <lexicon-in> <lexicon-out>\n" .
"This script adds disambiguation symbols to a lexicon in order to\n" .
"make decoding graphs determinizable; it adds pseudo-phone\n" .
"disambiguation symbols #1, #2 and so on at the ends of phones\n" .
"to ensure that all pronunciations are different, and that none\n" .
"is a prefix of another.\n" .
"It prints to the standard output the number of the largest-numbered" .
"disambiguation symbol that was used.\n" .
"\n" .
"Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" .
" --sil-probs [should be with --pron-probs option]\n" .
" Expect 3 extra fields after the pron-probs, for aspects of\n" .
" the silence probability model\n" .
" --first-allowed-disambig <n> The number of the first disambiguation symbol\n" .
" that this script is allowed to add. By default this is\n" .
" #1, but you can set this to a larger value using this option.\n" .
"e.g.:\n" .
" add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" .
" add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" .
" add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n";
}
$lexfn = shift @ARGV;
$lexoutfn = shift @ARGV;
open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
# (1) Read in the lexicon.
@L = ( );
while(<L>) {
@A = split(" ", $_);
push @L, join(" ", @A);
}
# (2) Work out the count of each phone-sequence in the
# lexicon.
foreach $l (@L) {
@A = split(" ", $l);
shift @A; # Remove word.
if ($pron_probs) {
$p = shift @A;
if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; }
}
if ($sil_probs) {
$silp = shift @A;
if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; }
$correction = shift @A;
if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
$correction = shift @A;
if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
}
if (!(@A)) {
die "Bad lexicon line $1, no phone in phone list";
}
$count{join(" ",@A)}++;
}
# (3) For each left sub-sequence of each phone-sequence, note down
# that it exists (for identifying prefixes of longer strings).
foreach $l (@L) {
@A = split(" ", $l);
shift @A; # Remove word.
if ($pron_probs) { shift @A; } # remove pron-prob.
if ($sil_probs) {
shift @A; # Remove silprob
shift @A; # Remove silprob
}
while(@A > 0) {
pop @A; # Remove last phone
$issubseq{join(" ",@A)} = 1;
}
}
# (4) For each entry in the lexicon:
# if the phone sequence is unique and is not a
# prefix of another word, no diambig symbol.
# Else output #1, or #2, #3, ... if the same phone-seq
# has already been assigned a disambig symbol.
open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n";
# max_disambig will always be the highest-numbered disambiguation symbol that
# has been used so far.
$max_disambig = $first_allowed_disambig - 1;
foreach $l (@L) {
@A = split(" ", $l);
$word = shift @A;
if ($pron_probs) {
$pron_prob = shift @A;
}
if ($sil_probs) {
$sil_word_prob = shift @A;
$word_sil_correction = shift @A;
$prev_nonsil_correction = shift @A
}
$phnseq = join(" ", @A);
if (!defined $issubseq{$phnseq}
&& $count{$phnseq} == 1) {
; # Do nothing.
} else {
if ($phnseq eq "") { # need disambig symbols for the empty string
# that are not use anywhere else.
$max_disambig++;
$reserved_for_the_empty_string{$max_disambig} = 1;
$phnseq = "#$max_disambig";
} else {
$cur_disambig = $last_used_disambig_symbol_of{$phnseq};
if (!defined $cur_disambig) {
$cur_disambig = $first_allowed_disambig;
} else {
$cur_disambig++; # Get a number that has not been used yet for
# this phone sequence.
}
while (defined $reserved_for_the_empty_string{$cur_disambig}) {
$cur_disambig++;
}
if ($cur_disambig > $max_disambig) {
$max_disambig = $cur_disambig;
}
$last_used_disambig_symbol_of{$phnseq} = $cur_disambig;
$phnseq = $phnseq . " #" . $cur_disambig;
}
}
if ($pron_probs) {
if ($sil_probs) {
print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n";
} else {
print O "$word\t$pron_prob\t$phnseq\n";
}
} else {
print O "$word\t$phnseq\n";
}
}
print $max_disambig . "\n";
#!/bin/bash
# Copyright 2015 Yajie Miao (Carnegie Mellon University)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the
# phoneme and character-based lexicons.
set -eo pipefail
. tools/parse_options.sh
if [ $# -ne 3 ]; then
echo "usage: tools/fst/compile_lexicon_token_fst.sh <dict-src-dir> <tmp-dir> <lang-dir>"
echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang"
echo "<dict-src-dir> should contain the following files:"
echo "lexicon.txt units.txt"
echo "options: "
exit 1;
fi
srcdir=$1
tmpdir=$2
dir=$3
mkdir -p $dir $tmpdir
[ -f path.sh ] && . ./path.sh
export LC_ALL=C
cp $srcdir/units.txt $dir
# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0.
# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is.
perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1;
# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst.
# Without these symbols, determinization will fail.
ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt`
ndisambig=$[$ndisambig+1];
( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list
# Get the full list of CTC tokens used in FST. These tokens include <eps>, the blank <blk>,
# the actual model unit, and the disambiguation symbols.
cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list
(echo '<eps>';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt
# ctc_token_fst_corrected is too big and too slow for character based chinese modeling,
# so here use ctc_token_fst_compact
tools/fst/ctc_token_fst_compact.py $dir/tokens.txt | \
fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \
fstarcsort --sort_type=olabel > $dir/T.fst || exit 1;
# Encode the words with indices. Will be used in lexicon and language model FST compiling.
cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk '
BEGIN {
print "<eps> 0";
}
{
printf("%s %d\n", $1, NR);
}
END {
printf("#0 %d\n", NR+1);
printf("<s> %d\n", NR+2);
printf("</s> %d\n", NR+3);
}' > $dir/words.txt || exit 1;
# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time.
token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'`
word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'`
tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \
fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \
--keep_isymbols=false --keep_osymbols=false | \
fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \
fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
echo "Lexicon and token FSTs compiling succeeded"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment