Commit 9dcc7a15 authored by flyingdown's avatar flyingdown
Browse files

init v0.10.0

parent db2b0b79
Pipeline #254 failed with stages
in 0 seconds
import unittest
import random
import torch
import numpy as np
from torchaudio.functional import rnnt_loss
CPU_DEVICE = torch.device("cpu")
class _NumpyTransducer(torch.autograd.Function):
@staticmethod
def forward(
ctx,
log_probs,
logit_lengths,
target_lengths,
targets,
blank=-1,
):
device = log_probs.device
log_probs = log_probs.cpu().data.numpy()
logit_lengths = logit_lengths.cpu().data.numpy()
target_lengths = target_lengths.cpu().data.numpy()
targets = targets.cpu().data.numpy()
gradients, costs, _, _ = __class__.compute(
log_probs=log_probs,
logit_lengths=logit_lengths,
target_lengths=target_lengths,
targets=targets,
blank=blank,
)
costs = torch.FloatTensor(costs).to(device=device)
gradients = torch.FloatTensor(gradients).to(device=device)
ctx.grads = torch.autograd.Variable(gradients)
return costs
@staticmethod
def backward(ctx, grad_output):
grad_output = grad_output.view(-1, 1, 1, 1).to(ctx.grads)
return ctx.grads.mul(grad_output), None, None, None, None, None, None, None, None
@staticmethod
def compute_alpha_one_sequence(log_probs, targets, blank=-1):
max_T, max_U, D = log_probs.shape
alpha = np.zeros((max_T, max_U), dtype=np.float32)
for t in range(1, max_T):
alpha[t, 0] = alpha[t - 1, 0] + log_probs[t - 1, 0, blank]
for u in range(1, max_U):
alpha[0, u] = alpha[0, u - 1] + log_probs[0, u - 1, targets[u - 1]]
for t in range(1, max_T):
for u in range(1, max_U):
skip = alpha[t - 1, u] + log_probs[t - 1, u, blank]
emit = alpha[t, u - 1] + log_probs[t, u - 1, targets[u - 1]]
alpha[t, u] = np.logaddexp(skip, emit)
cost = -(alpha[-1, -1] + log_probs[-1, -1, blank])
return alpha, cost
@staticmethod
def compute_beta_one_sequence(log_probs, targets, blank=-1):
max_T, max_U, D = log_probs.shape
beta = np.zeros((max_T, max_U), dtype=np.float32)
beta[-1, -1] = log_probs[-1, -1, blank]
for t in reversed(range(max_T - 1)):
beta[t, -1] = beta[t + 1, -1] + log_probs[t, -1, blank]
for u in reversed(range(max_U - 1)):
beta[-1, u] = beta[-1, u + 1] + log_probs[-1, u, targets[u]]
for t in reversed(range(max_T - 1)):
for u in reversed(range(max_U - 1)):
skip = beta[t + 1, u] + log_probs[t, u, blank]
emit = beta[t, u + 1] + log_probs[t, u, targets[u]]
beta[t, u] = np.logaddexp(skip, emit)
cost = -beta[0, 0]
return beta, cost
@staticmethod
def compute_gradients_one_sequence(
log_probs, alpha, beta, targets, blank=-1
):
max_T, max_U, D = log_probs.shape
gradients = np.full(log_probs.shape, float("-inf"))
cost = -beta[0, 0]
gradients[-1, -1, blank] = alpha[-1, -1]
gradients[:-1, :, blank] = alpha[:-1, :] + beta[1:, :]
for u, l in enumerate(targets):
gradients[:, u, l] = alpha[:, u] + beta[:, u + 1]
gradients = -(np.exp(gradients + log_probs + cost))
return gradients
@staticmethod
def compute(
log_probs,
logit_lengths,
target_lengths,
targets,
blank=-1,
):
gradients = np.zeros_like(log_probs)
B_tgt, max_T, max_U, D = log_probs.shape
B_src = logit_lengths.shape[0]
H = int(B_tgt / B_src)
alphas = np.zeros((B_tgt, max_T, max_U))
betas = np.zeros((B_tgt, max_T, max_U))
betas.fill(float("-inf"))
alphas.fill(float("-inf"))
costs = np.zeros(B_tgt)
for b_tgt in range(B_tgt):
b_src = int(b_tgt / H)
T = int(logit_lengths[b_src])
# NOTE: see https://arxiv.org/pdf/1211.3711.pdf Section 2.1
U = int(target_lengths[b_tgt]) + 1
seq_log_probs = log_probs[b_tgt, :T, :U, :]
seq_targets = targets[b_tgt, : int(target_lengths[b_tgt])]
alpha, alpha_cost = __class__.compute_alpha_one_sequence(
log_probs=seq_log_probs, targets=seq_targets, blank=blank
)
beta, beta_cost = __class__.compute_beta_one_sequence(
log_probs=seq_log_probs, targets=seq_targets, blank=blank
)
seq_gradients = __class__.compute_gradients_one_sequence(
log_probs=seq_log_probs,
alpha=alpha,
beta=beta,
targets=seq_targets,
blank=blank,
)
np.testing.assert_almost_equal(alpha_cost, beta_cost, decimal=2)
gradients[b_tgt, :T, :U, :] = seq_gradients
costs[b_tgt] = beta_cost
alphas[b_tgt, :T, :U] = alpha
betas[b_tgt, :T, :U] = beta
return gradients, costs, alphas, betas
class NumpyTransducerLoss(torch.nn.Module):
def __init__(self, blank=-1):
super().__init__()
self.blank = blank
def forward(
self,
logits,
logit_lengths,
target_lengths,
targets,
):
log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
return _NumpyTransducer.apply(
log_probs,
logit_lengths,
target_lengths,
targets,
self.blank,
)
def compute_with_numpy_transducer(data):
costs = NumpyTransducerLoss(
blank=data["blank"],
)(
logits=data["logits"],
logit_lengths=data["logit_lengths"],
target_lengths=data["target_lengths"],
targets=data["targets"],
)
loss = torch.sum(costs)
loss.backward()
costs = costs.cpu()
gradients = data["logits"].saved_grad.cpu()
return costs, gradients
def compute_with_pytorch_transducer(data):
costs = rnnt_loss(
logits=data["logits"],
logit_lengths=data["logit_lengths"],
target_lengths=data["target_lengths"],
targets=data["targets"],
blank=data["blank"],
reduction="none",
)
loss = torch.sum(costs)
loss.backward()
costs = costs.cpu()
gradients = data["logits"].saved_grad.cpu()
return costs, gradients
def get_basic_data(device):
# Example provided
# in 6f73a2513dc784c59eec153a45f40bc528355b18
# of https://github.com/HawkAaron/warp-transducer
logits = torch.tensor(
[
[
[
[0.1, 0.6, 0.1, 0.1, 0.1],
[0.1, 0.1, 0.6, 0.1, 0.1],
[0.1, 0.1, 0.2, 0.8, 0.1],
],
[
[0.1, 0.6, 0.1, 0.1, 0.1],
[0.1, 0.1, 0.2, 0.1, 0.1],
[0.7, 0.1, 0.2, 0.1, 0.1],
],
]
],
dtype=torch.float32,
device=device,
)
targets = torch.tensor([[1, 2]], dtype=torch.int, device=device)
logit_lengths = torch.tensor([2], dtype=torch.int, device=device)
target_lengths = torch.tensor([2], dtype=torch.int, device=device)
logits.requires_grad_(True)
return logits, targets, logit_lengths, target_lengths
def get_B1_T10_U3_D4_data(
random=False,
dtype=torch.float32,
device=CPU_DEVICE,
):
B, T, U, D = 2, 10, 3, 4
logits = torch.rand(B, T, U, D, dtype=dtype, device=device)
if not random:
logits.fill_(0.1)
logits.requires_grad_(True)
def grad_hook(grad):
logits.saved_grad = grad.clone()
logits.register_hook(grad_hook)
data = {}
data["logits"] = logits
data["logit_lengths"] = torch.tensor([10, 10], dtype=torch.int32, device=device)
data["target_lengths"] = torch.tensor([2, 2], dtype=torch.int32, device=device)
data["targets"] = torch.tensor([[1, 2], [1, 2]], dtype=torch.int32, device=device)
data["blank"] = 0
return data
def get_B1_T2_U3_D5_data(dtype=torch.float32, device=CPU_DEVICE):
logits = torch.tensor(
[
0.1,
0.6,
0.1,
0.1,
0.1,
0.1,
0.1,
0.6,
0.1,
0.1,
0.1,
0.1,
0.2,
0.8,
0.1,
0.1,
0.6,
0.1,
0.1,
0.1,
0.1,
0.1,
0.2,
0.1,
0.1,
0.7,
0.1,
0.2,
0.1,
0.1,
],
dtype=dtype,
device=device,
).reshape(1, 2, 3, 5)
logits.requires_grad_(True)
def grad_hook(grad):
logits.saved_grad = grad.clone()
logits.register_hook(grad_hook)
targets = torch.tensor([[1, 2]], dtype=torch.int32, device=device)
logit_lengths = torch.tensor([2], dtype=torch.int32, device=device)
target_lengths = torch.tensor([2], dtype=torch.int32, device=device)
blank = -1
ref_costs = torch.tensor([5.09566688538], dtype=dtype)
ref_gradients = torch.tensor(
[
0.17703132,
-0.39992708,
0.17703132,
0.17703132,
-0.13116692,
0.12247062,
0.12247062,
-0.181684,
0.12247062,
-0.1857276,
0.06269141,
0.06269141,
0.06928471,
0.12624498,
-0.32091248,
0.05456069,
-0.2182428,
0.05456069,
0.05456069,
0.05456069,
0.12073967,
0.12073967,
-0.48295838,
0.12073967,
0.12073967,
0.30741188,
0.16871123,
0.18645471,
0.16871123,
-0.83128875,
],
dtype=dtype,
).reshape(1, 2, 3, 5)
data = {
"logits": logits,
"targets": targets,
"logit_lengths": logit_lengths,
"target_lengths": target_lengths,
"blank": blank,
}
return data, ref_costs, ref_gradients
def get_B2_T4_U3_D3_data(dtype=torch.float32, device=CPU_DEVICE):
# Test from D21322854
logits = torch.tensor(
[
0.065357,
0.787530,
0.081592,
0.529716,
0.750675,
0.754135,
0.609764,
0.868140,
0.622532,
0.668522,
0.858039,
0.164539,
0.989780,
0.944298,
0.603168,
0.946783,
0.666203,
0.286882,
0.094184,
0.366674,
0.736168,
0.166680,
0.714154,
0.399400,
0.535982,
0.291821,
0.612642,
0.324241,
0.800764,
0.524106,
0.779195,
0.183314,
0.113745,
0.240222,
0.339470,
0.134160,
0.505562,
0.051597,
0.640290,
0.430733,
0.829473,
0.177467,
0.320700,
0.042883,
0.302803,
0.675178,
0.569537,
0.558474,
0.083132,
0.060165,
0.107958,
0.748615,
0.943918,
0.486356,
0.418199,
0.652408,
0.024243,
0.134582,
0.366342,
0.295830,
0.923670,
0.689929,
0.741898,
0.250005,
0.603430,
0.987289,
0.592606,
0.884672,
0.543450,
0.660770,
0.377128,
0.358021,
],
dtype=dtype,
device=device,
).reshape(2, 4, 3, 3)
logits.requires_grad_(True)
def grad_hook(grad):
logits.saved_grad = grad.clone()
logits.register_hook(grad_hook)
targets = torch.tensor([[1, 2], [1, 1]], dtype=torch.int32, device=device)
logit_lengths = torch.tensor([4, 4], dtype=torch.int32, device=device)
target_lengths = torch.tensor([2, 2], dtype=torch.int32, device=device)
blank = 0
ref_costs = torch.tensor([4.2806528590890736, 3.9384369822503591], dtype=dtype)
ref_gradients = torch.tensor(
[
-0.186844,
-0.062555,
0.249399,
-0.203377,
0.202399,
0.000977,
-0.141016,
0.079123,
0.061893,
-0.011552,
-0.081280,
0.092832,
-0.154257,
0.229433,
-0.075176,
-0.246593,
0.146405,
0.100188,
-0.012918,
-0.061593,
0.074512,
-0.055986,
0.219831,
-0.163845,
-0.497627,
0.209240,
0.288387,
0.013605,
-0.030220,
0.016615,
0.113925,
0.062781,
-0.176706,
-0.667078,
0.367659,
0.299419,
-0.356344,
-0.055347,
0.411691,
-0.096922,
0.029459,
0.067463,
-0.063518,
0.027654,
0.035863,
-0.154499,
-0.073942,
0.228441,
-0.166790,
-0.000088,
0.166878,
-0.172370,
0.105565,
0.066804,
0.023875,
-0.118256,
0.094381,
-0.104707,
-0.108934,
0.213642,
-0.369844,
0.180118,
0.189726,
0.025714,
-0.079462,
0.053748,
0.122328,
-0.238789,
0.116460,
-0.598687,
0.302203,
0.296484,
],
dtype=dtype,
).reshape(2, 4, 3, 3)
data = {
"logits": logits,
"targets": targets,
"logit_lengths": logit_lengths,
"target_lengths": target_lengths,
"blank": blank,
}
return data, ref_costs, ref_gradients
def get_random_data(
max_B=8,
max_T=128,
max_U=32,
max_D=40,
blank=-1,
dtype=torch.float32,
device=CPU_DEVICE,
seed=None,
):
if seed is not None:
torch.manual_seed(seed=seed)
if blank != -1:
raise ValueError("blank != -1 is not supported yet.")
random.seed(0)
B = random.randint(1, max_B - 1)
T = random.randint(5, max_T - 1)
U = random.randint(5, max_U - 1)
D = random.randint(2, max_D - 1)
logit_lengths = torch.randint(low=5, high=T + 1, size=(B,), dtype=torch.int32, device=device)
target_lengths = torch.randint(low=5, high=U + 1, size=(B,), dtype=torch.int32, device=device)
max_src_length = torch.max(logit_lengths)
max_tgt_length = torch.max(target_lengths)
targets = torch.randint(
low=0, high=D - 1, size=(B, max_tgt_length), dtype=torch.int32, device=device
)
logits = torch.rand(
size=(B, max_src_length, max_tgt_length + 1, D),
dtype=dtype,
device=device,
).requires_grad_(True)
def grad_hook(grad):
logits.saved_grad = grad.clone()
logits.register_hook(grad_hook)
return {
"logits": logits,
"targets": targets,
"logit_lengths": logit_lengths,
"target_lengths": target_lengths,
"blank": blank,
}
def skipIfNoRNNT(test_item):
try:
torch.ops.torchaudio.rnnt_loss
return test_item
except RuntimeError:
return unittest.skip("torchaudio C++ extension is not compiled with RNN transducer loss")
import sys
import subprocess
import warnings
def get_encoding(dtype):
encodings = {
'float32': 'floating-point',
'int32': 'signed-integer',
'int16': 'signed-integer',
'uint8': 'unsigned-integer',
}
return encodings[dtype]
def get_bit_depth(dtype):
bit_depths = {
'float32': 32,
'int32': 32,
'int16': 16,
'uint8': 8,
}
return bit_depths[dtype]
def gen_audio_file(
path, sample_rate, num_channels,
*, encoding=None, bit_depth=None, compression=None, attenuation=None, duration=1, comment_file=None,
):
"""Generate synthetic audio file with `sox` command."""
if path.endswith('.wav'):
warnings.warn('Use get_wav_data and save_wav to generate wav file for accurate result.')
command = [
'sox',
'-V3', # verbose
'--no-dither', # disable automatic dithering
'-R',
# -R is supposed to be repeatable, though the implementation looks suspicious
# and not setting the seed to a fixed value.
# https://fossies.org/dox/sox-14.4.2/sox_8c_source.html
# search "sox_globals.repeatable"
]
if bit_depth is not None:
command += ['--bits', str(bit_depth)]
command += [
'--rate', str(sample_rate),
'--null', # no input
'--channels', str(num_channels),
]
if compression is not None:
command += ['--compression', str(compression)]
if bit_depth is not None:
command += ['--bits', str(bit_depth)]
if encoding is not None:
command += ['--encoding', str(encoding)]
if comment_file is not None:
command += ['--comment-file', str(comment_file)]
command += [
str(path),
'synth', str(duration), # synthesizes for the given duration [sec]
'sawtooth', '1',
# saw tooth covers the both ends of value range, which is a good property for test.
# similar to linspace(-1., 1.)
# this introduces bigger boundary effect than sine when converted to mp3
]
if attenuation is not None:
command += ['vol', f'-{attenuation}dB']
print(' '.join(command), file=sys.stderr)
subprocess.run(command, check=True)
def convert_audio_file(
src_path, dst_path,
*, encoding=None, bit_depth=None, compression=None):
"""Convert audio file with `sox` command."""
command = ['sox', '-V3', '--no-dither', '-R', str(src_path)]
if encoding is not None:
command += ['--encoding', str(encoding)]
if bit_depth is not None:
command += ['--bits', str(bit_depth)]
if compression is not None:
command += ['--compression', str(compression)]
command += [dst_path]
print(' '.join(command), file=sys.stderr)
subprocess.run(command, check=True)
def _flattern(effects):
if not effects:
return effects
if isinstance(effects[0], str):
return effects
return [item for sublist in effects for item in sublist]
def run_sox_effect(input_file, output_file, effect, *, output_sample_rate=None, output_bitdepth=None):
"""Run sox effects"""
effect = _flattern(effect)
command = ['sox', '-V', '--no-dither', input_file]
if output_bitdepth:
command += ['--bits', str(output_bitdepth)]
command += [output_file] + effect
if output_sample_rate:
command += ['rate', str(output_sample_rate)]
print(' '.join(command))
subprocess.run(command, check=True)
from typing import Optional
import torch
import scipy.io.wavfile
def normalize_wav(tensor: torch.Tensor) -> torch.Tensor:
if tensor.dtype == torch.float32:
pass
elif tensor.dtype == torch.int32:
tensor = tensor.to(torch.float32)
tensor[tensor > 0] /= 2147483647.
tensor[tensor < 0] /= 2147483648.
elif tensor.dtype == torch.int16:
tensor = tensor.to(torch.float32)
tensor[tensor > 0] /= 32767.
tensor[tensor < 0] /= 32768.
elif tensor.dtype == torch.uint8:
tensor = tensor.to(torch.float32) - 128
tensor[tensor > 0] /= 127.
tensor[tensor < 0] /= 128.
return tensor
def get_wav_data(
dtype: str,
num_channels: int,
*,
num_frames: Optional[int] = None,
normalize: bool = True,
channels_first: bool = True,
):
"""Generate linear signal of the given dtype and num_channels
Data range is
[-1.0, 1.0] for float32,
[-2147483648, 2147483647] for int32
[-32768, 32767] for int16
[0, 255] for uint8
num_frames allow to change the linear interpolation parameter.
Default values are 256 for uint8, else 1 << 16.
1 << 16 as default is so that int16 value range is completely covered.
"""
dtype_ = getattr(torch, dtype)
if num_frames is None:
if dtype == 'uint8':
num_frames = 256
else:
num_frames = 1 << 16
if dtype == 'uint8':
base = torch.linspace(0, 255, num_frames, dtype=dtype_)
elif dtype == 'int8':
base = torch.linspace(-128, 127, num_frames, dtype=dtype_)
elif dtype == 'float32':
base = torch.linspace(-1., 1., num_frames, dtype=dtype_)
elif dtype == 'float64':
base = torch.linspace(-1., 1., num_frames, dtype=dtype_)
elif dtype == 'int32':
base = torch.linspace(-2147483648, 2147483647, num_frames, dtype=dtype_)
elif dtype == 'int16':
base = torch.linspace(-32768, 32767, num_frames, dtype=dtype_)
else:
raise NotImplementedError(f'Unsupported dtype {dtype}')
data = base.repeat([num_channels, 1])
if not channels_first:
data = data.transpose(1, 0)
if normalize:
data = normalize_wav(data)
return data
def load_wav(path: str, normalize=True, channels_first=True) -> torch.Tensor:
"""Load wav file without torchaudio"""
sample_rate, data = scipy.io.wavfile.read(path)
data = torch.from_numpy(data.copy())
if data.ndim == 1:
data = data.unsqueeze(1)
if normalize:
data = normalize_wav(data)
if channels_first:
data = data.transpose(1, 0)
return data, sample_rate
def save_wav(path, data, sample_rate, channels_first=True):
"""Save wav file without torchaudio"""
if channels_first:
data = data.transpose(1, 0)
scipy.io.wavfile.write(path, sample_rate, data.numpy())
import torch
import torchaudio.compliance.kaldi as kaldi
from torchaudio_unittest import common_utils
def extract_window(window, wave, f, frame_length, frame_shift, snip_edges):
# just a copy of ExtractWindow from feature-window.cc in python
def first_sample_of_frame(frame, window_size, window_shift, snip_edges):
if snip_edges:
return frame * window_shift
else:
midpoint_of_frame = frame * window_shift + window_shift // 2
beginning_of_frame = midpoint_of_frame - window_size // 2
return beginning_of_frame
sample_offset = 0
num_samples = sample_offset + wave.size(0)
start_sample = first_sample_of_frame(f, frame_length, frame_shift, snip_edges)
end_sample = start_sample + frame_length
if snip_edges:
assert(start_sample >= sample_offset and end_sample <= num_samples)
else:
assert(sample_offset == 0 or start_sample >= sample_offset)
wave_start = start_sample - sample_offset
wave_end = wave_start + frame_length
if wave_start >= 0 and wave_end <= wave.size(0):
window[f, :] = wave[wave_start:(wave_start + frame_length)]
else:
wave_dim = wave.size(0)
for s in range(frame_length):
s_in_wave = s + wave_start
while s_in_wave < 0 or s_in_wave >= wave_dim:
if s_in_wave < 0:
s_in_wave = - s_in_wave - 1
else:
s_in_wave = 2 * wave_dim - 1 - s_in_wave
window[f, s] = wave[s_in_wave]
class Test_Kaldi(common_utils.TempDirMixin, common_utils.TorchaudioTestCase):
def _test_get_strided_helper(self, num_samples, window_size, window_shift, snip_edges):
waveform = torch.arange(num_samples).float()
output = kaldi._get_strided(waveform, window_size, window_shift, snip_edges)
# from NumFrames in feature-window.cc
n = window_size
if snip_edges:
m = 0 if num_samples < window_size else 1 + (num_samples - window_size) // window_shift
else:
m = (num_samples + (window_shift // 2)) // window_shift
self.assertTrue(output.dim() == 2)
self.assertTrue(output.shape[0] == m and output.shape[1] == n)
window = torch.empty((m, window_size))
for r in range(m):
extract_window(window, waveform, r, window_size, window_shift, snip_edges)
self.assertEqual(window, output)
def test_get_strided(self):
# generate any combination where 0 < window_size <= num_samples and
# 0 < window_shift.
for num_samples in range(1, 20):
for window_size in range(1, num_samples + 1):
for window_shift in range(1, 2 * num_samples + 1):
for snip_edges in range(0, 2):
self._test_get_strided_helper(num_samples, window_size, window_shift, snip_edges)
def test_mfcc_empty(self):
# Passing in an empty tensor should result in an error
self.assertRaises(AssertionError, kaldi.mfcc, torch.empty(0))
import os
from pathlib import Path
from torchaudio.datasets import cmuarctic
from torchaudio_unittest.common_utils import (
TempDirMixin,
TorchaudioTestCase,
get_whitenoise,
save_wav,
normalize_wav,
)
def get_mock_dataset(root_dir):
"""
root_dir: directory to the mocked dataset
"""
mocked_data = []
sample_rate = 16000
transcript = "This is a test transcript."
base_dir = os.path.join(root_dir, "ARCTIC", "cmu_us_aew_arctic")
txt_dir = os.path.join(base_dir, "etc")
os.makedirs(txt_dir, exist_ok=True)
txt_file = os.path.join(txt_dir, "txt.done.data")
audio_dir = os.path.join(base_dir, "wav")
os.makedirs(audio_dir, exist_ok=True)
seed = 42
with open(txt_file, "w") as txt:
for c in ["a", "b"]:
for i in range(5):
utterance_id = f"arctic_{c}{i:04d}"
path = os.path.join(audio_dir, f"{utterance_id}.wav")
data = get_whitenoise(
sample_rate=sample_rate,
duration=3,
n_channels=1,
dtype="int16",
seed=seed,
)
save_wav(path, data, sample_rate)
sample = (
normalize_wav(data),
sample_rate,
transcript,
utterance_id.split("_")[1],
)
mocked_data.append(sample)
txt.write(f'( {utterance_id} "{transcript}" )\n')
seed += 1
return mocked_data
class TestCMUARCTIC(TempDirMixin, TorchaudioTestCase):
backend = "default"
root_dir = None
samples = []
@classmethod
def setUpClass(cls):
cls.root_dir = cls.get_base_temp_dir()
cls.samples = get_mock_dataset(cls.root_dir)
def _test_cmuarctic(self, dataset):
n_ite = 0
for i, (waveform, sample_rate, transcript, utterance_id) in enumerate(dataset):
expected_sample = self.samples[i]
assert sample_rate == expected_sample[1]
assert transcript == expected_sample[2]
assert utterance_id == expected_sample[3]
self.assertEqual(expected_sample[0], waveform, atol=5e-5, rtol=1e-8)
n_ite += 1
assert n_ite == len(self.samples)
def test_cmuarctic_str(self):
dataset = cmuarctic.CMUARCTIC(self.root_dir)
self._test_cmuarctic(dataset)
def test_cmuarctic_path(self):
dataset = cmuarctic.CMUARCTIC(Path(self.root_dir))
self._test_cmuarctic(dataset)
import os
from pathlib import Path
from torchaudio.datasets import CMUDict
from torchaudio_unittest.common_utils import (
TempDirMixin,
TorchaudioTestCase,
)
def get_mock_dataset(root_dir, return_punc=False):
"""
root_dir: directory to the mocked dataset
"""
header = [
";;; # CMUdict -- Major Version: 0.07",
";;; ",
";;; # $HeadURL$",
]
puncs = [
"!EXCLAMATION-POINT EH2 K S K L AH0 M EY1 SH AH0 N P OY2 N T",
"\"CLOSE-QUOTE K L OW1 Z K W OW1 T",
"#HASH-MARK HH AE1 M AA2 R K",
"%PERCENT P ER0 S EH1 N T",
"&AMPERSAND AE1 M P ER0 S AE2 N D",
"'END-INNER-QUOTE EH1 N D IH1 N ER0 K W OW1 T",
"(BEGIN-PARENS B IH0 G IH1 N P ER0 EH1 N Z",
")CLOSE-PAREN K L OW1 Z P ER0 EH1 N",
"+PLUS P L UH1 S",
",COMMA K AA1 M AH0",
"--DASH D AE1 SH",
"!EXCLAMATION-POINT EH2 K S K L AH0 M EY1 SH AH0 N P OY2 N T",
"/SLASH S L AE1 SH",
":COLON K OW1 L AH0 N",
";SEMI-COLON S EH1 M IY0 K OW1 L AH0 N",
"?QUESTION-MARK K W EH1 S CH AH0 N M AA1 R K",
"{BRACE B R EY1 S",
"}CLOSE-BRACE K L OW1 Z B R EY1 S",
"...ELLIPSIS IH2 L IH1 P S IH0 S",
]
punc_outputs = [
"!",
"\"",
"#",
"%",
"&",
"'",
"(",
")",
"+",
",",
"--",
"!",
"/",
":",
";",
"?",
"{",
"}",
"...",
]
words = [
"3-D TH R IY1 D IY2",
"'BOUT B AW1 T",
"'CAUSE K AH0 Z",
"'TWAS T W AH1 Z",
"A AH0",
"B B IY1",
"C S IY1",
"D D IY1",
"E IY1",
"F EH1 F",
"G JH IY1",
"H EY1 CH",
"I AY1",
"J JH EY1",
"K K EY1",
"L EH1 L",
"M EH1 M",
"N EH1 N",
"O OW1",
"P P IY1",
"Q K Y UW1",
"R AA1 R",
"S EH1 S",
"T T IY1",
"U Y UW1",
"V V IY1",
"X EH1 K S",
"Y W AY1",
"Z Z IY1",
]
mocked_symbols = [
"AA1",
"AA2",
"AE1",
"AE2",
"AH0",
"AH1",
"AY1",
"B",
"CH",
"D",
"EH1",
"EH2",
"ER0",
"EY1",
"F",
"G",
"HH",
"IH0",
"IH1",
"IY0",
"IY1",
"IY2",
"JH",
"K",
"L",
"M",
"N",
"OW1",
"OY2",
"P",
"R",
"S",
"SH",
"T",
"TH",
"UH1",
"UW0",
"UW1",
"V",
"W",
"Y",
"Z",
]
dict_file = os.path.join(root_dir, "cmudict-0.7b")
symbol_file = os.path.join(root_dir, "cmudict-0.7b.symbols")
with open(dict_file, "w") as fileobj:
for section in [header, puncs, words]:
for line in section:
fileobj.write(line)
fileobj.write("\n")
with open(symbol_file, "w") as txt:
txt.write("\n".join(mocked_symbols))
mocked_data = []
if return_punc:
for i, ent in enumerate(puncs):
_, phones = ent.split(" ")
mocked_data.append((punc_outputs[i], phones.split(" ")))
for ent in words:
word, phones = ent.split(" ")
mocked_data.append((word, phones.split(" ")))
return mocked_data
class TestCMUDict(TempDirMixin, TorchaudioTestCase):
root_dir = None
root_punc_dir = None
samples = []
punc_samples = []
@classmethod
def setUpClass(cls):
cls.root_dir = os.path.join(cls.get_base_temp_dir(), "normal")
os.mkdir(cls.root_dir)
cls.samples = get_mock_dataset(cls.root_dir)
cls.root_punc_dir = os.path.join(cls.get_base_temp_dir(), "punc")
os.mkdir(cls.root_punc_dir)
cls.punc_samples = get_mock_dataset(cls.root_punc_dir, return_punc=True)
def _test_cmudict(self, dataset):
"""Test if the dataset is reading the mocked data correctly."""
n_item = 0
for i, (word, phones) in enumerate(dataset):
expected_word, expected_phones = self.samples[i]
assert word == expected_word
assert phones == expected_phones
n_item += 1
assert n_item == len(self.samples)
def _test_punc_cmudict(self, dataset):
"""Test if the dataset is reading the mocked data with punctuations correctly."""
n_item = 0
for i, (word, phones) in enumerate(dataset):
expected_word, expected_phones = self.punc_samples[i]
assert word == expected_word
assert phones == expected_phones
n_item += 1
assert n_item == len(self.punc_samples)
def test_cmuarctic_path_with_punctuation(self):
dataset = CMUDict(Path(self.root_punc_dir), exclude_punctuations=False)
self._test_punc_cmudict(dataset)
def test_cmuarctic_str_with_punctuation(self):
dataset = CMUDict(self.root_punc_dir, exclude_punctuations=False)
self._test_punc_cmudict(dataset)
def test_cmuarctic_path(self):
dataset = CMUDict(Path(self.root_punc_dir), exclude_punctuations=True)
self._test_cmudict(dataset)
def test_cmuarctic_str(self):
dataset = CMUDict(self.root_punc_dir, exclude_punctuations=True)
self._test_cmudict(dataset)
import csv
import os
from pathlib import Path
from typing import Tuple, Dict
from torch import Tensor
from torchaudio_unittest.common_utils import (
TempDirMixin,
TorchaudioTestCase,
get_whitenoise,
save_wav,
normalize_wav,
)
from torchaudio.datasets import COMMONVOICE
_ORIGINAL_EXT_AUDIO = COMMONVOICE._ext_audio
_SAMPLE_RATE = 48000
_HEADERS = [u"client_ids", u"path", u"sentence", u"up_votes", u"down_votes", u"age", u"gender", u"accent"]
_EN_TRAIN_CSV_CONTENTS = [
["9d16c5d980247861130e0480e2719f448be73d86a496c36d01a477cbdecd8cfd1399403d7a77bf458d211a70711b2da0845c",
"common_voice_en_18885784.wav",
"He was accorded a State funeral, and was buried in Drayton and Toowoomba Cemetery.", "2", "0", "", "",
""],
["c82eb9291328620f06025a1f8112b909099e447e485e99236cb87df008650250e79fea5ca772061fb6a370830847b9c44d20",
"common_voice_en_556542.wav", "Once more into the breach", "2", "0", "thirties", "male", "us"],
["f74d880c5ad4c5917f314a604d3fc4805159d255796fb9f8defca35333ecc002bdf53dc463503c12674ea840b21b4a507b7c",
"common_voice_en_18607573.wav",
"Caddy, show Miss Clare and Miss Summerson their rooms.", "2", "0", "twenties", "male", "canada"],
]
_FR_TRAIN_CSV_CONTENTS = [
[
"a2e8e1e1cc74d08c92a53d7b9ff84e077eb90410edd85b8882f16fd037cecfcb6a19413c6c63ce6458cfea9579878fa91cef"
"18343441c601cae0597a4b0d3144",
"89e67e7682b36786a0b4b4022c4d42090c86edd96c78c12d30088e62522b8fe466ea4912e6a1055dfb91b296a0743e0a2bbe"
"16cebac98ee5349e3e8262cb9329",
"Or sur ce point nous n’avons aucune réponse de votre part.", "2", "0", "twenties", "male", "france"],
[
"a2e8e1e1cc74d08c92a53d7b9ff84e077eb90410edd85b8882f16fd037cecfcb6a19413c6c63ce6458cfea9579878fa91cef18"
"343441c601cae0597a4b0d3144",
"87d71819a26179e93acfee149d0b21b7bf5e926e367d80b2b3792d45f46e04853a514945783ff764c1fc237b4eb0ee2b0a7a7"
"cbd395acbdfcfa9d76a6e199bbd",
"Monsieur de La Verpillière, laissez parler le ministre", "2", "0", "twenties", "male", "france"],
]
def get_mock_dataset(root_dir, train_csv_contents, ext_audio) -> Tuple[Tensor, int, Dict[str, str]]:
"""
prepares mocked dataset
"""
mocked_data = []
# Note: extension is changed to wav for the sake of test
# Note: the first content is missing values for `age`, `gender` and `accent` as in the original data.
# Tsv file name difference does not mean different subset, testing as a whole dataset here
tsv_filename = os.path.join(root_dir, "train.tsv")
audio_base_path = os.path.join(root_dir, "clips")
os.makedirs(audio_base_path, exist_ok=True)
with open(tsv_filename, "w", newline='') as tsv:
writer = csv.writer(tsv, delimiter='\t')
writer.writerow(_HEADERS)
for i, content in enumerate(train_csv_contents):
content[2] = str(content[2].encode("utf-8"))
writer.writerow(content)
if not content[1].endswith(ext_audio):
audio_path = os.path.join(audio_base_path, content[1] + ext_audio)
else:
audio_path = os.path.join(audio_base_path, content[1])
data = get_whitenoise(sample_rate=_SAMPLE_RATE, duration=1, n_channels=1, seed=i, dtype='float32')
save_wav(audio_path, data, _SAMPLE_RATE)
# Append data entry
mocked_data.append((normalize_wav(data), _SAMPLE_RATE, dict(zip(_HEADERS, content))))
return mocked_data
def get_mock_dataset_en(root_dir, ext_audio) -> Tuple[Tensor, int, Dict[str, str]]:
"""
prepares english mocked dataset
"""
return get_mock_dataset(root_dir, _EN_TRAIN_CSV_CONTENTS, ext_audio)
def get_mock_dataset_fr(root_dir, ext_audio) -> Tuple[Tensor, int, Dict[str, str]]:
"""
prepares french mocked dataset
"""
return get_mock_dataset(root_dir, _FR_TRAIN_CSV_CONTENTS, ext_audio)
class BaseTestCommonVoice(TempDirMixin):
root_dir = None
data = None
@classmethod
def setUpClass(cls):
super().setUpClass()
cls.root_dir = cls.get_base_temp_dir()
COMMONVOICE._ext_audio = ".wav"
@classmethod
def tearDownClass(cls):
super().tearDownClass()
COMMONVOICE._ext_audio = _ORIGINAL_EXT_AUDIO
def _test_commonvoice(self, dataset):
n_ite = 0
for i, (waveform, sample_rate, dictionary) in enumerate(dataset):
expected_dictionary = self.data[i][2]
expected_data = self.data[i][0]
self.assertEqual(expected_data, waveform, atol=5e-5, rtol=1e-8)
assert sample_rate == _SAMPLE_RATE
assert dictionary == expected_dictionary
n_ite += 1
assert n_ite == len(self.data)
class TestCommonVoiceEN(BaseTestCommonVoice, TorchaudioTestCase):
backend = 'default'
root_dir = None
@classmethod
def setUpClass(cls):
super().setUpClass()
cls.data = get_mock_dataset_en(cls.root_dir, COMMONVOICE._ext_audio)
def test_commonvoice_str(self):
dataset = COMMONVOICE(self.root_dir)
self._test_commonvoice(dataset)
def test_commonvoice_path(self):
dataset = COMMONVOICE(Path(self.root_dir))
self._test_commonvoice(dataset)
class TestCommonVoiceFR(BaseTestCommonVoice, TorchaudioTestCase):
backend = 'default'
root_dir = None
@classmethod
def setUpClass(cls):
super().setUpClass()
cls.data = get_mock_dataset_fr(cls.root_dir, COMMONVOICE._ext_audio)
def test_commonvoice_str(self):
dataset = COMMONVOICE(self.root_dir)
self._test_commonvoice(dataset)
from torchaudio.datasets.vctk import VCTK
from torchaudio_unittest.common_utils import (
TorchaudioTestCase,
get_asset_path,
)
class TestDatasets(TorchaudioTestCase):
backend = 'default'
path = get_asset_path()
def test_vctk(self):
data = VCTK(self.path)
data[0]
import os
from pathlib import Path
from torchaudio.datasets import gtzan
from torchaudio_unittest.common_utils import (
TempDirMixin,
TorchaudioTestCase,
get_whitenoise,
save_wav,
normalize_wav,
)
def get_mock_dataset(root_dir):
"""
root_dir: directory to the mocked dataset
"""
mocked_samples = []
mocked_training = []
mocked_validation = []
mocked_testing = []
sample_rate = 22050
seed = 0
for genre in gtzan.gtzan_genres:
base_dir = os.path.join(root_dir, 'genres', genre)
os.makedirs(base_dir, exist_ok=True)
for i in range(100):
filename = f'{genre}.{i:05d}'
path = os.path.join(base_dir, f'{filename}.wav')
data = get_whitenoise(sample_rate=sample_rate, duration=0.01, n_channels=1, dtype='int16', seed=seed)
save_wav(path, data, sample_rate)
sample = (normalize_wav(data), sample_rate, genre)
mocked_samples.append(sample)
if filename in gtzan.filtered_test:
mocked_testing.append(sample)
if filename in gtzan.filtered_train:
mocked_training.append(sample)
if filename in gtzan.filtered_valid:
mocked_validation.append(sample)
seed += 1
return (mocked_samples, mocked_training, mocked_validation, mocked_testing)
class TestGTZAN(TempDirMixin, TorchaudioTestCase):
backend = 'default'
root_dir = None
samples = []
training = []
validation = []
testing = []
@classmethod
def setUpClass(cls):
cls.root_dir = cls.get_base_temp_dir()
mocked_data = get_mock_dataset(cls.root_dir)
cls.samples = mocked_data[0]
cls.training = mocked_data[1]
cls.validation = mocked_data[2]
cls.testing = mocked_data[3]
def test_no_subset(self):
dataset = gtzan.GTZAN(self.root_dir)
n_ite = 0
for i, (waveform, sample_rate, label) in enumerate(dataset):
self.assertEqual(waveform, self.samples[i][0], atol=5e-5, rtol=1e-8)
assert sample_rate == self.samples[i][1]
assert label == self.samples[i][2]
n_ite += 1
assert n_ite == len(self.samples)
def _test_training(self, dataset):
n_ite = 0
for i, (waveform, sample_rate, label) in enumerate(dataset):
self.assertEqual(waveform, self.training[i][0], atol=5e-5, rtol=1e-8)
assert sample_rate == self.training[i][1]
assert label == self.training[i][2]
n_ite += 1
assert n_ite == len(self.training)
def _test_validation(self, dataset):
n_ite = 0
for i, (waveform, sample_rate, label) in enumerate(dataset):
self.assertEqual(waveform, self.validation[i][0], atol=5e-5, rtol=1e-8)
assert sample_rate == self.validation[i][1]
assert label == self.validation[i][2]
n_ite += 1
assert n_ite == len(self.validation)
def _test_testing(self, dataset):
n_ite = 0
for i, (waveform, sample_rate, label) in enumerate(dataset):
self.assertEqual(waveform, self.testing[i][0], atol=5e-5, rtol=1e-8)
assert sample_rate == self.testing[i][1]
assert label == self.testing[i][2]
n_ite += 1
assert n_ite == len(self.testing)
def test_training_str(self):
train_dataset = gtzan.GTZAN(self.root_dir, subset='training')
self._test_training(train_dataset)
def test_validation_str(self):
val_dataset = gtzan.GTZAN(self.root_dir, subset='validation')
self._test_validation(val_dataset)
def test_testing_str(self):
test_dataset = gtzan.GTZAN(self.root_dir, subset='testing')
self._test_testing(test_dataset)
def test_training_path(self):
root_dir = Path(self.root_dir)
train_dataset = gtzan.GTZAN(root_dir, subset='training')
self._test_training(train_dataset)
def test_validation_path(self):
root_dir = Path(self.root_dir)
val_dataset = gtzan.GTZAN(root_dir, subset='validation')
self._test_validation(val_dataset)
def test_testing_path(self):
root_dir = Path(self.root_dir)
test_dataset = gtzan.GTZAN(root_dir, subset='testing')
self._test_testing(test_dataset)
import os
from pathlib import Path
from torchaudio_unittest.common_utils import (
TempDirMixin,
TorchaudioTestCase,
get_whitenoise,
save_wav,
normalize_wav,
)
from torchaudio.datasets import librispeech
# Used to generate a unique transcript for each dummy audio file
_NUMBERS = [
'ZERO',
'ONE',
'TWO',
'THREE',
'FOUR',
'FIVE',
'SIX',
'SEVEN',
'EIGHT',
'NINE'
]
def get_mock_dataset(root_dir):
"""
root_dir: directory to the mocked dataset
"""
mocked_data = []
dataset_dir = os.path.join(
root_dir, librispeech.FOLDER_IN_ARCHIVE, librispeech.URL
)
os.makedirs(dataset_dir, exist_ok=True)
sample_rate = 16000 # 16kHz
seed = 0
for speaker_id in range(5):
speaker_path = os.path.join(dataset_dir, str(speaker_id))
os.makedirs(speaker_path, exist_ok=True)
for chapter_id in range(3):
chapter_path = os.path.join(speaker_path, str(chapter_id))
os.makedirs(chapter_path, exist_ok=True)
trans_content = []
for utterance_id in range(10):
filename = f'{speaker_id}-{chapter_id}-{utterance_id:04d}.wav'
path = os.path.join(chapter_path, filename)
transcript = ' '.join(
[_NUMBERS[x] for x in [speaker_id, chapter_id, utterance_id]]
)
trans_content.append(
f'{speaker_id}-{chapter_id}-{utterance_id:04d} {transcript}'
)
data = get_whitenoise(
sample_rate=sample_rate,
duration=0.01,
n_channels=1,
dtype='float32',
seed=seed
)
save_wav(path, data, sample_rate)
sample = (
normalize_wav(data),
sample_rate,
transcript,
speaker_id,
chapter_id,
utterance_id
)
mocked_data.append(sample)
seed += 1
trans_filename = f'{speaker_id}-{chapter_id}.trans.txt'
trans_path = os.path.join(chapter_path, trans_filename)
with open(trans_path, 'w') as f:
f.write('\n'.join(trans_content))
return mocked_data
class TestLibriSpeech(TempDirMixin, TorchaudioTestCase):
backend = 'default'
root_dir = None
samples = []
@classmethod
def setUpClass(cls):
cls.root_dir = cls.get_base_temp_dir()
cls.samples = get_mock_dataset(cls.root_dir)
@classmethod
def tearDownClass(cls):
# In case of test failure
librispeech.LIBRISPEECH._ext_audio = '.flac'
def _test_librispeech(self, dataset):
num_samples = 0
for i, (
data, sample_rate, transcript, speaker_id, chapter_id, utterance_id
) in enumerate(dataset):
self.assertEqual(data, self.samples[i][0], atol=5e-5, rtol=1e-8)
assert sample_rate == self.samples[i][1]
assert transcript == self.samples[i][2]
assert speaker_id == self.samples[i][3]
assert chapter_id == self.samples[i][4]
assert utterance_id == self.samples[i][5]
num_samples += 1
assert num_samples == len(self.samples)
librispeech.LIBRISPEECH._ext_audio = '.flac'
def test_librispeech_str(self):
librispeech.LIBRISPEECH._ext_audio = '.wav'
dataset = librispeech.LIBRISPEECH(self.root_dir)
self._test_librispeech(dataset)
def test_librispeech_path(self):
librispeech.LIBRISPEECH._ext_audio = '.wav'
dataset = librispeech.LIBRISPEECH(Path(self.root_dir))
self._test_librispeech(dataset)
import os
from pathlib import Path
from torchaudio_unittest.common_utils import (
TempDirMixin,
TorchaudioTestCase,
get_whitenoise,
save_wav,
normalize_wav,
)
from torchaudio.datasets.libritts import LIBRITTS
_UTTERANCE_IDS = [
[19, 198, '000000', '000000'],
[26, 495, '000004', '000000'],
]
_ORIGINAL_TEXT = 'this is the original text.'
_NORMALIZED_TEXT = 'this is the normalized text.'
def get_mock_dataset(root_dir):
"""
root_dir: directory to the mocked dataset
"""
mocked_data = []
base_dir = os.path.join(root_dir, 'LibriTTS', 'train-clean-100')
for i, utterance_id in enumerate(_UTTERANCE_IDS):
filename = f'{"_".join(str(u) for u in utterance_id)}.wav'
file_dir = os.path.join(base_dir, str(utterance_id[0]), str(utterance_id[1]))
os.makedirs(file_dir, exist_ok=True)
path = os.path.join(file_dir, filename)
data = get_whitenoise(sample_rate=24000, duration=2, n_channels=1, dtype='int16', seed=i)
save_wav(path, data, 24000)
mocked_data.append(normalize_wav(data))
original_text_filename = f'{"_".join(str(u) for u in utterance_id)}.original.txt'
path_original = os.path.join(file_dir, original_text_filename)
with open(path_original, 'w') as file_:
file_.write(_ORIGINAL_TEXT)
normalized_text_filename = f'{"_".join(str(u) for u in utterance_id)}.normalized.txt'
path_normalized = os.path.join(file_dir, normalized_text_filename)
with open(path_normalized, 'w') as file_:
file_.write(_NORMALIZED_TEXT)
return mocked_data, _UTTERANCE_IDS, _ORIGINAL_TEXT, _NORMALIZED_TEXT
class TestLibriTTS(TempDirMixin, TorchaudioTestCase):
backend = 'default'
root_dir = None
data = []
_utterance_ids, _original_text, _normalized_text = [], [], []
@classmethod
def setUpClass(cls):
cls.root_dir = cls.get_base_temp_dir()
cls.data, cls._utterance_ids, cls._original_text, cls._normalized_text = get_mock_dataset(cls.root_dir)
def _test_libritts(self, dataset):
n_ites = 0
for i, (waveform,
sample_rate,
original_text,
normalized_text,
speaker_id,
chapter_id,
utterance_id) in enumerate(dataset):
expected_ids = self._utterance_ids[i]
expected_data = self.data[i]
self.assertEqual(expected_data, waveform, atol=5e-5, rtol=1e-8)
assert sample_rate == 24000
assert speaker_id == expected_ids[0]
assert chapter_id == expected_ids[1]
assert original_text == self._original_text
assert normalized_text == self._normalized_text
assert utterance_id == f'{"_".join(str(u) for u in expected_ids[-4:])}'
n_ites += 1
assert n_ites == len(self._utterance_ids)
def test_libritts_str(self):
dataset = LIBRITTS(self.root_dir)
self._test_libritts(dataset)
def test_libritts_path(self):
dataset = LIBRITTS(Path(self.root_dir))
self._test_libritts(dataset)
import csv
import os
from pathlib import Path
from torchaudio_unittest.common_utils import (
TempDirMixin,
TorchaudioTestCase,
get_whitenoise,
normalize_wav,
save_wav,
)
from torchaudio.datasets import ljspeech
_TRANSCRIPTS = [
"Test transcript 1",
"Test transcript 2",
"Test transcript 3",
"In 1465 Sweynheim and Pannartz began printing in the monastery of Subiaco near Rome,"
]
_NORMALIZED_TRANSCRIPT = [
"Test transcript one",
"Test transcript two",
"Test transcript three",
"In fourteen sixty-five Sweynheim and Pannartz began printing in the monastery of Subiaco near Rome,"
]
def get_mock_dataset(root_dir):
"""
root_dir: path to the mocked dataset
"""
mocked_data = []
base_dir = os.path.join(root_dir, "LJSpeech-1.1")
archive_dir = os.path.join(base_dir, "wavs")
os.makedirs(archive_dir, exist_ok=True)
metadata_path = os.path.join(base_dir, "metadata.csv")
sample_rate = 22050
with open(metadata_path, mode="w", newline='') as metadata_file:
metadata_writer = csv.writer(
metadata_file, delimiter="|", quoting=csv.QUOTE_NONE
)
for i, (transcript, normalized_transcript) in enumerate(
zip(_TRANSCRIPTS, _NORMALIZED_TRANSCRIPT)
):
fileid = f'LJ001-{i:04d}'
metadata_writer.writerow([fileid, transcript, normalized_transcript])
filename = fileid + ".wav"
path = os.path.join(archive_dir, filename)
data = get_whitenoise(
sample_rate=sample_rate, duration=1, n_channels=1, dtype="int16", seed=i
)
save_wav(path, data, sample_rate)
mocked_data.append(normalize_wav(data))
return mocked_data, _TRANSCRIPTS, _NORMALIZED_TRANSCRIPT
class TestLJSpeech(TempDirMixin, TorchaudioTestCase):
backend = "default"
root_dir = None
data, _transcripts, _normalized_transcript = [], [], []
@classmethod
def setUpClass(cls):
cls.root_dir = cls.get_base_temp_dir()
cls.data, cls._transcripts, cls._normalized_transcript = get_mock_dataset(cls.root_dir)
def _test_ljspeech(self, dataset):
n_ite = 0
for i, (waveform, sample_rate, transcript, normalized_transcript) in enumerate(
dataset
):
expected_transcript = self._transcripts[i]
expected_normalized_transcript = self._normalized_transcript[i]
expected_data = self.data[i]
self.assertEqual(expected_data, waveform, atol=5e-5, rtol=1e-8)
assert sample_rate == sample_rate
assert transcript == expected_transcript
assert normalized_transcript == expected_normalized_transcript
n_ite += 1
assert n_ite == len(self.data)
def test_ljspeech_str(self):
dataset = ljspeech.LJSPEECH(self.root_dir)
self._test_ljspeech(dataset)
def test_ljspeech_path(self):
dataset = ljspeech.LJSPEECH(Path(self.root_dir))
self._test_ljspeech(dataset)
import os
from pathlib import Path
from torchaudio_unittest.common_utils import (
TempDirMixin,
TorchaudioTestCase,
get_whitenoise,
normalize_wav,
save_wav,
)
from torchaudio.datasets import speechcommands
_LABELS = [
"bed",
"bird",
"cat",
"dog",
"down",
"eight",
"five",
"follow",
"forward",
"four",
"go",
"happy",
"house",
"learn",
"left",
"marvin",
"nine",
"no",
"off",
"on",
"one",
"right",
"seven",
"sheila",
"six",
"stop",
"three",
"tree",
"two",
"up",
"visual",
"wow",
"yes",
"zero",
]
def get_mock_dataset(dataset_dir):
"""
dataset_dir: directory to the mocked dataset
"""
mocked_samples = []
mocked_train_samples = []
mocked_valid_samples = []
mocked_test_samples = []
os.makedirs(dataset_dir, exist_ok=True)
sample_rate = 16000 # 16kHz sample rate
seed = 0
valid_file = os.path.join(dataset_dir, "validation_list.txt")
test_file = os.path.join(dataset_dir, "testing_list.txt")
with open(valid_file, "w") as valid, open(test_file, "w") as test:
for label in _LABELS:
path = os.path.join(dataset_dir, label)
os.makedirs(path, exist_ok=True)
for j in range(6):
# generate hash ID for speaker
speaker = "{:08x}".format(j)
for utterance in range(3):
filename = f"{speaker}{speechcommands.HASH_DIVIDER}{utterance}.wav"
file_path = os.path.join(path, filename)
seed += 1
data = get_whitenoise(
sample_rate=sample_rate,
duration=0.01,
n_channels=1,
dtype="int16",
seed=seed,
)
save_wav(file_path, data, sample_rate)
sample = (
normalize_wav(data),
sample_rate,
label,
speaker,
utterance,
)
mocked_samples.append(sample)
if j < 2:
mocked_train_samples.append(sample)
elif j < 4:
valid.write(f'{label}/{filename}\n')
mocked_valid_samples.append(sample)
elif j < 6:
test.write(f'{label}/{filename}\n')
mocked_test_samples.append(sample)
return mocked_samples, mocked_train_samples, mocked_valid_samples, mocked_test_samples
class TestSpeechCommands(TempDirMixin, TorchaudioTestCase):
backend = "default"
root_dir = None
samples = []
train_samples = []
valid_samples = []
test_samples = []
@classmethod
def setUpClass(cls):
cls.root_dir = cls.get_base_temp_dir()
dataset_dir = os.path.join(
cls.root_dir, speechcommands.FOLDER_IN_ARCHIVE, speechcommands.URL
)
cls.samples, cls.train_samples, cls.valid_samples, cls.test_samples = get_mock_dataset(dataset_dir)
def _testSpeechCommands(self, dataset, data_samples):
num_samples = 0
for i, (data, sample_rate, label, speaker_id, utterance_number) in enumerate(
dataset
):
self.assertEqual(data, data_samples[i][0], atol=5e-5, rtol=1e-8)
assert sample_rate == data_samples[i][1]
assert label == data_samples[i][2]
assert speaker_id == data_samples[i][3]
assert utterance_number == data_samples[i][4]
num_samples += 1
assert num_samples == len(data_samples)
def testSpeechCommands_str(self):
dataset = speechcommands.SPEECHCOMMANDS(self.root_dir)
self._testSpeechCommands(dataset, self.samples)
def testSpeechCommands_path(self):
dataset = speechcommands.SPEECHCOMMANDS(Path(self.root_dir))
self._testSpeechCommands(dataset, self.samples)
def testSpeechCommandsSubsetTrain(self):
dataset = speechcommands.SPEECHCOMMANDS(self.root_dir, subset="training")
self._testSpeechCommands(dataset, self.train_samples)
def testSpeechCommandsSubsetValid(self):
dataset = speechcommands.SPEECHCOMMANDS(self.root_dir, subset="validation")
self._testSpeechCommands(dataset, self.valid_samples)
def testSpeechCommandsSubsetTest(self):
dataset = speechcommands.SPEECHCOMMANDS(self.root_dir, subset="testing")
self._testSpeechCommands(dataset, self.test_samples)
def testSpeechCommandsSum(self):
dataset_all = speechcommands.SPEECHCOMMANDS(self.root_dir)
dataset_train = speechcommands.SPEECHCOMMANDS(self.root_dir, subset="training")
dataset_valid = speechcommands.SPEECHCOMMANDS(self.root_dir, subset="validation")
dataset_test = speechcommands.SPEECHCOMMANDS(self.root_dir, subset="testing")
assert len(dataset_train) + len(dataset_valid) + len(dataset_test) == len(dataset_all)
import os
import platform
from pathlib import Path
from torchaudio_unittest.common_utils import (
TempDirMixin,
TorchaudioTestCase,
get_whitenoise,
save_wav,
skipIfNoSox
)
from torchaudio.datasets import tedlium
# Used to generate a unique utterance for each dummy audio file
_UTTERANCES = [
"AaronHuey_2010X 1 AaronHuey_2010X 0.0 2.0 <o,f0,female> script1\n",
"AaronHuey_2010X 1 AaronHuey_2010X 2.0 4.0 <o,f0,female> script2\n",
"AaronHuey_2010X 1 AaronHuey_2010X 4.0 6.0 <o,f0,female> script3\n",
"AaronHuey_2010X 1 AaronHuey_2010X 6.0 8.0 <o,f0,female> script4\n",
"AaronHuey_2010X 1 AaronHuey_2010X 8.0 10.0 <o,f0,female> script5\n",
]
_PHONEME = [
"a AH",
"a(2) EY",
"aachen AA K AH N",
"aad AE D",
"aaden EY D AH N",
"aadmi AE D M IY",
"aae EY EY",
]
def get_mock_dataset(dataset_dir):
"""
dataset_dir: directory of the mocked dataset
"""
mocked_samples = {}
os.makedirs(dataset_dir, exist_ok=True)
sample_rate = 16000 # 16kHz
seed = 0
for release in ["release1", "release2", "release3"]:
data = get_whitenoise(sample_rate=sample_rate, duration=10.00, n_channels=1, dtype="float32", seed=seed)
if release in ["release1", "release2"]:
release_dir = os.path.join(
dataset_dir,
tedlium._RELEASE_CONFIGS[release]["folder_in_archive"],
tedlium._RELEASE_CONFIGS[release]["subset"],
)
else:
release_dir = os.path.join(
dataset_dir,
tedlium._RELEASE_CONFIGS[release]["folder_in_archive"],
tedlium._RELEASE_CONFIGS[release]["data_path"],
)
os.makedirs(release_dir, exist_ok=True)
os.makedirs(os.path.join(release_dir, "stm"), exist_ok=True) # Subfolder for transcripts
os.makedirs(os.path.join(release_dir, "sph"), exist_ok=True) # Subfolder for audio files
filename = f"{release}.sph"
path = os.path.join(os.path.join(release_dir, "sph"), filename)
save_wav(path, data, sample_rate)
trans_filename = f"{release}.stm"
trans_path = os.path.join(os.path.join(release_dir, "stm"), trans_filename)
with open(trans_path, "w") as f:
f.write("".join(_UTTERANCES))
dict_filename = f"{release}.dic"
dict_path = os.path.join(release_dir, dict_filename)
with open(dict_path, "w") as f:
f.write("\n".join(_PHONEME))
# Create a samples list to compare with
mocked_samples[release] = []
for utterance in _UTTERANCES:
talk_id, _, speaker_id, start_time, end_time, identifier, transcript = utterance.split(" ", 6)
start_time = int(float(start_time)) * sample_rate
end_time = int(float(end_time)) * sample_rate
sample = (
data[:, start_time:end_time],
sample_rate,
transcript,
talk_id,
speaker_id,
identifier,
)
mocked_samples[release].append(sample)
seed += 1
return mocked_samples
class Tedlium(TempDirMixin):
root_dir = None
samples = {}
@classmethod
def setUpClass(cls):
cls.root_dir = cls.get_base_temp_dir()
cls.root_dir = dataset_dir = os.path.join(cls.root_dir, "tedlium")
cls.samples = get_mock_dataset(dataset_dir)
def _test_tedlium(self, dataset, release):
num_samples = 0
for i, (data, sample_rate, transcript, talk_id, speaker_id, identifier) in enumerate(dataset):
self.assertEqual(data, self.samples[release][i][0], atol=5e-5, rtol=1e-8)
assert sample_rate == self.samples[release][i][1]
assert transcript == self.samples[release][i][2]
assert talk_id == self.samples[release][i][3]
assert speaker_id == self.samples[release][i][4]
assert identifier == self.samples[release][i][5]
num_samples += 1
assert num_samples == len(self.samples[release])
dataset._dict_path = os.path.join(dataset._path, f"{release}.dic")
phoneme_dict = dataset.phoneme_dict
phoenemes = [f"{key} {' '.join(value)}" for key, value in phoneme_dict.items()]
assert phoenemes == _PHONEME
def test_tedlium_release1_str(self):
release = "release1"
dataset = tedlium.TEDLIUM(self.root_dir, release=release)
self._test_tedlium(dataset, release)
def test_tedlium_release1_path(self):
release = "release1"
dataset = tedlium.TEDLIUM(Path(self.root_dir), release=release)
self._test_tedlium(dataset, release)
def test_tedlium_release2(self):
release = "release2"
dataset = tedlium.TEDLIUM(self.root_dir, release=release)
self._test_tedlium(dataset, release)
def test_tedlium_release3(self):
release = "release3"
dataset = tedlium.TEDLIUM(self.root_dir, release=release)
self._test_tedlium(dataset, release)
class TestTedliumSoundfile(Tedlium, TorchaudioTestCase):
backend = "soundfile"
if platform.system() != "Windows":
@skipIfNoSox
class TestTedliumSoxIO(Tedlium, TorchaudioTestCase):
backend = "sox_io"
import torch
from torchaudio_unittest.common_utils import (
TorchaudioTestCase,
TempDirMixin
)
from torchaudio.datasets import utils as dataset_utils
class Dataset(torch.utils.data.Dataset):
def __getitem__(self, n):
sample_rate = 8000
waveform = n * torch.ones(2, 256)
return waveform, sample_rate
def __len__(self) -> int:
return 2
def __iter__(self):
for i in range(len(self)):
yield self[i]
class TestIterator(TorchaudioTestCase, TempDirMixin):
backend = 'default'
def test_disckcache_iterator(self):
data = dataset_utils.diskcache_iterator(Dataset(), self.get_base_temp_dir())
# Save
data[0]
# Load
data[0]
def test_bg_iterator(self):
data = dataset_utils.bg_iterator(Dataset(), 5)
for _ in data:
pass
import os
from pathlib import Path
from torchaudio.datasets import vctk
from torchaudio_unittest.common_utils import (
TempDirMixin,
TorchaudioTestCase,
get_whitenoise,
save_wav,
normalize_wav,
)
# Used to generate a unique transcript for each dummy audio file
_TRANSCRIPT = [
'Please call Stella',
'Ask her to bring these things',
'with her from the store',
'Six spoons of fresh snow peas, five thick slabs of blue cheese, and maybe a snack for her brother Bob',
'We also need a small plastic snake and a big toy frog for the kids',
'She can scoop these things into three red bags, and we will go meet her Wednesday at the train station',
'When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow',
'The rainbow is a division of white light into many beautiful colors',
'These take the shape of a long round arch, with its path high above, and its two ends \
apparently beyond the horizon',
'There is, according to legend, a boiling pot of gold at one end'
]
def get_mock_dataset(root_dir):
"""
root_dir: root directory of the mocked data
"""
mocked_samples = []
dataset_dir = os.path.join(root_dir, 'VCTK-Corpus-0.92')
os.makedirs(dataset_dir, exist_ok=True)
sample_rate = 48000
seed = 0
for speaker in range(225, 230):
speaker_id = 'p' + str(speaker)
audio_dir = os.path.join(dataset_dir, 'wav48_silence_trimmed', speaker_id)
os.makedirs(audio_dir, exist_ok=True)
file_dir = os.path.join(dataset_dir, 'txt', speaker_id)
os.makedirs(file_dir, exist_ok=True)
for utterance_id in range(1, 11):
filename = f'{speaker_id}_{utterance_id:03d}_mic2'
audio_file_path = os.path.join(audio_dir, filename + '.wav')
data = get_whitenoise(
sample_rate=sample_rate,
duration=0.01,
n_channels=1,
dtype='float32',
seed=seed
)
save_wav(audio_file_path, data, sample_rate)
txt_file_path = os.path.join(file_dir, filename[:-5] + '.txt')
transcript = _TRANSCRIPT[utterance_id - 1]
with open(txt_file_path, 'w') as f:
f.write(transcript)
sample = (
normalize_wav(data),
sample_rate,
transcript,
speaker_id,
utterance_id
)
mocked_samples.append(sample)
seed += 1
return mocked_samples
class TestVCTK(TempDirMixin, TorchaudioTestCase):
backend = 'default'
root_dir = None
samples = []
@classmethod
def setUpClass(cls):
cls.root_dir = cls.get_base_temp_dir()
cls.samples = get_mock_dataset(cls.root_dir)
def _test_vctk(self, dataset):
num_samples = 0
for i, (data, sample_rate, transcript, speaker_id, utterance_id) in enumerate(dataset):
self.assertEqual(data, self.samples[i][0], atol=5e-5, rtol=1e-8)
assert sample_rate == self.samples[i][1]
assert transcript == self.samples[i][2]
assert speaker_id == self.samples[i][3]
assert int(utterance_id) == self.samples[i][4]
num_samples += 1
assert num_samples == len(self.samples)
def test_vctk_str(self):
dataset = vctk.VCTK_092(self.root_dir, audio_ext=".wav")
self._test_vctk(dataset)
def test_vctk_path(self):
dataset = vctk.VCTK_092(Path(self.root_dir), audio_ext=".wav")
self._test_vctk(dataset)
import os
from pathlib import Path
from torchaudio.datasets import yesno
from torchaudio_unittest.common_utils import (
TempDirMixin,
TorchaudioTestCase,
get_whitenoise,
save_wav,
normalize_wav,
)
def get_mock_data(root_dir, labels):
"""
root_dir: path
labels: list of labels
"""
mocked_data = []
base_dir = os.path.join(root_dir, 'waves_yesno')
os.makedirs(base_dir, exist_ok=True)
for i, label in enumerate(labels):
filename = f'{"_".join(str(l) for l in label)}.wav'
path = os.path.join(base_dir, filename)
data = get_whitenoise(sample_rate=8000, duration=6, n_channels=1, dtype='int16', seed=i)
save_wav(path, data, 8000)
mocked_data.append(normalize_wav(data))
return mocked_data
class TestYesNo(TempDirMixin, TorchaudioTestCase):
backend = 'default'
root_dir = None
data = []
labels = [
[0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 1, 1, 1],
[0, 1, 0, 1, 0, 1, 1, 0],
[1, 1, 1, 1, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1],
]
@classmethod
def setUpClass(cls):
cls.root_dir = cls.get_base_temp_dir()
cls.data = get_mock_data(cls.root_dir, cls.labels)
def _test_yesno(self, dataset):
n_ite = 0
for i, (waveform, sample_rate, label) in enumerate(dataset):
expected_label = self.labels[i]
expected_data = self.data[i]
self.assertEqual(expected_data, waveform, atol=5e-5, rtol=1e-8)
assert sample_rate == 8000
assert label == expected_label
n_ite += 1
assert n_ite == len(self.data)
def test_yesno_str(self):
dataset = yesno.YESNO(self.root_dir)
self._test_yesno(dataset)
def test_yesno_path(self):
dataset = yesno.YESNO(Path(self.root_dir))
self._test_yesno(dataset)
import os
import sys
sys.path.append(
os.path.join(
os.path.dirname(__file__),
'..', '..', '..', 'examples'))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment