Commit 5859923a authored by Joao Gomes's avatar Joao Gomes Committed by Facebook GitHub Bot
Browse files

Apply arc lint to pytorch audio (#2096)

Summary:
Pull Request resolved: https://github.com/pytorch/audio/pull/2096

run: `arc lint --apply-patches --paths-cmd 'hg files -I "./**/*.py"'`

Reviewed By: mthrok

Differential Revision: D33297351

fbshipit-source-id: 7bf5956edf0717c5ca90219f72414ff4eeaf5aa8
parent 0e5913d5
from .data_utils import (
get_asset_path,
get_whitenoise,
get_sinusoid,
get_spectrogram,
)
from .backend_utils import (
set_audio_backend,
)
......@@ -21,43 +15,46 @@ from .case_utils import (
skipIfRocm,
skipIfNoQengine,
)
from .data_utils import (
get_asset_path,
get_whitenoise,
get_sinusoid,
get_spectrogram,
)
from .func_utils import torch_script
from .parameterized_utils import load_params, nested_params
from .wav_utils import (
get_wav_data,
normalize_wav,
load_wav,
save_wav,
)
from .parameterized_utils import (
load_params,
nested_params
)
from .func_utils import torch_script
__all__ = [
'get_asset_path',
'get_whitenoise',
'get_sinusoid',
'get_spectrogram',
'set_audio_backend',
'TempDirMixin',
'HttpServerMixin',
'TestBaseMixin',
'PytorchTestCase',
'TorchaudioTestCase',
'skipIfNoCuda',
'skipIfNoExec',
'skipIfNoModule',
'skipIfNoKaldi',
'skipIfNoSox',
'skipIfNoSoxBackend',
'skipIfRocm',
'skipIfNoQengine',
'get_wav_data',
'normalize_wav',
'load_wav',
'save_wav',
'load_params',
'nested_params',
'torch_script',
"get_asset_path",
"get_whitenoise",
"get_sinusoid",
"get_spectrogram",
"set_audio_backend",
"TempDirMixin",
"HttpServerMixin",
"TestBaseMixin",
"PytorchTestCase",
"TorchaudioTestCase",
"skipIfNoCuda",
"skipIfNoExec",
"skipIfNoModule",
"skipIfNoKaldi",
"skipIfNoSox",
"skipIfNoSoxBackend",
"skipIfRocm",
"skipIfNoQengine",
"get_wav_data",
"normalize_wav",
"load_wav",
"save_wav",
"load_params",
"nested_params",
"torch_script",
]
......@@ -6,15 +6,15 @@ import torchaudio
def set_audio_backend(backend):
"""Allow additional backend value, 'default'"""
backends = torchaudio.list_audio_backends()
if backend == 'soundfile':
be = 'soundfile'
elif backend == 'default':
if 'sox_io' in backends:
be = 'sox_io'
elif 'soundfile' in backends:
be = 'soundfile'
if backend == "soundfile":
be = "soundfile"
elif backend == "default":
if "sox_io" in backends:
be = "sox_io"
elif "soundfile" in backends:
be = "soundfile"
else:
raise unittest.SkipTest('No default backend available')
raise unittest.SkipTest("No default backend available")
else:
be = backend
......
import shutil
import os.path
import shutil
import subprocess
import tempfile
import time
......@@ -7,24 +7,21 @@ import unittest
import torch
from torch.testing._internal.common_utils import TestCase as PytorchTestCase
from torchaudio._internal.module_utils import (
is_module_available,
is_sox_available,
is_kaldi_available
)
from torchaudio._internal.module_utils import is_module_available, is_sox_available, is_kaldi_available
from .backend_utils import set_audio_backend
class TempDirMixin:
"""Mixin to provide easy access to temp dir"""
temp_dir_ = None
@classmethod
def get_base_temp_dir(cls):
# If TORCHAUDIO_TEST_TEMP_DIR is set, use it instead of temporary directory.
# this is handy for debugging.
key = 'TORCHAUDIO_TEST_TEMP_DIR'
key = "TORCHAUDIO_TEST_TEMP_DIR"
if key in os.environ:
return os.environ[key]
if cls.temp_dir_ is None:
......@@ -51,6 +48,7 @@ class HttpServerMixin(TempDirMixin):
This class creates temporary directory and serve the directory as HTTP service.
The server is up through the execution of all the test suite defined under the subclass.
"""
_proc = None
_port = 8000
......@@ -58,9 +56,8 @@ class HttpServerMixin(TempDirMixin):
def setUpClass(cls):
super().setUpClass()
cls._proc = subprocess.Popen(
['python', '-m', 'http.server', f'{cls._port}'],
cwd=cls.get_base_temp_dir(),
stderr=subprocess.DEVNULL) # Disable server-side error log because it is confusing
["python", "-m", "http.server", f"{cls._port}"], cwd=cls.get_base_temp_dir(), stderr=subprocess.DEVNULL
) # Disable server-side error log because it is confusing
time.sleep(2.0)
@classmethod
......@@ -74,6 +71,7 @@ class HttpServerMixin(TempDirMixin):
class TestBaseMixin:
"""Mixin to provide consistent way to define device/dtype/backend aware TestCase"""
dtype = None
device = None
backend = None
......@@ -84,11 +82,11 @@ class TestBaseMixin:
@property
def complex_dtype(self):
if self.dtype in ['float32', 'float', torch.float, torch.float32]:
if self.dtype in ["float32", "float", torch.float, torch.float32]:
return torch.cfloat
if self.dtype in ['float64', 'double', torch.double, torch.float64]:
if self.dtype in ["float64", "double", torch.double, torch.float64]:
return torch.cdouble
raise ValueError(f'No corresponding complex dtype for {self.dtype}')
raise ValueError(f"No corresponding complex dtype for {self.dtype}")
class TorchaudioTestCase(TestBaseMixin, PytorchTestCase):
......@@ -96,7 +94,7 @@ class TorchaudioTestCase(TestBaseMixin, PytorchTestCase):
def skipIfNoExec(cmd):
return unittest.skipIf(shutil.which(cmd) is None, f'`{cmd}` is not available')
return unittest.skipIf(shutil.which(cmd) is None, f"`{cmd}` is not available")
def skipIfNoModule(module, display_name=None):
......@@ -107,17 +105,19 @@ def skipIfNoModule(module, display_name=None):
def skipIfNoCuda(test_item):
if torch.cuda.is_available():
return test_item
force_cuda_test = os.environ.get('TORCHAUDIO_TEST_FORCE_CUDA', '0')
if force_cuda_test not in ['0', '1']:
force_cuda_test = os.environ.get("TORCHAUDIO_TEST_FORCE_CUDA", "0")
if force_cuda_test not in ["0", "1"]:
raise ValueError('"TORCHAUDIO_TEST_FORCE_CUDA" must be either "0" or "1".')
if force_cuda_test == '1':
if force_cuda_test == "1":
raise RuntimeError('"TORCHAUDIO_TEST_FORCE_CUDA" is set but CUDA is not available.')
return unittest.skip('CUDA is not available.')(test_item)
skipIfNoSox = unittest.skipIf(not is_sox_available(), reason='Sox not available')
skipIfNoKaldi = unittest.skipIf(not is_kaldi_available(), reason='Kaldi not available')
skipIfRocm = unittest.skipIf(os.getenv('TORCHAUDIO_TEST_WITH_ROCM', '0') == '1',
reason="test doesn't currently work on the ROCm stack")
return unittest.skip("CUDA is not available.")(test_item)
skipIfNoSox = unittest.skipIf(not is_sox_available(), reason="Sox not available")
skipIfNoKaldi = unittest.skipIf(not is_kaldi_available(), reason="Kaldi not available")
skipIfRocm = unittest.skipIf(
os.getenv("TORCHAUDIO_TEST_WITH_ROCM", "0") == "1", reason="test doesn't currently work on the ROCm stack"
)
skipIfNoQengine = unittest.skipIf(
'fbgemm' not in torch.backends.quantized.supported_engines,
reason="`fbgemm` is not available."
"fbgemm" not in torch.backends.quantized.supported_engines, reason="`fbgemm` is not available."
)
......@@ -4,13 +4,12 @@ from typing import Union, Optional
import torch
_TEST_DIR_PATH = os.path.realpath(
os.path.join(os.path.dirname(__file__), '..'))
_TEST_DIR_PATH = os.path.realpath(os.path.join(os.path.dirname(__file__), ".."))
def get_asset_path(*paths):
"""Return full path of a test asset"""
return os.path.join(_TEST_DIR_PATH, 'assets', *paths)
return os.path.join(_TEST_DIR_PATH, "assets", *paths)
def convert_tensor_encoding(
......@@ -63,13 +62,12 @@ def get_whitenoise(
if isinstance(dtype, str):
dtype = getattr(torch, dtype)
if dtype not in [torch.float64, torch.float32, torch.int32, torch.int16, torch.uint8]:
raise NotImplementedError(f'dtype {dtype} is not supported.')
raise NotImplementedError(f"dtype {dtype} is not supported.")
# According to the doc, folking rng on all CUDA devices is slow when there are many CUDA devices,
# so we only fork on CPU, generate values and move the data to the given device
with torch.random.fork_rng([]):
torch.random.manual_seed(seed)
tensor = torch.randn([n_channels, int(sample_rate * duration)],
dtype=torch.float32, device='cpu')
tensor = torch.randn([n_channels, int(sample_rate * duration)], dtype=torch.float32, device="cpu")
tensor /= 2.0
tensor *= scale_factor
tensor.clamp_(-1.0, 1.0)
......@@ -116,15 +114,15 @@ def get_sinusoid(
def get_spectrogram(
waveform,
*,
n_fft: int = 2048,
hop_length: Optional[int] = None,
win_length: Optional[int] = None,
window: Optional[torch.Tensor] = None,
center: bool = True,
pad_mode: str = 'reflect',
power: Optional[float] = None,
waveform,
*,
n_fft: int = 2048,
hop_length: Optional[int] = None,
win_length: Optional[int] = None,
window: Optional[torch.Tensor] = None,
center: bool = True,
pad_mode: str = "reflect",
power: Optional[float] = None,
):
"""Generate a spectrogram of the given Tensor
......@@ -149,7 +147,8 @@ def get_spectrogram(
center=center,
window=window,
pad_mode=pad_mode,
return_complex=True)
return_complex=True,
)
if power is not None:
spec = spec.abs() ** power
return spec
......@@ -6,11 +6,11 @@ import torch
def convert_args(**kwargs):
args = []
for key, value in kwargs.items():
if key == 'sample_rate':
key = 'sample_frequency'
key = '--' + key.replace('_', '-')
if key == "sample_rate":
key = "sample_frequency"
key = "--" + key.replace("_", "-")
value = str(value).lower() if value in [True, False] else str(value)
args.append('%s=%s' % (key, value))
args.append("%s=%s" % (key, value))
return args
......@@ -25,14 +25,14 @@ def run_kaldi(command, input_type, input_value):
"""
import kaldi_io
key = 'foo'
key = "foo"
process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
if input_type == 'ark':
if input_type == "ark":
kaldi_io.write_mat(process.stdin, input_value.cpu().numpy(), key=key)
elif input_type == 'scp':
process.stdin.write(f'{key} {input_value}'.encode('utf8'))
elif input_type == "scp":
process.stdin.write(f"{key} {input_value}".encode("utf8"))
else:
raise NotImplementedError('Unexpected type')
raise NotImplementedError("Unexpected type")
process.stdin.close()
result = dict(kaldi_io.read_mat_ark(process.stdout))['foo']
result = dict(kaldi_io.read_mat_ark(process.stdout))["foo"]
return torch.from_numpy(result.copy()) # copy supresses some torch warning
......@@ -7,7 +7,7 @@ from .data_utils import get_asset_path
def load_params(*paths):
with open(get_asset_path(*paths), 'r') as file:
with open(get_asset_path(*paths), "r") as file:
return [param(json.loads(line)) for line in file]
......@@ -20,7 +20,7 @@ def _name_func(func, _, params):
strs.append(str(arg))
# sanitize the test name
name = "_".join(strs).replace(".", "_")
return f'{func.__name__}_{name}'
return f"{func.__name__}_{name}"
def nested_params(*params_set):
......@@ -39,13 +39,10 @@ def nested_params(*params_set):
# Parameters to be nested are given as list of `parameterized.param`
if not all(isinstance(p, param) for p in flatten):
raise TypeError(
"When using ``parameterized.param``, "
"all the parameters have to be of the ``param`` type.")
raise TypeError("When using ``parameterized.param``, " "all the parameters have to be of the ``param`` type.")
if any(p.args for p in flatten):
raise ValueError(
"When using ``parameterized.param``, "
"all the parameters have to be provided as keyword argument."
"When using ``parameterized.param``, " "all the parameters have to be provided as keyword argument."
)
args = [param()]
for params in params_set:
......
......@@ -5,11 +5,7 @@ import torch
def psd_numpy(
X: np.array,
mask: Optional[np.array],
multi_mask: bool = False,
normalize: bool = True,
eps: float = 1e-15
X: np.array, mask: Optional[np.array], multi_mask: bool = False, normalize: bool = True, eps: float = 1e-15
) -> np.array:
X_conj = np.conj(X)
psd_X = np.einsum("...cft,...eft->...ftce", X, X_conj)
......
import unittest
import random
import torch
import unittest
import numpy as np
import torch
from torchaudio.functional import rnnt_loss
......@@ -84,9 +85,7 @@ class _NumpyTransducer(torch.autograd.Function):
return beta, cost
@staticmethod
def compute_gradients_one_sequence(
log_probs, alpha, beta, targets, blank=-1
):
def compute_gradients_one_sequence(log_probs, alpha, beta, targets, blank=-1):
max_T, max_U, D = log_probs.shape
gradients = np.full(log_probs.shape, float("-inf"))
cost = -beta[0, 0]
......@@ -175,9 +174,7 @@ class NumpyTransducerLoss(torch.nn.Module):
def compute_with_numpy_transducer(data):
costs = NumpyTransducerLoss(
blank=data["blank"],
)(
costs = NumpyTransducerLoss(blank=data["blank"],)(
logits=data["logits"],
logit_lengths=data["logit_lengths"],
target_lengths=data["target_lengths"],
......@@ -254,6 +251,7 @@ def get_B1_T10_U3_D4_data(
def grad_hook(grad):
logits.saved_grad = grad.clone()
logits.register_hook(grad_hook)
data = {}
......@@ -307,6 +305,7 @@ def get_B1_T2_U3_D5_data(dtype=torch.float32, device=CPU_DEVICE):
def grad_hook(grad):
logits.saved_grad = grad.clone()
logits.register_hook(grad_hook)
targets = torch.tensor([[1, 2]], dtype=torch.int32, device=device)
......@@ -447,6 +446,7 @@ def get_B2_T4_U3_D3_data(dtype=torch.float32, device=CPU_DEVICE):
def grad_hook(grad):
logits.saved_grad = grad.clone()
logits.register_hook(grad_hook)
targets = torch.tensor([[1, 2], [1, 1]], dtype=torch.int32, device=device)
......@@ -573,9 +573,7 @@ def get_random_data(
max_src_length = torch.max(logit_lengths)
max_tgt_length = torch.max(target_lengths)
targets = torch.randint(
low=0, high=D - 1, size=(B, max_tgt_length), dtype=torch.int32, device=device
)
targets = torch.randint(low=0, high=D - 1, size=(B, max_tgt_length), dtype=torch.int32, device=device)
logits = torch.rand(
size=(B, max_src_length, max_tgt_length + 1, D),
dtype=dtype,
......@@ -584,6 +582,7 @@ def get_random_data(
def grad_hook(grad):
logits.saved_grad = grad.clone()
logits.register_hook(grad_hook)
return {
......
import sys
import subprocess
import sys
import warnings
def get_encoding(dtype):
encodings = {
'float32': 'floating-point',
'int32': 'signed-integer',
'int16': 'signed-integer',
'uint8': 'unsigned-integer',
"float32": "floating-point",
"int32": "signed-integer",
"int16": "signed-integer",
"uint8": "unsigned-integer",
}
return encodings[dtype]
def get_bit_depth(dtype):
bit_depths = {
'float32': 32,
'int32': 32,
'int16': 16,
'uint8': 8,
"float32": 32,
"int32": 32,
"int16": 16,
"uint8": 8,
}
return bit_depths[dtype]
def gen_audio_file(
path, sample_rate, num_channels,
*, encoding=None, bit_depth=None, compression=None, attenuation=None, duration=1, comment_file=None,
path,
sample_rate,
num_channels,
*,
encoding=None,
bit_depth=None,
compression=None,
attenuation=None,
duration=1,
comment_file=None,
):
"""Generate synthetic audio file with `sox` command."""
if path.endswith('.wav'):
warnings.warn('Use get_wav_data and save_wav to generate wav file for accurate result.')
if path.endswith(".wav"):
warnings.warn("Use get_wav_data and save_wav to generate wav file for accurate result.")
command = [
'sox',
'-V3', # verbose
'--no-dither', # disable automatic dithering
'-R',
"sox",
"-V3", # verbose
"--no-dither", # disable automatic dithering
"-R",
# -R is supposed to be repeatable, though the implementation looks suspicious
# and not setting the seed to a fixed value.
# https://fossies.org/dox/sox-14.4.2/sox_8c_source.html
# search "sox_globals.repeatable"
]
if bit_depth is not None:
command += ['--bits', str(bit_depth)]
command += ["--bits", str(bit_depth)]
command += [
'--rate', str(sample_rate),
'--null', # no input
'--channels', str(num_channels),
"--rate",
str(sample_rate),
"--null", # no input
"--channels",
str(num_channels),
]
if compression is not None:
command += ['--compression', str(compression)]
command += ["--compression", str(compression)]
if bit_depth is not None:
command += ['--bits', str(bit_depth)]
command += ["--bits", str(bit_depth)]
if encoding is not None:
command += ['--encoding', str(encoding)]
command += ["--encoding", str(encoding)]
if comment_file is not None:
command += ['--comment-file', str(comment_file)]
command += ["--comment-file", str(comment_file)]
command += [
str(path),
'synth', str(duration), # synthesizes for the given duration [sec]
'sawtooth', '1',
"synth",
str(duration), # synthesizes for the given duration [sec]
"sawtooth",
"1",
# saw tooth covers the both ends of value range, which is a good property for test.
# similar to linspace(-1., 1.)
# this introduces bigger boundary effect than sine when converted to mp3
]
if attenuation is not None:
command += ['vol', f'-{attenuation}dB']
print(' '.join(command), file=sys.stderr)
command += ["vol", f"-{attenuation}dB"]
print(" ".join(command), file=sys.stderr)
subprocess.run(command, check=True)
def convert_audio_file(
src_path, dst_path,
*, encoding=None, bit_depth=None, compression=None):
def convert_audio_file(src_path, dst_path, *, encoding=None, bit_depth=None, compression=None):
"""Convert audio file with `sox` command."""
command = ['sox', '-V3', '--no-dither', '-R', str(src_path)]
command = ["sox", "-V3", "--no-dither", "-R", str(src_path)]
if encoding is not None:
command += ['--encoding', str(encoding)]
command += ["--encoding", str(encoding)]
if bit_depth is not None:
command += ['--bits', str(bit_depth)]
command += ["--bits", str(bit_depth)]
if compression is not None:
command += ['--compression', str(compression)]
command += ["--compression", str(compression)]
command += [dst_path]
print(' '.join(command), file=sys.stderr)
print(" ".join(command), file=sys.stderr)
subprocess.run(command, check=True)
......@@ -96,11 +106,11 @@ def _flattern(effects):
def run_sox_effect(input_file, output_file, effect, *, output_sample_rate=None, output_bitdepth=None):
"""Run sox effects"""
effect = _flattern(effect)
command = ['sox', '-V', '--no-dither', input_file]
command = ["sox", "-V", "--no-dither", input_file]
if output_bitdepth:
command += ['--bits', str(output_bitdepth)]
command += ["--bits", str(output_bitdepth)]
command += [output_file] + effect
if output_sample_rate:
command += ['rate', str(output_sample_rate)]
print(' '.join(command))
command += ["rate", str(output_sample_rate)]
print(" ".join(command))
subprocess.run(command, check=True)
from typing import Optional
import torch
import scipy.io.wavfile
import torch
def normalize_wav(tensor: torch.Tensor) -> torch.Tensor:
......@@ -9,26 +9,26 @@ def normalize_wav(tensor: torch.Tensor) -> torch.Tensor:
pass
elif tensor.dtype == torch.int32:
tensor = tensor.to(torch.float32)
tensor[tensor > 0] /= 2147483647.
tensor[tensor < 0] /= 2147483648.
tensor[tensor > 0] /= 2147483647.0
tensor[tensor < 0] /= 2147483648.0
elif tensor.dtype == torch.int16:
tensor = tensor.to(torch.float32)
tensor[tensor > 0] /= 32767.
tensor[tensor < 0] /= 32768.
tensor[tensor > 0] /= 32767.0
tensor[tensor < 0] /= 32768.0
elif tensor.dtype == torch.uint8:
tensor = tensor.to(torch.float32) - 128
tensor[tensor > 0] /= 127.
tensor[tensor < 0] /= 128.
tensor[tensor > 0] /= 127.0
tensor[tensor < 0] /= 128.0
return tensor
def get_wav_data(
dtype: str,
num_channels: int,
*,
num_frames: Optional[int] = None,
normalize: bool = True,
channels_first: bool = True,
dtype: str,
num_channels: int,
*,
num_frames: Optional[int] = None,
normalize: bool = True,
channels_first: bool = True,
):
"""Generate linear signal of the given dtype and num_channels
......@@ -45,25 +45,25 @@ def get_wav_data(
dtype_ = getattr(torch, dtype)
if num_frames is None:
if dtype == 'uint8':
if dtype == "uint8":
num_frames = 256
else:
num_frames = 1 << 16
if dtype == 'uint8':
if dtype == "uint8":
base = torch.linspace(0, 255, num_frames, dtype=dtype_)
elif dtype == 'int8':
elif dtype == "int8":
base = torch.linspace(-128, 127, num_frames, dtype=dtype_)
elif dtype == 'float32':
base = torch.linspace(-1., 1., num_frames, dtype=dtype_)
elif dtype == 'float64':
base = torch.linspace(-1., 1., num_frames, dtype=dtype_)
elif dtype == 'int32':
elif dtype == "float32":
base = torch.linspace(-1.0, 1.0, num_frames, dtype=dtype_)
elif dtype == "float64":
base = torch.linspace(-1.0, 1.0, num_frames, dtype=dtype_)
elif dtype == "int32":
base = torch.linspace(-2147483648, 2147483647, num_frames, dtype=dtype_)
elif dtype == 'int16':
elif dtype == "int16":
base = torch.linspace(-32768, 32767, num_frames, dtype=dtype_)
else:
raise NotImplementedError(f'Unsupported dtype {dtype}')
raise NotImplementedError(f"Unsupported dtype {dtype}")
data = base.repeat([num_channels, 1])
if not channels_first:
data = data.transpose(1, 0)
......
import torch
import torchaudio.compliance.kaldi as kaldi
from torchaudio_unittest import common_utils
......@@ -20,28 +19,27 @@ def extract_window(window, wave, f, frame_length, frame_shift, snip_edges):
end_sample = start_sample + frame_length
if snip_edges:
assert(start_sample >= sample_offset and end_sample <= num_samples)
assert start_sample >= sample_offset and end_sample <= num_samples
else:
assert(sample_offset == 0 or start_sample >= sample_offset)
assert sample_offset == 0 or start_sample >= sample_offset
wave_start = start_sample - sample_offset
wave_end = wave_start + frame_length
if wave_start >= 0 and wave_end <= wave.size(0):
window[f, :] = wave[wave_start:(wave_start + frame_length)]
window[f, :] = wave[wave_start : (wave_start + frame_length)]
else:
wave_dim = wave.size(0)
for s in range(frame_length):
s_in_wave = s + wave_start
while s_in_wave < 0 or s_in_wave >= wave_dim:
if s_in_wave < 0:
s_in_wave = - s_in_wave - 1
s_in_wave = -s_in_wave - 1
else:
s_in_wave = 2 * wave_dim - 1 - s_in_wave
window[f, s] = wave[s_in_wave]
class Test_Kaldi(common_utils.TempDirMixin, common_utils.TorchaudioTestCase):
def _test_get_strided_helper(self, num_samples, window_size, window_shift, snip_edges):
waveform = torch.arange(num_samples).float()
output = kaldi._get_strided(waveform, window_size, window_shift, snip_edges)
......
......@@ -2,7 +2,6 @@ import os
from pathlib import Path
from torchaudio.datasets import cmuarctic
from torchaudio_unittest.common_utils import (
TempDirMixin,
TorchaudioTestCase,
......
......@@ -2,7 +2,6 @@ import os
from pathlib import Path
from torchaudio.datasets import CMUDict
from torchaudio_unittest.common_utils import (
TempDirMixin,
TorchaudioTestCase,
......@@ -21,7 +20,7 @@ def get_mock_dataset(root_dir, return_punc=False):
puncs = [
"!EXCLAMATION-POINT EH2 K S K L AH0 M EY1 SH AH0 N P OY2 N T",
"\"CLOSE-QUOTE K L OW1 Z K W OW1 T",
'"CLOSE-QUOTE K L OW1 Z K W OW1 T',
"#HASH-MARK HH AE1 M AA2 R K",
"%PERCENT P ER0 S EH1 N T",
"&AMPERSAND AE1 M P ER0 S AE2 N D",
......@@ -43,7 +42,7 @@ def get_mock_dataset(root_dir, return_punc=False):
punc_outputs = [
"!",
"\"",
'"',
"#",
"%",
"&",
......
......@@ -4,6 +4,7 @@ from pathlib import Path
from typing import Tuple, Dict
from torch import Tensor
from torchaudio.datasets import COMMONVOICE
from torchaudio_unittest.common_utils import (
TempDirMixin,
TorchaudioTestCase,
......@@ -12,21 +13,40 @@ from torchaudio_unittest.common_utils import (
normalize_wav,
)
from torchaudio.datasets import COMMONVOICE
_ORIGINAL_EXT_AUDIO = COMMONVOICE._ext_audio
_SAMPLE_RATE = 48000
_HEADERS = [u"client_ids", u"path", u"sentence", u"up_votes", u"down_votes", u"age", u"gender", u"accent"]
_EN_TRAIN_CSV_CONTENTS = [
["9d16c5d980247861130e0480e2719f448be73d86a496c36d01a477cbdecd8cfd1399403d7a77bf458d211a70711b2da0845c",
"common_voice_en_18885784.wav",
"He was accorded a State funeral, and was buried in Drayton and Toowoomba Cemetery.", "2", "0", "", "",
""],
["c82eb9291328620f06025a1f8112b909099e447e485e99236cb87df008650250e79fea5ca772061fb6a370830847b9c44d20",
"common_voice_en_556542.wav", "Once more into the breach", "2", "0", "thirties", "male", "us"],
["f74d880c5ad4c5917f314a604d3fc4805159d255796fb9f8defca35333ecc002bdf53dc463503c12674ea840b21b4a507b7c",
"common_voice_en_18607573.wav",
"Caddy, show Miss Clare and Miss Summerson their rooms.", "2", "0", "twenties", "male", "canada"],
[
"9d16c5d980247861130e0480e2719f448be73d86a496c36d01a477cbdecd8cfd1399403d7a77bf458d211a70711b2da0845c",
"common_voice_en_18885784.wav",
"He was accorded a State funeral, and was buried in Drayton and Toowoomba Cemetery.",
"2",
"0",
"",
"",
"",
],
[
"c82eb9291328620f06025a1f8112b909099e447e485e99236cb87df008650250e79fea5ca772061fb6a370830847b9c44d20",
"common_voice_en_556542.wav",
"Once more into the breach",
"2",
"0",
"thirties",
"male",
"us",
],
[
"f74d880c5ad4c5917f314a604d3fc4805159d255796fb9f8defca35333ecc002bdf53dc463503c12674ea840b21b4a507b7c",
"common_voice_en_18607573.wav",
"Caddy, show Miss Clare and Miss Summerson their rooms.",
"2",
"0",
"twenties",
"male",
"canada",
],
]
_FR_TRAIN_CSV_CONTENTS = [
......@@ -35,14 +55,25 @@ _FR_TRAIN_CSV_CONTENTS = [
"18343441c601cae0597a4b0d3144",
"89e67e7682b36786a0b4b4022c4d42090c86edd96c78c12d30088e62522b8fe466ea4912e6a1055dfb91b296a0743e0a2bbe"
"16cebac98ee5349e3e8262cb9329",
"Or sur ce point nous n’avons aucune réponse de votre part.", "2", "0", "twenties", "male", "france"],
"Or sur ce point nous n’avons aucune réponse de votre part.",
"2",
"0",
"twenties",
"male",
"france",
],
[
"a2e8e1e1cc74d08c92a53d7b9ff84e077eb90410edd85b8882f16fd037cecfcb6a19413c6c63ce6458cfea9579878fa91cef18"
"343441c601cae0597a4b0d3144",
"87d71819a26179e93acfee149d0b21b7bf5e926e367d80b2b3792d45f46e04853a514945783ff764c1fc237b4eb0ee2b0a7a7"
"cbd395acbdfcfa9d76a6e199bbd",
"Monsieur de La Verpillière, laissez parler le ministre", "2", "0", "twenties", "male", "france"],
"Monsieur de La Verpillière, laissez parler le ministre",
"2",
"0",
"twenties",
"male",
"france",
],
]
......@@ -57,8 +88,8 @@ def get_mock_dataset(root_dir, train_csv_contents, ext_audio) -> Tuple[Tensor, i
tsv_filename = os.path.join(root_dir, "train.tsv")
audio_base_path = os.path.join(root_dir, "clips")
os.makedirs(audio_base_path, exist_ok=True)
with open(tsv_filename, "w", newline='') as tsv:
writer = csv.writer(tsv, delimiter='\t')
with open(tsv_filename, "w", newline="") as tsv:
writer = csv.writer(tsv, delimiter="\t")
writer.writerow(_HEADERS)
for i, content in enumerate(train_csv_contents):
content[2] = str(content[2].encode("utf-8"))
......@@ -68,7 +99,7 @@ def get_mock_dataset(root_dir, train_csv_contents, ext_audio) -> Tuple[Tensor, i
else:
audio_path = os.path.join(audio_base_path, content[1])
data = get_whitenoise(sample_rate=_SAMPLE_RATE, duration=1, n_channels=1, seed=i, dtype='float32')
data = get_whitenoise(sample_rate=_SAMPLE_RATE, duration=1, n_channels=1, seed=i, dtype="float32")
save_wav(audio_path, data, _SAMPLE_RATE)
# Append data entry
mocked_data.append((normalize_wav(data), _SAMPLE_RATE, dict(zip(_HEADERS, content))))
......@@ -117,7 +148,7 @@ class BaseTestCommonVoice(TempDirMixin):
class TestCommonVoiceEN(BaseTestCommonVoice, TorchaudioTestCase):
backend = 'default'
backend = "default"
root_dir = None
@classmethod
......@@ -135,7 +166,7 @@ class TestCommonVoiceEN(BaseTestCommonVoice, TorchaudioTestCase):
class TestCommonVoiceFR(BaseTestCommonVoice, TorchaudioTestCase):
backend = 'default'
backend = "default"
root_dir = None
@classmethod
......
from pathlib import Path
import pytest
from torchaudio.datasets import dr_vctk
from torchaudio_unittest.common_utils import (
TempDirMixin,
TorchaudioTestCase,
......@@ -57,11 +55,7 @@ def get_mock_dataset(root_dir):
data = {}
for condition in _CONDITIONS:
data[condition] = get_whitenoise(
sample_rate=sample_rate,
duration=0.01,
n_channels=1,
dtype='float32',
seed=seed
sample_rate=sample_rate, duration=0.01, n_channels=1, dtype="float32", seed=seed
)
audio_dir = dataset_dir / f"{condition}_{subset}set_wav_16k"
audio_file_path = audio_dir / filename
......@@ -85,7 +79,7 @@ def get_mock_dataset(root_dir):
class TestDRVCTK(TempDirMixin, TorchaudioTestCase):
backend = 'default'
backend = "default"
root_dir = None
samples = {}
......
......@@ -2,7 +2,6 @@ import os
from pathlib import Path
from torchaudio.datasets import gtzan
from torchaudio_unittest.common_utils import (
TempDirMixin,
TorchaudioTestCase,
......@@ -24,12 +23,12 @@ def get_mock_dataset(root_dir):
seed = 0
for genre in gtzan.gtzan_genres:
base_dir = os.path.join(root_dir, 'genres', genre)
base_dir = os.path.join(root_dir, "genres", genre)
os.makedirs(base_dir, exist_ok=True)
for i in range(100):
filename = f'{genre}.{i:05d}'
path = os.path.join(base_dir, f'{filename}.wav')
data = get_whitenoise(sample_rate=sample_rate, duration=0.01, n_channels=1, dtype='int16', seed=seed)
filename = f"{genre}.{i:05d}"
path = os.path.join(base_dir, f"{filename}.wav")
data = get_whitenoise(sample_rate=sample_rate, duration=0.01, n_channels=1, dtype="int16", seed=seed)
save_wav(path, data, sample_rate)
sample = (normalize_wav(data), sample_rate, genre)
mocked_samples.append(sample)
......@@ -44,7 +43,7 @@ def get_mock_dataset(root_dir):
class TestGTZAN(TempDirMixin, TorchaudioTestCase):
backend = 'default'
backend = "default"
root_dir = None
samples = []
......@@ -100,28 +99,28 @@ class TestGTZAN(TempDirMixin, TorchaudioTestCase):
assert n_ite == len(self.testing)
def test_training_str(self):
train_dataset = gtzan.GTZAN(self.root_dir, subset='training')
train_dataset = gtzan.GTZAN(self.root_dir, subset="training")
self._test_training(train_dataset)
def test_validation_str(self):
val_dataset = gtzan.GTZAN(self.root_dir, subset='validation')
val_dataset = gtzan.GTZAN(self.root_dir, subset="validation")
self._test_validation(val_dataset)
def test_testing_str(self):
test_dataset = gtzan.GTZAN(self.root_dir, subset='testing')
test_dataset = gtzan.GTZAN(self.root_dir, subset="testing")
self._test_testing(test_dataset)
def test_training_path(self):
root_dir = Path(self.root_dir)
train_dataset = gtzan.GTZAN(root_dir, subset='training')
train_dataset = gtzan.GTZAN(root_dir, subset="training")
self._test_training(train_dataset)
def test_validation_path(self):
root_dir = Path(self.root_dir)
val_dataset = gtzan.GTZAN(root_dir, subset='validation')
val_dataset = gtzan.GTZAN(root_dir, subset="validation")
self._test_validation(val_dataset)
def test_testing_path(self):
root_dir = Path(self.root_dir)
test_dataset = gtzan.GTZAN(root_dir, subset='testing')
test_dataset = gtzan.GTZAN(root_dir, subset="testing")
self._test_testing(test_dataset)
import os
from pathlib import Path
from torchaudio.datasets import librispeech
from torchaudio_unittest.common_utils import (
TempDirMixin,
TorchaudioTestCase,
......@@ -9,21 +10,8 @@ from torchaudio_unittest.common_utils import (
normalize_wav,
)
from torchaudio.datasets import librispeech
# Used to generate a unique transcript for each dummy audio file
_NUMBERS = [
'ZERO',
'ONE',
'TWO',
'THREE',
'FOUR',
'FIVE',
'SIX',
'SEVEN',
'EIGHT',
'NINE'
]
_NUMBERS = ["ZERO", "ONE", "TWO", "THREE", "FOUR", "FIVE", "SIX", "SEVEN", "EIGHT", "NINE"]
def get_mock_dataset(root_dir):
......@@ -31,9 +19,7 @@ def get_mock_dataset(root_dir):
root_dir: directory to the mocked dataset
"""
mocked_data = []
dataset_dir = os.path.join(
root_dir, librispeech.FOLDER_IN_ARCHIVE, librispeech.URL
)
dataset_dir = os.path.join(root_dir, librispeech.FOLDER_IN_ARCHIVE, librispeech.URL)
os.makedirs(dataset_dir, exist_ok=True)
sample_rate = 16000 # 16kHz
seed = 0
......@@ -48,45 +34,28 @@ def get_mock_dataset(root_dir):
trans_content = []
for utterance_id in range(10):
filename = f'{speaker_id}-{chapter_id}-{utterance_id:04d}.wav'
filename = f"{speaker_id}-{chapter_id}-{utterance_id:04d}.wav"
path = os.path.join(chapter_path, filename)
transcript = ' '.join(
[_NUMBERS[x] for x in [speaker_id, chapter_id, utterance_id]]
)
trans_content.append(
f'{speaker_id}-{chapter_id}-{utterance_id:04d} {transcript}'
)
data = get_whitenoise(
sample_rate=sample_rate,
duration=0.01,
n_channels=1,
dtype='float32',
seed=seed
)
transcript = " ".join([_NUMBERS[x] for x in [speaker_id, chapter_id, utterance_id]])
trans_content.append(f"{speaker_id}-{chapter_id}-{utterance_id:04d} {transcript}")
data = get_whitenoise(sample_rate=sample_rate, duration=0.01, n_channels=1, dtype="float32", seed=seed)
save_wav(path, data, sample_rate)
sample = (
normalize_wav(data),
sample_rate,
transcript,
speaker_id,
chapter_id,
utterance_id
)
sample = (normalize_wav(data), sample_rate, transcript, speaker_id, chapter_id, utterance_id)
mocked_data.append(sample)
seed += 1
trans_filename = f'{speaker_id}-{chapter_id}.trans.txt'
trans_filename = f"{speaker_id}-{chapter_id}.trans.txt"
trans_path = os.path.join(chapter_path, trans_filename)
with open(trans_path, 'w') as f:
f.write('\n'.join(trans_content))
with open(trans_path, "w") as f:
f.write("\n".join(trans_content))
return mocked_data
class TestLibriSpeech(TempDirMixin, TorchaudioTestCase):
backend = 'default'
backend = "default"
root_dir = None
samples = []
......@@ -99,13 +68,11 @@ class TestLibriSpeech(TempDirMixin, TorchaudioTestCase):
@classmethod
def tearDownClass(cls):
# In case of test failure
librispeech.LIBRISPEECH._ext_audio = '.flac'
librispeech.LIBRISPEECH._ext_audio = ".flac"
def _test_librispeech(self, dataset):
num_samples = 0
for i, (
data, sample_rate, transcript, speaker_id, chapter_id, utterance_id
) in enumerate(dataset):
for i, (data, sample_rate, transcript, speaker_id, chapter_id, utterance_id) in enumerate(dataset):
self.assertEqual(data, self.samples[i][0], atol=5e-5, rtol=1e-8)
assert sample_rate == self.samples[i][1]
assert transcript == self.samples[i][2]
......@@ -115,14 +82,14 @@ class TestLibriSpeech(TempDirMixin, TorchaudioTestCase):
num_samples += 1
assert num_samples == len(self.samples)
librispeech.LIBRISPEECH._ext_audio = '.flac'
librispeech.LIBRISPEECH._ext_audio = ".flac"
def test_librispeech_str(self):
librispeech.LIBRISPEECH._ext_audio = '.wav'
librispeech.LIBRISPEECH._ext_audio = ".wav"
dataset = librispeech.LIBRISPEECH(self.root_dir)
self._test_librispeech(dataset)
def test_librispeech_path(self):
librispeech.LIBRISPEECH._ext_audio = '.wav'
librispeech.LIBRISPEECH._ext_audio = ".wav"
dataset = librispeech.LIBRISPEECH(Path(self.root_dir))
self._test_librispeech(dataset)
import os
from pathlib import Path
from torchaudio.datasets.libritts import LIBRITTS
from torchaudio_unittest.common_utils import (
TempDirMixin,
TorchaudioTestCase,
......@@ -9,14 +10,12 @@ from torchaudio_unittest.common_utils import (
normalize_wav,
)
from torchaudio.datasets.libritts import LIBRITTS
_UTTERANCE_IDS = [
[19, 198, '000000', '000000'],
[26, 495, '000004', '000000'],
[19, 198, "000000", "000000"],
[26, 495, "000004", "000000"],
]
_ORIGINAL_TEXT = 'this is the original text.'
_NORMALIZED_TEXT = 'this is the normalized text.'
_ORIGINAL_TEXT = "this is the original text."
_NORMALIZED_TEXT = "this is the normalized text."
def get_mock_dataset(root_dir):
......@@ -24,31 +23,31 @@ def get_mock_dataset(root_dir):
root_dir: directory to the mocked dataset
"""
mocked_data = []
base_dir = os.path.join(root_dir, 'LibriTTS', 'train-clean-100')
base_dir = os.path.join(root_dir, "LibriTTS", "train-clean-100")
for i, utterance_id in enumerate(_UTTERANCE_IDS):
filename = f'{"_".join(str(u) for u in utterance_id)}.wav'
file_dir = os.path.join(base_dir, str(utterance_id[0]), str(utterance_id[1]))
os.makedirs(file_dir, exist_ok=True)
path = os.path.join(file_dir, filename)
data = get_whitenoise(sample_rate=24000, duration=2, n_channels=1, dtype='int16', seed=i)
data = get_whitenoise(sample_rate=24000, duration=2, n_channels=1, dtype="int16", seed=i)
save_wav(path, data, 24000)
mocked_data.append(normalize_wav(data))
original_text_filename = f'{"_".join(str(u) for u in utterance_id)}.original.txt'
path_original = os.path.join(file_dir, original_text_filename)
with open(path_original, 'w') as file_:
with open(path_original, "w") as file_:
file_.write(_ORIGINAL_TEXT)
normalized_text_filename = f'{"_".join(str(u) for u in utterance_id)}.normalized.txt'
path_normalized = os.path.join(file_dir, normalized_text_filename)
with open(path_normalized, 'w') as file_:
with open(path_normalized, "w") as file_:
file_.write(_NORMALIZED_TEXT)
return mocked_data, _UTTERANCE_IDS, _ORIGINAL_TEXT, _NORMALIZED_TEXT
class TestLibriTTS(TempDirMixin, TorchaudioTestCase):
backend = 'default'
backend = "default"
root_dir = None
data = []
......@@ -61,13 +60,15 @@ class TestLibriTTS(TempDirMixin, TorchaudioTestCase):
def _test_libritts(self, dataset):
n_ites = 0
for i, (waveform,
sample_rate,
original_text,
normalized_text,
speaker_id,
chapter_id,
utterance_id) in enumerate(dataset):
for i, (
waveform,
sample_rate,
original_text,
normalized_text,
speaker_id,
chapter_id,
utterance_id,
) in enumerate(dataset):
expected_ids = self._utterance_ids[i]
expected_data = self.data[i]
self.assertEqual(expected_data, waveform, atol=5e-5, rtol=1e-8)
......
......@@ -2,6 +2,7 @@ import csv
import os
from pathlib import Path
from torchaudio.datasets import ljspeech
from torchaudio_unittest.common_utils import (
TempDirMixin,
TorchaudioTestCase,
......@@ -10,20 +11,18 @@ from torchaudio_unittest.common_utils import (
save_wav,
)
from torchaudio.datasets import ljspeech
_TRANSCRIPTS = [
"Test transcript 1",
"Test transcript 2",
"Test transcript 3",
"In 1465 Sweynheim and Pannartz began printing in the monastery of Subiaco near Rome,"
"In 1465 Sweynheim and Pannartz began printing in the monastery of Subiaco near Rome,",
]
_NORMALIZED_TRANSCRIPT = [
"Test transcript one",
"Test transcript two",
"Test transcript three",
"In fourteen sixty-five Sweynheim and Pannartz began printing in the monastery of Subiaco near Rome,"
"In fourteen sixty-five Sweynheim and Pannartz began printing in the monastery of Subiaco near Rome,",
]
......@@ -38,20 +37,14 @@ def get_mock_dataset(root_dir):
metadata_path = os.path.join(base_dir, "metadata.csv")
sample_rate = 22050
with open(metadata_path, mode="w", newline='') as metadata_file:
metadata_writer = csv.writer(
metadata_file, delimiter="|", quoting=csv.QUOTE_NONE
)
for i, (transcript, normalized_transcript) in enumerate(
zip(_TRANSCRIPTS, _NORMALIZED_TRANSCRIPT)
):
fileid = f'LJ001-{i:04d}'
with open(metadata_path, mode="w", newline="") as metadata_file:
metadata_writer = csv.writer(metadata_file, delimiter="|", quoting=csv.QUOTE_NONE)
for i, (transcript, normalized_transcript) in enumerate(zip(_TRANSCRIPTS, _NORMALIZED_TRANSCRIPT)):
fileid = f"LJ001-{i:04d}"
metadata_writer.writerow([fileid, transcript, normalized_transcript])
filename = fileid + ".wav"
path = os.path.join(archive_dir, filename)
data = get_whitenoise(
sample_rate=sample_rate, duration=1, n_channels=1, dtype="int16", seed=i
)
data = get_whitenoise(sample_rate=sample_rate, duration=1, n_channels=1, dtype="int16", seed=i)
save_wav(path, data, sample_rate)
mocked_data.append(normalize_wav(data))
return mocked_data, _TRANSCRIPTS, _NORMALIZED_TRANSCRIPT
......@@ -70,9 +63,7 @@ class TestLJSpeech(TempDirMixin, TorchaudioTestCase):
def _test_ljspeech(self, dataset):
n_ite = 0
for i, (waveform, sample_rate, transcript, normalized_transcript) in enumerate(
dataset
):
for i, (waveform, sample_rate, transcript, normalized_transcript) in enumerate(dataset):
expected_transcript = self._transcripts[i]
expected_normalized_transcript = self._normalized_transcript[i]
expected_data = self.data[i]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment