Commit 5859923a authored by Joao Gomes's avatar Joao Gomes Committed by Facebook GitHub Bot
Browse files

Apply arc lint to pytorch audio (#2096)

Summary:
Pull Request resolved: https://github.com/pytorch/audio/pull/2096

run: `arc lint --apply-patches --paths-cmd 'hg files -I "./**/*.py"'`

Reviewed By: mthrok

Differential Revision: D33297351

fbshipit-source-id: 7bf5956edf0717c5ca90219f72414ff4eeaf5aa8
parent 0e5913d5
from .data_utils import (
get_asset_path,
get_whitenoise,
get_sinusoid,
get_spectrogram,
)
from .backend_utils import ( from .backend_utils import (
set_audio_backend, set_audio_backend,
) )
...@@ -21,43 +15,46 @@ from .case_utils import ( ...@@ -21,43 +15,46 @@ from .case_utils import (
skipIfRocm, skipIfRocm,
skipIfNoQengine, skipIfNoQengine,
) )
from .data_utils import (
get_asset_path,
get_whitenoise,
get_sinusoid,
get_spectrogram,
)
from .func_utils import torch_script
from .parameterized_utils import load_params, nested_params
from .wav_utils import ( from .wav_utils import (
get_wav_data, get_wav_data,
normalize_wav, normalize_wav,
load_wav, load_wav,
save_wav, save_wav,
) )
from .parameterized_utils import (
load_params,
nested_params
)
from .func_utils import torch_script
__all__ = [ __all__ = [
'get_asset_path', "get_asset_path",
'get_whitenoise', "get_whitenoise",
'get_sinusoid', "get_sinusoid",
'get_spectrogram', "get_spectrogram",
'set_audio_backend', "set_audio_backend",
'TempDirMixin', "TempDirMixin",
'HttpServerMixin', "HttpServerMixin",
'TestBaseMixin', "TestBaseMixin",
'PytorchTestCase', "PytorchTestCase",
'TorchaudioTestCase', "TorchaudioTestCase",
'skipIfNoCuda', "skipIfNoCuda",
'skipIfNoExec', "skipIfNoExec",
'skipIfNoModule', "skipIfNoModule",
'skipIfNoKaldi', "skipIfNoKaldi",
'skipIfNoSox', "skipIfNoSox",
'skipIfNoSoxBackend', "skipIfNoSoxBackend",
'skipIfRocm', "skipIfRocm",
'skipIfNoQengine', "skipIfNoQengine",
'get_wav_data', "get_wav_data",
'normalize_wav', "normalize_wav",
'load_wav', "load_wav",
'save_wav', "save_wav",
'load_params', "load_params",
'nested_params', "nested_params",
'torch_script', "torch_script",
] ]
...@@ -6,15 +6,15 @@ import torchaudio ...@@ -6,15 +6,15 @@ import torchaudio
def set_audio_backend(backend): def set_audio_backend(backend):
"""Allow additional backend value, 'default'""" """Allow additional backend value, 'default'"""
backends = torchaudio.list_audio_backends() backends = torchaudio.list_audio_backends()
if backend == 'soundfile': if backend == "soundfile":
be = 'soundfile' be = "soundfile"
elif backend == 'default': elif backend == "default":
if 'sox_io' in backends: if "sox_io" in backends:
be = 'sox_io' be = "sox_io"
elif 'soundfile' in backends: elif "soundfile" in backends:
be = 'soundfile' be = "soundfile"
else: else:
raise unittest.SkipTest('No default backend available') raise unittest.SkipTest("No default backend available")
else: else:
be = backend be = backend
......
import shutil
import os.path import os.path
import shutil
import subprocess import subprocess
import tempfile import tempfile
import time import time
...@@ -7,24 +7,21 @@ import unittest ...@@ -7,24 +7,21 @@ import unittest
import torch import torch
from torch.testing._internal.common_utils import TestCase as PytorchTestCase from torch.testing._internal.common_utils import TestCase as PytorchTestCase
from torchaudio._internal.module_utils import ( from torchaudio._internal.module_utils import is_module_available, is_sox_available, is_kaldi_available
is_module_available,
is_sox_available,
is_kaldi_available
)
from .backend_utils import set_audio_backend from .backend_utils import set_audio_backend
class TempDirMixin: class TempDirMixin:
"""Mixin to provide easy access to temp dir""" """Mixin to provide easy access to temp dir"""
temp_dir_ = None temp_dir_ = None
@classmethod @classmethod
def get_base_temp_dir(cls): def get_base_temp_dir(cls):
# If TORCHAUDIO_TEST_TEMP_DIR is set, use it instead of temporary directory. # If TORCHAUDIO_TEST_TEMP_DIR is set, use it instead of temporary directory.
# this is handy for debugging. # this is handy for debugging.
key = 'TORCHAUDIO_TEST_TEMP_DIR' key = "TORCHAUDIO_TEST_TEMP_DIR"
if key in os.environ: if key in os.environ:
return os.environ[key] return os.environ[key]
if cls.temp_dir_ is None: if cls.temp_dir_ is None:
...@@ -51,6 +48,7 @@ class HttpServerMixin(TempDirMixin): ...@@ -51,6 +48,7 @@ class HttpServerMixin(TempDirMixin):
This class creates temporary directory and serve the directory as HTTP service. This class creates temporary directory and serve the directory as HTTP service.
The server is up through the execution of all the test suite defined under the subclass. The server is up through the execution of all the test suite defined under the subclass.
""" """
_proc = None _proc = None
_port = 8000 _port = 8000
...@@ -58,9 +56,8 @@ class HttpServerMixin(TempDirMixin): ...@@ -58,9 +56,8 @@ class HttpServerMixin(TempDirMixin):
def setUpClass(cls): def setUpClass(cls):
super().setUpClass() super().setUpClass()
cls._proc = subprocess.Popen( cls._proc = subprocess.Popen(
['python', '-m', 'http.server', f'{cls._port}'], ["python", "-m", "http.server", f"{cls._port}"], cwd=cls.get_base_temp_dir(), stderr=subprocess.DEVNULL
cwd=cls.get_base_temp_dir(), ) # Disable server-side error log because it is confusing
stderr=subprocess.DEVNULL) # Disable server-side error log because it is confusing
time.sleep(2.0) time.sleep(2.0)
@classmethod @classmethod
...@@ -74,6 +71,7 @@ class HttpServerMixin(TempDirMixin): ...@@ -74,6 +71,7 @@ class HttpServerMixin(TempDirMixin):
class TestBaseMixin: class TestBaseMixin:
"""Mixin to provide consistent way to define device/dtype/backend aware TestCase""" """Mixin to provide consistent way to define device/dtype/backend aware TestCase"""
dtype = None dtype = None
device = None device = None
backend = None backend = None
...@@ -84,11 +82,11 @@ class TestBaseMixin: ...@@ -84,11 +82,11 @@ class TestBaseMixin:
@property @property
def complex_dtype(self): def complex_dtype(self):
if self.dtype in ['float32', 'float', torch.float, torch.float32]: if self.dtype in ["float32", "float", torch.float, torch.float32]:
return torch.cfloat return torch.cfloat
if self.dtype in ['float64', 'double', torch.double, torch.float64]: if self.dtype in ["float64", "double", torch.double, torch.float64]:
return torch.cdouble return torch.cdouble
raise ValueError(f'No corresponding complex dtype for {self.dtype}') raise ValueError(f"No corresponding complex dtype for {self.dtype}")
class TorchaudioTestCase(TestBaseMixin, PytorchTestCase): class TorchaudioTestCase(TestBaseMixin, PytorchTestCase):
...@@ -96,7 +94,7 @@ class TorchaudioTestCase(TestBaseMixin, PytorchTestCase): ...@@ -96,7 +94,7 @@ class TorchaudioTestCase(TestBaseMixin, PytorchTestCase):
def skipIfNoExec(cmd): def skipIfNoExec(cmd):
return unittest.skipIf(shutil.which(cmd) is None, f'`{cmd}` is not available') return unittest.skipIf(shutil.which(cmd) is None, f"`{cmd}` is not available")
def skipIfNoModule(module, display_name=None): def skipIfNoModule(module, display_name=None):
...@@ -107,17 +105,19 @@ def skipIfNoModule(module, display_name=None): ...@@ -107,17 +105,19 @@ def skipIfNoModule(module, display_name=None):
def skipIfNoCuda(test_item): def skipIfNoCuda(test_item):
if torch.cuda.is_available(): if torch.cuda.is_available():
return test_item return test_item
force_cuda_test = os.environ.get('TORCHAUDIO_TEST_FORCE_CUDA', '0') force_cuda_test = os.environ.get("TORCHAUDIO_TEST_FORCE_CUDA", "0")
if force_cuda_test not in ['0', '1']: if force_cuda_test not in ["0", "1"]:
raise ValueError('"TORCHAUDIO_TEST_FORCE_CUDA" must be either "0" or "1".') raise ValueError('"TORCHAUDIO_TEST_FORCE_CUDA" must be either "0" or "1".')
if force_cuda_test == '1': if force_cuda_test == "1":
raise RuntimeError('"TORCHAUDIO_TEST_FORCE_CUDA" is set but CUDA is not available.') raise RuntimeError('"TORCHAUDIO_TEST_FORCE_CUDA" is set but CUDA is not available.')
return unittest.skip('CUDA is not available.')(test_item) return unittest.skip("CUDA is not available.")(test_item)
skipIfNoSox = unittest.skipIf(not is_sox_available(), reason='Sox not available')
skipIfNoKaldi = unittest.skipIf(not is_kaldi_available(), reason='Kaldi not available')
skipIfRocm = unittest.skipIf(os.getenv('TORCHAUDIO_TEST_WITH_ROCM', '0') == '1', skipIfNoSox = unittest.skipIf(not is_sox_available(), reason="Sox not available")
reason="test doesn't currently work on the ROCm stack") skipIfNoKaldi = unittest.skipIf(not is_kaldi_available(), reason="Kaldi not available")
skipIfRocm = unittest.skipIf(
os.getenv("TORCHAUDIO_TEST_WITH_ROCM", "0") == "1", reason="test doesn't currently work on the ROCm stack"
)
skipIfNoQengine = unittest.skipIf( skipIfNoQengine = unittest.skipIf(
'fbgemm' not in torch.backends.quantized.supported_engines, "fbgemm" not in torch.backends.quantized.supported_engines, reason="`fbgemm` is not available."
reason="`fbgemm` is not available."
) )
...@@ -4,13 +4,12 @@ from typing import Union, Optional ...@@ -4,13 +4,12 @@ from typing import Union, Optional
import torch import torch
_TEST_DIR_PATH = os.path.realpath( _TEST_DIR_PATH = os.path.realpath(os.path.join(os.path.dirname(__file__), ".."))
os.path.join(os.path.dirname(__file__), '..'))
def get_asset_path(*paths): def get_asset_path(*paths):
"""Return full path of a test asset""" """Return full path of a test asset"""
return os.path.join(_TEST_DIR_PATH, 'assets', *paths) return os.path.join(_TEST_DIR_PATH, "assets", *paths)
def convert_tensor_encoding( def convert_tensor_encoding(
...@@ -63,13 +62,12 @@ def get_whitenoise( ...@@ -63,13 +62,12 @@ def get_whitenoise(
if isinstance(dtype, str): if isinstance(dtype, str):
dtype = getattr(torch, dtype) dtype = getattr(torch, dtype)
if dtype not in [torch.float64, torch.float32, torch.int32, torch.int16, torch.uint8]: if dtype not in [torch.float64, torch.float32, torch.int32, torch.int16, torch.uint8]:
raise NotImplementedError(f'dtype {dtype} is not supported.') raise NotImplementedError(f"dtype {dtype} is not supported.")
# According to the doc, folking rng on all CUDA devices is slow when there are many CUDA devices, # According to the doc, folking rng on all CUDA devices is slow when there are many CUDA devices,
# so we only fork on CPU, generate values and move the data to the given device # so we only fork on CPU, generate values and move the data to the given device
with torch.random.fork_rng([]): with torch.random.fork_rng([]):
torch.random.manual_seed(seed) torch.random.manual_seed(seed)
tensor = torch.randn([n_channels, int(sample_rate * duration)], tensor = torch.randn([n_channels, int(sample_rate * duration)], dtype=torch.float32, device="cpu")
dtype=torch.float32, device='cpu')
tensor /= 2.0 tensor /= 2.0
tensor *= scale_factor tensor *= scale_factor
tensor.clamp_(-1.0, 1.0) tensor.clamp_(-1.0, 1.0)
...@@ -116,15 +114,15 @@ def get_sinusoid( ...@@ -116,15 +114,15 @@ def get_sinusoid(
def get_spectrogram( def get_spectrogram(
waveform, waveform,
*, *,
n_fft: int = 2048, n_fft: int = 2048,
hop_length: Optional[int] = None, hop_length: Optional[int] = None,
win_length: Optional[int] = None, win_length: Optional[int] = None,
window: Optional[torch.Tensor] = None, window: Optional[torch.Tensor] = None,
center: bool = True, center: bool = True,
pad_mode: str = 'reflect', pad_mode: str = "reflect",
power: Optional[float] = None, power: Optional[float] = None,
): ):
"""Generate a spectrogram of the given Tensor """Generate a spectrogram of the given Tensor
...@@ -149,7 +147,8 @@ def get_spectrogram( ...@@ -149,7 +147,8 @@ def get_spectrogram(
center=center, center=center,
window=window, window=window,
pad_mode=pad_mode, pad_mode=pad_mode,
return_complex=True) return_complex=True,
)
if power is not None: if power is not None:
spec = spec.abs() ** power spec = spec.abs() ** power
return spec return spec
import io import io
import torch import torch
......
...@@ -6,11 +6,11 @@ import torch ...@@ -6,11 +6,11 @@ import torch
def convert_args(**kwargs): def convert_args(**kwargs):
args = [] args = []
for key, value in kwargs.items(): for key, value in kwargs.items():
if key == 'sample_rate': if key == "sample_rate":
key = 'sample_frequency' key = "sample_frequency"
key = '--' + key.replace('_', '-') key = "--" + key.replace("_", "-")
value = str(value).lower() if value in [True, False] else str(value) value = str(value).lower() if value in [True, False] else str(value)
args.append('%s=%s' % (key, value)) args.append("%s=%s" % (key, value))
return args return args
...@@ -25,14 +25,14 @@ def run_kaldi(command, input_type, input_value): ...@@ -25,14 +25,14 @@ def run_kaldi(command, input_type, input_value):
""" """
import kaldi_io import kaldi_io
key = 'foo' key = "foo"
process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
if input_type == 'ark': if input_type == "ark":
kaldi_io.write_mat(process.stdin, input_value.cpu().numpy(), key=key) kaldi_io.write_mat(process.stdin, input_value.cpu().numpy(), key=key)
elif input_type == 'scp': elif input_type == "scp":
process.stdin.write(f'{key} {input_value}'.encode('utf8')) process.stdin.write(f"{key} {input_value}".encode("utf8"))
else: else:
raise NotImplementedError('Unexpected type') raise NotImplementedError("Unexpected type")
process.stdin.close() process.stdin.close()
result = dict(kaldi_io.read_mat_ark(process.stdout))['foo'] result = dict(kaldi_io.read_mat_ark(process.stdout))["foo"]
return torch.from_numpy(result.copy()) # copy supresses some torch warning return torch.from_numpy(result.copy()) # copy supresses some torch warning
...@@ -7,7 +7,7 @@ from .data_utils import get_asset_path ...@@ -7,7 +7,7 @@ from .data_utils import get_asset_path
def load_params(*paths): def load_params(*paths):
with open(get_asset_path(*paths), 'r') as file: with open(get_asset_path(*paths), "r") as file:
return [param(json.loads(line)) for line in file] return [param(json.loads(line)) for line in file]
...@@ -20,7 +20,7 @@ def _name_func(func, _, params): ...@@ -20,7 +20,7 @@ def _name_func(func, _, params):
strs.append(str(arg)) strs.append(str(arg))
# sanitize the test name # sanitize the test name
name = "_".join(strs).replace(".", "_") name = "_".join(strs).replace(".", "_")
return f'{func.__name__}_{name}' return f"{func.__name__}_{name}"
def nested_params(*params_set): def nested_params(*params_set):
...@@ -39,13 +39,10 @@ def nested_params(*params_set): ...@@ -39,13 +39,10 @@ def nested_params(*params_set):
# Parameters to be nested are given as list of `parameterized.param` # Parameters to be nested are given as list of `parameterized.param`
if not all(isinstance(p, param) for p in flatten): if not all(isinstance(p, param) for p in flatten):
raise TypeError( raise TypeError("When using ``parameterized.param``, " "all the parameters have to be of the ``param`` type.")
"When using ``parameterized.param``, "
"all the parameters have to be of the ``param`` type.")
if any(p.args for p in flatten): if any(p.args for p in flatten):
raise ValueError( raise ValueError(
"When using ``parameterized.param``, " "When using ``parameterized.param``, " "all the parameters have to be provided as keyword argument."
"all the parameters have to be provided as keyword argument."
) )
args = [param()] args = [param()]
for params in params_set: for params in params_set:
......
...@@ -5,11 +5,7 @@ import torch ...@@ -5,11 +5,7 @@ import torch
def psd_numpy( def psd_numpy(
X: np.array, X: np.array, mask: Optional[np.array], multi_mask: bool = False, normalize: bool = True, eps: float = 1e-15
mask: Optional[np.array],
multi_mask: bool = False,
normalize: bool = True,
eps: float = 1e-15
) -> np.array: ) -> np.array:
X_conj = np.conj(X) X_conj = np.conj(X)
psd_X = np.einsum("...cft,...eft->...ftce", X, X_conj) psd_X = np.einsum("...cft,...eft->...ftce", X, X_conj)
......
import unittest
import random import random
import torch import unittest
import numpy as np import numpy as np
import torch
from torchaudio.functional import rnnt_loss from torchaudio.functional import rnnt_loss
...@@ -84,9 +85,7 @@ class _NumpyTransducer(torch.autograd.Function): ...@@ -84,9 +85,7 @@ class _NumpyTransducer(torch.autograd.Function):
return beta, cost return beta, cost
@staticmethod @staticmethod
def compute_gradients_one_sequence( def compute_gradients_one_sequence(log_probs, alpha, beta, targets, blank=-1):
log_probs, alpha, beta, targets, blank=-1
):
max_T, max_U, D = log_probs.shape max_T, max_U, D = log_probs.shape
gradients = np.full(log_probs.shape, float("-inf")) gradients = np.full(log_probs.shape, float("-inf"))
cost = -beta[0, 0] cost = -beta[0, 0]
...@@ -175,9 +174,7 @@ class NumpyTransducerLoss(torch.nn.Module): ...@@ -175,9 +174,7 @@ class NumpyTransducerLoss(torch.nn.Module):
def compute_with_numpy_transducer(data): def compute_with_numpy_transducer(data):
costs = NumpyTransducerLoss( costs = NumpyTransducerLoss(blank=data["blank"],)(
blank=data["blank"],
)(
logits=data["logits"], logits=data["logits"],
logit_lengths=data["logit_lengths"], logit_lengths=data["logit_lengths"],
target_lengths=data["target_lengths"], target_lengths=data["target_lengths"],
...@@ -254,6 +251,7 @@ def get_B1_T10_U3_D4_data( ...@@ -254,6 +251,7 @@ def get_B1_T10_U3_D4_data(
def grad_hook(grad): def grad_hook(grad):
logits.saved_grad = grad.clone() logits.saved_grad = grad.clone()
logits.register_hook(grad_hook) logits.register_hook(grad_hook)
data = {} data = {}
...@@ -307,6 +305,7 @@ def get_B1_T2_U3_D5_data(dtype=torch.float32, device=CPU_DEVICE): ...@@ -307,6 +305,7 @@ def get_B1_T2_U3_D5_data(dtype=torch.float32, device=CPU_DEVICE):
def grad_hook(grad): def grad_hook(grad):
logits.saved_grad = grad.clone() logits.saved_grad = grad.clone()
logits.register_hook(grad_hook) logits.register_hook(grad_hook)
targets = torch.tensor([[1, 2]], dtype=torch.int32, device=device) targets = torch.tensor([[1, 2]], dtype=torch.int32, device=device)
...@@ -447,6 +446,7 @@ def get_B2_T4_U3_D3_data(dtype=torch.float32, device=CPU_DEVICE): ...@@ -447,6 +446,7 @@ def get_B2_T4_U3_D3_data(dtype=torch.float32, device=CPU_DEVICE):
def grad_hook(grad): def grad_hook(grad):
logits.saved_grad = grad.clone() logits.saved_grad = grad.clone()
logits.register_hook(grad_hook) logits.register_hook(grad_hook)
targets = torch.tensor([[1, 2], [1, 1]], dtype=torch.int32, device=device) targets = torch.tensor([[1, 2], [1, 1]], dtype=torch.int32, device=device)
...@@ -573,9 +573,7 @@ def get_random_data( ...@@ -573,9 +573,7 @@ def get_random_data(
max_src_length = torch.max(logit_lengths) max_src_length = torch.max(logit_lengths)
max_tgt_length = torch.max(target_lengths) max_tgt_length = torch.max(target_lengths)
targets = torch.randint( targets = torch.randint(low=0, high=D - 1, size=(B, max_tgt_length), dtype=torch.int32, device=device)
low=0, high=D - 1, size=(B, max_tgt_length), dtype=torch.int32, device=device
)
logits = torch.rand( logits = torch.rand(
size=(B, max_src_length, max_tgt_length + 1, D), size=(B, max_src_length, max_tgt_length + 1, D),
dtype=dtype, dtype=dtype,
...@@ -584,6 +582,7 @@ def get_random_data( ...@@ -584,6 +582,7 @@ def get_random_data(
def grad_hook(grad): def grad_hook(grad):
logits.saved_grad = grad.clone() logits.saved_grad = grad.clone()
logits.register_hook(grad_hook) logits.register_hook(grad_hook)
return { return {
......
import sys
import subprocess import subprocess
import sys
import warnings import warnings
def get_encoding(dtype): def get_encoding(dtype):
encodings = { encodings = {
'float32': 'floating-point', "float32": "floating-point",
'int32': 'signed-integer', "int32": "signed-integer",
'int16': 'signed-integer', "int16": "signed-integer",
'uint8': 'unsigned-integer', "uint8": "unsigned-integer",
} }
return encodings[dtype] return encodings[dtype]
def get_bit_depth(dtype): def get_bit_depth(dtype):
bit_depths = { bit_depths = {
'float32': 32, "float32": 32,
'int32': 32, "int32": 32,
'int16': 16, "int16": 16,
'uint8': 8, "uint8": 8,
} }
return bit_depths[dtype] return bit_depths[dtype]
def gen_audio_file( def gen_audio_file(
path, sample_rate, num_channels, path,
*, encoding=None, bit_depth=None, compression=None, attenuation=None, duration=1, comment_file=None, sample_rate,
num_channels,
*,
encoding=None,
bit_depth=None,
compression=None,
attenuation=None,
duration=1,
comment_file=None,
): ):
"""Generate synthetic audio file with `sox` command.""" """Generate synthetic audio file with `sox` command."""
if path.endswith('.wav'): if path.endswith(".wav"):
warnings.warn('Use get_wav_data and save_wav to generate wav file for accurate result.') warnings.warn("Use get_wav_data and save_wav to generate wav file for accurate result.")
command = [ command = [
'sox', "sox",
'-V3', # verbose "-V3", # verbose
'--no-dither', # disable automatic dithering "--no-dither", # disable automatic dithering
'-R', "-R",
# -R is supposed to be repeatable, though the implementation looks suspicious # -R is supposed to be repeatable, though the implementation looks suspicious
# and not setting the seed to a fixed value. # and not setting the seed to a fixed value.
# https://fossies.org/dox/sox-14.4.2/sox_8c_source.html # https://fossies.org/dox/sox-14.4.2/sox_8c_source.html
# search "sox_globals.repeatable" # search "sox_globals.repeatable"
] ]
if bit_depth is not None: if bit_depth is not None:
command += ['--bits', str(bit_depth)] command += ["--bits", str(bit_depth)]
command += [ command += [
'--rate', str(sample_rate), "--rate",
'--null', # no input str(sample_rate),
'--channels', str(num_channels), "--null", # no input
"--channels",
str(num_channels),
] ]
if compression is not None: if compression is not None:
command += ['--compression', str(compression)] command += ["--compression", str(compression)]
if bit_depth is not None: if bit_depth is not None:
command += ['--bits', str(bit_depth)] command += ["--bits", str(bit_depth)]
if encoding is not None: if encoding is not None:
command += ['--encoding', str(encoding)] command += ["--encoding", str(encoding)]
if comment_file is not None: if comment_file is not None:
command += ['--comment-file', str(comment_file)] command += ["--comment-file", str(comment_file)]
command += [ command += [
str(path), str(path),
'synth', str(duration), # synthesizes for the given duration [sec] "synth",
'sawtooth', '1', str(duration), # synthesizes for the given duration [sec]
"sawtooth",
"1",
# saw tooth covers the both ends of value range, which is a good property for test. # saw tooth covers the both ends of value range, which is a good property for test.
# similar to linspace(-1., 1.) # similar to linspace(-1., 1.)
# this introduces bigger boundary effect than sine when converted to mp3 # this introduces bigger boundary effect than sine when converted to mp3
] ]
if attenuation is not None: if attenuation is not None:
command += ['vol', f'-{attenuation}dB'] command += ["vol", f"-{attenuation}dB"]
print(' '.join(command), file=sys.stderr) print(" ".join(command), file=sys.stderr)
subprocess.run(command, check=True) subprocess.run(command, check=True)
def convert_audio_file( def convert_audio_file(src_path, dst_path, *, encoding=None, bit_depth=None, compression=None):
src_path, dst_path,
*, encoding=None, bit_depth=None, compression=None):
"""Convert audio file with `sox` command.""" """Convert audio file with `sox` command."""
command = ['sox', '-V3', '--no-dither', '-R', str(src_path)] command = ["sox", "-V3", "--no-dither", "-R", str(src_path)]
if encoding is not None: if encoding is not None:
command += ['--encoding', str(encoding)] command += ["--encoding", str(encoding)]
if bit_depth is not None: if bit_depth is not None:
command += ['--bits', str(bit_depth)] command += ["--bits", str(bit_depth)]
if compression is not None: if compression is not None:
command += ['--compression', str(compression)] command += ["--compression", str(compression)]
command += [dst_path] command += [dst_path]
print(' '.join(command), file=sys.stderr) print(" ".join(command), file=sys.stderr)
subprocess.run(command, check=True) subprocess.run(command, check=True)
...@@ -96,11 +106,11 @@ def _flattern(effects): ...@@ -96,11 +106,11 @@ def _flattern(effects):
def run_sox_effect(input_file, output_file, effect, *, output_sample_rate=None, output_bitdepth=None): def run_sox_effect(input_file, output_file, effect, *, output_sample_rate=None, output_bitdepth=None):
"""Run sox effects""" """Run sox effects"""
effect = _flattern(effect) effect = _flattern(effect)
command = ['sox', '-V', '--no-dither', input_file] command = ["sox", "-V", "--no-dither", input_file]
if output_bitdepth: if output_bitdepth:
command += ['--bits', str(output_bitdepth)] command += ["--bits", str(output_bitdepth)]
command += [output_file] + effect command += [output_file] + effect
if output_sample_rate: if output_sample_rate:
command += ['rate', str(output_sample_rate)] command += ["rate", str(output_sample_rate)]
print(' '.join(command)) print(" ".join(command))
subprocess.run(command, check=True) subprocess.run(command, check=True)
from typing import Optional from typing import Optional
import torch
import scipy.io.wavfile import scipy.io.wavfile
import torch
def normalize_wav(tensor: torch.Tensor) -> torch.Tensor: def normalize_wav(tensor: torch.Tensor) -> torch.Tensor:
...@@ -9,26 +9,26 @@ def normalize_wav(tensor: torch.Tensor) -> torch.Tensor: ...@@ -9,26 +9,26 @@ def normalize_wav(tensor: torch.Tensor) -> torch.Tensor:
pass pass
elif tensor.dtype == torch.int32: elif tensor.dtype == torch.int32:
tensor = tensor.to(torch.float32) tensor = tensor.to(torch.float32)
tensor[tensor > 0] /= 2147483647. tensor[tensor > 0] /= 2147483647.0
tensor[tensor < 0] /= 2147483648. tensor[tensor < 0] /= 2147483648.0
elif tensor.dtype == torch.int16: elif tensor.dtype == torch.int16:
tensor = tensor.to(torch.float32) tensor = tensor.to(torch.float32)
tensor[tensor > 0] /= 32767. tensor[tensor > 0] /= 32767.0
tensor[tensor < 0] /= 32768. tensor[tensor < 0] /= 32768.0
elif tensor.dtype == torch.uint8: elif tensor.dtype == torch.uint8:
tensor = tensor.to(torch.float32) - 128 tensor = tensor.to(torch.float32) - 128
tensor[tensor > 0] /= 127. tensor[tensor > 0] /= 127.0
tensor[tensor < 0] /= 128. tensor[tensor < 0] /= 128.0
return tensor return tensor
def get_wav_data( def get_wav_data(
dtype: str, dtype: str,
num_channels: int, num_channels: int,
*, *,
num_frames: Optional[int] = None, num_frames: Optional[int] = None,
normalize: bool = True, normalize: bool = True,
channels_first: bool = True, channels_first: bool = True,
): ):
"""Generate linear signal of the given dtype and num_channels """Generate linear signal of the given dtype and num_channels
...@@ -45,25 +45,25 @@ def get_wav_data( ...@@ -45,25 +45,25 @@ def get_wav_data(
dtype_ = getattr(torch, dtype) dtype_ = getattr(torch, dtype)
if num_frames is None: if num_frames is None:
if dtype == 'uint8': if dtype == "uint8":
num_frames = 256 num_frames = 256
else: else:
num_frames = 1 << 16 num_frames = 1 << 16
if dtype == 'uint8': if dtype == "uint8":
base = torch.linspace(0, 255, num_frames, dtype=dtype_) base = torch.linspace(0, 255, num_frames, dtype=dtype_)
elif dtype == 'int8': elif dtype == "int8":
base = torch.linspace(-128, 127, num_frames, dtype=dtype_) base = torch.linspace(-128, 127, num_frames, dtype=dtype_)
elif dtype == 'float32': elif dtype == "float32":
base = torch.linspace(-1., 1., num_frames, dtype=dtype_) base = torch.linspace(-1.0, 1.0, num_frames, dtype=dtype_)
elif dtype == 'float64': elif dtype == "float64":
base = torch.linspace(-1., 1., num_frames, dtype=dtype_) base = torch.linspace(-1.0, 1.0, num_frames, dtype=dtype_)
elif dtype == 'int32': elif dtype == "int32":
base = torch.linspace(-2147483648, 2147483647, num_frames, dtype=dtype_) base = torch.linspace(-2147483648, 2147483647, num_frames, dtype=dtype_)
elif dtype == 'int16': elif dtype == "int16":
base = torch.linspace(-32768, 32767, num_frames, dtype=dtype_) base = torch.linspace(-32768, 32767, num_frames, dtype=dtype_)
else: else:
raise NotImplementedError(f'Unsupported dtype {dtype}') raise NotImplementedError(f"Unsupported dtype {dtype}")
data = base.repeat([num_channels, 1]) data = base.repeat([num_channels, 1])
if not channels_first: if not channels_first:
data = data.transpose(1, 0) data = data.transpose(1, 0)
......
import torch import torch
import torchaudio.compliance.kaldi as kaldi import torchaudio.compliance.kaldi as kaldi
from torchaudio_unittest import common_utils from torchaudio_unittest import common_utils
...@@ -20,28 +19,27 @@ def extract_window(window, wave, f, frame_length, frame_shift, snip_edges): ...@@ -20,28 +19,27 @@ def extract_window(window, wave, f, frame_length, frame_shift, snip_edges):
end_sample = start_sample + frame_length end_sample = start_sample + frame_length
if snip_edges: if snip_edges:
assert(start_sample >= sample_offset and end_sample <= num_samples) assert start_sample >= sample_offset and end_sample <= num_samples
else: else:
assert(sample_offset == 0 or start_sample >= sample_offset) assert sample_offset == 0 or start_sample >= sample_offset
wave_start = start_sample - sample_offset wave_start = start_sample - sample_offset
wave_end = wave_start + frame_length wave_end = wave_start + frame_length
if wave_start >= 0 and wave_end <= wave.size(0): if wave_start >= 0 and wave_end <= wave.size(0):
window[f, :] = wave[wave_start:(wave_start + frame_length)] window[f, :] = wave[wave_start : (wave_start + frame_length)]
else: else:
wave_dim = wave.size(0) wave_dim = wave.size(0)
for s in range(frame_length): for s in range(frame_length):
s_in_wave = s + wave_start s_in_wave = s + wave_start
while s_in_wave < 0 or s_in_wave >= wave_dim: while s_in_wave < 0 or s_in_wave >= wave_dim:
if s_in_wave < 0: if s_in_wave < 0:
s_in_wave = - s_in_wave - 1 s_in_wave = -s_in_wave - 1
else: else:
s_in_wave = 2 * wave_dim - 1 - s_in_wave s_in_wave = 2 * wave_dim - 1 - s_in_wave
window[f, s] = wave[s_in_wave] window[f, s] = wave[s_in_wave]
class Test_Kaldi(common_utils.TempDirMixin, common_utils.TorchaudioTestCase): class Test_Kaldi(common_utils.TempDirMixin, common_utils.TorchaudioTestCase):
def _test_get_strided_helper(self, num_samples, window_size, window_shift, snip_edges): def _test_get_strided_helper(self, num_samples, window_size, window_shift, snip_edges):
waveform = torch.arange(num_samples).float() waveform = torch.arange(num_samples).float()
output = kaldi._get_strided(waveform, window_size, window_shift, snip_edges) output = kaldi._get_strided(waveform, window_size, window_shift, snip_edges)
......
...@@ -2,7 +2,6 @@ import os ...@@ -2,7 +2,6 @@ import os
from pathlib import Path from pathlib import Path
from torchaudio.datasets import cmuarctic from torchaudio.datasets import cmuarctic
from torchaudio_unittest.common_utils import ( from torchaudio_unittest.common_utils import (
TempDirMixin, TempDirMixin,
TorchaudioTestCase, TorchaudioTestCase,
......
...@@ -2,7 +2,6 @@ import os ...@@ -2,7 +2,6 @@ import os
from pathlib import Path from pathlib import Path
from torchaudio.datasets import CMUDict from torchaudio.datasets import CMUDict
from torchaudio_unittest.common_utils import ( from torchaudio_unittest.common_utils import (
TempDirMixin, TempDirMixin,
TorchaudioTestCase, TorchaudioTestCase,
...@@ -21,7 +20,7 @@ def get_mock_dataset(root_dir, return_punc=False): ...@@ -21,7 +20,7 @@ def get_mock_dataset(root_dir, return_punc=False):
puncs = [ puncs = [
"!EXCLAMATION-POINT EH2 K S K L AH0 M EY1 SH AH0 N P OY2 N T", "!EXCLAMATION-POINT EH2 K S K L AH0 M EY1 SH AH0 N P OY2 N T",
"\"CLOSE-QUOTE K L OW1 Z K W OW1 T", '"CLOSE-QUOTE K L OW1 Z K W OW1 T',
"#HASH-MARK HH AE1 M AA2 R K", "#HASH-MARK HH AE1 M AA2 R K",
"%PERCENT P ER0 S EH1 N T", "%PERCENT P ER0 S EH1 N T",
"&AMPERSAND AE1 M P ER0 S AE2 N D", "&AMPERSAND AE1 M P ER0 S AE2 N D",
...@@ -43,7 +42,7 @@ def get_mock_dataset(root_dir, return_punc=False): ...@@ -43,7 +42,7 @@ def get_mock_dataset(root_dir, return_punc=False):
punc_outputs = [ punc_outputs = [
"!", "!",
"\"", '"',
"#", "#",
"%", "%",
"&", "&",
......
...@@ -4,6 +4,7 @@ from pathlib import Path ...@@ -4,6 +4,7 @@ from pathlib import Path
from typing import Tuple, Dict from typing import Tuple, Dict
from torch import Tensor from torch import Tensor
from torchaudio.datasets import COMMONVOICE
from torchaudio_unittest.common_utils import ( from torchaudio_unittest.common_utils import (
TempDirMixin, TempDirMixin,
TorchaudioTestCase, TorchaudioTestCase,
...@@ -12,21 +13,40 @@ from torchaudio_unittest.common_utils import ( ...@@ -12,21 +13,40 @@ from torchaudio_unittest.common_utils import (
normalize_wav, normalize_wav,
) )
from torchaudio.datasets import COMMONVOICE
_ORIGINAL_EXT_AUDIO = COMMONVOICE._ext_audio _ORIGINAL_EXT_AUDIO = COMMONVOICE._ext_audio
_SAMPLE_RATE = 48000 _SAMPLE_RATE = 48000
_HEADERS = [u"client_ids", u"path", u"sentence", u"up_votes", u"down_votes", u"age", u"gender", u"accent"] _HEADERS = [u"client_ids", u"path", u"sentence", u"up_votes", u"down_votes", u"age", u"gender", u"accent"]
_EN_TRAIN_CSV_CONTENTS = [ _EN_TRAIN_CSV_CONTENTS = [
["9d16c5d980247861130e0480e2719f448be73d86a496c36d01a477cbdecd8cfd1399403d7a77bf458d211a70711b2da0845c", [
"common_voice_en_18885784.wav", "9d16c5d980247861130e0480e2719f448be73d86a496c36d01a477cbdecd8cfd1399403d7a77bf458d211a70711b2da0845c",
"He was accorded a State funeral, and was buried in Drayton and Toowoomba Cemetery.", "2", "0", "", "", "common_voice_en_18885784.wav",
""], "He was accorded a State funeral, and was buried in Drayton and Toowoomba Cemetery.",
["c82eb9291328620f06025a1f8112b909099e447e485e99236cb87df008650250e79fea5ca772061fb6a370830847b9c44d20", "2",
"common_voice_en_556542.wav", "Once more into the breach", "2", "0", "thirties", "male", "us"], "0",
["f74d880c5ad4c5917f314a604d3fc4805159d255796fb9f8defca35333ecc002bdf53dc463503c12674ea840b21b4a507b7c", "",
"common_voice_en_18607573.wav", "",
"Caddy, show Miss Clare and Miss Summerson their rooms.", "2", "0", "twenties", "male", "canada"], "",
],
[
"c82eb9291328620f06025a1f8112b909099e447e485e99236cb87df008650250e79fea5ca772061fb6a370830847b9c44d20",
"common_voice_en_556542.wav",
"Once more into the breach",
"2",
"0",
"thirties",
"male",
"us",
],
[
"f74d880c5ad4c5917f314a604d3fc4805159d255796fb9f8defca35333ecc002bdf53dc463503c12674ea840b21b4a507b7c",
"common_voice_en_18607573.wav",
"Caddy, show Miss Clare and Miss Summerson their rooms.",
"2",
"0",
"twenties",
"male",
"canada",
],
] ]
_FR_TRAIN_CSV_CONTENTS = [ _FR_TRAIN_CSV_CONTENTS = [
...@@ -35,14 +55,25 @@ _FR_TRAIN_CSV_CONTENTS = [ ...@@ -35,14 +55,25 @@ _FR_TRAIN_CSV_CONTENTS = [
"18343441c601cae0597a4b0d3144", "18343441c601cae0597a4b0d3144",
"89e67e7682b36786a0b4b4022c4d42090c86edd96c78c12d30088e62522b8fe466ea4912e6a1055dfb91b296a0743e0a2bbe" "89e67e7682b36786a0b4b4022c4d42090c86edd96c78c12d30088e62522b8fe466ea4912e6a1055dfb91b296a0743e0a2bbe"
"16cebac98ee5349e3e8262cb9329", "16cebac98ee5349e3e8262cb9329",
"Or sur ce point nous n’avons aucune réponse de votre part.", "2", "0", "twenties", "male", "france"], "Or sur ce point nous n’avons aucune réponse de votre part.",
"2",
"0",
"twenties",
"male",
"france",
],
[ [
"a2e8e1e1cc74d08c92a53d7b9ff84e077eb90410edd85b8882f16fd037cecfcb6a19413c6c63ce6458cfea9579878fa91cef18" "a2e8e1e1cc74d08c92a53d7b9ff84e077eb90410edd85b8882f16fd037cecfcb6a19413c6c63ce6458cfea9579878fa91cef18"
"343441c601cae0597a4b0d3144", "343441c601cae0597a4b0d3144",
"87d71819a26179e93acfee149d0b21b7bf5e926e367d80b2b3792d45f46e04853a514945783ff764c1fc237b4eb0ee2b0a7a7" "87d71819a26179e93acfee149d0b21b7bf5e926e367d80b2b3792d45f46e04853a514945783ff764c1fc237b4eb0ee2b0a7a7"
"cbd395acbdfcfa9d76a6e199bbd", "cbd395acbdfcfa9d76a6e199bbd",
"Monsieur de La Verpillière, laissez parler le ministre", "2", "0", "twenties", "male", "france"], "Monsieur de La Verpillière, laissez parler le ministre",
"2",
"0",
"twenties",
"male",
"france",
],
] ]
...@@ -57,8 +88,8 @@ def get_mock_dataset(root_dir, train_csv_contents, ext_audio) -> Tuple[Tensor, i ...@@ -57,8 +88,8 @@ def get_mock_dataset(root_dir, train_csv_contents, ext_audio) -> Tuple[Tensor, i
tsv_filename = os.path.join(root_dir, "train.tsv") tsv_filename = os.path.join(root_dir, "train.tsv")
audio_base_path = os.path.join(root_dir, "clips") audio_base_path = os.path.join(root_dir, "clips")
os.makedirs(audio_base_path, exist_ok=True) os.makedirs(audio_base_path, exist_ok=True)
with open(tsv_filename, "w", newline='') as tsv: with open(tsv_filename, "w", newline="") as tsv:
writer = csv.writer(tsv, delimiter='\t') writer = csv.writer(tsv, delimiter="\t")
writer.writerow(_HEADERS) writer.writerow(_HEADERS)
for i, content in enumerate(train_csv_contents): for i, content in enumerate(train_csv_contents):
content[2] = str(content[2].encode("utf-8")) content[2] = str(content[2].encode("utf-8"))
...@@ -68,7 +99,7 @@ def get_mock_dataset(root_dir, train_csv_contents, ext_audio) -> Tuple[Tensor, i ...@@ -68,7 +99,7 @@ def get_mock_dataset(root_dir, train_csv_contents, ext_audio) -> Tuple[Tensor, i
else: else:
audio_path = os.path.join(audio_base_path, content[1]) audio_path = os.path.join(audio_base_path, content[1])
data = get_whitenoise(sample_rate=_SAMPLE_RATE, duration=1, n_channels=1, seed=i, dtype='float32') data = get_whitenoise(sample_rate=_SAMPLE_RATE, duration=1, n_channels=1, seed=i, dtype="float32")
save_wav(audio_path, data, _SAMPLE_RATE) save_wav(audio_path, data, _SAMPLE_RATE)
# Append data entry # Append data entry
mocked_data.append((normalize_wav(data), _SAMPLE_RATE, dict(zip(_HEADERS, content)))) mocked_data.append((normalize_wav(data), _SAMPLE_RATE, dict(zip(_HEADERS, content))))
...@@ -117,7 +148,7 @@ class BaseTestCommonVoice(TempDirMixin): ...@@ -117,7 +148,7 @@ class BaseTestCommonVoice(TempDirMixin):
class TestCommonVoiceEN(BaseTestCommonVoice, TorchaudioTestCase): class TestCommonVoiceEN(BaseTestCommonVoice, TorchaudioTestCase):
backend = 'default' backend = "default"
root_dir = None root_dir = None
@classmethod @classmethod
...@@ -135,7 +166,7 @@ class TestCommonVoiceEN(BaseTestCommonVoice, TorchaudioTestCase): ...@@ -135,7 +166,7 @@ class TestCommonVoiceEN(BaseTestCommonVoice, TorchaudioTestCase):
class TestCommonVoiceFR(BaseTestCommonVoice, TorchaudioTestCase): class TestCommonVoiceFR(BaseTestCommonVoice, TorchaudioTestCase):
backend = 'default' backend = "default"
root_dir = None root_dir = None
@classmethod @classmethod
......
from pathlib import Path from pathlib import Path
import pytest import pytest
from torchaudio.datasets import dr_vctk from torchaudio.datasets import dr_vctk
from torchaudio_unittest.common_utils import ( from torchaudio_unittest.common_utils import (
TempDirMixin, TempDirMixin,
TorchaudioTestCase, TorchaudioTestCase,
...@@ -57,11 +55,7 @@ def get_mock_dataset(root_dir): ...@@ -57,11 +55,7 @@ def get_mock_dataset(root_dir):
data = {} data = {}
for condition in _CONDITIONS: for condition in _CONDITIONS:
data[condition] = get_whitenoise( data[condition] = get_whitenoise(
sample_rate=sample_rate, sample_rate=sample_rate, duration=0.01, n_channels=1, dtype="float32", seed=seed
duration=0.01,
n_channels=1,
dtype='float32',
seed=seed
) )
audio_dir = dataset_dir / f"{condition}_{subset}set_wav_16k" audio_dir = dataset_dir / f"{condition}_{subset}set_wav_16k"
audio_file_path = audio_dir / filename audio_file_path = audio_dir / filename
...@@ -85,7 +79,7 @@ def get_mock_dataset(root_dir): ...@@ -85,7 +79,7 @@ def get_mock_dataset(root_dir):
class TestDRVCTK(TempDirMixin, TorchaudioTestCase): class TestDRVCTK(TempDirMixin, TorchaudioTestCase):
backend = 'default' backend = "default"
root_dir = None root_dir = None
samples = {} samples = {}
......
...@@ -2,7 +2,6 @@ import os ...@@ -2,7 +2,6 @@ import os
from pathlib import Path from pathlib import Path
from torchaudio.datasets import gtzan from torchaudio.datasets import gtzan
from torchaudio_unittest.common_utils import ( from torchaudio_unittest.common_utils import (
TempDirMixin, TempDirMixin,
TorchaudioTestCase, TorchaudioTestCase,
...@@ -24,12 +23,12 @@ def get_mock_dataset(root_dir): ...@@ -24,12 +23,12 @@ def get_mock_dataset(root_dir):
seed = 0 seed = 0
for genre in gtzan.gtzan_genres: for genre in gtzan.gtzan_genres:
base_dir = os.path.join(root_dir, 'genres', genre) base_dir = os.path.join(root_dir, "genres", genre)
os.makedirs(base_dir, exist_ok=True) os.makedirs(base_dir, exist_ok=True)
for i in range(100): for i in range(100):
filename = f'{genre}.{i:05d}' filename = f"{genre}.{i:05d}"
path = os.path.join(base_dir, f'{filename}.wav') path = os.path.join(base_dir, f"{filename}.wav")
data = get_whitenoise(sample_rate=sample_rate, duration=0.01, n_channels=1, dtype='int16', seed=seed) data = get_whitenoise(sample_rate=sample_rate, duration=0.01, n_channels=1, dtype="int16", seed=seed)
save_wav(path, data, sample_rate) save_wav(path, data, sample_rate)
sample = (normalize_wav(data), sample_rate, genre) sample = (normalize_wav(data), sample_rate, genre)
mocked_samples.append(sample) mocked_samples.append(sample)
...@@ -44,7 +43,7 @@ def get_mock_dataset(root_dir): ...@@ -44,7 +43,7 @@ def get_mock_dataset(root_dir):
class TestGTZAN(TempDirMixin, TorchaudioTestCase): class TestGTZAN(TempDirMixin, TorchaudioTestCase):
backend = 'default' backend = "default"
root_dir = None root_dir = None
samples = [] samples = []
...@@ -100,28 +99,28 @@ class TestGTZAN(TempDirMixin, TorchaudioTestCase): ...@@ -100,28 +99,28 @@ class TestGTZAN(TempDirMixin, TorchaudioTestCase):
assert n_ite == len(self.testing) assert n_ite == len(self.testing)
def test_training_str(self): def test_training_str(self):
train_dataset = gtzan.GTZAN(self.root_dir, subset='training') train_dataset = gtzan.GTZAN(self.root_dir, subset="training")
self._test_training(train_dataset) self._test_training(train_dataset)
def test_validation_str(self): def test_validation_str(self):
val_dataset = gtzan.GTZAN(self.root_dir, subset='validation') val_dataset = gtzan.GTZAN(self.root_dir, subset="validation")
self._test_validation(val_dataset) self._test_validation(val_dataset)
def test_testing_str(self): def test_testing_str(self):
test_dataset = gtzan.GTZAN(self.root_dir, subset='testing') test_dataset = gtzan.GTZAN(self.root_dir, subset="testing")
self._test_testing(test_dataset) self._test_testing(test_dataset)
def test_training_path(self): def test_training_path(self):
root_dir = Path(self.root_dir) root_dir = Path(self.root_dir)
train_dataset = gtzan.GTZAN(root_dir, subset='training') train_dataset = gtzan.GTZAN(root_dir, subset="training")
self._test_training(train_dataset) self._test_training(train_dataset)
def test_validation_path(self): def test_validation_path(self):
root_dir = Path(self.root_dir) root_dir = Path(self.root_dir)
val_dataset = gtzan.GTZAN(root_dir, subset='validation') val_dataset = gtzan.GTZAN(root_dir, subset="validation")
self._test_validation(val_dataset) self._test_validation(val_dataset)
def test_testing_path(self): def test_testing_path(self):
root_dir = Path(self.root_dir) root_dir = Path(self.root_dir)
test_dataset = gtzan.GTZAN(root_dir, subset='testing') test_dataset = gtzan.GTZAN(root_dir, subset="testing")
self._test_testing(test_dataset) self._test_testing(test_dataset)
import os import os
from pathlib import Path from pathlib import Path
from torchaudio.datasets import librispeech
from torchaudio_unittest.common_utils import ( from torchaudio_unittest.common_utils import (
TempDirMixin, TempDirMixin,
TorchaudioTestCase, TorchaudioTestCase,
...@@ -9,21 +10,8 @@ from torchaudio_unittest.common_utils import ( ...@@ -9,21 +10,8 @@ from torchaudio_unittest.common_utils import (
normalize_wav, normalize_wav,
) )
from torchaudio.datasets import librispeech
# Used to generate a unique transcript for each dummy audio file # Used to generate a unique transcript for each dummy audio file
_NUMBERS = [ _NUMBERS = ["ZERO", "ONE", "TWO", "THREE", "FOUR", "FIVE", "SIX", "SEVEN", "EIGHT", "NINE"]
'ZERO',
'ONE',
'TWO',
'THREE',
'FOUR',
'FIVE',
'SIX',
'SEVEN',
'EIGHT',
'NINE'
]
def get_mock_dataset(root_dir): def get_mock_dataset(root_dir):
...@@ -31,9 +19,7 @@ def get_mock_dataset(root_dir): ...@@ -31,9 +19,7 @@ def get_mock_dataset(root_dir):
root_dir: directory to the mocked dataset root_dir: directory to the mocked dataset
""" """
mocked_data = [] mocked_data = []
dataset_dir = os.path.join( dataset_dir = os.path.join(root_dir, librispeech.FOLDER_IN_ARCHIVE, librispeech.URL)
root_dir, librispeech.FOLDER_IN_ARCHIVE, librispeech.URL
)
os.makedirs(dataset_dir, exist_ok=True) os.makedirs(dataset_dir, exist_ok=True)
sample_rate = 16000 # 16kHz sample_rate = 16000 # 16kHz
seed = 0 seed = 0
...@@ -48,45 +34,28 @@ def get_mock_dataset(root_dir): ...@@ -48,45 +34,28 @@ def get_mock_dataset(root_dir):
trans_content = [] trans_content = []
for utterance_id in range(10): for utterance_id in range(10):
filename = f'{speaker_id}-{chapter_id}-{utterance_id:04d}.wav' filename = f"{speaker_id}-{chapter_id}-{utterance_id:04d}.wav"
path = os.path.join(chapter_path, filename) path = os.path.join(chapter_path, filename)
transcript = ' '.join( transcript = " ".join([_NUMBERS[x] for x in [speaker_id, chapter_id, utterance_id]])
[_NUMBERS[x] for x in [speaker_id, chapter_id, utterance_id]] trans_content.append(f"{speaker_id}-{chapter_id}-{utterance_id:04d} {transcript}")
)
trans_content.append( data = get_whitenoise(sample_rate=sample_rate, duration=0.01, n_channels=1, dtype="float32", seed=seed)
f'{speaker_id}-{chapter_id}-{utterance_id:04d} {transcript}'
)
data = get_whitenoise(
sample_rate=sample_rate,
duration=0.01,
n_channels=1,
dtype='float32',
seed=seed
)
save_wav(path, data, sample_rate) save_wav(path, data, sample_rate)
sample = ( sample = (normalize_wav(data), sample_rate, transcript, speaker_id, chapter_id, utterance_id)
normalize_wav(data),
sample_rate,
transcript,
speaker_id,
chapter_id,
utterance_id
)
mocked_data.append(sample) mocked_data.append(sample)
seed += 1 seed += 1
trans_filename = f'{speaker_id}-{chapter_id}.trans.txt' trans_filename = f"{speaker_id}-{chapter_id}.trans.txt"
trans_path = os.path.join(chapter_path, trans_filename) trans_path = os.path.join(chapter_path, trans_filename)
with open(trans_path, 'w') as f: with open(trans_path, "w") as f:
f.write('\n'.join(trans_content)) f.write("\n".join(trans_content))
return mocked_data return mocked_data
class TestLibriSpeech(TempDirMixin, TorchaudioTestCase): class TestLibriSpeech(TempDirMixin, TorchaudioTestCase):
backend = 'default' backend = "default"
root_dir = None root_dir = None
samples = [] samples = []
...@@ -99,13 +68,11 @@ class TestLibriSpeech(TempDirMixin, TorchaudioTestCase): ...@@ -99,13 +68,11 @@ class TestLibriSpeech(TempDirMixin, TorchaudioTestCase):
@classmethod @classmethod
def tearDownClass(cls): def tearDownClass(cls):
# In case of test failure # In case of test failure
librispeech.LIBRISPEECH._ext_audio = '.flac' librispeech.LIBRISPEECH._ext_audio = ".flac"
def _test_librispeech(self, dataset): def _test_librispeech(self, dataset):
num_samples = 0 num_samples = 0
for i, ( for i, (data, sample_rate, transcript, speaker_id, chapter_id, utterance_id) in enumerate(dataset):
data, sample_rate, transcript, speaker_id, chapter_id, utterance_id
) in enumerate(dataset):
self.assertEqual(data, self.samples[i][0], atol=5e-5, rtol=1e-8) self.assertEqual(data, self.samples[i][0], atol=5e-5, rtol=1e-8)
assert sample_rate == self.samples[i][1] assert sample_rate == self.samples[i][1]
assert transcript == self.samples[i][2] assert transcript == self.samples[i][2]
...@@ -115,14 +82,14 @@ class TestLibriSpeech(TempDirMixin, TorchaudioTestCase): ...@@ -115,14 +82,14 @@ class TestLibriSpeech(TempDirMixin, TorchaudioTestCase):
num_samples += 1 num_samples += 1
assert num_samples == len(self.samples) assert num_samples == len(self.samples)
librispeech.LIBRISPEECH._ext_audio = '.flac' librispeech.LIBRISPEECH._ext_audio = ".flac"
def test_librispeech_str(self): def test_librispeech_str(self):
librispeech.LIBRISPEECH._ext_audio = '.wav' librispeech.LIBRISPEECH._ext_audio = ".wav"
dataset = librispeech.LIBRISPEECH(self.root_dir) dataset = librispeech.LIBRISPEECH(self.root_dir)
self._test_librispeech(dataset) self._test_librispeech(dataset)
def test_librispeech_path(self): def test_librispeech_path(self):
librispeech.LIBRISPEECH._ext_audio = '.wav' librispeech.LIBRISPEECH._ext_audio = ".wav"
dataset = librispeech.LIBRISPEECH(Path(self.root_dir)) dataset = librispeech.LIBRISPEECH(Path(self.root_dir))
self._test_librispeech(dataset) self._test_librispeech(dataset)
import os import os
from pathlib import Path from pathlib import Path
from torchaudio.datasets.libritts import LIBRITTS
from torchaudio_unittest.common_utils import ( from torchaudio_unittest.common_utils import (
TempDirMixin, TempDirMixin,
TorchaudioTestCase, TorchaudioTestCase,
...@@ -9,14 +10,12 @@ from torchaudio_unittest.common_utils import ( ...@@ -9,14 +10,12 @@ from torchaudio_unittest.common_utils import (
normalize_wav, normalize_wav,
) )
from torchaudio.datasets.libritts import LIBRITTS
_UTTERANCE_IDS = [ _UTTERANCE_IDS = [
[19, 198, '000000', '000000'], [19, 198, "000000", "000000"],
[26, 495, '000004', '000000'], [26, 495, "000004", "000000"],
] ]
_ORIGINAL_TEXT = 'this is the original text.' _ORIGINAL_TEXT = "this is the original text."
_NORMALIZED_TEXT = 'this is the normalized text.' _NORMALIZED_TEXT = "this is the normalized text."
def get_mock_dataset(root_dir): def get_mock_dataset(root_dir):
...@@ -24,31 +23,31 @@ def get_mock_dataset(root_dir): ...@@ -24,31 +23,31 @@ def get_mock_dataset(root_dir):
root_dir: directory to the mocked dataset root_dir: directory to the mocked dataset
""" """
mocked_data = [] mocked_data = []
base_dir = os.path.join(root_dir, 'LibriTTS', 'train-clean-100') base_dir = os.path.join(root_dir, "LibriTTS", "train-clean-100")
for i, utterance_id in enumerate(_UTTERANCE_IDS): for i, utterance_id in enumerate(_UTTERANCE_IDS):
filename = f'{"_".join(str(u) for u in utterance_id)}.wav' filename = f'{"_".join(str(u) for u in utterance_id)}.wav'
file_dir = os.path.join(base_dir, str(utterance_id[0]), str(utterance_id[1])) file_dir = os.path.join(base_dir, str(utterance_id[0]), str(utterance_id[1]))
os.makedirs(file_dir, exist_ok=True) os.makedirs(file_dir, exist_ok=True)
path = os.path.join(file_dir, filename) path = os.path.join(file_dir, filename)
data = get_whitenoise(sample_rate=24000, duration=2, n_channels=1, dtype='int16', seed=i) data = get_whitenoise(sample_rate=24000, duration=2, n_channels=1, dtype="int16", seed=i)
save_wav(path, data, 24000) save_wav(path, data, 24000)
mocked_data.append(normalize_wav(data)) mocked_data.append(normalize_wav(data))
original_text_filename = f'{"_".join(str(u) for u in utterance_id)}.original.txt' original_text_filename = f'{"_".join(str(u) for u in utterance_id)}.original.txt'
path_original = os.path.join(file_dir, original_text_filename) path_original = os.path.join(file_dir, original_text_filename)
with open(path_original, 'w') as file_: with open(path_original, "w") as file_:
file_.write(_ORIGINAL_TEXT) file_.write(_ORIGINAL_TEXT)
normalized_text_filename = f'{"_".join(str(u) for u in utterance_id)}.normalized.txt' normalized_text_filename = f'{"_".join(str(u) for u in utterance_id)}.normalized.txt'
path_normalized = os.path.join(file_dir, normalized_text_filename) path_normalized = os.path.join(file_dir, normalized_text_filename)
with open(path_normalized, 'w') as file_: with open(path_normalized, "w") as file_:
file_.write(_NORMALIZED_TEXT) file_.write(_NORMALIZED_TEXT)
return mocked_data, _UTTERANCE_IDS, _ORIGINAL_TEXT, _NORMALIZED_TEXT return mocked_data, _UTTERANCE_IDS, _ORIGINAL_TEXT, _NORMALIZED_TEXT
class TestLibriTTS(TempDirMixin, TorchaudioTestCase): class TestLibriTTS(TempDirMixin, TorchaudioTestCase):
backend = 'default' backend = "default"
root_dir = None root_dir = None
data = [] data = []
...@@ -61,13 +60,15 @@ class TestLibriTTS(TempDirMixin, TorchaudioTestCase): ...@@ -61,13 +60,15 @@ class TestLibriTTS(TempDirMixin, TorchaudioTestCase):
def _test_libritts(self, dataset): def _test_libritts(self, dataset):
n_ites = 0 n_ites = 0
for i, (waveform, for i, (
sample_rate, waveform,
original_text, sample_rate,
normalized_text, original_text,
speaker_id, normalized_text,
chapter_id, speaker_id,
utterance_id) in enumerate(dataset): chapter_id,
utterance_id,
) in enumerate(dataset):
expected_ids = self._utterance_ids[i] expected_ids = self._utterance_ids[i]
expected_data = self.data[i] expected_data = self.data[i]
self.assertEqual(expected_data, waveform, atol=5e-5, rtol=1e-8) self.assertEqual(expected_data, waveform, atol=5e-5, rtol=1e-8)
......
...@@ -2,6 +2,7 @@ import csv ...@@ -2,6 +2,7 @@ import csv
import os import os
from pathlib import Path from pathlib import Path
from torchaudio.datasets import ljspeech
from torchaudio_unittest.common_utils import ( from torchaudio_unittest.common_utils import (
TempDirMixin, TempDirMixin,
TorchaudioTestCase, TorchaudioTestCase,
...@@ -10,20 +11,18 @@ from torchaudio_unittest.common_utils import ( ...@@ -10,20 +11,18 @@ from torchaudio_unittest.common_utils import (
save_wav, save_wav,
) )
from torchaudio.datasets import ljspeech
_TRANSCRIPTS = [ _TRANSCRIPTS = [
"Test transcript 1", "Test transcript 1",
"Test transcript 2", "Test transcript 2",
"Test transcript 3", "Test transcript 3",
"In 1465 Sweynheim and Pannartz began printing in the monastery of Subiaco near Rome," "In 1465 Sweynheim and Pannartz began printing in the monastery of Subiaco near Rome,",
] ]
_NORMALIZED_TRANSCRIPT = [ _NORMALIZED_TRANSCRIPT = [
"Test transcript one", "Test transcript one",
"Test transcript two", "Test transcript two",
"Test transcript three", "Test transcript three",
"In fourteen sixty-five Sweynheim and Pannartz began printing in the monastery of Subiaco near Rome," "In fourteen sixty-five Sweynheim and Pannartz began printing in the monastery of Subiaco near Rome,",
] ]
...@@ -38,20 +37,14 @@ def get_mock_dataset(root_dir): ...@@ -38,20 +37,14 @@ def get_mock_dataset(root_dir):
metadata_path = os.path.join(base_dir, "metadata.csv") metadata_path = os.path.join(base_dir, "metadata.csv")
sample_rate = 22050 sample_rate = 22050
with open(metadata_path, mode="w", newline='') as metadata_file: with open(metadata_path, mode="w", newline="") as metadata_file:
metadata_writer = csv.writer( metadata_writer = csv.writer(metadata_file, delimiter="|", quoting=csv.QUOTE_NONE)
metadata_file, delimiter="|", quoting=csv.QUOTE_NONE for i, (transcript, normalized_transcript) in enumerate(zip(_TRANSCRIPTS, _NORMALIZED_TRANSCRIPT)):
) fileid = f"LJ001-{i:04d}"
for i, (transcript, normalized_transcript) in enumerate(
zip(_TRANSCRIPTS, _NORMALIZED_TRANSCRIPT)
):
fileid = f'LJ001-{i:04d}'
metadata_writer.writerow([fileid, transcript, normalized_transcript]) metadata_writer.writerow([fileid, transcript, normalized_transcript])
filename = fileid + ".wav" filename = fileid + ".wav"
path = os.path.join(archive_dir, filename) path = os.path.join(archive_dir, filename)
data = get_whitenoise( data = get_whitenoise(sample_rate=sample_rate, duration=1, n_channels=1, dtype="int16", seed=i)
sample_rate=sample_rate, duration=1, n_channels=1, dtype="int16", seed=i
)
save_wav(path, data, sample_rate) save_wav(path, data, sample_rate)
mocked_data.append(normalize_wav(data)) mocked_data.append(normalize_wav(data))
return mocked_data, _TRANSCRIPTS, _NORMALIZED_TRANSCRIPT return mocked_data, _TRANSCRIPTS, _NORMALIZED_TRANSCRIPT
...@@ -70,9 +63,7 @@ class TestLJSpeech(TempDirMixin, TorchaudioTestCase): ...@@ -70,9 +63,7 @@ class TestLJSpeech(TempDirMixin, TorchaudioTestCase):
def _test_ljspeech(self, dataset): def _test_ljspeech(self, dataset):
n_ite = 0 n_ite = 0
for i, (waveform, sample_rate, transcript, normalized_transcript) in enumerate( for i, (waveform, sample_rate, transcript, normalized_transcript) in enumerate(dataset):
dataset
):
expected_transcript = self._transcripts[i] expected_transcript = self._transcripts[i]
expected_normalized_transcript = self._normalized_transcript[i] expected_normalized_transcript = self._normalized_transcript[i]
expected_data = self.data[i] expected_data = self.data[i]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment