Commit 9dcc7a15 authored by flyingdown's avatar flyingdown
Browse files

init v0.10.0

parent db2b0b79
Pipeline #254 failed with stages
in 0 seconds
from torchaudio import sox_effects
from parameterized import parameterized
from torchaudio_unittest.common_utils import (
TempDirMixin,
TorchaudioTestCase,
skipIfNoSox,
get_wav_data,
get_sinusoid,
save_wav,
)
from .common import (
load_params,
)
@skipIfNoSox
class SmokeTest(TempDirMixin, TorchaudioTestCase):
"""Run smoke test on various effects
The purpose of this test suite is to verify that sox_effect functionalities do not exhibit
abnormal behaviors.
This test suite should be able to run without any additional tools (such as sox command),
however without such tools, the correctness of each function cannot be verified.
"""
@parameterized.expand(
load_params("sox_effect_test_args.jsonl"),
name_func=lambda f, i, p: f'{f.__name__}_{i}_{p.args[0]["effects"][0][0]}',
)
def test_apply_effects_tensor(self, args):
"""`apply_effects_tensor` should not crash"""
effects = args['effects']
num_channels = args.get("num_channels", 2)
input_sr = args.get("input_sample_rate", 8000)
original = get_sinusoid(
frequency=800, sample_rate=input_sr,
n_channels=num_channels, dtype='float32')
_found, _sr = sox_effects.apply_effects_tensor(original, input_sr, effects)
@parameterized.expand(
load_params("sox_effect_test_args.jsonl"),
name_func=lambda f, i, p: f'{f.__name__}_{i}_{p.args[0]["effects"][0][0]}',
)
def test_apply_effects_file(self, args):
"""`apply_effects_file` should return identical data as sox command"""
dtype = 'int32'
channels_first = True
effects = args['effects']
num_channels = args.get("num_channels", 2)
input_sr = args.get("input_sample_rate", 8000)
input_path = self.get_temp_path('input.wav')
data = get_wav_data(dtype, num_channels, channels_first=channels_first)
save_wav(input_path, data, input_sr, channels_first=channels_first)
_found, _sr = sox_effects.apply_effects_file(
input_path, effects, normalize=False, channels_first=channels_first)
@parameterized.expand(
load_params("sox_effect_test_args.jsonl"),
name_func=lambda f, i, p: f'{f.__name__}_{i}_{p.args[0]["effects"][0][0]}',
)
def test_apply_effects_fileobj(self, args):
"""`apply_effects_file` should return identical data as sox command"""
dtype = 'int32'
channels_first = True
effects = args['effects']
num_channels = args.get("num_channels", 2)
input_sr = args.get("input_sample_rate", 8000)
input_path = self.get_temp_path('input.wav')
data = get_wav_data(dtype, num_channels, channels_first=channels_first)
save_wav(input_path, data, input_sr, channels_first=channels_first)
with open(input_path, 'rb') as fileobj:
_found, _sr = sox_effects.apply_effects_file(
fileobj, effects, normalize=False, channels_first=channels_first)
import io
import itertools
from pathlib import Path
import tarfile
from parameterized import parameterized
from torchaudio import sox_effects
from torchaudio._internal import module_utils as _mod_utils
from torchaudio_unittest.common_utils import (
TempDirMixin,
HttpServerMixin,
PytorchTestCase,
skipIfNoSox,
skipIfNoModule,
skipIfNoExec,
get_asset_path,
get_sinusoid,
get_wav_data,
save_wav,
load_wav,
sox_utils,
)
from .common import (
load_params,
name_func,
)
if _mod_utils.is_module_available("requests"):
import requests
@skipIfNoSox
class TestSoxEffects(PytorchTestCase):
def test_init(self):
"""Calling init_sox_effects multiple times does not crush"""
for _ in range(3):
sox_effects.init_sox_effects()
@skipIfNoSox
class TestSoxEffectsTensor(TempDirMixin, PytorchTestCase):
"""Test suite for `apply_effects_tensor` function"""
@parameterized.expand(list(itertools.product(
['float32', 'int32', 'int16', 'uint8'],
[8000, 16000],
[1, 2, 4, 8],
[True, False]
)), name_func=name_func)
def test_apply_no_effect(self, dtype, sample_rate, num_channels, channels_first):
"""`apply_effects_tensor` without effects should return identical data as input"""
original = get_wav_data(dtype, num_channels, channels_first=channels_first)
expected = original.clone()
found, output_sample_rate = sox_effects.apply_effects_tensor(
expected, sample_rate, [], channels_first)
assert output_sample_rate == sample_rate
# SoxEffect should not alter the input Tensor object
self.assertEqual(original, expected)
# SoxEffect should not return the same Tensor object
assert expected is not found
# Returned Tensor should equal to the input Tensor
self.assertEqual(expected, found)
@parameterized.expand(
load_params("sox_effect_test_args.jsonl"),
name_func=lambda f, i, p: f'{f.__name__}_{i}_{p.args[0]["effects"][0][0]}',
)
def test_apply_effects(self, args):
"""`apply_effects_tensor` should return identical data as sox command"""
effects = args['effects']
num_channels = args.get("num_channels", 2)
input_sr = args.get("input_sample_rate", 8000)
output_sr = args.get("output_sample_rate")
input_path = self.get_temp_path('input.wav')
reference_path = self.get_temp_path('reference.wav')
original = get_sinusoid(
frequency=800, sample_rate=input_sr,
n_channels=num_channels, dtype='float32')
save_wav(input_path, original, input_sr)
sox_utils.run_sox_effect(
input_path, reference_path, effects, output_sample_rate=output_sr)
expected, expected_sr = load_wav(reference_path)
found, sr = sox_effects.apply_effects_tensor(original, input_sr, effects)
assert sr == expected_sr
self.assertEqual(expected, found)
@skipIfNoSox
class TestSoxEffectsFile(TempDirMixin, PytorchTestCase):
"""Test suite for `apply_effects_file` function"""
@parameterized.expand(list(itertools.product(
['float32', 'int32', 'int16', 'uint8'],
[8000, 16000],
[1, 2, 4, 8],
[False, True],
)), name_func=name_func)
def test_apply_no_effect(self, dtype, sample_rate, num_channels, channels_first):
"""`apply_effects_file` without effects should return identical data as input"""
path = self.get_temp_path('input.wav')
expected = get_wav_data(dtype, num_channels, channels_first=channels_first)
save_wav(path, expected, sample_rate, channels_first=channels_first)
found, output_sample_rate = sox_effects.apply_effects_file(
path, [], normalize=False, channels_first=channels_first)
assert output_sample_rate == sample_rate
self.assertEqual(expected, found)
@parameterized.expand(
load_params("sox_effect_test_args.jsonl"),
name_func=lambda f, i, p: f'{f.__name__}_{i}_{p.args[0]["effects"][0][0]}',
)
def test_apply_effects_str(self, args):
"""`apply_effects_file` should return identical data as sox command"""
dtype = 'int32'
channels_first = True
effects = args['effects']
num_channels = args.get("num_channels", 2)
input_sr = args.get("input_sample_rate", 8000)
output_sr = args.get("output_sample_rate")
input_path = self.get_temp_path('input.wav')
reference_path = self.get_temp_path('reference.wav')
data = get_wav_data(dtype, num_channels, channels_first=channels_first)
save_wav(input_path, data, input_sr, channels_first=channels_first)
sox_utils.run_sox_effect(
input_path, reference_path, effects, output_sample_rate=output_sr)
expected, expected_sr = load_wav(reference_path)
found, sr = sox_effects.apply_effects_file(
input_path, effects, normalize=False, channels_first=channels_first)
assert sr == expected_sr
self.assertEqual(found, expected)
def test_apply_effects_path(self):
"""`apply_effects_file` should return identical data as sox command when file path is given as a Path Object"""
dtype = 'int32'
channels_first = True
effects = [["hilbert"]]
num_channels = 2
input_sr = 8000
output_sr = 8000
input_path = self.get_temp_path('input.wav')
reference_path = self.get_temp_path('reference.wav')
data = get_wav_data(dtype, num_channels, channels_first=channels_first)
save_wav(input_path, data, input_sr, channels_first=channels_first)
sox_utils.run_sox_effect(
input_path, reference_path, effects, output_sample_rate=output_sr)
expected, expected_sr = load_wav(reference_path)
found, sr = sox_effects.apply_effects_file(
Path(input_path), effects, normalize=False, channels_first=channels_first)
assert sr == expected_sr
self.assertEqual(found, expected)
@skipIfNoSox
class TestFileFormats(TempDirMixin, PytorchTestCase):
"""`apply_effects_file` gives the same result as sox on various file formats"""
@parameterized.expand(list(itertools.product(
['float32', 'int32', 'int16', 'uint8'],
[8000, 16000],
[1, 2],
)), name_func=lambda f, _, p: f'{f.__name__}_{"_".join(str(arg) for arg in p.args)}')
def test_wav(self, dtype, sample_rate, num_channels):
"""`apply_effects_file` works on various wav format"""
channels_first = True
effects = [['band', '300', '10']]
input_path = self.get_temp_path('input.wav')
reference_path = self.get_temp_path('reference.wav')
data = get_wav_data(dtype, num_channels, channels_first=channels_first)
save_wav(input_path, data, sample_rate, channels_first=channels_first)
sox_utils.run_sox_effect(input_path, reference_path, effects)
expected, expected_sr = load_wav(reference_path)
found, sr = sox_effects.apply_effects_file(
input_path, effects, normalize=False, channels_first=channels_first)
assert sr == expected_sr
self.assertEqual(found, expected)
@parameterized.expand(list(itertools.product(
[8000, 16000],
[1, 2],
)), name_func=lambda f, _, p: f'{f.__name__}_{"_".join(str(arg) for arg in p.args)}')
def test_mp3(self, sample_rate, num_channels):
"""`apply_effects_file` works on various mp3 format"""
channels_first = True
effects = [['band', '300', '10']]
input_path = self.get_temp_path('input.mp3')
reference_path = self.get_temp_path('reference.wav')
sox_utils.gen_audio_file(input_path, sample_rate, num_channels)
sox_utils.run_sox_effect(input_path, reference_path, effects)
expected, expected_sr = load_wav(reference_path)
found, sr = sox_effects.apply_effects_file(
input_path, effects, channels_first=channels_first)
save_wav(self.get_temp_path('result.wav'), found, sr, channels_first=channels_first)
assert sr == expected_sr
self.assertEqual(found, expected, atol=1e-4, rtol=1e-8)
@parameterized.expand(list(itertools.product(
[8000, 16000],
[1, 2],
)), name_func=lambda f, _, p: f'{f.__name__}_{"_".join(str(arg) for arg in p.args)}')
def test_flac(self, sample_rate, num_channels):
"""`apply_effects_file` works on various flac format"""
channels_first = True
effects = [['band', '300', '10']]
input_path = self.get_temp_path('input.flac')
reference_path = self.get_temp_path('reference.wav')
sox_utils.gen_audio_file(input_path, sample_rate, num_channels)
sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32)
expected, expected_sr = load_wav(reference_path)
found, sr = sox_effects.apply_effects_file(
input_path, effects, channels_first=channels_first)
save_wav(self.get_temp_path('result.wav'), found, sr, channels_first=channels_first)
assert sr == expected_sr
self.assertEqual(found, expected)
@parameterized.expand(list(itertools.product(
[8000, 16000],
[1, 2],
)), name_func=lambda f, _, p: f'{f.__name__}_{"_".join(str(arg) for arg in p.args)}')
def test_vorbis(self, sample_rate, num_channels):
"""`apply_effects_file` works on various vorbis format"""
channels_first = True
effects = [['band', '300', '10']]
input_path = self.get_temp_path('input.vorbis')
reference_path = self.get_temp_path('reference.wav')
sox_utils.gen_audio_file(input_path, sample_rate, num_channels)
sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32)
expected, expected_sr = load_wav(reference_path)
found, sr = sox_effects.apply_effects_file(
input_path, effects, channels_first=channels_first)
save_wav(self.get_temp_path('result.wav'), found, sr, channels_first=channels_first)
assert sr == expected_sr
self.assertEqual(found, expected)
@skipIfNoSox
class TestApplyEffectFileWithoutExtension(PytorchTestCase):
def test_mp3(self):
"""Providing format allows to read mp3 without extension
libsox does not check header for mp3
https://github.com/pytorch/audio/issues/1040
The file was generated with the following command
ffmpeg -f lavfi -i "sine=frequency=1000:duration=5" -ar 16000 -f mp3 test_noext
"""
effects = [['band', '300', '10']]
path = get_asset_path("mp3_without_ext")
_, sr = sox_effects.apply_effects_file(path, effects, format="mp3")
assert sr == 16000
@skipIfNoExec('sox')
@skipIfNoSox
class TestFileObject(TempDirMixin, PytorchTestCase):
@parameterized.expand([
('wav', None),
('mp3', 128),
('mp3', 320),
('flac', 0),
('flac', 5),
('flac', 8),
('vorbis', -1),
('vorbis', 10),
('amb', None),
])
def test_fileobj(self, ext, compression):
"""Applying effects via file object works"""
sample_rate = 16000
channels_first = True
effects = [['band', '300', '10']]
format_ = ext if ext in ['mp3'] else None
input_path = self.get_temp_path(f'input.{ext}')
reference_path = self.get_temp_path('reference.wav')
sox_utils.gen_audio_file(
input_path, sample_rate, num_channels=2, compression=compression)
sox_utils.run_sox_effect(
input_path, reference_path, effects, output_bitdepth=32)
expected, expected_sr = load_wav(reference_path)
with open(input_path, 'rb') as fileobj:
found, sr = sox_effects.apply_effects_file(
fileobj, effects, channels_first=channels_first, format=format_)
save_wav(self.get_temp_path('result.wav'), found, sr, channels_first=channels_first)
assert sr == expected_sr
self.assertEqual(found, expected)
@parameterized.expand([
('wav', None),
('mp3', 128),
('mp3', 320),
('flac', 0),
('flac', 5),
('flac', 8),
('vorbis', -1),
('vorbis', 10),
('amb', None),
])
def test_bytesio(self, ext, compression):
"""Applying effects via BytesIO object works"""
sample_rate = 16000
channels_first = True
effects = [['band', '300', '10']]
format_ = ext if ext in ['mp3'] else None
input_path = self.get_temp_path(f'input.{ext}')
reference_path = self.get_temp_path('reference.wav')
sox_utils.gen_audio_file(
input_path, sample_rate, num_channels=2, compression=compression)
sox_utils.run_sox_effect(
input_path, reference_path, effects, output_bitdepth=32)
expected, expected_sr = load_wav(reference_path)
with open(input_path, 'rb') as file_:
fileobj = io.BytesIO(file_.read())
found, sr = sox_effects.apply_effects_file(
fileobj, effects, channels_first=channels_first, format=format_)
save_wav(self.get_temp_path('result.wav'), found, sr, channels_first=channels_first)
assert sr == expected_sr
self.assertEqual(found, expected)
@parameterized.expand([
('wav', None),
('mp3', 128),
('mp3', 320),
('flac', 0),
('flac', 5),
('flac', 8),
('vorbis', -1),
('vorbis', 10),
('amb', None),
])
def test_tarfile(self, ext, compression):
"""Applying effects to compressed audio via file-like file works"""
sample_rate = 16000
channels_first = True
effects = [['band', '300', '10']]
format_ = ext if ext in ['mp3'] else None
audio_file = f'input.{ext}'
input_path = self.get_temp_path(audio_file)
reference_path = self.get_temp_path('reference.wav')
archive_path = self.get_temp_path('archive.tar.gz')
sox_utils.gen_audio_file(
input_path, sample_rate, num_channels=2, compression=compression)
sox_utils.run_sox_effect(
input_path, reference_path, effects, output_bitdepth=32)
expected, expected_sr = load_wav(reference_path)
with tarfile.TarFile(archive_path, 'w') as tarobj:
tarobj.add(input_path, arcname=audio_file)
with tarfile.TarFile(archive_path, 'r') as tarobj:
fileobj = tarobj.extractfile(audio_file)
found, sr = sox_effects.apply_effects_file(
fileobj, effects, channels_first=channels_first, format=format_)
save_wav(self.get_temp_path('result.wav'), found, sr, channels_first=channels_first)
assert sr == expected_sr
self.assertEqual(found, expected)
@skipIfNoSox
@skipIfNoExec('sox')
@skipIfNoModule("requests")
class TestFileObjectHttp(HttpServerMixin, PytorchTestCase):
@parameterized.expand([
('wav', None),
('mp3', 128),
('mp3', 320),
('flac', 0),
('flac', 5),
('flac', 8),
('vorbis', -1),
('vorbis', 10),
('amb', None),
])
def test_requests(self, ext, compression):
sample_rate = 16000
channels_first = True
effects = [['band', '300', '10']]
format_ = ext if ext in ['mp3'] else None
audio_file = f'input.{ext}'
input_path = self.get_temp_path(audio_file)
reference_path = self.get_temp_path('reference.wav')
sox_utils.gen_audio_file(
input_path, sample_rate, num_channels=2, compression=compression)
sox_utils.run_sox_effect(
input_path, reference_path, effects, output_bitdepth=32)
expected, expected_sr = load_wav(reference_path)
url = self.get_url(audio_file)
with requests.get(url, stream=True) as resp:
found, sr = sox_effects.apply_effects_file(
resp.raw, effects, channels_first=channels_first, format=format_)
save_wav(self.get_temp_path('result.wav'), found, sr, channels_first=channels_first)
assert sr == expected_sr
self.assertEqual(found, expected)
from typing import List
import torch
from torchaudio import sox_effects
from parameterized import parameterized
from torchaudio_unittest.common_utils import (
TempDirMixin,
TorchaudioTestCase,
skipIfNoSox,
get_sinusoid,
save_wav,
torch_script,
)
from .common import (
load_params,
)
class SoxEffectTensorTransform(torch.nn.Module):
effects: List[List[str]]
def __init__(self, effects: List[List[str]], sample_rate: int, channels_first: bool):
super().__init__()
self.effects = effects
self.sample_rate = sample_rate
self.channels_first = channels_first
def forward(self, tensor: torch.Tensor):
return sox_effects.apply_effects_tensor(
tensor, self.sample_rate, self.effects, self.channels_first)
class SoxEffectFileTransform(torch.nn.Module):
effects: List[List[str]]
channels_first: bool
def __init__(self, effects: List[List[str]], channels_first: bool):
super().__init__()
self.effects = effects
self.channels_first = channels_first
def forward(self, path: str):
return sox_effects.apply_effects_file(path, self.effects, self.channels_first)
@skipIfNoSox
class TestTorchScript(TempDirMixin, TorchaudioTestCase):
@parameterized.expand(
load_params("sox_effect_test_args.jsonl"),
name_func=lambda f, i, p: f'{f.__name__}_{i}_{p.args[0]["effects"][0][0]}',
)
def test_apply_effects_tensor(self, args):
effects = args['effects']
channels_first = True
num_channels = args.get("num_channels", 2)
input_sr = args.get("input_sample_rate", 8000)
trans = SoxEffectTensorTransform(effects, input_sr, channels_first)
trans = torch_script(trans)
wav = get_sinusoid(
frequency=800, sample_rate=input_sr,
n_channels=num_channels, dtype='float32', channels_first=channels_first)
found, sr_found = trans(wav)
expected, sr_expected = sox_effects.apply_effects_tensor(
wav, input_sr, effects, channels_first)
assert sr_found == sr_expected
self.assertEqual(expected, found)
@parameterized.expand(
load_params("sox_effect_test_args.jsonl"),
name_func=lambda f, i, p: f'{f.__name__}_{i}_{p.args[0]["effects"][0][0]}',
)
def test_apply_effects_file(self, args):
effects = args['effects']
channels_first = True
num_channels = args.get("num_channels", 2)
input_sr = args.get("input_sample_rate", 8000)
trans = SoxEffectFileTransform(effects, channels_first)
trans = torch_script(trans)
path = self.get_temp_path('input.wav')
wav = get_sinusoid(
frequency=800, sample_rate=input_sr,
n_channels=num_channels, dtype='float32', channels_first=channels_first)
save_wav(path, wav, sample_rate=input_sr, channels_first=channels_first)
found, sr_found = trans(path)
expected, sr_expected = sox_effects.apply_effects_file(path, effects, channels_first)
assert sr_found == sr_expected
self.assertEqual(expected, found)
from torchaudio_unittest.common_utils import PytorchTestCase
from .autograd_test_impl import AutogradTestMixin, AutogradTestFloat32
class AutogradCPUTest(AutogradTestMixin, PytorchTestCase):
device = 'cpu'
class AutogradRNNTCPUTest(AutogradTestFloat32, PytorchTestCase):
device = 'cpu'
from torchaudio_unittest.common_utils import (
PytorchTestCase,
skipIfNoCuda,
)
from .autograd_test_impl import AutogradTestMixin, AutogradTestFloat32
@skipIfNoCuda
class AutogradCUDATest(AutogradTestMixin, PytorchTestCase):
device = 'cuda'
@skipIfNoCuda
class AutogradRNNTCUDATest(AutogradTestFloat32, PytorchTestCase):
device = 'cuda'
from typing import List
import unittest
from parameterized import parameterized
import torch
from torch.autograd import gradcheck, gradgradcheck
import torchaudio.transforms as T
from torchaudio_unittest.common_utils import (
TestBaseMixin,
get_whitenoise,
get_spectrogram,
nested_params,
rnnt_utils,
)
class _DeterministicWrapper(torch.nn.Module):
"""Helper transform wrapper to make the given transform deterministic"""
def __init__(self, transform, seed=0):
super().__init__()
self.seed = seed
self.transform = transform
def forward(self, input: torch.Tensor):
torch.random.manual_seed(self.seed)
return self.transform(input)
class AutogradTestMixin(TestBaseMixin):
def assert_grad(
self,
transform: torch.nn.Module,
inputs: List[torch.Tensor],
*,
nondet_tol: float = 0.0,
):
transform = transform.to(dtype=torch.float64, device=self.device)
# gradcheck and gradgradcheck only pass if the input tensors are of dtype `torch.double` or
# `torch.cdouble`, when the default eps and tolerance values are used.
inputs_ = []
for i in inputs:
if torch.is_tensor(i):
i = i.to(
dtype=torch.cdouble if i.is_complex() else torch.double,
device=self.device)
i.requires_grad = True
inputs_.append(i)
assert gradcheck(transform, inputs_)
assert gradgradcheck(transform, inputs_, nondet_tol=nondet_tol)
@parameterized.expand([
({'pad': 0, 'normalized': False, 'power': None, 'return_complex': True}, ),
({'pad': 3, 'normalized': False, 'power': None, 'return_complex': True}, ),
({'pad': 0, 'normalized': True, 'power': None, 'return_complex': True}, ),
({'pad': 3, 'normalized': True, 'power': None, 'return_complex': True}, ),
({'pad': 0, 'normalized': False, 'power': None}, ),
({'pad': 3, 'normalized': False, 'power': None}, ),
({'pad': 0, 'normalized': True, 'power': None}, ),
({'pad': 3, 'normalized': True, 'power': None}, ),
({'pad': 0, 'normalized': False, 'power': 1.0}, ),
({'pad': 3, 'normalized': False, 'power': 1.0}, ),
({'pad': 0, 'normalized': True, 'power': 1.0}, ),
({'pad': 3, 'normalized': True, 'power': 1.0}, ),
({'pad': 0, 'normalized': False, 'power': 2.0}, ),
({'pad': 3, 'normalized': False, 'power': 2.0}, ),
({'pad': 0, 'normalized': True, 'power': 2.0}, ),
({'pad': 3, 'normalized': True, 'power': 2.0}, ),
])
def test_spectrogram(self, kwargs):
# replication_pad1d_backward_cuda is not deteministic and
# gives very small (~2.7756e-17) difference.
#
# See https://github.com/pytorch/pytorch/issues/54093
transform = T.Spectrogram(**kwargs)
waveform = get_whitenoise(sample_rate=8000, duration=0.05, n_channels=2)
self.assert_grad(transform, [waveform], nondet_tol=1e-10)
@parameterized.expand([(False, ), (True, )])
def test_inverse_spectrogram(self, return_complex):
# create a realistic input:
waveform = get_whitenoise(sample_rate=8000, duration=0.05, n_channels=2)
length = waveform.shape[-1]
spectrogram = get_spectrogram(waveform, n_fft=400)
if not return_complex:
spectrogram = torch.view_as_real(spectrogram)
# test
inv_transform = T.InverseSpectrogram(n_fft=400)
self.assert_grad(inv_transform, [spectrogram, length])
def test_melspectrogram(self):
# replication_pad1d_backward_cuda is not deteministic and
# gives very small (~2.7756e-17) difference.
#
# See https://github.com/pytorch/pytorch/issues/54093
sample_rate = 8000
transform = T.MelSpectrogram(sample_rate=sample_rate)
waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2)
self.assert_grad(transform, [waveform], nondet_tol=1e-10)
@nested_params(
[0, 0.99],
[False, True],
)
def test_griffinlim(self, momentum, rand_init):
n_fft = 400
power = 1
n_iter = 3
spec = get_spectrogram(
get_whitenoise(sample_rate=8000, duration=0.05, n_channels=2),
n_fft=n_fft, power=power)
transform = _DeterministicWrapper(
T.GriffinLim(n_fft=n_fft, n_iter=n_iter, momentum=momentum, rand_init=rand_init, power=power))
self.assert_grad(transform, [spec])
@parameterized.expand([(False, ), (True, )])
def test_mfcc(self, log_mels):
sample_rate = 8000
transform = T.MFCC(sample_rate=sample_rate, log_mels=log_mels)
waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2)
self.assert_grad(transform, [waveform])
@parameterized.expand([(False, ), (True, )])
def test_lfcc(self, log_lf):
sample_rate = 8000
transform = T.LFCC(sample_rate=sample_rate, log_lf=log_lf)
waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2)
self.assert_grad(transform, [waveform])
def test_compute_deltas(self):
transform = T.ComputeDeltas()
spec = torch.rand(10, 20)
self.assert_grad(transform, [spec])
@parameterized.expand([(8000, 8000), (8000, 4000), (4000, 8000)])
def test_resample(self, orig_freq, new_freq):
transform = T.Resample(orig_freq=orig_freq, new_freq=new_freq)
waveform = get_whitenoise(sample_rate=8000, duration=0.05, n_channels=2)
self.assert_grad(transform, [waveform])
@parameterized.expand([("linear", ), ("exponential", ), ("logarithmic", ), ("quarter_sine", ), ("half_sine", )])
def test_fade(self, fade_shape):
transform = T.Fade(fade_shape=fade_shape)
waveform = get_whitenoise(sample_rate=8000, duration=0.05, n_channels=2)
self.assert_grad(transform, [waveform], nondet_tol=1e-10)
@parameterized.expand([(T.TimeMasking,), (T.FrequencyMasking,)])
def test_masking(self, masking_transform):
sample_rate = 8000
n_fft = 400
spectrogram = get_spectrogram(
get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2),
n_fft=n_fft, power=1)
deterministic_transform = _DeterministicWrapper(masking_transform(400))
self.assert_grad(deterministic_transform, [spectrogram])
@parameterized.expand([(T.TimeMasking,), (T.FrequencyMasking,)])
def test_masking_iid(self, masking_transform):
sample_rate = 8000
n_fft = 400
specs = [get_spectrogram(
get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2, seed=i),
n_fft=n_fft, power=1)
for i in range(3)
]
batch = torch.stack(specs)
assert batch.ndim == 4
deterministic_transform = _DeterministicWrapper(masking_transform(400, True))
self.assert_grad(deterministic_transform, [batch])
def test_spectral_centroid(self):
sample_rate = 8000
transform = T.SpectralCentroid(sample_rate=sample_rate)
waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2)
self.assert_grad(transform, [waveform], nondet_tol=1e-10)
def test_amplitude_to_db(self):
sample_rate = 8000
transform = T.AmplitudeToDB()
waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2)
self.assert_grad(transform, [waveform])
def test_melscale(self):
sample_rate = 8000
n_fft = 400
n_mels = n_fft // 2 + 1
transform = T.MelScale(sample_rate=sample_rate, n_mels=n_mels)
spec = get_spectrogram(
get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2),
n_fft=n_fft, power=1)
self.assert_grad(transform, [spec])
@parameterized.expand([(1.5, "amplitude"), (2, "power"), (10, "db")])
def test_vol(self, gain, gain_type):
sample_rate = 8000
transform = T.Vol(gain=gain, gain_type=gain_type)
waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2)
self.assert_grad(transform, [waveform])
@parameterized.expand([
({'cmn_window': 100, 'min_cmn_window': 50, 'center': False, 'norm_vars': False}, ),
({'cmn_window': 100, 'min_cmn_window': 50, 'center': True, 'norm_vars': False}, ),
({'cmn_window': 100, 'min_cmn_window': 50, 'center': False, 'norm_vars': True}, ),
({'cmn_window': 100, 'min_cmn_window': 50, 'center': True, 'norm_vars': True}, ),
])
def test_sliding_window_cmn(self, kwargs):
n_fft = 10
power = 1
spec = get_spectrogram(
get_whitenoise(sample_rate=200, duration=0.05, n_channels=2),
n_fft=n_fft, power=power)
spec_reshaped = spec.transpose(-1, -2)
transform = T.SlidingWindowCmn(**kwargs)
self.assert_grad(transform, [spec_reshaped])
@unittest.expectedFailure
def test_timestretch_zeros_fail(self):
"""Test that ``T.TimeStretch`` fails gradcheck at 0
This is because ``F.phase_vocoder`` converts data from cartesian to polar coordinate,
which performs ``atan2(img, real)``, and gradient is not defined at 0.
"""
n_fft = 16
transform = T.TimeStretch(n_freq=n_fft // 2 + 1, fixed_rate=0.99)
waveform = torch.zeros(2, 40)
spectrogram = get_spectrogram(waveform, n_fft=n_fft, power=None)
self.assert_grad(transform, [spectrogram])
@nested_params(
[0.7, 0.8, 0.9, 1.0, 1.3],
[False, True],
)
def test_timestretch_non_zero(self, rate, test_pseudo_complex):
"""Verify that ``T.TimeStretch`` does not fail if it's not close to 0
``T.TimeStrech`` is not differentiable around 0, so this test checks the differentiability
for cases where input is not zero.
As tested above, when spectrogram contains values close to zero, the gradients are unstable
and gradcheck fails.
In this test, we generate spectrogram from random signal, then we push the points around
zero away from the origin.
This process does not reflect the real use-case, and it is not practical for users, but
this helps us understand to what degree the function is differentiable and when not.
"""
n_fft = 16
transform = T.TimeStretch(n_freq=n_fft // 2 + 1, fixed_rate=rate)
waveform = get_whitenoise(sample_rate=40, duration=1, n_channels=2)
spectrogram = get_spectrogram(waveform, n_fft=n_fft, power=None)
# 1e-3 is too small (on CPU)
epsilon = 1e-2
too_close = spectrogram.abs() < epsilon
spectrogram[too_close] = epsilon * spectrogram[too_close] / spectrogram[too_close].abs()
if test_pseudo_complex:
spectrogram = torch.view_as_real(spectrogram)
self.assert_grad(transform, [spectrogram])
def test_psd(self):
transform = T.PSD()
waveform = get_whitenoise(sample_rate=8000, duration=0.05, n_channels=2)
spectrogram = get_spectrogram(waveform, n_fft=400)
self.assert_grad(transform, [spectrogram])
@parameterized.expand([
[True],
[False],
])
def test_psd_with_mask(self, multi_mask):
transform = T.PSD(multi_mask=multi_mask)
waveform = get_whitenoise(sample_rate=8000, duration=0.05, n_channels=2)
spectrogram = get_spectrogram(waveform, n_fft=400)
if multi_mask:
mask = torch.rand(spectrogram.shape[-3:])
else:
mask = torch.rand(spectrogram.shape[-2:])
self.assert_grad(transform, [spectrogram, mask])
@parameterized.expand([
"ref_channel",
# stv_power test time too long, comment for now
# "stv_power",
# stv_evd will fail since the eigenvalues are not distinct
# "stv_evd",
])
def test_mvdr(self, solution):
transform = T.MVDR(solution=solution)
waveform = get_whitenoise(sample_rate=8000, duration=0.05, n_channels=2)
spectrogram = get_spectrogram(waveform, n_fft=400)
mask_s = torch.rand(spectrogram.shape[-2:])
mask_n = torch.rand(spectrogram.shape[-2:])
self.assert_grad(transform, [spectrogram, mask_s, mask_n])
class AutogradTestFloat32(TestBaseMixin):
def assert_grad(
self,
transform: torch.nn.Module,
inputs: List[torch.Tensor],
):
inputs_ = []
for i in inputs:
if torch.is_tensor(i):
i = i.to(dtype=torch.float32, device=self.device)
inputs_.append(i)
# gradcheck with float32 requires higher atol and epsilon
assert gradcheck(transform, inputs, eps=1e-3, atol=1e-3, nondet_tol=0.)
@parameterized.expand([
(rnnt_utils.get_B1_T10_U3_D4_data, ),
(rnnt_utils.get_B2_T4_U3_D3_data, ),
(rnnt_utils.get_B1_T2_U3_D5_data, ),
])
def test_rnnt_loss(self, data_func):
def get_data(data_func, device):
data = data_func()
if type(data) == tuple:
data = data[0]
return data
data = get_data(data_func, self.device)
inputs = (
data["logits"].to(torch.float32),
data["targets"],
data["logit_lengths"],
data["target_lengths"],
)
loss = T.RNNTLoss(blank=data["blank"])
self.assert_grad(loss, inputs)
"""Test numerical consistency among single input and batched input."""
import torch
from parameterized import parameterized
from torchaudio import transforms as T
from torchaudio_unittest import common_utils
class TestTransforms(common_utils.TorchaudioTestCase):
"""Test suite for classes defined in `transforms` module"""
backend = 'default'
def assert_batch_consistency(
self, transform, batch, *args, atol=1e-8, rtol=1e-5, seed=42,
**kwargs):
n = batch.size(0)
# Compute items separately, then batch the result
torch.random.manual_seed(seed)
items_input = batch.clone()
items_result = torch.stack([
transform(items_input[i], *args, **kwargs) for i in range(n)
])
# Batch the input and run
torch.random.manual_seed(seed)
batch_input = batch.clone()
batch_result = transform(batch_input, *args, **kwargs)
self.assertEqual(items_input, batch_input, rtol=rtol, atol=atol)
self.assertEqual(items_result, batch_result, rtol=rtol, atol=atol)
def test_batch_AmplitudeToDB(self):
spec = torch.rand((3, 2, 6, 201))
transform = T.AmplitudeToDB()
self.assert_batch_consistency(transform, spec)
def test_batch_Resample(self):
waveform = torch.randn(3, 2, 2786)
transform = T.Resample()
self.assert_batch_consistency(transform, waveform)
def test_batch_MelScale(self):
specgram = torch.randn(3, 2, 201, 256)
transform = T.MelScale()
self.assert_batch_consistency(transform, specgram)
def test_batch_InverseMelScale(self):
n_mels = 32
n_stft = 5
mel_spec = torch.randn(3, 2, n_mels, 32) ** 2
transform = T.InverseMelScale(n_stft, n_mels)
# Because InverseMelScale runs SGD on randomly initialized values so they do not yield
# exactly same result. For this reason, tolerance is very relaxed here.
self.assert_batch_consistency(transform, mel_spec, atol=1.0, rtol=1e-5)
def test_batch_compute_deltas(self):
specgram = torch.randn(3, 2, 31, 2786)
transform = T.ComputeDeltas()
self.assert_batch_consistency(transform, specgram)
def test_batch_mulaw(self):
waveform = common_utils.get_whitenoise(sample_rate=8000, duration=1, n_channels=6)
waveform = waveform.reshape(3, 2, -1)
# Single then transform then batch
expected = [T.MuLawEncoding()(waveform[i]) for i in range(3)]
expected = torch.stack(expected)
# Batch then transform
computed = T.MuLawEncoding()(waveform)
# shape = (3, 2, 201, 1394)
self.assertEqual(computed, expected)
# Single then transform then batch
expected_decoded = [T.MuLawDecoding()(expected[i]) for i in range(3)]
expected_decoded = torch.stack(expected_decoded)
# Batch then transform
computed_decoded = T.MuLawDecoding()(computed)
# shape = (3, 2, 201, 1394)
self.assertEqual(computed_decoded, expected_decoded)
def test_batch_spectrogram(self):
waveform = common_utils.get_whitenoise(sample_rate=8000, duration=1, n_channels=6)
waveform = waveform.reshape(3, 2, -1)
transform = T.Spectrogram()
self.assert_batch_consistency(transform, waveform)
def test_batch_inverse_spectrogram(self):
waveform = common_utils.get_whitenoise(sample_rate=8000, duration=1, n_channels=6)
specgram = common_utils.get_spectrogram(waveform, n_fft=400)
specgram = specgram.reshape(3, 2, specgram.shape[-2], specgram.shape[-1])
transform = T.InverseSpectrogram(n_fft=400)
self.assert_batch_consistency(transform, specgram)
def test_batch_melspectrogram(self):
waveform = common_utils.get_whitenoise(sample_rate=8000, duration=1, n_channels=6)
waveform = waveform.reshape(3, 2, -1)
transform = T.MelSpectrogram()
self.assert_batch_consistency(transform, waveform)
def test_batch_mfcc(self):
waveform = common_utils.get_whitenoise(sample_rate=8000, duration=1, n_channels=6)
waveform = waveform.reshape(3, 2, -1)
transform = T.MFCC()
self.assert_batch_consistency(transform, waveform, atol=1e-4, rtol=1e-5)
def test_batch_lfcc(self):
waveform = common_utils.get_whitenoise(sample_rate=8000, duration=1, n_channels=6)
waveform = waveform.reshape(3, 2, -1)
transform = T.LFCC()
self.assert_batch_consistency(transform, waveform, atol=1e-4, rtol=1e-5)
@parameterized.expand([(True, ), (False, )])
def test_batch_TimeStretch(self, test_pseudo_complex):
rate = 2
num_freq = 1025
num_frames = 400
batch = 3
spec = torch.randn(batch, num_freq, num_frames, dtype=torch.complex64)
if test_pseudo_complex:
spec = torch.view_as_real(spec)
transform = T.TimeStretch(
fixed_rate=rate,
n_freq=num_freq,
hop_length=512
)
self.assert_batch_consistency(transform, spec, atol=1e-5, rtol=1e-5)
def test_batch_Fade(self):
waveform = common_utils.get_whitenoise(sample_rate=8000, duration=1, n_channels=6)
waveform = waveform.reshape(3, 2, -1)
fade_in_len = 3000
fade_out_len = 3000
transform = T.Fade(fade_in_len, fade_out_len)
self.assert_batch_consistency(transform, waveform)
def test_batch_Vol(self):
waveform = common_utils.get_whitenoise(sample_rate=8000, duration=1, n_channels=6)
waveform = waveform.reshape(3, 2, -1)
transform = T.Vol(gain=1.1)
self.assert_batch_consistency(transform, waveform)
def test_batch_spectral_centroid(self):
sample_rate = 44100
waveform = common_utils.get_whitenoise(sample_rate=sample_rate, n_channels=6)
waveform = waveform.reshape(3, 2, -1)
transform = T.SpectralCentroid(sample_rate)
self.assert_batch_consistency(transform, waveform)
def test_batch_pitch_shift(self):
sample_rate = 8000
n_steps = -2
waveform = common_utils.get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=6)
waveform = waveform.reshape(3, 2, -1)
transform = T.PitchShift(sample_rate, n_steps, n_fft=400)
self.assert_batch_consistency(transform, waveform)
def test_batch_PSD(self):
waveform = common_utils.get_whitenoise(sample_rate=8000, duration=1, n_channels=6)
specgram = common_utils.get_spectrogram(waveform, n_fft=400)
specgram = specgram.reshape(3, 2, specgram.shape[-2], specgram.shape[-1])
transform = T.PSD()
self.assert_batch_consistency(transform, specgram)
def test_batch_PSD_with_mask(self):
waveform = common_utils.get_whitenoise(sample_rate=8000, duration=1, n_channels=6)
waveform = waveform.to(torch.double)
specgram = common_utils.get_spectrogram(waveform, n_fft=400)
specgram = specgram.reshape(3, 2, specgram.shape[-2], specgram.shape[-1])
mask = torch.rand((3, specgram.shape[-2], specgram.shape[-1]))
transform = T.PSD()
# Single then transform then batch
expected = [transform(specgram[i], mask[i]) for i in range(3)]
expected = torch.stack(expected)
# Batch then transform
computed = transform(specgram, mask)
self.assertEqual(computed, expected)
@parameterized.expand([
[True],
[False],
])
def test_MVDR(self, multi_mask):
waveform = common_utils.get_whitenoise(sample_rate=8000, duration=1, n_channels=6)
waveform = waveform.to(torch.double)
specgram = common_utils.get_spectrogram(waveform, n_fft=400)
specgram = specgram.reshape(3, 2, specgram.shape[-2], specgram.shape[-1])
if multi_mask:
mask_s = torch.rand((3, 2, specgram.shape[-2], specgram.shape[-1]))
mask_n = torch.rand((3, 2, specgram.shape[-2], specgram.shape[-1]))
else:
mask_s = torch.rand((3, specgram.shape[-2], specgram.shape[-1]))
mask_n = torch.rand((3, specgram.shape[-2], specgram.shape[-1]))
transform = T.MVDR(multi_mask=multi_mask)
# Single then transform then batch
expected = [transform(specgram[i], mask_s[i], mask_n[i]) for i in range(3)]
expected = torch.stack(expected)
# Batch then transform
computed = transform(specgram, mask_s, mask_n)
self.assertEqual(computed, expected)
import torch
from torchaudio_unittest import common_utils
from .kaldi_compatibility_impl import Kaldi
class TestKaldiFloat32(Kaldi, common_utils.PytorchTestCase):
dtype = torch.float32
device = torch.device('cpu')
class TestKaldiFloat64(Kaldi, common_utils.PytorchTestCase):
dtype = torch.float64
device = torch.device('cpu')
import torch
from torchaudio_unittest import common_utils
from .kaldi_compatibility_impl import Kaldi
@common_utils.skipIfNoCuda
class TestKaldiFloat32(Kaldi, common_utils.PytorchTestCase):
dtype = torch.float32
device = torch.device('cuda')
@common_utils.skipIfNoCuda
class TestKaldiFloat64(Kaldi, common_utils.PytorchTestCase):
dtype = torch.float64
device = torch.device('cuda')
"""Test suites for checking numerical compatibility against Kaldi"""
import torchaudio.compliance.kaldi
from parameterized import parameterized
from torchaudio_unittest.common_utils import (
TestBaseMixin,
TempDirMixin,
load_params,
skipIfNoExec,
get_asset_path,
load_wav,
)
from torchaudio_unittest.common_utils.kaldi_utils import (
convert_args,
run_kaldi,
)
class Kaldi(TempDirMixin, TestBaseMixin):
def assert_equal(self, output, *, expected, rtol=None, atol=None):
expected = expected.to(dtype=self.dtype, device=self.device)
self.assertEqual(output, expected, rtol=rtol, atol=atol)
@parameterized.expand(load_params('kaldi_test_fbank_args.jsonl'))
@skipIfNoExec('compute-fbank-feats')
def test_fbank(self, kwargs):
"""fbank should be numerically compatible with compute-fbank-feats"""
wave_file = get_asset_path('kaldi_file.wav')
waveform = load_wav(wave_file, normalize=False)[0].to(dtype=self.dtype, device=self.device)
result = torchaudio.compliance.kaldi.fbank(waveform, **kwargs)
command = ['compute-fbank-feats'] + convert_args(**kwargs) + ['scp:-', 'ark:-']
kaldi_result = run_kaldi(command, 'scp', wave_file)
self.assert_equal(result, expected=kaldi_result, rtol=1e-4, atol=1e-8)
@parameterized.expand(load_params('kaldi_test_spectrogram_args.jsonl'))
@skipIfNoExec('compute-spectrogram-feats')
def test_spectrogram(self, kwargs):
"""spectrogram should be numerically compatible with compute-spectrogram-feats"""
wave_file = get_asset_path('kaldi_file.wav')
waveform = load_wav(wave_file, normalize=False)[0].to(dtype=self.dtype, device=self.device)
result = torchaudio.compliance.kaldi.spectrogram(waveform, **kwargs)
command = ['compute-spectrogram-feats'] + convert_args(**kwargs) + ['scp:-', 'ark:-']
kaldi_result = run_kaldi(command, 'scp', wave_file)
self.assert_equal(result, expected=kaldi_result, rtol=1e-4, atol=1e-8)
@parameterized.expand(load_params('kaldi_test_mfcc_args.jsonl'))
@skipIfNoExec('compute-mfcc-feats')
def test_mfcc(self, kwargs):
"""mfcc should be numerically compatible with compute-mfcc-feats"""
wave_file = get_asset_path('kaldi_file.wav')
waveform = load_wav(wave_file, normalize=False)[0].to(dtype=self.dtype, device=self.device)
result = torchaudio.compliance.kaldi.mfcc(waveform, **kwargs)
command = ['compute-mfcc-feats'] + convert_args(**kwargs) + ['scp:-', 'ark:-']
kaldi_result = run_kaldi(command, 'scp', wave_file)
self.assert_equal(result, expected=kaldi_result, rtol=1e-4, atol=1e-8)
import torch
from torchaudio_unittest.common_utils import PytorchTestCase
from .librosa_compatibility_test_impl import TransformsTestBase
class TestTransforms(TransformsTestBase, PytorchTestCase):
dtype = torch.float64
device = torch.device('cpu')
import torch
from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
from .librosa_compatibility_test_impl import TransformsTestBase
@skipIfNoCuda
class TestTransforms(TransformsTestBase, PytorchTestCase):
dtype = torch.float64
device = torch.device('cuda')
import unittest
import torch
import torchaudio.transforms as T
from torchaudio._internal.module_utils import is_module_available
from parameterized import param, parameterized
from torchaudio_unittest.common_utils import (
TestBaseMixin,
get_whitenoise,
get_sinusoid,
get_spectrogram,
nested_params,
)
LIBROSA_AVAILABLE = is_module_available('librosa')
if LIBROSA_AVAILABLE:
import librosa
@unittest.skipIf(not LIBROSA_AVAILABLE, "Librosa not available")
class TransformsTestBase(TestBaseMixin):
@parameterized.expand([
param(n_fft=400, hop_length=200, power=2.0),
param(n_fft=600, hop_length=100, power=2.0),
param(n_fft=400, hop_length=200, power=3.0),
param(n_fft=200, hop_length=50, power=2.0),
])
def test_Spectrogram(self, n_fft, hop_length, power):
sample_rate = 16000
waveform = get_whitenoise(
sample_rate=sample_rate, n_channels=1,
).to(self.device, self.dtype)
expected = librosa.core.spectrum._spectrogram(
y=waveform[0].cpu().numpy(),
n_fft=n_fft, hop_length=hop_length, power=power)[0]
result = T.Spectrogram(
n_fft=n_fft, hop_length=hop_length, power=power,
).to(self.device, self.dtype)(waveform)[0]
self.assertEqual(result, torch.from_numpy(expected), atol=1e-5, rtol=1e-5)
def test_Spectrogram_complex(self):
n_fft = 400
hop_length = 200
sample_rate = 16000
waveform = get_whitenoise(
sample_rate=sample_rate, n_channels=1,
).to(self.device, self.dtype)
expected = librosa.core.spectrum._spectrogram(
y=waveform[0].cpu().numpy(),
n_fft=n_fft, hop_length=hop_length, power=1)[0]
result = T.Spectrogram(
n_fft=n_fft, hop_length=hop_length, power=None, return_complex=True,
).to(self.device, self.dtype)(waveform)[0]
self.assertEqual(result.abs(), torch.from_numpy(expected), atol=1e-5, rtol=1e-5)
@nested_params(
[
param(n_fft=400, hop_length=200, n_mels=64),
param(n_fft=600, hop_length=100, n_mels=128),
param(n_fft=200, hop_length=50, n_mels=32),
],
[param(norm=norm) for norm in [None, 'slaney']],
[param(mel_scale=mel_scale) for mel_scale in ['htk', 'slaney']],
)
def test_MelSpectrogram(self, n_fft, hop_length, n_mels, norm, mel_scale):
sample_rate = 16000
waveform = get_sinusoid(
sample_rate=sample_rate, n_channels=1,
).to(self.device, self.dtype)
expected = librosa.feature.melspectrogram(
y=waveform[0].cpu().numpy(),
sr=sample_rate, n_fft=n_fft,
hop_length=hop_length, n_mels=n_mels, norm=norm,
htk=mel_scale == "htk")
result = T.MelSpectrogram(
sample_rate=sample_rate, window_fn=torch.hann_window,
hop_length=hop_length, n_mels=n_mels,
n_fft=n_fft, norm=norm, mel_scale=mel_scale,
).to(self.device, self.dtype)(waveform)[0]
self.assertEqual(result, torch.from_numpy(expected), atol=5e-4, rtol=1e-5)
def test_magnitude_to_db(self):
spectrogram = get_spectrogram(
get_whitenoise(), n_fft=400, power=2).to(self.device, self.dtype)
result = T.AmplitudeToDB('magnitude', 80.).to(self.device, self.dtype)(spectrogram)[0]
expected = librosa.core.spectrum.amplitude_to_db(spectrogram[0].cpu().numpy())
self.assertEqual(result, torch.from_numpy(expected))
def test_power_to_db(self):
spectrogram = get_spectrogram(
get_whitenoise(), n_fft=400, power=2).to(self.device, self.dtype)
result = T.AmplitudeToDB('power', 80.).to(self.device, self.dtype)(spectrogram)[0]
expected = librosa.core.spectrum.power_to_db(spectrogram[0].cpu().numpy())
self.assertEqual(result, torch.from_numpy(expected))
@nested_params([
param(n_fft=400, hop_length=200, n_mels=64, n_mfcc=40),
param(n_fft=600, hop_length=100, n_mels=128, n_mfcc=20),
param(n_fft=200, hop_length=50, n_mels=32, n_mfcc=25),
])
def test_mfcc(self, n_fft, hop_length, n_mels, n_mfcc):
sample_rate = 16000
waveform = get_whitenoise(
sample_rate=sample_rate, n_channels=1).to(self.device, self.dtype)
result = T.MFCC(
sample_rate=sample_rate, n_mfcc=n_mfcc, norm='ortho',
melkwargs={'hop_length': hop_length, 'n_fft': n_fft, 'n_mels': n_mels},
).to(self.device, self.dtype)(waveform)[0]
melspec = librosa.feature.melspectrogram(
y=waveform[0].cpu().numpy(), sr=sample_rate, n_fft=n_fft,
win_length=n_fft, hop_length=hop_length,
n_mels=n_mels, htk=True, norm=None)
expected = librosa.feature.mfcc(
S=librosa.core.spectrum.power_to_db(melspec),
n_mfcc=n_mfcc, dct_type=2, norm='ortho')
self.assertEqual(result, torch.from_numpy(expected), atol=5e-4, rtol=1e-5)
@parameterized.expand([
param(n_fft=400, hop_length=200),
param(n_fft=600, hop_length=100),
param(n_fft=200, hop_length=50),
])
def test_spectral_centroid(self, n_fft, hop_length):
sample_rate = 16000
waveform = get_whitenoise(
sample_rate=sample_rate, n_channels=1).to(self.device, self.dtype)
result = T.SpectralCentroid(
sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length,
).to(self.device, self.dtype)(waveform)
expected = librosa.feature.spectral_centroid(
y=waveform[0].cpu().numpy(), sr=sample_rate, n_fft=n_fft, hop_length=hop_length)
self.assertEqual(result, torch.from_numpy(expected), atol=5e-4, rtol=1e-5)
import warnings
import torch
import torchaudio.transforms as T
from parameterized import parameterized
from torchaudio_unittest.common_utils import (
skipIfNoSox,
skipIfNoExec,
TempDirMixin,
TorchaudioTestCase,
get_asset_path,
sox_utils,
load_wav,
save_wav,
get_whitenoise,
)
@skipIfNoSox
@skipIfNoExec('sox')
class TestFunctionalFiltering(TempDirMixin, TorchaudioTestCase):
def run_sox_effect(self, input_file, effect):
output_file = self.get_temp_path('expected.wav')
sox_utils.run_sox_effect(input_file, output_file, [str(e) for e in effect])
return load_wav(output_file)
def assert_sox_effect(self, result, input_path, effects, atol=1e-04, rtol=1e-5):
expected, _ = self.run_sox_effect(input_path, effects)
self.assertEqual(result, expected, atol=atol, rtol=rtol)
def get_whitenoise(self, sample_rate=8000):
noise = get_whitenoise(
sample_rate=sample_rate, duration=3, scale_factor=0.9,
)
path = self.get_temp_path("whitenoise.wav")
save_wav(path, noise, sample_rate)
return noise, path
@parameterized.expand([
('q', 'quarter_sine'),
('h', 'half_sine'),
('t', 'linear'),
])
def test_fade(self, fade_shape_sox, fade_shape):
fade_in_len, fade_out_len = 44100, 44100
data, path = self.get_whitenoise(sample_rate=44100)
result = T.Fade(fade_in_len, fade_out_len, fade_shape)(data)
self.assert_sox_effect(result, path, ['fade', fade_shape_sox, '1', '0', '1'])
@parameterized.expand([
('amplitude', 1.1),
('db', 2),
('power', 2),
])
def test_vol(self, gain_type, gain):
data, path = self.get_whitenoise()
result = T.Vol(gain, gain_type)(data)
self.assert_sox_effect(result, path, ['vol', f'{gain}', gain_type])
@parameterized.expand(['vad-go-stereo-44100.wav', 'vad-go-mono-32000.wav'])
def test_vad(self, filename):
path = get_asset_path(filename)
data, sample_rate = load_wav(path)
result = T.Vad(sample_rate)(data)
self.assert_sox_effect(result, path, ['vad'])
def test_vad_warning(self):
"""vad should throw a warning if input dimension is greater than 2"""
sample_rate = 41100
data = torch.rand(5, 5, sample_rate)
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
T.Vad(sample_rate)(data)
assert len(w) == 1
data = torch.rand(5, sample_rate)
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
T.Vad(sample_rate)(data)
assert len(w) == 0
data = torch.rand(sample_rate)
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
T.Vad(sample_rate)(data)
assert len(w) == 0
import torch
from torchaudio_unittest.common_utils import PytorchTestCase
from .torchscript_consistency_impl import Transforms, TransformsFloat32Only, TransformsFloat64Only
class TestTransformsFloat32(Transforms, TransformsFloat32Only, PytorchTestCase):
dtype = torch.float32
device = torch.device('cpu')
class TestTransformsFloat64(Transforms, TransformsFloat64Only, PytorchTestCase):
dtype = torch.float64
device = torch.device('cpu')
import torch
from torchaudio_unittest.common_utils import skipIfNoCuda, PytorchTestCase
from .torchscript_consistency_impl import Transforms, TransformsFloat32Only, TransformsFloat64Only
@skipIfNoCuda
class TestTransformsFloat32(Transforms, TransformsFloat32Only, PytorchTestCase):
dtype = torch.float32
device = torch.device('cuda')
@skipIfNoCuda
class TestTransformsFloat64(Transforms, TransformsFloat64Only, PytorchTestCase):
dtype = torch.float64
device = torch.device('cuda')
"""Test suites for jit-ability and its numerical compatibility"""
import torch
import torchaudio.transforms as T
from parameterized import parameterized
from torchaudio_unittest import common_utils
from torchaudio_unittest.common_utils import (
skipIfRocm,
TestBaseMixin,
torch_script,
)
class Transforms(TestBaseMixin):
"""Implements test for Transforms that are performed for different devices"""
def _assert_consistency(self, transform, tensor, *args):
tensor = tensor.to(device=self.device, dtype=self.dtype)
transform = transform.to(device=self.device, dtype=self.dtype)
ts_transform = torch_script(transform)
output = transform(tensor, *args)
ts_output = ts_transform(tensor, *args)
self.assertEqual(ts_output, output)
def _assert_consistency_complex(self, transform, tensor, test_pseudo_complex=False, *args):
assert tensor.is_complex()
tensor = tensor.to(device=self.device, dtype=self.complex_dtype)
transform = transform.to(device=self.device, dtype=self.dtype)
ts_transform = torch_script(transform)
if test_pseudo_complex:
tensor = torch.view_as_real(tensor)
output = transform(tensor, *args)
ts_output = ts_transform(tensor, *args)
self.assertEqual(ts_output, output)
def test_Spectrogram(self):
tensor = torch.rand((1, 1000))
self._assert_consistency(T.Spectrogram(), tensor)
def test_Spectrogram_return_complex(self):
tensor = torch.rand((1, 1000))
self._assert_consistency(T.Spectrogram(power=None, return_complex=True), tensor)
def test_InverseSpectrogram(self):
tensor = common_utils.get_whitenoise(sample_rate=8000)
spectrogram = common_utils.get_spectrogram(tensor, n_fft=400, hop_length=100)
self._assert_consistency_complex(T.InverseSpectrogram(n_fft=400, hop_length=100), spectrogram)
def test_InverseSpectrogram_pseudocomplex(self):
tensor = common_utils.get_whitenoise(sample_rate=8000)
spectrogram = common_utils.get_spectrogram(tensor, n_fft=400, hop_length=100)
spectrogram = torch.view_as_real(spectrogram)
self._assert_consistency(T.InverseSpectrogram(n_fft=400, hop_length=100), spectrogram)
@skipIfRocm
def test_GriffinLim(self):
tensor = torch.rand((1, 201, 6))
self._assert_consistency(T.GriffinLim(length=1000, rand_init=False), tensor)
def test_AmplitudeToDB(self):
spec = torch.rand((6, 201))
self._assert_consistency(T.AmplitudeToDB(), spec)
def test_MelScale(self):
spec_f = torch.rand((1, 201, 6))
self._assert_consistency(T.MelScale(n_stft=201), spec_f)
def test_MelSpectrogram(self):
tensor = torch.rand((1, 1000))
self._assert_consistency(T.MelSpectrogram(), tensor)
def test_MFCC(self):
tensor = torch.rand((1, 1000))
self._assert_consistency(T.MFCC(), tensor)
def test_LFCC(self):
tensor = torch.rand((1, 1000))
self._assert_consistency(T.LFCC(), tensor)
def test_Resample(self):
sr1, sr2 = 16000, 8000
tensor = common_utils.get_whitenoise(sample_rate=sr1)
self._assert_consistency(T.Resample(sr1, sr2), tensor)
def test_ComplexNorm(self):
tensor = torch.rand((1, 2, 201, 2))
self._assert_consistency(T.ComplexNorm(), tensor)
def test_MuLawEncoding(self):
tensor = common_utils.get_whitenoise()
self._assert_consistency(T.MuLawEncoding(), tensor)
def test_MuLawDecoding(self):
tensor = torch.rand((1, 10))
self._assert_consistency(T.MuLawDecoding(), tensor)
def test_Fade(self):
waveform = common_utils.get_whitenoise()
fade_in_len = 3000
fade_out_len = 3000
self._assert_consistency(T.Fade(fade_in_len, fade_out_len), waveform)
def test_FrequencyMasking(self):
tensor = torch.rand((10, 2, 50, 10, 2))
self._assert_consistency(T.FrequencyMasking(freq_mask_param=60, iid_masks=False), tensor)
def test_TimeMasking(self):
tensor = torch.rand((10, 2, 50, 10, 2))
self._assert_consistency(T.TimeMasking(time_mask_param=30, iid_masks=False), tensor)
def test_Vol(self):
waveform = common_utils.get_whitenoise()
self._assert_consistency(T.Vol(1.1), waveform)
def test_SlidingWindowCmn(self):
tensor = torch.rand((1000, 10))
self._assert_consistency(T.SlidingWindowCmn(), tensor)
def test_Vad(self):
filepath = common_utils.get_asset_path("vad-go-mono-32000.wav")
waveform, sample_rate = common_utils.load_wav(filepath)
self._assert_consistency(T.Vad(sample_rate=sample_rate), waveform)
def test_SpectralCentroid(self):
sample_rate = 44100
waveform = common_utils.get_whitenoise(sample_rate=sample_rate)
self._assert_consistency(T.SpectralCentroid(sample_rate=sample_rate), waveform)
@parameterized.expand([(True, ), (False, )])
def test_TimeStretch(self, test_pseudo_complex):
n_freq = 400
hop_length = 512
fixed_rate = 1.3
tensor = torch.view_as_complex(torch.rand((10, 2, n_freq, 10, 2)))
self._assert_consistency_complex(
T.TimeStretch(n_freq=n_freq, hop_length=hop_length, fixed_rate=fixed_rate),
tensor,
test_pseudo_complex
)
def test_PitchShift(self):
sample_rate = 8000
n_steps = 4
waveform = common_utils.get_whitenoise(sample_rate=sample_rate)
self._assert_consistency(
T.PitchShift(sample_rate=sample_rate, n_steps=n_steps),
waveform
)
def test_PSD(self):
tensor = common_utils.get_whitenoise(sample_rate=8000, n_channels=4)
spectrogram = common_utils.get_spectrogram(tensor, n_fft=400, hop_length=100)
spectrogram = spectrogram.to(self.device)
self._assert_consistency_complex(T.PSD(), spectrogram)
def test_PSD_with_mask(self):
tensor = common_utils.get_whitenoise(sample_rate=8000, n_channels=4)
spectrogram = common_utils.get_spectrogram(tensor, n_fft=400, hop_length=100)
spectrogram = spectrogram.to(self.device)
mask = torch.rand(spectrogram.shape[-2:], device=self.device)
self._assert_consistency_complex(T.PSD(), spectrogram, False, mask)
class TransformsFloat32Only(TestBaseMixin):
def test_rnnt_loss(self):
logits = torch.tensor([[[[0.1, 0.6, 0.1, 0.1, 0.1],
[0.1, 0.1, 0.6, 0.1, 0.1],
[0.1, 0.1, 0.2, 0.8, 0.1]],
[[0.1, 0.6, 0.1, 0.1, 0.1],
[0.1, 0.1, 0.2, 0.1, 0.1],
[0.7, 0.1, 0.2, 0.1, 0.1]]]])
tensor = logits.to(device=self.device, dtype=torch.float32)
targets = torch.tensor([[1, 2]], device=tensor.device, dtype=torch.int32)
logit_lengths = torch.tensor([2], device=tensor.device, dtype=torch.int32)
target_lengths = torch.tensor([2], device=tensor.device, dtype=torch.int32)
self._assert_consistency(T.RNNTLoss(), logits, targets, logit_lengths, target_lengths)
class TransformsFloat64Only(TestBaseMixin):
@parameterized.expand([
["ref_channel", True],
["stv_evd", True],
["stv_power", True],
["ref_channel", False],
["stv_evd", False],
["stv_power", False],
])
def test_MVDR(self, solution, online):
tensor = common_utils.get_whitenoise(sample_rate=8000, n_channels=4)
spectrogram = common_utils.get_spectrogram(tensor, n_fft=400, hop_length=100)
spectrogram = spectrogram.to(device=self.device, dtype=torch.cdouble)
mask_s = torch.rand(spectrogram.shape[-2:], device=self.device)
mask_n = torch.rand(spectrogram.shape[-2:], device=self.device)
self._assert_consistency_complex(
T.MVDR(solution=solution, online=online),
spectrogram, False, mask_s, mask_n
)
import torch
from torchaudio_unittest.common_utils import PytorchTestCase
from . transforms_test_impl import TransformsTestBase
class TransformsCPUFloat32Test(TransformsTestBase, PytorchTestCase):
device = 'cpu'
dtype = torch.float32
class TransformsCPUFloat64Test(TransformsTestBase, PytorchTestCase):
device = 'cpu'
dtype = torch.float64
import torch
from torchaudio_unittest.common_utils import (
PytorchTestCase,
skipIfNoCuda,
)
from . transforms_test_impl import TransformsTestBase
@skipIfNoCuda
class TransformsCUDAFloat32Test(TransformsTestBase, PytorchTestCase):
device = 'cuda'
dtype = torch.float32
@skipIfNoCuda
class TransformsCUDAFloat64Test(TransformsTestBase, PytorchTestCase):
device = 'cuda'
dtype = torch.float64
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment