"orbit/utils/summary_manager.py" did not exist on "b5a482c403bb46f168433fe3305708c16e9db24f"
Commit 9dcc7a15 authored by flyingdown's avatar flyingdown
Browse files

init v0.10.0

parent db2b0b79
Pipeline #254 failed with stages
in 0 seconds
from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
from .librosa_compatibility_test_impl import Functional, FunctionalComplex
@skipIfNoCuda
class TestFunctionalCUDA(Functional, PytorchTestCase):
device = 'cuda'
@skipIfNoCuda
class TestFunctionalComplexCUDA(FunctionalComplex, PytorchTestCase):
device = 'cuda'
import unittest
from distutils.version import StrictVersion
import torch
from parameterized import param
import torchaudio.functional as F
from torchaudio._internal.module_utils import is_module_available
LIBROSA_AVAILABLE = is_module_available('librosa')
if LIBROSA_AVAILABLE:
import numpy as np
import librosa
from torchaudio_unittest.common_utils import (
TestBaseMixin,
nested_params,
get_whitenoise,
get_spectrogram,
)
@unittest.skipIf(not LIBROSA_AVAILABLE, "Librosa not available")
class Functional(TestBaseMixin):
"""Test suite for functions in `functional` module."""
dtype = torch.float64
@nested_params([0, 0.99])
def test_griffinlim(self, momentum):
# FFT params
n_fft = 400
win_length = n_fft
hop_length = n_fft // 4
window = torch.hann_window(win_length, device=self.device)
power = 1
# GriffinLim params
n_iter = 8
waveform = get_whitenoise(device=self.device, dtype=self.dtype)
specgram = get_spectrogram(
waveform, n_fft=n_fft, hop_length=hop_length, power=power,
win_length=win_length, window=window)
result = F.griffinlim(
specgram,
window=window,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
power=power,
n_iter=n_iter,
momentum=momentum,
length=waveform.size(1),
rand_init=False)
expected = librosa.griffinlim(
specgram[0].cpu().numpy(),
n_iter=n_iter,
hop_length=hop_length,
momentum=momentum,
init=None,
length=waveform.size(1))[None, ...]
self.assertEqual(result, torch.from_numpy(expected), atol=5e-5, rtol=1e-07)
@nested_params(
[
param(),
param(n_mels=128, sample_rate=44100),
param(n_mels=128, fmin=2000.0, fmax=5000.0),
param(n_mels=56, fmin=100.0, fmax=9000.0),
param(n_mels=56, fmin=800.0, fmax=900.0),
param(n_mels=56, fmin=1900.0, fmax=900.0),
param(n_mels=10, fmin=1900.0, fmax=900.0),
],
[param(norm=n) for n in [None, 'slaney']],
[param(mel_scale=s) for s in ['htk', 'slaney']],
)
def test_create_mel_fb(self, n_mels=40, sample_rate=22050, n_fft=2048,
fmin=0.0, fmax=8000.0, norm=None, mel_scale="htk"):
if (norm == "slaney" and StrictVersion(librosa.__version__) < StrictVersion("0.7.2")):
self.skipTest('Test is known to fail with older versions of librosa.')
if self.device != 'cpu':
self.skipTest('No need to run this test on CUDA')
expected = librosa.filters.mel(
sr=sample_rate,
n_fft=n_fft,
n_mels=n_mels,
fmax=fmax,
fmin=fmin,
htk=mel_scale == "htk",
norm=norm).T
result = F.melscale_fbanks(
sample_rate=sample_rate,
n_mels=n_mels,
f_max=fmax,
f_min=fmin,
n_freqs=(n_fft // 2 + 1),
norm=norm,
mel_scale=mel_scale)
self.assertEqual(result, torch.from_numpy(expected), atol=7e-5, rtol=1.3e-6)
def test_amplitude_to_DB_power(self):
amin = 1e-10
db_multiplier = 0.0
top_db = 80.0
multiplier = 10.0
spec = get_spectrogram(get_whitenoise(device=self.device, dtype=self.dtype), power=2)
result = F.amplitude_to_DB(spec, multiplier, amin, db_multiplier, top_db)
expected = librosa.core.power_to_db(spec[0].cpu().numpy())[None, ...]
self.assertEqual(result, torch.from_numpy(expected))
def test_amplitude_to_DB(self):
amin = 1e-10
db_multiplier = 0.0
top_db = 80.0
multiplier = 20.0
spec = get_spectrogram(get_whitenoise(device=self.device, dtype=self.dtype), power=1)
result = F.amplitude_to_DB(spec, multiplier, amin, db_multiplier, top_db)
expected = librosa.core.amplitude_to_db(spec[0].cpu().numpy())[None, ...]
self.assertEqual(result, torch.from_numpy(expected))
@unittest.skipIf(not LIBROSA_AVAILABLE, "Librosa not available")
class FunctionalComplex(TestBaseMixin):
@nested_params(
[0.5, 1.01, 1.3],
[True, False],
)
def test_phase_vocoder(self, rate, test_pseudo_complex):
hop_length = 256
num_freq = 1025
num_frames = 400
torch.random.manual_seed(42)
# Due to cummulative sum, numerical error in using torch.float32 will
# result in bottom right values of the stretched sectrogram to not
# match with librosa.
spec = torch.randn(num_freq, num_frames, device=self.device, dtype=torch.complex128)
phase_advance = torch.linspace(
0,
np.pi * hop_length,
num_freq,
device=self.device,
dtype=torch.float64)[..., None]
stretched = F.phase_vocoder(
torch.view_as_real(spec) if test_pseudo_complex else spec,
rate=rate, phase_advance=phase_advance)
expected_stretched = librosa.phase_vocoder(
spec.cpu().numpy(),
rate=rate,
hop_length=hop_length)
self.assertEqual(
torch.view_as_complex(stretched) if test_pseudo_complex else stretched,
torch.from_numpy(expected_stretched))
import torch
import torchaudio.functional as F
from torchaudio_unittest.common_utils import (
skipIfNoSox,
skipIfNoExec,
TempDirMixin,
TorchaudioTestCase,
get_asset_path,
sox_utils,
load_wav,
save_wav,
get_whitenoise,
)
@skipIfNoSox
@skipIfNoExec('sox')
class TestFunctionalFiltering(TempDirMixin, TorchaudioTestCase):
def run_sox_effect(self, input_file, effect):
output_file = self.get_temp_path('expected.wav')
sox_utils.run_sox_effect(input_file, output_file, [str(e) for e in effect])
return load_wav(output_file)
def assert_sox_effect(self, result, input_path, effects, atol=1e-04, rtol=1e-5):
expected, _ = self.run_sox_effect(input_path, effects)
self.assertEqual(result, expected, atol=atol, rtol=rtol)
def get_whitenoise(self, sample_rate=8000):
noise = get_whitenoise(
sample_rate=sample_rate, duration=3, scale_factor=0.9,
)
path = self.get_temp_path("whitenoise.wav")
save_wav(path, noise, sample_rate)
return noise, path
def test_gain(self):
path = get_asset_path('steam-train-whistle-daniel_simon.wav')
data, _ = load_wav(path)
result = F.gain(data, 3)
self.assert_sox_effect(result, path, ['gain', 3])
def test_dither(self):
path = get_asset_path('steam-train-whistle-daniel_simon.wav')
data, _ = load_wav(path)
result = F.dither(data)
self.assert_sox_effect(result, path, ['dither'])
def test_dither_noise(self):
path = get_asset_path('steam-train-whistle-daniel_simon.wav')
data, _ = load_wav(path)
result = F.dither(data, noise_shaping=True)
self.assert_sox_effect(result, path, ['dither', '-s'], atol=1.5e-4)
def test_lowpass(self):
cutoff_freq = 3000
sample_rate = 8000
data, path = self.get_whitenoise(sample_rate)
result = F.lowpass_biquad(data, sample_rate, cutoff_freq)
self.assert_sox_effect(result, path, ['lowpass', cutoff_freq], atol=1.5e-4)
def test_highpass(self):
cutoff_freq = 2000
sample_rate = 8000
data, path = self.get_whitenoise(sample_rate)
result = F.highpass_biquad(data, sample_rate, cutoff_freq)
self.assert_sox_effect(result, path, ['highpass', cutoff_freq], atol=1.5e-4)
def test_allpass(self):
central_freq = 1000
q = 0.707
sample_rate = 8000
data, path = self.get_whitenoise(sample_rate)
result = F.allpass_biquad(data, sample_rate, central_freq, q)
self.assert_sox_effect(result, path, ['allpass', central_freq, f'{q}q'])
def test_bandpass_with_csg(self):
central_freq = 1000
q = 0.707
const_skirt_gain = True
sample_rate = 8000
data, path = self.get_whitenoise(sample_rate)
result = F.bandpass_biquad(data, sample_rate, central_freq, q, const_skirt_gain)
self.assert_sox_effect(result, path, ['bandpass', '-c', central_freq, f'{q}q'])
def test_bandpass_without_csg(self):
central_freq = 1000
q = 0.707
const_skirt_gain = False
sample_rate = 8000
data, path = self.get_whitenoise(sample_rate)
result = F.bandpass_biquad(data, sample_rate, central_freq, q, const_skirt_gain)
self.assert_sox_effect(result, path, ['bandpass', central_freq, f'{q}q'])
def test_bandreject(self):
central_freq = 1000
q = 0.707
sample_rate = 8000
data, path = self.get_whitenoise(sample_rate)
result = F.bandreject_biquad(data, sample_rate, central_freq, q)
self.assert_sox_effect(result, path, ['bandreject', central_freq, f'{q}q'])
def test_band_with_noise(self):
central_freq = 1000
q = 0.707
noise = True
sample_rate = 8000
data, path = self.get_whitenoise(sample_rate)
result = F.band_biquad(data, sample_rate, central_freq, q, noise)
self.assert_sox_effect(result, path, ['band', '-n', central_freq, f'{q}q'])
def test_band_without_noise(self):
central_freq = 1000
q = 0.707
noise = False
sample_rate = 8000
data, path = self.get_whitenoise(sample_rate)
result = F.band_biquad(data, sample_rate, central_freq, q, noise)
self.assert_sox_effect(result, path, ['band', central_freq, f'{q}q'])
def test_treble(self):
central_freq = 1000
q = 0.707
gain = 40
sample_rate = 8000
data, path = self.get_whitenoise(sample_rate)
result = F.treble_biquad(data, sample_rate, gain, central_freq, q)
self.assert_sox_effect(result, path, ['treble', gain, central_freq, f'{q}q'])
def test_bass(self):
central_freq = 1000
q = 0.707
gain = 40
sample_rate = 8000
data, path = self.get_whitenoise(sample_rate)
result = F.bass_biquad(data, sample_rate, gain, central_freq, q)
self.assert_sox_effect(result, path, ['bass', gain, central_freq, f'{q}q'], atol=1.5e-4)
def test_deemph(self):
sample_rate = 44100
data, path = self.get_whitenoise(sample_rate)
result = F.deemph_biquad(data, sample_rate)
self.assert_sox_effect(result, path, ['deemph'])
def test_riaa(self):
sample_rate = 44100
data, path = self.get_whitenoise(sample_rate)
result = F.riaa_biquad(data, sample_rate)
self.assert_sox_effect(result, path, ['riaa'])
def test_contrast(self):
enhancement_amount = 80.
data, path = self.get_whitenoise()
result = F.contrast(data, enhancement_amount)
self.assert_sox_effect(result, path, ['contrast', enhancement_amount])
def test_dcshift_with_limiter(self):
shift = 0.5
limiter_gain = 0.05
data, path = self.get_whitenoise()
result = F.dcshift(data, shift, limiter_gain)
self.assert_sox_effect(result, path, ['dcshift', shift, limiter_gain])
def test_dcshift_without_limiter(self):
shift = 0.6
data, path = self.get_whitenoise()
result = F.dcshift(data, shift)
self.assert_sox_effect(result, path, ['dcshift', shift])
def test_overdrive(self):
gain = 30
colour = 40
data, path = self.get_whitenoise()
result = F.overdrive(data, gain, colour)
self.assert_sox_effect(result, path, ['overdrive', gain, colour])
def test_phaser_sine(self):
gain_in = 0.5
gain_out = 0.8
delay_ms = 2.0
decay = 0.4
speed = 0.5
sample_rate = 8000
data, path = self.get_whitenoise(sample_rate)
result = F.phaser(data, sample_rate, gain_in, gain_out, delay_ms, decay, speed, sinusoidal=True)
self.assert_sox_effect(result, path, ['phaser', gain_in, gain_out, delay_ms, decay, speed, '-s'])
def test_phaser_triangle(self):
gain_in = 0.5
gain_out = 0.8
delay_ms = 2.0
decay = 0.4
speed = 0.5
sample_rate = 8000
data, path = self.get_whitenoise(sample_rate)
result = F.phaser(data, sample_rate, gain_in, gain_out, delay_ms, decay, speed, sinusoidal=False)
self.assert_sox_effect(result, path, ['phaser', gain_in, gain_out, delay_ms, decay, speed, '-t'])
def test_flanger_triangle_linear(self):
delay = 0.6
depth = 0.87
regen = 3.0
width = 0.9
speed = 0.5
phase = 30
sample_rate = 8000
data, path = self.get_whitenoise(sample_rate)
result = F.flanger(
data, sample_rate, delay, depth, regen, width, speed, phase,
modulation='triangular', interpolation='linear')
self.assert_sox_effect(
result, path, ['flanger', delay, depth, regen, width, speed, 'triangle', phase, 'linear'])
def test_flanger_triangle_quad(self):
delay = 0.8
depth = 0.88
regen = 3.0
width = 0.4
speed = 0.5
phase = 40
sample_rate = 8000
data, path = self.get_whitenoise(sample_rate)
result = F.flanger(
data, sample_rate, delay, depth, regen, width, speed, phase,
modulation='triangular', interpolation='quadratic')
self.assert_sox_effect(
result, path, ['flanger', delay, depth, regen, width, speed, 'triangle', phase, 'quadratic'])
def test_flanger_sine_linear(self):
delay = 0.8
depth = 0.88
regen = 3.0
width = 0.23
speed = 1.3
phase = 60
sample_rate = 8000
data, path = self.get_whitenoise(sample_rate)
result = F.flanger(
data, sample_rate, delay, depth, regen, width, speed, phase,
modulation='sinusoidal', interpolation='linear')
self.assert_sox_effect(
result, path, ['flanger', delay, depth, regen, width, speed, 'sine', phase, 'linear'])
def test_flanger_sine_quad(self):
delay = 0.9
depth = 0.9
regen = 4.0
width = 0.23
speed = 1.3
phase = 25
sample_rate = 8000
data, path = self.get_whitenoise(sample_rate)
result = F.flanger(
data, sample_rate, delay, depth, regen, width, speed, phase,
modulation='sinusoidal', interpolation='quadratic')
self.assert_sox_effect(
result, path, ['flanger', delay, depth, regen, width, speed, 'sine', phase, 'quadratic'])
def test_equalizer(self):
center_freq = 300
q = 0.707
gain = 1
sample_rate = 8000
data, path = self.get_whitenoise(sample_rate)
result = F.equalizer_biquad(data, sample_rate, center_freq, gain, q)
self.assert_sox_effect(result, path, ['equalizer', center_freq, q, gain])
def test_perf_biquad_filtering(self):
b0 = 0.4
b1 = 0.2
b2 = 0.9
a0 = 0.7
a1 = 0.2
a2 = 0.6
data, path = self.get_whitenoise()
result = F.lfilter(data, torch.tensor([a0, a1, a2]), torch.tensor([b0, b1, b2]))
self.assert_sox_effect(result, path, ['biquad', b0, b1, b2, a0, a1, a2])
import torch
from torchaudio_unittest.common_utils import PytorchTestCase
from .torchscript_consistency_impl import Functional, FunctionalFloat32Only
class TestFunctionalFloat32(Functional, FunctionalFloat32Only, PytorchTestCase):
dtype = torch.float32
device = torch.device('cpu')
class TestFunctionalFloat64(Functional, PytorchTestCase):
dtype = torch.float64
device = torch.device('cpu')
import torch
from torchaudio_unittest.common_utils import skipIfNoCuda, PytorchTestCase
from .torchscript_consistency_impl import Functional, FunctionalFloat32Only
@skipIfNoCuda
class TestFunctionalFloat32(Functional, FunctionalFloat32Only, PytorchTestCase):
dtype = torch.float32
device = torch.device('cuda')
@skipIfNoCuda
class TestFunctionalFloat64(Functional, PytorchTestCase):
dtype = torch.float64
device = torch.device('cuda')
"""Test suites for jit-ability and its numerical compatibility"""
import unittest
import torch
import torchaudio.functional as F
from parameterized import parameterized
from torchaudio_unittest import common_utils
from torchaudio_unittest.common_utils import (
TempDirMixin,
TestBaseMixin,
skipIfRocm,
torch_script,
)
class Functional(TempDirMixin, TestBaseMixin):
"""Implements test for `functional` module that are performed for different devices"""
def _assert_consistency(self, func, tensor, shape_only=False):
tensor = tensor.to(device=self.device, dtype=self.dtype)
ts_func = torch_script(func)
torch.random.manual_seed(40)
output = func(tensor)
torch.random.manual_seed(40)
ts_output = ts_func(tensor)
if shape_only:
ts_output = ts_output.shape
output = output.shape
self.assertEqual(ts_output, output)
def _assert_consistency_complex(self, func, tensor, test_pseudo_complex=False):
assert tensor.is_complex()
tensor = tensor.to(device=self.device, dtype=self.complex_dtype)
ts_func = torch_script(func)
if test_pseudo_complex:
tensor = torch.view_as_real(tensor)
torch.random.manual_seed(40)
output = func(tensor)
torch.random.manual_seed(40)
ts_output = ts_func(tensor)
self.assertEqual(ts_output, output)
def test_spectrogram_complex(self):
def func(tensor):
n_fft = 400
ws = 400
hop = 200
pad = 0
window = torch.hann_window(ws, device=tensor.device, dtype=tensor.dtype)
power = None
normalize = False
return F.spectrogram(tensor, pad, window, n_fft, hop, ws, power, normalize)
tensor = common_utils.get_whitenoise()
self._assert_consistency(func, tensor)
def test_spectrogram_real(self):
def func(tensor):
n_fft = 400
ws = 400
hop = 200
pad = 0
window = torch.hann_window(ws, device=tensor.device, dtype=tensor.dtype)
power = 2.
normalize = False
return F.spectrogram(tensor, pad, window, n_fft, hop, ws, power, normalize, return_complex=False)
tensor = common_utils.get_whitenoise()
self._assert_consistency(func, tensor)
def test_inverse_spectrogram_complex(self):
def func(tensor):
length = 400
n_fft = 400
hop = 200
ws = 400
pad = 0
window = torch.hann_window(ws, device=tensor.device, dtype=torch.float64)
normalize = False
return F.inverse_spectrogram(tensor, length, pad, window, n_fft, hop, ws, normalize)
waveform = common_utils.get_whitenoise(sample_rate=8000, duration=0.05)
tensor = common_utils.get_spectrogram(waveform, n_fft=400, hop_length=200)
self._assert_consistency_complex(func, tensor)
def test_inverse_spectrogram_real(self):
def func(tensor):
length = 400
n_fft = 400
hop = 200
ws = 400
pad = 0
window = torch.hann_window(ws, device=tensor.device, dtype=tensor.dtype)
normalize = False
return F.inverse_spectrogram(tensor, length, pad, window, n_fft, hop, ws, normalize)
waveform = common_utils.get_whitenoise(sample_rate=8000, duration=0.05)
tensor = common_utils.get_spectrogram(waveform, n_fft=400, hop_length=200)
tensor = torch.view_as_real(tensor)
self._assert_consistency(func, tensor)
@skipIfRocm
def test_griffinlim(self):
def func(tensor):
n_fft = 400
ws = 400
hop = 200
window = torch.hann_window(ws, device=tensor.device, dtype=tensor.dtype)
power = 2.
momentum = 0.99
n_iter = 32
length = 1000
rand_int = False
return F.griffinlim(tensor, window, n_fft, hop, ws, power, n_iter, momentum, length, rand_int)
tensor = torch.rand((1, 201, 6))
self._assert_consistency(func, tensor)
def test_compute_deltas(self):
def func(tensor):
win_length = 2 * 7 + 1
return F.compute_deltas(tensor, win_length=win_length)
channel = 13
n_mfcc = channel * 3
time = 1021
tensor = torch.randn(channel, n_mfcc, time)
self._assert_consistency(func, tensor)
def test_detect_pitch_frequency(self):
waveform = common_utils.get_sinusoid(sample_rate=44100)
def func(tensor):
sample_rate = 44100
return F.detect_pitch_frequency(tensor, sample_rate)
self._assert_consistency(func, waveform)
def test_melscale_fbanks(self):
if self.device != torch.device('cpu'):
raise unittest.SkipTest('No need to perform test on device other than CPU')
def func(_):
n_stft = 100
f_min = 0.0
f_max = 20.0
n_mels = 10
sample_rate = 16000
norm = "slaney"
return F.melscale_fbanks(n_stft, f_min, f_max, n_mels, sample_rate, norm)
dummy = torch.zeros(1, 1)
self._assert_consistency(func, dummy)
def test_linear_fbanks(self):
if self.device != torch.device('cpu'):
raise unittest.SkipTest('No need to perform test on device other than CPU')
def func(_):
n_stft = 100
f_min = 0.0
f_max = 20.0
n_filter = 10
sample_rate = 16000
return F.linear_fbanks(n_stft, f_min, f_max, n_filter, sample_rate)
dummy = torch.zeros(1, 1)
self._assert_consistency(func, dummy)
def test_amplitude_to_DB(self):
def func(tensor):
multiplier = 10.0
amin = 1e-10
db_multiplier = 0.0
top_db = 80.0
return F.amplitude_to_DB(tensor, multiplier, amin, db_multiplier, top_db)
tensor = torch.rand((6, 201))
self._assert_consistency(func, tensor)
def test_DB_to_amplitude(self):
def func(tensor):
ref = 1.
power = 1.
return F.DB_to_amplitude(tensor, ref, power)
tensor = torch.rand((1, 100))
self._assert_consistency(func, tensor)
def test_create_dct(self):
if self.device != torch.device('cpu'):
raise unittest.SkipTest('No need to perform test on device other than CPU')
def func(_):
n_mfcc = 40
n_mels = 128
norm = "ortho"
return F.create_dct(n_mfcc, n_mels, norm)
dummy = torch.zeros(1, 1)
self._assert_consistency(func, dummy)
def test_mu_law_encoding(self):
def func(tensor):
qc = 256
return F.mu_law_encoding(tensor, qc)
waveform = common_utils.get_whitenoise()
self._assert_consistency(func, waveform)
def test_mu_law_decoding(self):
def func(tensor):
qc = 256
return F.mu_law_decoding(tensor, qc)
tensor = torch.rand((1, 10))
self._assert_consistency(func, tensor)
def test_complex_norm(self):
def func(tensor):
power = 2.
return F.complex_norm(tensor, power)
tensor = torch.randn(1, 2, 1025, 400, 2)
self._assert_consistency(func, tensor)
def test_mask_along_axis(self):
def func(tensor):
mask_param = 100
mask_value = 30.
axis = 2
return F.mask_along_axis(tensor, mask_param, mask_value, axis)
tensor = torch.randn(2, 1025, 400)
self._assert_consistency(func, tensor)
def test_mask_along_axis_iid(self):
def func(tensor):
mask_param = 100
mask_value = 30.
axis = 2
return F.mask_along_axis_iid(tensor, mask_param, mask_value, axis)
tensor = torch.randn(4, 2, 1025, 400)
self._assert_consistency(func, tensor)
def test_gain(self):
def func(tensor):
gainDB = 2.0
return F.gain(tensor, gainDB)
tensor = torch.rand((1, 1000))
self._assert_consistency(func, tensor)
def test_dither_TPDF(self):
def func(tensor):
return F.dither(tensor, 'TPDF')
tensor = common_utils.get_whitenoise(n_channels=2)
self._assert_consistency(func, tensor, shape_only=True)
def test_dither_RPDF(self):
def func(tensor):
return F.dither(tensor, 'RPDF')
tensor = common_utils.get_whitenoise(n_channels=2)
self._assert_consistency(func, tensor, shape_only=True)
def test_dither_GPDF(self):
def func(tensor):
return F.dither(tensor, 'GPDF')
tensor = common_utils.get_whitenoise(n_channels=2)
self._assert_consistency(func, tensor, shape_only=True)
def test_dither_noise_shaping(self):
def func(tensor):
return F.dither(tensor, noise_shaping=True)
tensor = common_utils.get_whitenoise(n_channels=2)
self._assert_consistency(func, tensor)
def test_lfilter(self):
if self.dtype == torch.float64:
raise unittest.SkipTest("This test is known to fail for float64")
waveform = common_utils.get_whitenoise()
def func(tensor):
# Design an IIR lowpass filter using scipy.signal filter design
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.iirdesign.html#scipy.signal.iirdesign
#
# Example
# >>> from scipy.signal import iirdesign
# >>> b, a = iirdesign(0.2, 0.3, 1, 60)
b_coeffs = torch.tensor(
[
0.00299893,
-0.0051152,
0.00841964,
-0.00747802,
0.00841964,
-0.0051152,
0.00299893,
],
device=tensor.device,
dtype=tensor.dtype,
)
a_coeffs = torch.tensor(
[
1.0,
-4.8155751,
10.2217618,
-12.14481273,
8.49018171,
-3.3066882,
0.56088705,
],
device=tensor.device,
dtype=tensor.dtype,
)
return F.lfilter(tensor, a_coeffs, b_coeffs)
self._assert_consistency(func, waveform)
def test_filtfilt(self):
def func(tensor):
torch.manual_seed(296)
b_coeffs = torch.rand(4, device=tensor.device, dtype=tensor.dtype)
a_coeffs = torch.rand(4, device=tensor.device, dtype=tensor.dtype)
return F.filtfilt(tensor, a_coeffs, b_coeffs)
waveform = common_utils.get_whitenoise(sample_rate=8000)
self._assert_consistency(func, waveform)
def test_lowpass(self):
if self.dtype == torch.float64:
raise unittest.SkipTest("This test is known to fail for float64")
waveform = common_utils.get_whitenoise(sample_rate=44100)
def func(tensor):
sample_rate = 44100
cutoff_freq = 3000.
return F.lowpass_biquad(tensor, sample_rate, cutoff_freq)
self._assert_consistency(func, waveform)
def test_highpass(self):
if self.dtype == torch.float64:
raise unittest.SkipTest("This test is known to fail for float64")
waveform = common_utils.get_whitenoise(sample_rate=44100)
def func(tensor):
sample_rate = 44100
cutoff_freq = 2000.
return F.highpass_biquad(tensor, sample_rate, cutoff_freq)
self._assert_consistency(func, waveform)
def test_allpass(self):
if self.dtype == torch.float64:
raise unittest.SkipTest("This test is known to fail for float64")
waveform = common_utils.get_whitenoise(sample_rate=44100)
def func(tensor):
sample_rate = 44100
central_freq = 1000.
q = 0.707
return F.allpass_biquad(tensor, sample_rate, central_freq, q)
self._assert_consistency(func, waveform)
def test_bandpass_with_csg(self):
if self.dtype == torch.float64:
raise unittest.SkipTest("This test is known to fail for float64")
waveform = common_utils.get_whitenoise(sample_rate=44100)
def func(tensor):
sample_rate = 44100
central_freq = 1000.
q = 0.707
const_skirt_gain = True
return F.bandpass_biquad(tensor, sample_rate, central_freq, q, const_skirt_gain)
self._assert_consistency(func, waveform)
def test_bandpass_without_csg(self):
if self.dtype == torch.float64:
raise unittest.SkipTest("This test is known to fail for float64")
waveform = common_utils.get_whitenoise(sample_rate=44100)
def func(tensor):
sample_rate = 44100
central_freq = 1000.
q = 0.707
const_skirt_gain = True
return F.bandpass_biquad(tensor, sample_rate, central_freq, q, const_skirt_gain)
self._assert_consistency(func, waveform)
def test_bandreject(self):
if self.dtype == torch.float64:
raise unittest.SkipTest("This test is known to fail for float64")
waveform = common_utils.get_whitenoise(sample_rate=44100)
def func(tensor):
sample_rate = 44100
central_freq = 1000.
q = 0.707
return F.bandreject_biquad(tensor, sample_rate, central_freq, q)
self._assert_consistency(func, waveform)
def test_band_with_noise(self):
if self.dtype == torch.float64:
raise unittest.SkipTest("This test is known to fail for float64")
waveform = common_utils.get_whitenoise(sample_rate=44100)
def func(tensor):
sample_rate = 44100
central_freq = 1000.
q = 0.707
noise = True
return F.band_biquad(tensor, sample_rate, central_freq, q, noise)
self._assert_consistency(func, waveform)
def test_band_without_noise(self):
if self.dtype == torch.float64:
raise unittest.SkipTest("This test is known to fail for float64")
waveform = common_utils.get_whitenoise(sample_rate=44100)
def func(tensor):
sample_rate = 44100
central_freq = 1000.
q = 0.707
noise = False
return F.band_biquad(tensor, sample_rate, central_freq, q, noise)
self._assert_consistency(func, waveform)
def test_treble(self):
if self.dtype == torch.float64:
raise unittest.SkipTest("This test is known to fail for float64")
waveform = common_utils.get_whitenoise(sample_rate=44100)
def func(tensor):
sample_rate = 44100
gain = 40.
central_freq = 1000.
q = 0.707
return F.treble_biquad(tensor, sample_rate, gain, central_freq, q)
self._assert_consistency(func, waveform)
def test_bass(self):
if self.dtype == torch.float64:
raise unittest.SkipTest("This test is known to fail for float64")
waveform = common_utils.get_whitenoise(sample_rate=44100)
def func(tensor):
sample_rate = 44100
gain = 40.
central_freq = 1000.
q = 0.707
return F.bass_biquad(tensor, sample_rate, gain, central_freq, q)
self._assert_consistency(func, waveform)
def test_deemph(self):
if self.dtype == torch.float64:
raise unittest.SkipTest("This test is known to fail for float64")
waveform = common_utils.get_whitenoise(sample_rate=44100)
def func(tensor):
sample_rate = 44100
return F.deemph_biquad(tensor, sample_rate)
self._assert_consistency(func, waveform)
def test_riaa(self):
if self.dtype == torch.float64:
raise unittest.SkipTest("This test is known to fail for float64")
waveform = common_utils.get_whitenoise(sample_rate=44100)
def func(tensor):
sample_rate = 44100
return F.riaa_biquad(tensor, sample_rate)
self._assert_consistency(func, waveform)
def test_equalizer(self):
if self.dtype == torch.float64:
raise unittest.SkipTest("This test is known to fail for float64")
waveform = common_utils.get_whitenoise(sample_rate=44100)
def func(tensor):
sample_rate = 44100
center_freq = 300.
gain = 1.
q = 0.707
return F.equalizer_biquad(tensor, sample_rate, center_freq, gain, q)
self._assert_consistency(func, waveform)
def test_perf_biquad_filtering(self):
if self.dtype == torch.float64:
raise unittest.SkipTest("This test is known to fail for float64")
waveform = common_utils.get_whitenoise()
def func(tensor):
a = torch.tensor([0.7, 0.2, 0.6], device=tensor.device, dtype=tensor.dtype)
b = torch.tensor([0.4, 0.2, 0.9], device=tensor.device, dtype=tensor.dtype)
return F.lfilter(tensor, a, b)
self._assert_consistency(func, waveform)
def test_sliding_window_cmn(self):
def func(tensor):
cmn_window = 600
min_cmn_window = 100
center = False
norm_vars = False
a = torch.tensor(
[
[
-1.915875792503357,
1.147700309753418
],
[
1.8242558240890503,
1.3869990110397339
]
],
device=tensor.device,
dtype=tensor.dtype
)
return F.sliding_window_cmn(a, cmn_window, min_cmn_window, center, norm_vars)
b = torch.tensor(
[
[
-1.8701,
-0.1196
],
[
1.8701,
0.1196
]
]
)
self._assert_consistency(func, b)
def test_contrast(self):
waveform = common_utils.get_whitenoise()
def func(tensor):
enhancement_amount = 80.
return F.contrast(tensor, enhancement_amount)
self._assert_consistency(func, waveform)
def test_dcshift(self):
waveform = common_utils.get_whitenoise()
def func(tensor):
shift = 0.5
limiter_gain = 0.05
return F.dcshift(tensor, shift, limiter_gain)
self._assert_consistency(func, waveform)
def test_overdrive(self):
waveform = common_utils.get_whitenoise()
def func(tensor):
gain = 30.
colour = 50.
return F.overdrive(tensor, gain, colour)
self._assert_consistency(func, waveform)
def test_phaser(self):
waveform = common_utils.get_whitenoise(sample_rate=44100)
def func(tensor):
gain_in = 0.5
gain_out = 0.8
delay_ms = 2.0
decay = 0.4
speed = 0.5
sample_rate = 44100
return F.phaser(tensor, sample_rate, gain_in, gain_out, delay_ms, decay, speed, sinusoidal=True)
self._assert_consistency(func, waveform)
def test_flanger(self):
torch.random.manual_seed(40)
waveform = torch.rand(2, 100) - 0.5
def func(tensor):
delay = 0.8
depth = 0.88
regen = 3.0
width = 0.23
speed = 1.3
phase = 60.
sample_rate = 44100
return F.flanger(tensor, sample_rate, delay, depth, regen, width, speed,
phase, modulation='sinusoidal', interpolation='linear')
self._assert_consistency(func, waveform)
def test_spectral_centroid(self):
def func(tensor):
sample_rate = 44100
n_fft = 400
ws = 400
hop = 200
pad = 0
window = torch.hann_window(ws, device=tensor.device, dtype=tensor.dtype)
return F.spectral_centroid(tensor, sample_rate, pad, window, n_fft, hop, ws)
tensor = common_utils.get_whitenoise(sample_rate=44100)
self._assert_consistency(func, tensor)
@common_utils.skipIfNoKaldi
def test_compute_kaldi_pitch(self):
if self.dtype != torch.float32 or self.device != torch.device('cpu'):
raise unittest.SkipTest("Only float32, cpu is supported.")
def func(tensor):
sample_rate: float = 44100.
return F.compute_kaldi_pitch(tensor, sample_rate)
tensor = common_utils.get_whitenoise(sample_rate=44100)
self._assert_consistency(func, tensor)
def test_resample_sinc(self):
def func(tensor):
sr1, sr2 = 16000, 8000
return F.resample(tensor, sr1, sr2, resampling_method="sinc_interpolation")
tensor = common_utils.get_whitenoise(sample_rate=16000)
self._assert_consistency(func, tensor)
def test_resample_kaiser(self):
def func(tensor):
sr1, sr2 = 16000, 8000
return F.resample(tensor, sr1, sr2, resampling_method="kaiser_window")
def func_beta(tensor):
sr1, sr2 = 16000, 8000
beta = 6.
return F.resample(tensor, sr1, sr2, resampling_method="kaiser_window", beta=beta)
tensor = common_utils.get_whitenoise(sample_rate=16000)
self._assert_consistency(func, tensor)
self._assert_consistency(func_beta, tensor)
@parameterized.expand([(True, ), (False, )])
def test_phase_vocoder(self, test_paseudo_complex):
def func(tensor):
is_complex = tensor.is_complex()
n_freq = tensor.size(-2 if is_complex else -3)
rate = 0.5
hop_length = 256
phase_advance = torch.linspace(
0,
3.14 * hop_length,
n_freq,
dtype=(torch.real(tensor) if is_complex else tensor).dtype,
device=tensor.device,
)[..., None]
return F.phase_vocoder(tensor, rate, phase_advance)
tensor = torch.view_as_complex(torch.randn(2, 1025, 400, 2))
self._assert_consistency_complex(func, tensor, test_paseudo_complex)
class FunctionalFloat32Only(TestBaseMixin):
def test_rnnt_loss(self):
def func(tensor):
targets = torch.tensor([[1, 2]], device=tensor.device, dtype=torch.int32)
logit_lengths = torch.tensor([2], device=tensor.device, dtype=torch.int32)
target_lengths = torch.tensor([2], device=tensor.device, dtype=torch.int32)
return F.rnnt_loss(tensor, targets, logit_lengths, target_lengths)
logits = torch.tensor([[[[0.1, 0.6, 0.1, 0.1, 0.1],
[0.1, 0.1, 0.6, 0.1, 0.1],
[0.1, 0.1, 0.2, 0.8, 0.1]],
[[0.1, 0.6, 0.1, 0.1, 0.1],
[0.1, 0.1, 0.2, 0.1, 0.1],
[0.7, 0.1, 0.2, 0.1, 0.1]]]])
tensor = logits.to(device=self.device, dtype=torch.float32)
self._assert_consistency(func, tensor)
import torch
import torchaudio.kaldi_io as kio
from torchaudio_unittest import common_utils
class Test_KaldiIO(common_utils.TorchaudioTestCase):
data1 = [[1, 2, 3], [11, 12, 13], [21, 22, 23]]
data2 = [[31, 32, 33], [41, 42, 43], [51, 52, 53]]
def _test_helper(self, file_name, expected_data, fn, expected_dtype):
""" Takes a file_name to the input data and a function fn to extract the
data. It compares the extracted data to the expected_data. The expected_dtype
will be used to check that the extracted data is of the right type.
"""
test_filepath = common_utils.get_asset_path(file_name)
expected_output = {'key' + str(idx + 1): torch.tensor(val, dtype=expected_dtype)
for idx, val in enumerate(expected_data)}
for key, vec in fn(test_filepath):
self.assertTrue(key in expected_output)
self.assertTrue(isinstance(vec, torch.Tensor))
self.assertEqual(vec.dtype, expected_dtype)
self.assertTrue(torch.all(torch.eq(vec, expected_output[key])))
def test_read_vec_int_ark(self):
self._test_helper("vec_int.ark", self.data1, kio.read_vec_int_ark, torch.int32)
def test_read_vec_flt_ark(self):
self._test_helper("vec_flt.ark", self.data1, kio.read_vec_flt_ark, torch.float32)
def test_read_mat_ark(self):
self._test_helper("mat.ark", [self.data1, self.data2], kio.read_mat_ark, torch.float32)
import itertools
from collections import namedtuple
import torch
from parameterized import parameterized
from torchaudio.models import ConvTasNet, DeepSpeech, Wav2Letter, WaveRNN
from torchaudio.models.wavernn import MelResNet, UpsampleNetwork
from torchaudio_unittest import common_utils
from torchaudio_unittest.common_utils import torch_script
class TestWav2Letter(common_utils.TorchaudioTestCase):
def test_waveform(self):
batch_size = 2
num_features = 1
num_classes = 40
input_length = 320
model = Wav2Letter(num_classes=num_classes, num_features=num_features)
x = torch.rand(batch_size, num_features, input_length)
out = model(x)
assert out.size() == (batch_size, num_classes, 2)
def test_mfcc(self):
batch_size = 2
num_features = 13
num_classes = 40
input_length = 2
model = Wav2Letter(num_classes=num_classes, input_type="mfcc", num_features=num_features)
x = torch.rand(batch_size, num_features, input_length)
out = model(x)
assert out.size() == (batch_size, num_classes, 2)
class TestMelResNet(common_utils.TorchaudioTestCase):
def test_waveform(self):
"""Validate the output dimensions of a MelResNet block.
"""
n_batch = 2
n_time = 200
n_freq = 100
n_output = 128
n_res_block = 10
n_hidden = 128
kernel_size = 5
model = MelResNet(n_res_block, n_freq, n_hidden, n_output, kernel_size)
x = torch.rand(n_batch, n_freq, n_time)
out = model(x)
assert out.size() == (n_batch, n_output, n_time - kernel_size + 1)
class TestUpsampleNetwork(common_utils.TorchaudioTestCase):
def test_waveform(self):
"""Validate the output dimensions of a UpsampleNetwork block.
"""
upsample_scales = [5, 5, 8]
n_batch = 2
n_time = 200
n_freq = 100
n_output = 256
n_res_block = 10
n_hidden = 128
kernel_size = 5
total_scale = 1
for upsample_scale in upsample_scales:
total_scale *= upsample_scale
model = UpsampleNetwork(upsample_scales,
n_res_block,
n_freq,
n_hidden,
n_output,
kernel_size)
x = torch.rand(n_batch, n_freq, n_time)
out1, out2 = model(x)
assert out1.size() == (n_batch, n_freq, total_scale * (n_time - kernel_size + 1))
assert out2.size() == (n_batch, n_output, total_scale * (n_time - kernel_size + 1))
class TestWaveRNN(common_utils.TorchaudioTestCase):
def test_waveform(self):
"""Validate the output dimensions of a WaveRNN model.
"""
upsample_scales = [5, 5, 8]
n_rnn = 512
n_fc = 512
n_classes = 512
hop_length = 200
n_batch = 2
n_time = 200
n_freq = 100
n_output = 256
n_res_block = 10
n_hidden = 128
kernel_size = 5
model = WaveRNN(upsample_scales, n_classes, hop_length, n_res_block,
n_rnn, n_fc, kernel_size, n_freq, n_hidden, n_output)
x = torch.rand(n_batch, 1, hop_length * (n_time - kernel_size + 1))
mels = torch.rand(n_batch, 1, n_freq, n_time)
out = model(x, mels)
assert out.size() == (n_batch, 1, hop_length * (n_time - kernel_size + 1), n_classes)
def test_infer_waveform(self):
"""Validate the output dimensions of a WaveRNN model's infer method.
"""
upsample_scales = [5, 5, 8]
n_rnn = 128
n_fc = 128
n_classes = 128
hop_length = 200
n_batch = 2
n_time = 50
n_freq = 25
n_output = 64
n_res_block = 2
n_hidden = 32
kernel_size = 5
model = WaveRNN(upsample_scales, n_classes, hop_length, n_res_block,
n_rnn, n_fc, kernel_size, n_freq, n_hidden, n_output)
x = torch.rand(n_batch, n_freq, n_time)
lengths = torch.tensor([n_time, n_time // 2])
out, waveform_lengths = model.infer(x, lengths)
assert out.size() == (n_batch, 1, hop_length * n_time)
assert waveform_lengths[0] == hop_length * n_time
assert waveform_lengths[1] == hop_length * n_time // 2
def test_torchscript_infer(self):
"""Scripted model outputs the same as eager mode"""
upsample_scales = [5, 5, 8]
n_rnn = 128
n_fc = 128
n_classes = 128
hop_length = 200
n_batch = 2
n_time = 50
n_freq = 25
n_output = 64
n_res_block = 2
n_hidden = 32
kernel_size = 5
model = WaveRNN(upsample_scales, n_classes, hop_length, n_res_block,
n_rnn, n_fc, kernel_size, n_freq, n_hidden, n_output)
model.eval()
x = torch.rand(n_batch, n_freq, n_time)
torch.random.manual_seed(0)
out_eager = model.infer(x)
torch.random.manual_seed(0)
out_script = torch_script(model).infer(x)
self.assertEqual(out_eager, out_script)
_ConvTasNetParams = namedtuple(
'_ConvTasNetParams',
[
'enc_num_feats',
'enc_kernel_size',
'msk_num_feats',
'msk_num_hidden_feats',
'msk_kernel_size',
'msk_num_layers',
'msk_num_stacks',
]
)
class TestConvTasNet(common_utils.TorchaudioTestCase):
@parameterized.expand(list(itertools.product(
[2, 3],
[
_ConvTasNetParams(128, 40, 128, 256, 3, 7, 2),
_ConvTasNetParams(256, 40, 128, 256, 3, 7, 2),
_ConvTasNetParams(512, 40, 128, 256, 3, 7, 2),
_ConvTasNetParams(512, 40, 128, 256, 3, 7, 2),
_ConvTasNetParams(512, 40, 128, 512, 3, 7, 2),
_ConvTasNetParams(512, 40, 128, 512, 3, 7, 2),
_ConvTasNetParams(512, 40, 256, 256, 3, 7, 2),
_ConvTasNetParams(512, 40, 256, 512, 3, 7, 2),
_ConvTasNetParams(512, 40, 256, 512, 3, 7, 2),
_ConvTasNetParams(512, 40, 128, 512, 3, 6, 4),
_ConvTasNetParams(512, 40, 128, 512, 3, 4, 6),
_ConvTasNetParams(512, 40, 128, 512, 3, 8, 3),
_ConvTasNetParams(512, 32, 128, 512, 3, 8, 3),
_ConvTasNetParams(512, 16, 128, 512, 3, 8, 3),
],
)))
def test_paper_configuration(self, num_sources, model_params):
"""ConvTasNet model works on the valid configurations in the paper"""
batch_size = 32
num_frames = 8000
model = ConvTasNet(
num_sources=num_sources,
enc_kernel_size=model_params.enc_kernel_size,
enc_num_feats=model_params.enc_num_feats,
msk_kernel_size=model_params.msk_kernel_size,
msk_num_feats=model_params.msk_num_feats,
msk_num_hidden_feats=model_params.msk_num_hidden_feats,
msk_num_layers=model_params.msk_num_layers,
msk_num_stacks=model_params.msk_num_stacks,
)
tensor = torch.rand(batch_size, 1, num_frames)
output = model(tensor)
assert output.shape == (batch_size, num_sources, num_frames)
class TestDeepSpeech(common_utils.TorchaudioTestCase):
def test_deepspeech(self):
n_batch = 2
n_feature = 1
n_channel = 1
n_class = 40
n_time = 320
model = DeepSpeech(n_feature=n_feature, n_class=n_class)
x = torch.rand(n_batch, n_channel, n_time, n_feature)
out = model(x)
assert out.size() == (n_batch, n_time, n_class)
import torch
from torchaudio_unittest.common_utils import PytorchTestCase
from .model_test_impl import (
Tacotron2EncoderTests,
Tacotron2DecoderTests,
Tacotron2Tests,
)
class TestTacotron2EncoderFloat32CPU(Tacotron2EncoderTests, PytorchTestCase):
dtype = torch.float32
device = torch.device("cpu")
class TestTacotron2DecoderFloat32CPU(Tacotron2DecoderTests, PytorchTestCase):
dtype = torch.float32
device = torch.device("cpu")
class TestTacotron2Float32CPU(Tacotron2Tests, PytorchTestCase):
dtype = torch.float32
device = torch.device("cpu")
import torch
from torchaudio_unittest.common_utils import skipIfNoCuda, PytorchTestCase
from .model_test_impl import (
Tacotron2EncoderTests,
Tacotron2DecoderTests,
Tacotron2Tests,
)
@skipIfNoCuda
class TestTacotron2EncoderFloat32CUDA(Tacotron2EncoderTests, PytorchTestCase):
dtype = torch.float32
device = torch.device("cuda")
@skipIfNoCuda
class TestTacotron2DecoderFloat32CUDA(Tacotron2DecoderTests, PytorchTestCase):
dtype = torch.float32
device = torch.device("cuda")
@skipIfNoCuda
class TestTacotron2Float32CUDA(Tacotron2Tests, PytorchTestCase):
dtype = torch.float32
device = torch.device("cuda")
from typing import Tuple
import torch
from torch import Tensor
from torchaudio.models import Tacotron2
from torchaudio.models.tacotron2 import _Encoder, _Decoder
from torchaudio_unittest.common_utils import TestBaseMixin, torch_script
class Tacotron2InferenceWrapper(torch.nn.Module):
def __init__(self, model):
super().__init__()
self.model = model
def forward(self, text: Tensor, text_lengths: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
return self.model.infer(text, text_lengths)
class Tacotron2DecoderInferenceWrapper(torch.nn.Module):
def __init__(self, model):
super().__init__()
self.model = model
def forward(self, memory: Tensor, memory_lengths: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
return self.model.infer(memory, memory_lengths)
class TorchscriptConsistencyMixin(TestBaseMixin):
r"""Mixin to provide easy access assert torchscript consistency"""
def _assert_torchscript_consistency(self, model, tensors):
ts_func = torch_script(model)
torch.random.manual_seed(40)
output = model(*tensors)
torch.random.manual_seed(40)
ts_output = ts_func(*tensors)
self.assertEqual(ts_output, output)
class Tacotron2EncoderTests(TorchscriptConsistencyMixin):
def test_tacotron2_torchscript_consistency(self):
r"""Validate the torchscript consistency of a Encoder."""
n_batch, n_seq, encoder_embedding_dim = 16, 64, 512
model = _Encoder(encoder_embedding_dim=encoder_embedding_dim,
encoder_n_convolution=3,
encoder_kernel_size=5).to(self.device).eval()
x = torch.rand(
n_batch, encoder_embedding_dim, n_seq, device=self.device, dtype=self.dtype
)
input_lengths = (
torch.ones(n_batch, device=self.device, dtype=torch.int32) * n_seq
)
self._assert_torchscript_consistency(model, (x, input_lengths))
def test_encoder_output_shape(self):
r"""Feed tensors with specific shape to Tacotron2 Decoder and validate
that it outputs with a tensor with expected shape.
"""
n_batch, n_seq, encoder_embedding_dim = 16, 64, 512
model = _Encoder(encoder_embedding_dim=encoder_embedding_dim,
encoder_n_convolution=3,
encoder_kernel_size=5).to(self.device).eval()
x = torch.rand(
n_batch, encoder_embedding_dim, n_seq, device=self.device, dtype=self.dtype
)
input_lengths = (
torch.ones(n_batch, device=self.device, dtype=torch.int32) * n_seq
)
out = model(x, input_lengths)
assert out.size() == (n_batch, n_seq, encoder_embedding_dim)
def _get_decoder_model(n_mels=80, encoder_embedding_dim=512,
decoder_max_step=2000, gate_threshold=0.5):
model = _Decoder(
n_mels=n_mels,
n_frames_per_step=1,
encoder_embedding_dim=encoder_embedding_dim,
decoder_rnn_dim=1024,
decoder_max_step=decoder_max_step,
decoder_dropout=0.1,
decoder_early_stopping=True,
attention_rnn_dim=1024,
attention_hidden_dim=128,
attention_location_n_filter=32,
attention_location_kernel_size=31,
attention_dropout=0.1,
prenet_dim=256,
gate_threshold=gate_threshold,
)
return model
class Tacotron2DecoderTests(TorchscriptConsistencyMixin):
def test_decoder_torchscript_consistency(self):
r"""Validate the torchscript consistency of a Decoder."""
n_batch = 16
n_mels = 80
n_seq = 200
encoder_embedding_dim = 256
n_time_steps = 150
model = _get_decoder_model(n_mels=n_mels, encoder_embedding_dim=encoder_embedding_dim)
model = model.to(self.device).eval()
memory = torch.rand(
n_batch, n_seq, encoder_embedding_dim, dtype=self.dtype, device=self.device
)
decoder_inputs = torch.rand(
n_batch, n_mels, n_time_steps, dtype=self.dtype, device=self.device
)
memory_lengths = torch.ones(n_batch, dtype=torch.int32, device=self.device)
self._assert_torchscript_consistency(
model, (memory, decoder_inputs, memory_lengths)
)
def test_decoder_output_shape(self):
r"""Feed tensors with specific shape to Tacotron2 Decoder and validate
that it outputs with a tensor with expected shape.
"""
n_batch = 16
n_mels = 80
n_seq = 200
encoder_embedding_dim = 256
n_time_steps = 150
model = _get_decoder_model(n_mels=n_mels, encoder_embedding_dim=encoder_embedding_dim)
model = model.to(self.device).eval()
memory = torch.rand(
n_batch, n_seq, encoder_embedding_dim, dtype=self.dtype, device=self.device
)
decoder_inputs = torch.rand(
n_batch, n_mels, n_time_steps, dtype=self.dtype, device=self.device
)
memory_lengths = torch.ones(n_batch, dtype=torch.int32, device=self.device)
mel_specgram, gate_outputs, alignments = model(
memory, decoder_inputs, memory_lengths
)
assert mel_specgram.size() == (n_batch, n_mels, n_time_steps)
assert gate_outputs.size() == (n_batch, n_time_steps)
assert alignments.size() == (n_batch, n_time_steps, n_seq)
def test_decoder_inference_torchscript_consistency(self):
r"""Validate the torchscript consistency of a Decoder."""
n_batch = 16
n_mels = 80
n_seq = 200
encoder_embedding_dim = 256
decoder_max_step = 300 # make inference more efficient
gate_threshold = 0.505 # make inference more efficient
model = _get_decoder_model(
n_mels=n_mels,
encoder_embedding_dim=encoder_embedding_dim,
decoder_max_step=decoder_max_step,
gate_threshold=gate_threshold,
)
model = model.to(self.device).eval()
memory = torch.rand(
n_batch, n_seq, encoder_embedding_dim, dtype=self.dtype, device=self.device
)
memory_lengths = torch.ones(n_batch, dtype=torch.int32, device=self.device)
model_wrapper = Tacotron2DecoderInferenceWrapper(model)
self._assert_torchscript_consistency(model_wrapper, (memory, memory_lengths))
def test_decoder_inference_output_shape(self):
r"""Validate the torchscript consistency of a Decoder."""
n_batch = 16
n_mels = 80
n_seq = 200
encoder_embedding_dim = 256
decoder_max_step = 300 # make inference more efficient
gate_threshold = 0.505 # if set to 0.5, the model will only run one step
model = _get_decoder_model(
n_mels=n_mels,
encoder_embedding_dim=encoder_embedding_dim,
decoder_max_step=decoder_max_step,
gate_threshold=gate_threshold,
)
model = model.to(self.device).eval()
memory = torch.rand(
n_batch, n_seq, encoder_embedding_dim, dtype=self.dtype, device=self.device
)
memory_lengths = torch.ones(n_batch, dtype=torch.int32, device=self.device)
mel_specgram, mel_specgram_lengths, gate_outputs, alignments = model.infer(
memory, memory_lengths
)
assert len(mel_specgram.size()) == 3
assert mel_specgram.size()[:-1] == (n_batch, n_mels, )
assert mel_specgram.size()[2] == mel_specgram_lengths.max().item()
assert len(mel_specgram_lengths.size()) == 1
assert mel_specgram_lengths.size()[0] == n_batch
assert mel_specgram_lengths.max().item() <= model.decoder_max_step
assert len(gate_outputs.size()) == 2
assert gate_outputs.size()[0] == n_batch
assert gate_outputs.size()[1] == mel_specgram_lengths.max().item()
assert len(alignments.size()) == 2
assert alignments.size()[0] == n_seq
assert alignments.size()[1] == mel_specgram_lengths.max().item() * n_batch
def _get_tacotron2_model(n_mels, decoder_max_step=2000, gate_threshold=0.5):
return Tacotron2(
mask_padding=False,
n_mels=n_mels,
n_symbol=148,
n_frames_per_step=1,
symbol_embedding_dim=512,
encoder_embedding_dim=512,
encoder_n_convolution=3,
encoder_kernel_size=5,
decoder_rnn_dim=1024,
decoder_max_step=decoder_max_step,
decoder_dropout=0.1,
decoder_early_stopping=True,
attention_rnn_dim=1024,
attention_hidden_dim=128,
attention_location_n_filter=32,
attention_location_kernel_size=31,
attention_dropout=0.1,
prenet_dim=256,
postnet_n_convolution=5,
postnet_kernel_size=5,
postnet_embedding_dim=512,
gate_threshold=gate_threshold,
)
class Tacotron2Tests(TorchscriptConsistencyMixin):
def _get_inputs(
self, n_mels: int, n_batch: int, max_mel_specgram_length: int, max_text_length: int
):
text = torch.randint(
0, 148, (n_batch, max_text_length), dtype=torch.int32, device=self.device
)
text_lengths = max_text_length * torch.ones(
(n_batch,), dtype=torch.int32, device=self.device
)
mel_specgram = torch.rand(
n_batch,
n_mels,
max_mel_specgram_length,
dtype=self.dtype,
device=self.device,
)
mel_specgram_lengths = max_mel_specgram_length * torch.ones(
(n_batch,), dtype=torch.int32, device=self.device
)
return text, text_lengths, mel_specgram, mel_specgram_lengths
def test_tacotron2_torchscript_consistency(self):
r"""Validate the torchscript consistency of a Tacotron2."""
n_batch = 16
n_mels = 80
max_mel_specgram_length = 300
max_text_length = 100
model = _get_tacotron2_model(n_mels).to(self.device).eval()
inputs = self._get_inputs(
n_mels, n_batch, max_mel_specgram_length, max_text_length
)
self._assert_torchscript_consistency(model, inputs)
def test_tacotron2_output_shape(self):
r"""Feed tensors with specific shape to Tacotron2 and validate
that it outputs with a tensor with expected shape.
"""
n_batch = 16
n_mels = 80
max_mel_specgram_length = 300
max_text_length = 100
model = _get_tacotron2_model(n_mels).to(self.device).eval()
inputs = self._get_inputs(
n_mels, n_batch, max_mel_specgram_length, max_text_length
)
mel_out, mel_out_postnet, gate_outputs, alignments = model(*inputs)
assert mel_out.size() == (n_batch, n_mels, max_mel_specgram_length)
assert mel_out_postnet.size() == (n_batch, n_mels, max_mel_specgram_length)
assert gate_outputs.size() == (n_batch, max_mel_specgram_length)
assert alignments.size() == (n_batch, max_mel_specgram_length, max_text_length)
def test_tacotron2_backward(self):
r"""Make sure calling the backward function on Tacotron2's outputs does
not error out. Following:
https://github.com/pytorch/vision/blob/23b8760374a5aaed53c6e5fc83a7e83dbe3b85df/test/test_models.py#L255
"""
n_batch = 16
n_mels = 80
max_mel_specgram_length = 300
max_text_length = 100
model = _get_tacotron2_model(n_mels).to(self.device)
inputs = self._get_inputs(
n_mels, n_batch, max_mel_specgram_length, max_text_length
)
mel_out, mel_out_postnet, gate_outputs, _ = model(*inputs)
mel_out.sum().backward(retain_graph=True)
mel_out_postnet.sum().backward(retain_graph=True)
gate_outputs.sum().backward()
def _get_inference_inputs(self, n_batch: int, max_text_length: int):
text = torch.randint(
0, 148, (n_batch, max_text_length), dtype=torch.int32, device=self.device
)
text_lengths = max_text_length * torch.ones(
(n_batch,), dtype=torch.int32, device=self.device
)
return text, text_lengths
def test_tacotron2_inference_torchscript_consistency(self):
r"""Validate the torchscript consistency of Tacotron2 inference function."""
n_batch = 16
n_mels = 40
max_text_length = 100
decoder_max_step = 200 # make inference more efficient
gate_threshold = 0.51 # if set to 0.5, the model will only run one step
model = _get_tacotron2_model(
n_mels, decoder_max_step=decoder_max_step, gate_threshold=gate_threshold
).to(self.device).eval()
inputs = self._get_inference_inputs(n_batch, max_text_length)
model_wrapper = Tacotron2InferenceWrapper(model)
self._assert_torchscript_consistency(model_wrapper, inputs)
def test_tacotron2_inference_output_shape(self):
r"""Feed tensors with specific shape to Tacotron2 inference function and validate
that it outputs with a tensor with expected shape.
"""
n_batch = 16
n_mels = 40
max_text_length = 100
decoder_max_step = 200 # make inference more efficient
gate_threshold = 0.51 # if set to 0.5, the model will only run one step
model = _get_tacotron2_model(
n_mels, decoder_max_step=decoder_max_step, gate_threshold=gate_threshold
).to(self.device).eval()
inputs = self._get_inference_inputs(n_batch, max_text_length)
mel_out, mel_specgram_lengths, alignments = model.infer(*inputs)
# There is no guarantee on exactly what max_mel_specgram_length should be
# We only know that it should be smaller than model.decoder.decoder_max_step
assert len(mel_out.size()) == 3
assert mel_out.size()[:2] == (n_batch, n_mels, )
assert mel_out.size()[2] == mel_specgram_lengths.max().item()
assert len(mel_specgram_lengths.size()) == 1
assert mel_specgram_lengths.size()[0] == n_batch
assert mel_specgram_lengths.max().item() <= model.decoder.decoder_max_step
assert len(alignments.size()) == 3
assert alignments.size()[0] == n_batch
assert alignments.size()[1] == mel_specgram_lengths.max().item()
assert alignments.size()[2] == max_text_length
import json
import torch
from torchaudio.models.wav2vec2 import (
wav2vec2_base,
wav2vec2_large,
wav2vec2_large_lv60k,
hubert_base,
hubert_large,
hubert_xlarge,
)
from torchaudio.models.wav2vec2.utils import (
import_fairseq_model,
)
from parameterized import parameterized
from torchaudio_unittest.common_utils import (
get_asset_path,
skipIfNoModule,
TorchaudioTestCase,
)
def _load_config(*paths):
with open(f'{get_asset_path("wav2vec2", "fairseq", *paths)}.json', 'r') as file_:
return json.load(file_)
def _name_func(testcase_func, i, param):
return f'{testcase_func.__name__}_{i}_{param[0][1].__name__}'
# Pretraining models
WAV2VEC2_BASE = _load_config('wav2vec_small')
WAV2VEC2_LARGE = _load_config('libri960_big')
WAV2VEC2_LARGE_LV60K = _load_config('wav2vec_vox_new')
WAV2VEC2_XLSR_53_56K = _load_config('xlsr_53_56k')
HUBERT_BASE = _load_config('hubert_base_ls960')
HUBERT_LARGE_LL60K = _load_config('hubert_large_ll60k')
HUBERT_XLARGE_LL60K = _load_config('hubert_xtralarge_ll60k')
# Finetuning models
WAV2VEC2_BASE_960H = _load_config('wav2vec_small_960h')
WAV2VEC2_LARGE_960H = _load_config('wav2vec_large_960h')
WAV2VEC2_LARGE_LV60K_960H = _load_config('wav2vec_large_lv60k_960h')
WAV2VEC2_LARGE_LV60K_SELF_960H = _load_config('wav2vec_large_lv60k_self_960h')
HUBERT_LARGE = _load_config('hubert_large_ll60k_finetune_ls960')
HUBERT_XLARGE = _load_config('hubert_xtralarge_ll60k_finetune_ls960')
# Config and corresponding factory functions
WAV2VEC2_PRETRAINING_CONFIGS = parameterized.expand([
(WAV2VEC2_BASE, wav2vec2_base),
(WAV2VEC2_LARGE, wav2vec2_large),
(WAV2VEC2_LARGE_LV60K, wav2vec2_large_lv60k),
(WAV2VEC2_XLSR_53_56K, wav2vec2_large_lv60k),
], name_func=_name_func)
HUBERT_PRETRAINING_CONFIGS = parameterized.expand([
(HUBERT_BASE, hubert_base),
(HUBERT_LARGE_LL60K, hubert_large),
(HUBERT_XLARGE_LL60K, hubert_xlarge),
], name_func=_name_func)
ALL_PRETRAINING_CONFIGS = parameterized.expand([
(WAV2VEC2_BASE, wav2vec2_base),
(WAV2VEC2_LARGE, wav2vec2_large),
(WAV2VEC2_LARGE_LV60K, wav2vec2_large_lv60k),
(WAV2VEC2_XLSR_53_56K, wav2vec2_large_lv60k),
(HUBERT_BASE, hubert_base),
(HUBERT_LARGE_LL60K, hubert_large),
(HUBERT_XLARGE_LL60K, hubert_xlarge),
], name_func=_name_func)
FINETUNING_CONFIGS = parameterized.expand([
(WAV2VEC2_BASE_960H, wav2vec2_base),
(WAV2VEC2_LARGE_960H, wav2vec2_large),
(WAV2VEC2_LARGE_LV60K_960H, wav2vec2_large_lv60k),
(WAV2VEC2_LARGE_LV60K_SELF_960H, wav2vec2_large_lv60k),
(HUBERT_LARGE, hubert_large),
(HUBERT_XLARGE, hubert_xlarge),
], name_func=_name_func)
@skipIfNoModule('fairseq')
class TestFairseqIntegration(TorchaudioTestCase):
"""Test the process of importing the models from fairseq.
Test methods in this test suite check the following things
1. Models loaded with fairseq cane be imported.
2. The same model can be recreated without fairseq.
"""
def _get_model(self, config, num_out=None):
import copy
from omegaconf import OmegaConf
from fairseq.models.wav2vec.wav2vec2 import (
Wav2Vec2Config,
Wav2Vec2Model,
)
from fairseq.models.wav2vec.wav2vec2_asr import (
Wav2VecEncoder,
Wav2Vec2CtcConfig,
)
from fairseq.models.hubert.hubert_asr import (
HubertCtcConfig,
HubertEncoder,
)
from fairseq.models.hubert.hubert import (
HubertModel,
HubertConfig,
)
from fairseq.tasks.hubert_pretraining import HubertPretrainingConfig
if config['_name'] == 'wav2vec_ctc':
config = copy.deepcopy(config)
config['w2v_args'] = OmegaConf.create(config['w2v_args'])
return Wav2VecEncoder(Wav2Vec2CtcConfig(**config), num_out)
if config['_name'] == 'wav2vec2':
return Wav2Vec2Model(Wav2Vec2Config(**config))
if config['_name'] == 'hubert_ctc':
config = copy.deepcopy(config)
config['w2v_args'] = OmegaConf.create(config['w2v_args'])
ctc_cfg = HubertCtcConfig(**config)
return HubertEncoder(ctc_cfg, tgt_dict=range(num_out))
if config['_name'] == 'hubert':
dicts = [list(range(i)) for i in config['num_classes']]
return HubertModel(
HubertConfig(**config['model']),
HubertPretrainingConfig(**config['task']),
dicts,
)
raise ValueError(f'Unexpected configuration: {config["_name"]}')
@WAV2VEC2_PRETRAINING_CONFIGS
def test_import_wave2vec2_pretraining_model(self, config, _):
"""Wav2vec2 pretraining models from fairseq can be imported and yields the same results"""
batch_size, num_frames = 3, 1024
torch.manual_seed(0)
original = self._get_model(config).eval()
imported = import_fairseq_model(original).eval()
x = torch.randn(batch_size, num_frames)
hyp, _ = imported.extract_features(x)
refs = original.extract_features(x, padding_mask=torch.zeros_like(x), layer=-1)
for i, (ref, _) in enumerate(refs['layer_results']):
self.assertEqual(hyp[i], ref.transpose(0, 1))
@HUBERT_PRETRAINING_CONFIGS
def test_import_hubert_pretraining_model(self, config, factory_func):
"""HuBERT pretraining models from fairseq can be imported and yields the same results"""
batch_size, num_frames = 3, 1024
torch.manual_seed(0)
original = self._get_model(config).eval()
imported = import_fairseq_model(original).eval()
x = torch.randn(batch_size, num_frames)
mask = torch.zeros_like(x)
hyp, _ = imported.extract_features(x)
# check the last layer
ref, _ = original.extract_features(x, padding_mask=mask, output_layer=len(original.encoder.layers))
atol = 3.0e-05 if factory_func is hubert_xlarge else 1.0e-5
self.assertEqual(hyp[-1], ref, atol=atol, rtol=1.3e-6)
# check the first layer
ref, _ = original.extract_features(x, padding_mask=mask, output_layer=1)
self.assertEqual(hyp[0], ref)
@ALL_PRETRAINING_CONFIGS
def test_recreate_pretraining_model(self, config, factory_func):
"""Imported pretraining models can be recreated via a factory function without fairseq."""
batch_size, num_frames = 3, 1024
original = self._get_model(config).eval()
imported = import_fairseq_model(original).eval()
reloaded = factory_func()
reloaded.load_state_dict(imported.state_dict())
reloaded.eval()
x = torch.randn(batch_size, num_frames)
lengths = torch.randint(low=0, high=num_frames, size=[batch_size, ])
# Without mask
ref, _ = imported(x)
hyp, _ = reloaded(x)
self.assertEqual(ref, hyp)
# With mask
ref, ref_lengths = imported(x, lengths)
hyp, hyp_lengths = reloaded(x, lengths)
self.assertEqual(ref, hyp)
self.assertEqual(ref_lengths, hyp_lengths)
@FINETUNING_CONFIGS
def test_import_finetuning_model(self, config, _):
"""Fintuned wav2vec2 models from fairseq can be imported and yields the same results"""
num_out = 28
batch_size, num_frames = 3, 1024
original = self._get_model(config, num_out).eval()
imported = import_fairseq_model(original).eval()
# Without mask
x = torch.randn(batch_size, num_frames)
ref = original(x, torch.zeros_like(x))['encoder_out'].transpose(0, 1)
hyp, _ = imported(x)
self.assertEqual(ref, hyp)
# With mask
lengths = torch.randint(low=0, high=num_frames, size=[batch_size, ])
mask = torch.arange(num_frames).expand(batch_size, num_frames) >= lengths[:, None]
ref = original(x, mask)['encoder_out'].transpose(0, 1)
hyp, output_lengths = imported(x, lengths)
for i, l in enumerate(output_lengths):
self.assertEqual(ref[i, :l, ...], hyp[i, :l, ...])
@FINETUNING_CONFIGS
def test_recreate_finetuning_model(self, config, factory_func):
"""Imported finetuning models can be recreated via a factory function without fairseq."""
num_out = 28
batch_size, num_frames = 3, 1024
original = self._get_model(config, num_out).eval()
imported = import_fairseq_model(original).eval()
reloaded = factory_func(aux_num_out=num_out)
reloaded.load_state_dict(imported.state_dict())
reloaded.eval()
# Without mask
torch.manual_seed(0)
x = torch.randn(batch_size, num_frames)
ref, _ = imported(x)
hyp, _ = reloaded(x)
self.assertEqual(ref, hyp)
# With mask
lengths = torch.randint(low=0, high=num_frames, size=[batch_size, ])
ref, ref_lengths = imported(x, lengths)
hyp, hyp_lengths = reloaded(x, lengths)
self.assertEqual(ref, hyp)
self.assertEqual(ref_lengths, hyp_lengths)
import json
import torch
from torchaudio.models.wav2vec2 import (
wav2vec2_base,
wav2vec2_large,
wav2vec2_large_lv60k,
)
from torchaudio.models.wav2vec2.utils import import_huggingface_model
from parameterized import parameterized
from torchaudio_unittest.common_utils import (
get_asset_path,
skipIfNoModule,
TorchaudioTestCase,
)
def _load_config(*paths):
with open(f'{get_asset_path("wav2vec2", "huggingface", *paths)}.json', 'r') as file_:
return json.load(file_)
def _name_func(testcase_func, i, param):
return f"{testcase_func.__name__}_{i}_{param[0][1].__name__}"
# Pretrained
HF_BASE = _load_config('facebook', 'wav2vec2-base')
HF_LARGE = _load_config('facebook', 'wav2vec2-large')
HF_LARGE_LV60 = _load_config('facebook', 'wav2vec2-large-lv60')
HF_LARGE_XLSR_53 = _load_config('facebook', 'wav2vec2-large-xlsr-53')
HF_BASE_10K_VOXPOPULI = _load_config('facebook', 'wav2vec2-base-10k-voxpopuli')
# Finetuned
HF_BASE_960H = _load_config('facebook', 'wav2vec2-base-960h')
HF_LARGE_960H = _load_config('facebook', 'wav2vec2-large-960h')
HF_LARGE_LV60_960H = _load_config('facebook', 'wav2vec2-large-960h-lv60')
HF_LARGE_LV60_SELF_960H = _load_config('facebook', 'wav2vec2-large-960h-lv60-self')
HF_LARGE_XLSR_DE = _load_config('facebook', 'wav2vec2-large-xlsr-53-german')
# Config and corresponding factory functions
PRETRAIN_CONFIGS = parameterized.expand([
(HF_BASE, wav2vec2_base),
(HF_LARGE, wav2vec2_large),
(HF_LARGE_LV60, wav2vec2_large_lv60k),
(HF_LARGE_XLSR_53, wav2vec2_large_lv60k),
(HF_BASE_10K_VOXPOPULI, wav2vec2_base),
], name_func=_name_func)
FINETUNE_CONFIGS = parameterized.expand([
(HF_BASE_960H, wav2vec2_base),
(HF_LARGE_960H, wav2vec2_large),
(HF_LARGE_LV60_960H, wav2vec2_large_lv60k),
(HF_LARGE_LV60_SELF_960H, wav2vec2_large_lv60k),
(HF_LARGE_XLSR_DE, wav2vec2_large_lv60k),
], name_func=_name_func)
@skipIfNoModule('transformers')
class TestHFIntegration(TorchaudioTestCase):
"""Test the process of importing the models from Hugging Face Transformers
Test methods in this test suite check the following things
1. Models loaded with Hugging Face Transformers cane be imported.
2. The same model can be recreated without Hugging Face Transformers.
"""
def _get_model(self, config):
# Helper function to avoid importing transformers on module scope.
# Normally, we use `is_module_available` helper function to check if
# the library is available, and import it on module scope if available.
# However, somehow, once "transformers" is imported, `is_module_available`
# starts to fail. Therefore, we defer importing "transformers" until
# the actual tests are started.
from transformers.models.wav2vec2 import (
Wav2Vec2Config,
Wav2Vec2Model,
Wav2Vec2ForCTC,
)
if config['architectures'] == ['Wav2Vec2Model']:
return Wav2Vec2Model(Wav2Vec2Config(**config))
if config['architectures'] == ['Wav2Vec2ForCTC']:
return Wav2Vec2ForCTC(Wav2Vec2Config(**config))
raise ValueError(f'Unexpected arch: {config["architectures"]}')
def _test_import_pretrain(self, original, imported, config):
torch.manual_seed(0)
# FeatureExtractor
x = torch.randn(3, 1024)
ref = original.feature_extractor(x).transpose(1, 2)
hyp, _ = imported.feature_extractor(x, None)
self.assertEqual(ref, hyp)
# Feature projection
x = torch.randn(3, 10, config['conv_dim'][-1])
ref = original.feature_projection(x)[0]
hyp = imported.encoder.feature_projection(x)
self.assertEqual(ref, hyp)
# Convolutional Positional Encoder
x = torch.randn(3, 256, config['hidden_size'])
ref = original.encoder.pos_conv_embed(x)
hyp = imported.encoder.transformer.pos_conv_embed(x)
self.assertEqual(ref, hyp)
# Encoder Transformer Layer
for original_, imported_ in zip(original.encoder.layers, imported.encoder.transformer.layers):
b, l, e = 16, 3, config["hidden_size"]
x = torch.randn(b, l, e)
mask = torch.randn(b, 1, l, l)
ref, = original_(x, attention_mask=mask, output_attentions=False)
hyp = imported_(x, mask)
self.assertEqual(ref, hyp)
# The whole Encoder Transformer
b, l, e = 16, 3, config["hidden_size"]
x = torch.randn(b, l, e)
ref = original.encoder(x).last_hidden_state
hyp = imported.encoder.transformer(x)
self.assertEqual(ref, hyp)
def _test_import_finetune(self, original, imported, config):
# Aux
x = torch.randn(3, 10, config["hidden_size"])
ref = original.lm_head(x)
hyp = imported.aux(x)
self.assertEqual(ref, hyp)
# The whole model without mask
x = torch.randn(3, 1024)
ref = original(x).logits
hyp, _ = imported(x)
self.assertEqual(ref, hyp)
# The whole model without mask
batch_size, num_frames = 3, 1024
x = torch.randn(batch_size, num_frames)
ref = original(x).logits
hyp, _ = imported(x)
self.assertEqual(ref, hyp)
# The whole model with mask
batch_size, num_frames = 3, 1024
x = torch.randn(batch_size, num_frames)
lengths = torch.randint(low=0, high=num_frames, size=[batch_size, ])
mask = torch.arange(num_frames).expand(batch_size, num_frames) < lengths[:, None]
ref = original(x, attention_mask=mask).logits
hyp, output_lengths = imported(x, lengths)
for i, l in enumerate(output_lengths):
self.assertEqual(ref[i, :l, ...], hyp[i, :l, ...])
@PRETRAIN_CONFIGS
def test_import_pretrain(self, config, _):
"""wav2vec2 models from HF transformers can be imported and yields the same results"""
original = self._get_model(config).eval()
imported = import_huggingface_model(original).eval()
self._test_import_pretrain(original, imported, config)
@FINETUNE_CONFIGS
def test_import_finetune(self, config, _):
"""wav2vec2 models from HF transformers can be imported and yields the same results"""
original = self._get_model(config).eval()
imported = import_huggingface_model(original).eval()
self._test_import_pretrain(original.wav2vec2, imported, config)
self._test_import_finetune(original, imported, config)
def _test_recreate(self, imported, reloaded, config):
torch.manual_seed(0)
# FeatureExtractor
x = torch.randn(3, 1024)
ref, _ = imported.feature_extractor(x, None)
hyp, _ = reloaded.feature_extractor(x, None)
self.assertEqual(ref, hyp)
# Feature projection
x = torch.randn(3, 10, config['conv_dim'][-1])
ref = imported.encoder.feature_projection(x)
hyp = reloaded.encoder.feature_projection(x)
self.assertEqual(ref, hyp)
# Convolutional Positional Encoder
x = torch.randn(3, 256, config['hidden_size'])
ref = imported.encoder.transformer.pos_conv_embed(x)
hyp = reloaded.encoder.transformer.pos_conv_embed(x)
self.assertEqual(ref, hyp)
# Encoder Transformer Layer
for imported_, reloaded_ in zip(imported.encoder.transformer.layers, reloaded.encoder.transformer.layers):
b, l, e = 16, 3, config["hidden_size"]
x = torch.randn(b, l, e)
mask = torch.randn(b, 1, l, l)
ref = imported_(x, mask)
hyp = reloaded_(x, mask)
self.assertEqual(ref, hyp)
# The whole Encoder Transformer
# TODO: Add mask pattern. Expected mask shapes and values are different.
b, l, e = 16, 3, config["hidden_size"]
x = torch.randn(b, l, e)
mask = torch.randn(b, 1, l, l)
ref = imported.encoder.transformer(x)
hyp = reloaded.encoder.transformer(x)
self.assertEqual(ref, hyp)
# Aux
if imported.aux is not None:
x = torch.randn(3, 10, config["hidden_size"])
ref = imported.aux(x)
hyp = reloaded.aux(x)
self.assertEqual(ref, hyp)
# The whole model
x = torch.randn(3, 1024)
ref, _ = imported(x)
hyp, _ = reloaded(x)
self.assertEqual(ref, hyp)
@PRETRAIN_CONFIGS
def test_recreate_pretrain(self, config, factory_func):
"""Imported models can be recreated via a factory function without Hugging Face transformers."""
imported = import_huggingface_model(self._get_model(config)).eval()
reloaded = factory_func()
reloaded.load_state_dict(imported.state_dict())
reloaded.eval()
self._test_recreate(imported, reloaded, config)
@FINETUNE_CONFIGS
def test_recreate_finetune(self, config, factory_func):
"""Imported models can be recreated via a factory function without Hugging Face transformers."""
imported = import_huggingface_model(self._get_model(config)).eval()
reloaded = factory_func(aux_num_out=imported.aux.out_features)
reloaded.load_state_dict(imported.state_dict())
reloaded.eval()
self._test_recreate(imported, reloaded, config)
import os
import torch
import torch.nn.functional as F
from torchaudio.models.wav2vec2 import (
wav2vec2_base,
wav2vec2_large,
wav2vec2_large_lv60k,
hubert_base,
hubert_large,
hubert_xlarge,
)
from torchaudio_unittest.common_utils import (
TorchaudioTestCase,
skipIfNoQengine,
skipIfNoCuda,
torch_script,
)
from parameterized import parameterized
def _name_func(testcase_func, i, param):
return f"{testcase_func.__name__}_{i}_{param[0][0].__name__}"
factory_funcs = parameterized.expand([
(wav2vec2_base, ),
(wav2vec2_large, ),
(wav2vec2_large_lv60k, ),
(hubert_base, ),
(hubert_large, ),
(hubert_xlarge, ),
], name_func=_name_func)
class TestWav2Vec2Model(TorchaudioTestCase):
def _smoke_test(self, model, device, dtype):
model = model.to(device=device, dtype=dtype)
model = model.eval()
torch.manual_seed(0)
batch_size, num_frames = 3, 1024
waveforms = torch.randn(
batch_size, num_frames, device=device, dtype=dtype)
lengths = torch.randint(
low=0, high=num_frames, size=[batch_size, ], device=device)
model(waveforms, lengths)
@parameterized.expand([(torch.float32, ), (torch.float64, )])
def test_cpu_smoke_test(self, dtype):
model = wav2vec2_base()
self._smoke_test(model, torch.device('cpu'), dtype)
model = wav2vec2_base(aux_num_out=32)
self._smoke_test(model, torch.device('cpu'), dtype)
@parameterized.expand([(torch.float32, ), (torch.float64, )])
@skipIfNoCuda
def test_cuda_smoke_test(self, dtype):
model = wav2vec2_base()
self._smoke_test(model, torch.device('cuda'), dtype)
model = wav2vec2_base(aux_num_out=32)
self._smoke_test(model, torch.device('cuda'), dtype)
def _feature_extractor_test(self, model):
batch_size, num_frames = 3, 1024
model.eval()
num_layers = len(model.encoder.transformer.layers)
torch.manual_seed(0)
waveforms = torch.randn(batch_size, num_frames)
lengths = torch.randint(low=0, high=num_frames, size=[batch_size, ])
# Not providing num_layers returns all the intermediate features from
# tranformer layers
all_features, lengths_ = model.extract_features(waveforms, lengths, num_layers=None)
assert len(all_features) == num_layers
for features in all_features:
assert features.ndim == 3
assert features.shape[0] == batch_size
assert lengths_.shape == torch.Size([batch_size])
# Limiting the number of layers to `l`.
for l in range(1, num_layers + 1):
features, lengths_ = model.extract_features(waveforms, lengths, num_layers=l)
assert len(features) == l
for i in range(l):
self.assertEqual(all_features[i], features[i])
assert lengths_.shape == torch.Size([batch_size])
@factory_funcs
def test_extract_feature(self, factory_func):
"""`extract_features` method does not fail"""
self._feature_extractor_test(factory_func(aux_num_out=32))
def _test_batch_consistency(self, model):
model.eval()
batch_size, max_frames = 5, 5 * 1024
torch.manual_seed(0)
waveforms = torch.randn(batch_size, max_frames)
input_lengths = torch.tensor([i * 3200 for i in range(1, 6)])
# Batch process with lengths
batch_logits, output_lengths = model(waveforms, input_lengths)
for i in range(batch_size):
# Par-sample process without feeding length
single_logit, _ = model(waveforms[i:i + 1, :input_lengths[i]], None)
batch_logit = batch_logits[i:i + 1, :output_lengths[i]]
# Convert to probability so that it's easier to interpretate the diff
single_prob = F.softmax(single_logit, dim=2)
batch_prob = F.softmax(batch_logit, dim=2)
# We allow max atol=0.005 -> 0.5%
self.assertEqual(single_prob, batch_prob, atol=0.005, rtol=0)
@factory_funcs
def test_pretrain_batch_consistency(self, factory_func):
"""Results from single process and batched process should be reasonably close
"""
self._test_batch_consistency(factory_func())
@factory_funcs
def test_finetune_batch_consistency(self, factory_func):
"""Results from single process and batched process should be reasonably close
"""
self._test_batch_consistency(factory_func(aux_num_out=32))
def _test_zero_length(self, model):
model.eval()
torch.manual_seed(0)
batch_size = 3
waveforms = torch.randn(batch_size, 1024)
input_lengths = torch.zeros(batch_size)
_, output_lengths = model(waveforms, input_lengths)
self.assertEqual(torch.zeros_like(output_lengths), output_lengths)
_, output_lengths = model.extract_features(waveforms, input_lengths)
self.assertEqual(torch.zeros_like(output_lengths), output_lengths)
@factory_funcs
def test_pretrain_zero_length(self, factory_func):
"""Passing zero length should not fail"""
self._test_zero_length(factory_func())
@factory_funcs
def test_finetune_zero_length(self, factory_func):
"""Passing zero length should not fail"""
self._test_zero_length(factory_func(aux_num_out=32))
def _test_torchscript(self, model):
model.eval()
batch_size, num_frames = 3, 1024
torch.manual_seed(0)
waveforms = torch.randn(batch_size, num_frames)
lengths = torch.randint(low=0, high=num_frames, size=[batch_size, ])
ref_out, ref_len = model(waveforms, lengths)
scripted = torch_script(model)
hyp_out, hyp_len = scripted(waveforms, lengths)
self.assertEqual(hyp_out, ref_out)
self.assertEqual(hyp_len, ref_len)
@factory_funcs
def test_pretrain_torchscript(self, factory_func):
"""Wav2Vec2Model should be scriptable"""
if factory_func is hubert_xlarge and os.name == 'nt' and os.environ.get('CI') == 'true':
self.skipTest(
'hubert_xlarge is known to fail on Windows CI. '
'See https://github.com/pytorch/pytorch/issues/65776')
self._test_torchscript(factory_func())
@factory_funcs
def test_finetune_torchscript(self, factory_func):
"""Wav2Vec2Model should be scriptable"""
if factory_func is hubert_xlarge and os.name == 'nt' and os.environ.get('CI') == 'true':
self.skipTest(
'hubert_xlarge is known to fail on Windows CI. '
'See https://github.com/pytorch/pytorch/issues/65776')
self._test_torchscript(factory_func(aux_num_out=32))
def _test_quantize_smoke_test(self, model):
model.eval()
batch_size, num_frames = 3, 1024
# Remove the weight normalization forward hook
model.encoder.transformer.pos_conv_embed.__prepare_scriptable__()
quantized = torch.quantization.quantize_dynamic(
model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8)
# A lazy way to check that Modules are different
assert str(quantized) != str(model), "Dynamic quantization did not modify the module."
torch.manual_seed(0)
waveforms = torch.randn(batch_size, num_frames)
lengths = torch.randint(low=0, high=num_frames, size=[batch_size, ])
_, _ = quantized(waveforms, lengths)
@factory_funcs
@skipIfNoQengine
def test_quantize(self, factory_func):
"""Wav2Vec2Model should support basic quantization"""
self._test_quantize_smoke_test(factory_func(aux_num_out=32))
def _test_quantize_torchscript(self, model):
model.eval()
batch_size, num_frames = 3, 1024
# Remove the weight normalization forward hook
model.encoder.transformer.pos_conv_embed.__prepare_scriptable__()
quantized = torch.quantization.quantize_dynamic(
model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8)
# A lazy way to check that Modules are different
assert str(quantized) != str(model), "Dynamic quantization did not modify the module."
torch.manual_seed(0)
waveforms = torch.randn(batch_size, num_frames)
lengths = torch.randint(low=0, high=num_frames, size=[batch_size, ])
ref_out, ref_len = quantized(waveforms, lengths)
# Script
scripted = torch_script(quantized)
hyp_out, hyp_len = scripted(waveforms, lengths)
self.assertEqual(hyp_out, ref_out)
self.assertEqual(hyp_len, ref_len)
@factory_funcs
@skipIfNoQengine
def test_quantize_torchscript(self, factory_func):
"""Quantized Wav2Vec2Model should be scriptable"""
self._test_quantize_torchscript(factory_func(aux_num_out=32))
import json
from parameterized import param
from torchaudio_unittest.common_utils import get_asset_path
def name_func(func, _, params):
if isinstance(params.args[0], str):
args = "_".join([str(arg) for arg in params.args])
else:
args = "_".join([str(arg) for arg in params.args[0]])
return f'{func.__name__}_{args}'
def load_params(*paths):
params = []
with open(get_asset_path(*paths), 'r') as file:
for line in file:
data = json.loads(line)
for effect in data['effects']:
for i, arg in enumerate(effect):
if arg.startswith("<ASSET_DIR>"):
effect[i] = arg.replace("<ASSET_DIR>", get_asset_path())
params.append(param(data))
return params
import sys
import platform
from unittest import skipIf
from typing import List, Tuple
from concurrent.futures import ProcessPoolExecutor
import numpy as np
import torch
import torchaudio
from torchaudio_unittest.common_utils import (
TempDirMixin,
PytorchTestCase,
skipIfNoSox,
get_whitenoise,
save_wav,
)
class RandomPerturbationFile(torch.utils.data.Dataset):
"""Given flist, apply random speed perturbation"""
def __init__(self, flist: List[str], sample_rate: int):
super().__init__()
self.flist = flist
self.sample_rate = sample_rate
self.rng = None
def __getitem__(self, index):
speed = self.rng.uniform(0.5, 2.0)
effects = [
['gain', '-n', '-10'],
['speed', f'{speed:.5f}'], # duration of data is 0.5 ~ 2.0 seconds.
['rate', f'{self.sample_rate}'],
['pad', '0', '1.5'], # add 1.5 seconds silence at the end
['trim', '0', '2'], # get the first 2 seconds
]
data, _ = torchaudio.sox_effects.apply_effects_file(self.flist[index], effects)
return data
def __len__(self):
return len(self.flist)
class RandomPerturbationTensor(torch.utils.data.Dataset):
"""Apply speed purturbation to (synthetic) Tensor data"""
def __init__(self, signals: List[Tuple[torch.Tensor, int]], sample_rate: int):
super().__init__()
self.signals = signals
self.sample_rate = sample_rate
self.rng = None
def __getitem__(self, index):
speed = self.rng.uniform(0.5, 2.0)
effects = [
['gain', '-n', '-10'],
['speed', f'{speed:.5f}'], # duration of data is 0.5 ~ 2.0 seconds.
['rate', f'{self.sample_rate}'],
['pad', '0', '1.5'], # add 1.5 seconds silence at the end
['trim', '0', '2'], # get the first 2 seconds
]
tensor, sample_rate = self.signals[index]
data, _ = torchaudio.sox_effects.apply_effects_tensor(tensor, sample_rate, effects)
return data
def __len__(self):
return len(self.signals)
def init_random_seed(worker_id):
dataset = torch.utils.data.get_worker_info().dataset
dataset.rng = np.random.RandomState(worker_id)
@skipIfNoSox
@skipIf(
platform.system() == 'Darwin' and
sys.version_info.major == 3 and
sys.version_info.minor in [6, 7],
'This test is known to get stuck for macOS with Python < 3.8. '
'See https://github.com/pytorch/pytorch/issues/46409'
)
class TestSoxEffectsDataset(TempDirMixin, PytorchTestCase):
"""Test `apply_effects_file` in multi-process dataloader setting"""
def _generate_dataset(self, num_samples=128):
flist = []
for i in range(num_samples):
sample_rate = np.random.choice([8000, 16000, 44100])
dtype = np.random.choice(['float32', 'int32', 'int16', 'uint8'])
data = get_whitenoise(n_channels=2, sample_rate=sample_rate, duration=1, dtype=dtype)
path = self.get_temp_path(f'{i:03d}_{dtype}_{sample_rate}.wav')
save_wav(path, data, sample_rate)
flist.append(path)
return flist
def test_apply_effects_file(self):
sample_rate = 12000
flist = self._generate_dataset()
dataset = RandomPerturbationFile(flist, sample_rate)
loader = torch.utils.data.DataLoader(
dataset, batch_size=32, num_workers=16,
worker_init_fn=init_random_seed,
)
for batch in loader:
assert batch.shape == (32, 2, 2 * sample_rate)
def _generate_signals(self, num_samples=128):
signals = []
for _ in range(num_samples):
sample_rate = np.random.choice([8000, 16000, 44100])
data = get_whitenoise(
n_channels=2, sample_rate=sample_rate, duration=1, dtype='float32')
signals.append((data, sample_rate))
return signals
def test_apply_effects_tensor(self):
sample_rate = 12000
signals = self._generate_signals()
dataset = RandomPerturbationTensor(signals, sample_rate)
loader = torch.utils.data.DataLoader(
dataset, batch_size=32, num_workers=16,
worker_init_fn=init_random_seed,
)
for batch in loader:
assert batch.shape == (32, 2, 2 * sample_rate)
def speed(path):
wav, sample_rate = torchaudio.backend.sox_io_backend.load(path)
effects = [
['speed', '1.03756523535464655'],
['rate', f'{sample_rate}'],
]
return torchaudio.sox_effects.apply_effects_tensor(wav, sample_rate, effects)[0]
@skipIfNoSox
class TestProcessPoolExecutor(TempDirMixin, PytorchTestCase):
backend = "sox_io"
def setUp(self):
sample_rate = 16000
self.flist = []
for i in range(10):
path = self.get_temp_path(f'{i}.wav')
data = get_whitenoise(n_channels=1, sample_rate=sample_rate, duration=1, dtype='float')
save_wav(path, data, sample_rate)
self.flist.append(path)
def test_executor(self):
"""Test that apply_effects_tensor with speed + rate does not crush
https://github.com/pytorch/audio/issues/1021
"""
executor = ProcessPoolExecutor(1)
futures = [executor.submit(speed, path) for path in self.flist]
for future in futures:
future.result()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment