test_batch_consistency.py 10.7 KB
Newer Older
1
2
3
"""Test numerical consistency among single input and batched input."""
import unittest

4
import platform
5
6
7
8
9
10
11
import torch
import torchaudio
import torchaudio.functional as F

import common_utils


12
13
def _test_batch_consistency(functional, tensor, *args, batch_size=1, atol=1e-8, rtol=1e-5, **kwargs):
    # run then batch the result
14
15
    torch.random.manual_seed(42)
    expected = functional(tensor.clone(), *args, **kwargs)
16
    expected = expected.repeat([batch_size] + [1] * expected.dim())
17

18
    # batch the input and run
19
    torch.random.manual_seed(42)
20
21
    pattern = [batch_size] + [1] * tensor.dim()
    computed = functional(tensor.repeat(pattern), *args, **kwargs)
22

23
    torch.testing.assert_allclose(computed, expected, rtol=rtol, atol=atol)
24
25


moto's avatar
moto committed
26
def _test_batch(functional, tensor, *args, atol=1e-8, rtol=1e-5, **kwargs):
27
28
    _test_batch_consistency(functional, tensor, *args, batch_size=1, atol=atol, rtol=rtol, **kwargs)
    _test_batch_consistency(functional, tensor, *args, batch_size=3, atol=atol, rtol=rtol, **kwargs)
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55


class TestFunctional(unittest.TestCase):
    """Test functions defined in `functional` module"""
    def test_griffinlim(self):
        n_fft = 400
        ws = 400
        hop = 200
        window = torch.hann_window(ws)
        power = 2
        normalize = False
        momentum = 0.99
        n_iter = 32
        length = 1000
        tensor = torch.rand((1, 201, 6))
        _test_batch(
            F.griffinlim, tensor, window, n_fft, hop, ws, power, normalize, n_iter, momentum, length, 0, atol=5e-5
        )

    def test_detect_pitch_frequency(self):
        filenames = [
            'steam-train-whistle-daniel_simon.wav',  # 2ch 44100Hz
            # Files from https://www.mediacollege.com/audio/tone/download/
            '100Hz_44100Hz_16bit_05sec.wav',  # 1ch
            '440Hz_44100Hz_16bit_05sec.wav',  # 1ch
        ]
        for filename in filenames:
56
            filepath = common_utils.get_asset_path(filename)
57
58
59
60
61
62
63
64
65
66
            waveform, sample_rate = torchaudio.load(filepath)
            _test_batch(F.detect_pitch_frequency, waveform, sample_rate)

    def test_istft(self):
        stft = torch.tensor([
            [[4., 0.], [4., 0.], [4., 0.], [4., 0.], [4., 0.]],
            [[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.]],
            [[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.]]
        ])
        _test_batch(F.istft, stft, n_fft=4, length=4)
67

68
69
70
    def test_contrast(self):
        waveform = torch.rand(2, 100) - 0.5
        _test_batch(F.contrast, waveform, enhancement_amount=80.)
71
72
73
74

    def test_dcshift(self):
        waveform = torch.rand(2, 100) - 0.5
        _test_batch(F.dcshift, waveform, shift=0.5, limiter_gain=0.05)
75

76
77
78
79
    def test_overdrive(self):
        waveform = torch.rand(2, 100) - 0.5
        _test_batch(F.overdrive, waveform, gain=45, colour=30)

80
81
82
83
84
    def test_phaser(self):
        filepath = common_utils.get_asset_path("whitenoise.wav")
        waveform, sample_rate = torchaudio.load(filepath)
        _test_batch(F.phaser, waveform, sample_rate)

85
86
87
88
89
90
    def test_sliding_window_cmn(self):
        waveform = torch.randn(2, 1024) - 0.5
        _test_batch(F.sliding_window_cmn, waveform, center=True, norm_vars=True)
        _test_batch(F.sliding_window_cmn, waveform, center=True, norm_vars=False)
        _test_batch(F.sliding_window_cmn, waveform, center=False, norm_vars=True)
        _test_batch(F.sliding_window_cmn, waveform, center=False, norm_vars=False)
Artyom Astafurov's avatar
Artyom Astafurov committed
91
92

    def test_vad(self):
93
        filepath = common_utils.get_asset_path("vad-go-mono-32000.wav")
Artyom Astafurov's avatar
Artyom Astafurov committed
94
95
        waveform, sample_rate = torchaudio.load(filepath)
        _test_batch(F.vad, waveform, sample_rate=sample_rate)
96

97
98
99
100
101
102
103
104
105
106
107
108

class TestTransforms(unittest.TestCase):
    """Test suite for classes defined in `transforms` module"""
    def test_batch_AmplitudeToDB(self):
        spec = torch.rand((6, 201))

        # Single then transform then batch
        expected = torchaudio.transforms.AmplitudeToDB()(spec).repeat(3, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.AmplitudeToDB()(spec.repeat(3, 1, 1))

109
        torch.testing.assert_allclose(computed, expected)
110
111
112
113
114
115
116
117
118
119

    def test_batch_Resample(self):
        waveform = torch.randn(2, 2786)

        # Single then transform then batch
        expected = torchaudio.transforms.Resample()(waveform).repeat(3, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.Resample()(waveform.repeat(3, 1, 1))

120
        torch.testing.assert_allclose(computed, expected)
121
122
123
124
125
126
127
128
129
130
131

    def test_batch_MelScale(self):
        specgram = torch.randn(2, 31, 2786)

        # Single then transform then batch
        expected = torchaudio.transforms.MelScale()(specgram).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.MelScale()(specgram.repeat(3, 1, 1, 1))

        # shape = (3, 2, 201, 1394)
132
        torch.testing.assert_allclose(computed, expected)
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148

    def test_batch_InverseMelScale(self):
        n_mels = 32
        n_stft = 5
        mel_spec = torch.randn(2, n_mels, 32) ** 2

        # Single then transform then batch
        expected = torchaudio.transforms.InverseMelScale(n_stft, n_mels)(mel_spec).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.InverseMelScale(n_stft, n_mels)(mel_spec.repeat(3, 1, 1, 1))

        # shape = (3, 2, n_mels, 32)

        # Because InverseMelScale runs SGD on randomly initialized values so they do not yield
        # exactly same result. For this reason, tolerance is very relaxed here.
149
        torch.testing.assert_allclose(computed, expected, atol=1.0, rtol=1e-5)
150
151
152
153
154
155
156
157
158
159
160

    def test_batch_compute_deltas(self):
        specgram = torch.randn(2, 31, 2786)

        # Single then transform then batch
        expected = torchaudio.transforms.ComputeDeltas()(specgram).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.ComputeDeltas()(specgram.repeat(3, 1, 1, 1))

        # shape = (3, 2, 201, 1394)
161
        torch.testing.assert_allclose(computed, expected)
162
163

    def test_batch_mulaw(self):
164
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
165
166
167
168
169
170
171
172
173
174
175
        waveform, _ = torchaudio.load(test_filepath)  # (2, 278756), 44100

        # Single then transform then batch
        waveform_encoded = torchaudio.transforms.MuLawEncoding()(waveform)
        expected = waveform_encoded.unsqueeze(0).repeat(3, 1, 1)

        # Batch then transform
        waveform_batched = waveform.unsqueeze(0).repeat(3, 1, 1)
        computed = torchaudio.transforms.MuLawEncoding()(waveform_batched)

        # shape = (3, 2, 201, 1394)
176
        torch.testing.assert_allclose(computed, expected)
177
178
179
180
181
182
183
184
185

        # Single then transform then batch
        waveform_decoded = torchaudio.transforms.MuLawDecoding()(waveform_encoded)
        expected = waveform_decoded.unsqueeze(0).repeat(3, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.MuLawDecoding()(computed)

        # shape = (3, 2, 201, 1394)
186
        torch.testing.assert_allclose(computed, expected)
187
188

    def test_batch_spectrogram(self):
189
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
190
191
192
193
194
195
196
        waveform, _ = torchaudio.load(test_filepath)  # (2, 278756), 44100

        # Single then transform then batch
        expected = torchaudio.transforms.Spectrogram()(waveform).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.Spectrogram()(waveform.repeat(3, 1, 1))
197
        torch.testing.assert_allclose(computed, expected)
198
199

    def test_batch_melspectrogram(self):
200
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
201
202
203
204
205
206
207
        waveform, _ = torchaudio.load(test_filepath)  # (2, 278756), 44100

        # Single then transform then batch
        expected = torchaudio.transforms.MelSpectrogram()(waveform).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.MelSpectrogram()(waveform.repeat(3, 1, 1))
208
        torch.testing.assert_allclose(computed, expected)
209

210
    @unittest.skipIf(platform.system() == "Windows", "Test is known to fail on Windows.")
211
    def test_batch_mfcc(self):
moto's avatar
moto committed
212
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
213
214
215
216
217
218
219
        waveform, _ = torchaudio.load(test_filepath)

        # Single then transform then batch
        expected = torchaudio.transforms.MFCC()(waveform).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.MFCC()(waveform.repeat(3, 1, 1))
220
        torch.testing.assert_allclose(computed, expected, atol=1e-5, rtol=1e-5)
221
222

    def test_batch_TimeStretch(self):
223
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
        waveform, _ = torchaudio.load(test_filepath)  # (2, 278756), 44100

        kwargs = {
            'n_fft': 2048,
            'hop_length': 512,
            'win_length': 2048,
            'window': torch.hann_window(2048),
            'center': True,
            'pad_mode': 'reflect',
            'normalized': True,
            'onesided': True,
        }
        rate = 2

        complex_specgrams = torch.stft(waveform, **kwargs)

        # Single then transform then batch
        expected = torchaudio.transforms.TimeStretch(
            fixed_rate=rate,
            n_freq=1025,
            hop_length=512,
        )(complex_specgrams).repeat(3, 1, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.TimeStretch(
            fixed_rate=rate,
            n_freq=1025,
            hop_length=512,
        )(complex_specgrams.repeat(3, 1, 1, 1, 1))

254
        torch.testing.assert_allclose(computed, expected, atol=1e-5, rtol=1e-5)
255
256

    def test_batch_Fade(self):
257
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
258
259
260
261
262
263
264
265
266
        waveform, _ = torchaudio.load(test_filepath)  # (2, 278756), 44100
        fade_in_len = 3000
        fade_out_len = 3000

        # Single then transform then batch
        expected = torchaudio.transforms.Fade(fade_in_len, fade_out_len)(waveform).repeat(3, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.Fade(fade_in_len, fade_out_len)(waveform.repeat(3, 1, 1))
267
        torch.testing.assert_allclose(computed, expected)
268
269

    def test_batch_Vol(self):
270
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
271
272
273
274
275
276
277
        waveform, _ = torchaudio.load(test_filepath)  # (2, 278756), 44100

        # Single then transform then batch
        expected = torchaudio.transforms.Vol(gain=1.1)(waveform).repeat(3, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.Vol(gain=1.1)(waveform.repeat(3, 1, 1))
278
        torch.testing.assert_allclose(computed, expected)
Vincent QB's avatar
Vincent QB committed
279
280
281
282


if __name__ == '__main__':
    unittest.main()