test_batch_consistency.py 10 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
"""Test numerical consistency among single input and batched input."""
import unittest

import torch
import torchaudio
import torchaudio.functional as F

import common_utils


11
12
def _test_batch_consistency(functional, tensor, *args, batch_size=1, atol=1e-8, rtol=1e-5, **kwargs):
    # run then batch the result
13
14
    torch.random.manual_seed(42)
    expected = functional(tensor.clone(), *args, **kwargs)
15
    expected = expected.repeat([batch_size] + [1] * expected.dim())
16

17
    # batch the input and run
18
    torch.random.manual_seed(42)
19
20
    pattern = [batch_size] + [1] * tensor.dim()
    computed = functional(tensor.repeat(pattern), *args, **kwargs)
21

22
    torch.testing.assert_allclose(computed, expected, rtol=rtol, atol=atol)
23
24


moto's avatar
moto committed
25
def _test_batch(functional, tensor, *args, atol=1e-8, rtol=1e-5, **kwargs):
26
27
    _test_batch_consistency(functional, tensor, *args, batch_size=1, atol=atol, rtol=rtol, **kwargs)
    _test_batch_consistency(functional, tensor, *args, batch_size=3, atol=atol, rtol=rtol, **kwargs)
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54


class TestFunctional(unittest.TestCase):
    """Test functions defined in `functional` module"""
    def test_griffinlim(self):
        n_fft = 400
        ws = 400
        hop = 200
        window = torch.hann_window(ws)
        power = 2
        normalize = False
        momentum = 0.99
        n_iter = 32
        length = 1000
        tensor = torch.rand((1, 201, 6))
        _test_batch(
            F.griffinlim, tensor, window, n_fft, hop, ws, power, normalize, n_iter, momentum, length, 0, atol=5e-5
        )

    def test_detect_pitch_frequency(self):
        filenames = [
            'steam-train-whistle-daniel_simon.wav',  # 2ch 44100Hz
            # Files from https://www.mediacollege.com/audio/tone/download/
            '100Hz_44100Hz_16bit_05sec.wav',  # 1ch
            '440Hz_44100Hz_16bit_05sec.wav',  # 1ch
        ]
        for filename in filenames:
55
            filepath = common_utils.get_asset_path(filename)
56
57
58
59
60
61
62
63
64
65
            waveform, sample_rate = torchaudio.load(filepath)
            _test_batch(F.detect_pitch_frequency, waveform, sample_rate)

    def test_istft(self):
        stft = torch.tensor([
            [[4., 0.], [4., 0.], [4., 0.], [4., 0.], [4., 0.]],
            [[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.]],
            [[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.]]
        ])
        _test_batch(F.istft, stft, n_fft=4, length=4)
66

67
68
69
    def test_contrast(self):
        waveform = torch.rand(2, 100) - 0.5
        _test_batch(F.contrast, waveform, enhancement_amount=80.)
70
71
72
73

    def test_dcshift(self):
        waveform = torch.rand(2, 100) - 0.5
        _test_batch(F.dcshift, waveform, shift=0.5, limiter_gain=0.05)
74
75
76
77
78
79
80

    def test_sliding_window_cmn(self):
        waveform = torch.randn(2, 1024) - 0.5
        _test_batch(F.sliding_window_cmn, waveform, center=True, norm_vars=True)
        _test_batch(F.sliding_window_cmn, waveform, center=True, norm_vars=False)
        _test_batch(F.sliding_window_cmn, waveform, center=False, norm_vars=True)
        _test_batch(F.sliding_window_cmn, waveform, center=False, norm_vars=False)
81

82
83
84
85
86
87
88
89
90
91
92
93

class TestTransforms(unittest.TestCase):
    """Test suite for classes defined in `transforms` module"""
    def test_batch_AmplitudeToDB(self):
        spec = torch.rand((6, 201))

        # Single then transform then batch
        expected = torchaudio.transforms.AmplitudeToDB()(spec).repeat(3, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.AmplitudeToDB()(spec.repeat(3, 1, 1))

94
        torch.testing.assert_allclose(computed, expected)
95
96
97
98
99
100
101
102
103
104

    def test_batch_Resample(self):
        waveform = torch.randn(2, 2786)

        # Single then transform then batch
        expected = torchaudio.transforms.Resample()(waveform).repeat(3, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.Resample()(waveform.repeat(3, 1, 1))

105
        torch.testing.assert_allclose(computed, expected)
106
107
108
109
110
111
112
113
114
115
116

    def test_batch_MelScale(self):
        specgram = torch.randn(2, 31, 2786)

        # Single then transform then batch
        expected = torchaudio.transforms.MelScale()(specgram).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.MelScale()(specgram.repeat(3, 1, 1, 1))

        # shape = (3, 2, 201, 1394)
117
        torch.testing.assert_allclose(computed, expected)
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

    def test_batch_InverseMelScale(self):
        n_mels = 32
        n_stft = 5
        mel_spec = torch.randn(2, n_mels, 32) ** 2

        # Single then transform then batch
        expected = torchaudio.transforms.InverseMelScale(n_stft, n_mels)(mel_spec).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.InverseMelScale(n_stft, n_mels)(mel_spec.repeat(3, 1, 1, 1))

        # shape = (3, 2, n_mels, 32)

        # Because InverseMelScale runs SGD on randomly initialized values so they do not yield
        # exactly same result. For this reason, tolerance is very relaxed here.
134
        torch.testing.assert_allclose(computed, expected, atol=1.0, rtol=1e-5)
135
136
137
138
139
140
141
142
143
144
145

    def test_batch_compute_deltas(self):
        specgram = torch.randn(2, 31, 2786)

        # Single then transform then batch
        expected = torchaudio.transforms.ComputeDeltas()(specgram).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.ComputeDeltas()(specgram.repeat(3, 1, 1, 1))

        # shape = (3, 2, 201, 1394)
146
        torch.testing.assert_allclose(computed, expected)
147
148

    def test_batch_mulaw(self):
149
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
150
151
152
153
154
155
156
157
158
159
160
        waveform, _ = torchaudio.load(test_filepath)  # (2, 278756), 44100

        # Single then transform then batch
        waveform_encoded = torchaudio.transforms.MuLawEncoding()(waveform)
        expected = waveform_encoded.unsqueeze(0).repeat(3, 1, 1)

        # Batch then transform
        waveform_batched = waveform.unsqueeze(0).repeat(3, 1, 1)
        computed = torchaudio.transforms.MuLawEncoding()(waveform_batched)

        # shape = (3, 2, 201, 1394)
161
        torch.testing.assert_allclose(computed, expected)
162
163
164
165
166
167
168
169
170

        # Single then transform then batch
        waveform_decoded = torchaudio.transforms.MuLawDecoding()(waveform_encoded)
        expected = waveform_decoded.unsqueeze(0).repeat(3, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.MuLawDecoding()(computed)

        # shape = (3, 2, 201, 1394)
171
        torch.testing.assert_allclose(computed, expected)
172
173

    def test_batch_spectrogram(self):
174
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
175
176
177
178
179
180
181
        waveform, _ = torchaudio.load(test_filepath)  # (2, 278756), 44100

        # Single then transform then batch
        expected = torchaudio.transforms.Spectrogram()(waveform).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.Spectrogram()(waveform.repeat(3, 1, 1))
182
        torch.testing.assert_allclose(computed, expected)
183
184

    def test_batch_melspectrogram(self):
185
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
186
187
188
189
190
191
192
        waveform, _ = torchaudio.load(test_filepath)  # (2, 278756), 44100

        # Single then transform then batch
        expected = torchaudio.transforms.MelSpectrogram()(waveform).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.MelSpectrogram()(waveform.repeat(3, 1, 1))
193
        torch.testing.assert_allclose(computed, expected)
194
195

    def test_batch_mfcc(self):
moto's avatar
moto committed
196
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
197
198
199
200
201
202
203
        waveform, _ = torchaudio.load(test_filepath)

        # Single then transform then batch
        expected = torchaudio.transforms.MFCC()(waveform).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.MFCC()(waveform.repeat(3, 1, 1))
204
        torch.testing.assert_allclose(computed, expected, atol=1e-5, rtol=1e-5)
205
206

    def test_batch_TimeStretch(self):
207
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
        waveform, _ = torchaudio.load(test_filepath)  # (2, 278756), 44100

        kwargs = {
            'n_fft': 2048,
            'hop_length': 512,
            'win_length': 2048,
            'window': torch.hann_window(2048),
            'center': True,
            'pad_mode': 'reflect',
            'normalized': True,
            'onesided': True,
        }
        rate = 2

        complex_specgrams = torch.stft(waveform, **kwargs)

        # Single then transform then batch
        expected = torchaudio.transforms.TimeStretch(
            fixed_rate=rate,
            n_freq=1025,
            hop_length=512,
        )(complex_specgrams).repeat(3, 1, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.TimeStretch(
            fixed_rate=rate,
            n_freq=1025,
            hop_length=512,
        )(complex_specgrams.repeat(3, 1, 1, 1, 1))

238
        torch.testing.assert_allclose(computed, expected, atol=1e-5, rtol=1e-5)
239
240

    def test_batch_Fade(self):
241
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
242
243
244
245
246
247
248
249
250
        waveform, _ = torchaudio.load(test_filepath)  # (2, 278756), 44100
        fade_in_len = 3000
        fade_out_len = 3000

        # Single then transform then batch
        expected = torchaudio.transforms.Fade(fade_in_len, fade_out_len)(waveform).repeat(3, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.Fade(fade_in_len, fade_out_len)(waveform.repeat(3, 1, 1))
251
        torch.testing.assert_allclose(computed, expected)
252
253

    def test_batch_Vol(self):
254
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
255
256
257
258
259
260
261
        waveform, _ = torchaudio.load(test_filepath)  # (2, 278756), 44100

        # Single then transform then batch
        expected = torchaudio.transforms.Vol(gain=1.1)(waveform).repeat(3, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.Vol(gain=1.1)(waveform.repeat(3, 1, 1))
262
        torch.testing.assert_allclose(computed, expected)
Vincent QB's avatar
Vincent QB committed
263
264
265
266


if __name__ == '__main__':
    unittest.main()