test_batch_consistency.py 9.48 KB
Newer Older
1
2
3
4
5
6
7
8
"""Test numerical consistency among single input and batched input."""
import unittest

import torch
import torchaudio
import torchaudio.functional as F

import common_utils
9
from common_utils import AudioBackendScope, BACKENDS
10
11


12
13
def _test_batch_consistency(functional, tensor, *args, batch_size=1, atol=1e-8, rtol=1e-5, **kwargs):
    # run then batch the result
14
15
    torch.random.manual_seed(42)
    expected = functional(tensor.clone(), *args, **kwargs)
16
    expected = expected.repeat([batch_size] + [1] * expected.dim())
17

18
    # batch the input and run
19
    torch.random.manual_seed(42)
20
21
    pattern = [batch_size] + [1] * tensor.dim()
    computed = functional(tensor.repeat(pattern), *args, **kwargs)
22

23
    torch.testing.assert_allclose(computed, expected, rtol=rtol, atol=atol)
24
25


moto's avatar
moto committed
26
def _test_batch(functional, tensor, *args, atol=1e-8, rtol=1e-5, **kwargs):
27
28
    _test_batch_consistency(functional, tensor, *args, batch_size=1, atol=atol, rtol=rtol, **kwargs)
    _test_batch_consistency(functional, tensor, *args, batch_size=3, atol=atol, rtol=rtol, **kwargs)
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55


class TestFunctional(unittest.TestCase):
    """Test functions defined in `functional` module"""
    def test_griffinlim(self):
        n_fft = 400
        ws = 400
        hop = 200
        window = torch.hann_window(ws)
        power = 2
        normalize = False
        momentum = 0.99
        n_iter = 32
        length = 1000
        tensor = torch.rand((1, 201, 6))
        _test_batch(
            F.griffinlim, tensor, window, n_fft, hop, ws, power, normalize, n_iter, momentum, length, 0, atol=5e-5
        )

    def test_detect_pitch_frequency(self):
        filenames = [
            'steam-train-whistle-daniel_simon.wav',  # 2ch 44100Hz
            # Files from https://www.mediacollege.com/audio/tone/download/
            '100Hz_44100Hz_16bit_05sec.wav',  # 1ch
            '440Hz_44100Hz_16bit_05sec.wav',  # 1ch
        ]
        for filename in filenames:
56
            filepath = common_utils.get_asset_path(filename)
57
58
59
60
61
62
63
64
65
66
            waveform, sample_rate = torchaudio.load(filepath)
            _test_batch(F.detect_pitch_frequency, waveform, sample_rate)

    def test_istft(self):
        stft = torch.tensor([
            [[4., 0.], [4., 0.], [4., 0.], [4., 0.], [4., 0.]],
            [[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.]],
            [[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.]]
        ])
        _test_batch(F.istft, stft, n_fft=4, length=4)
67
68
69
70
71
72
73
74
75
76
77
78
79


class TestTransforms(unittest.TestCase):
    """Test suite for classes defined in `transforms` module"""
    def test_batch_AmplitudeToDB(self):
        spec = torch.rand((6, 201))

        # Single then transform then batch
        expected = torchaudio.transforms.AmplitudeToDB()(spec).repeat(3, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.AmplitudeToDB()(spec.repeat(3, 1, 1))

80
        torch.testing.assert_allclose(computed, expected)
81
82
83
84
85
86
87
88
89
90

    def test_batch_Resample(self):
        waveform = torch.randn(2, 2786)

        # Single then transform then batch
        expected = torchaudio.transforms.Resample()(waveform).repeat(3, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.Resample()(waveform.repeat(3, 1, 1))

91
        torch.testing.assert_allclose(computed, expected)
92
93
94
95
96
97
98
99
100
101
102

    def test_batch_MelScale(self):
        specgram = torch.randn(2, 31, 2786)

        # Single then transform then batch
        expected = torchaudio.transforms.MelScale()(specgram).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.MelScale()(specgram.repeat(3, 1, 1, 1))

        # shape = (3, 2, 201, 1394)
103
        torch.testing.assert_allclose(computed, expected)
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119

    def test_batch_InverseMelScale(self):
        n_mels = 32
        n_stft = 5
        mel_spec = torch.randn(2, n_mels, 32) ** 2

        # Single then transform then batch
        expected = torchaudio.transforms.InverseMelScale(n_stft, n_mels)(mel_spec).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.InverseMelScale(n_stft, n_mels)(mel_spec.repeat(3, 1, 1, 1))

        # shape = (3, 2, n_mels, 32)

        # Because InverseMelScale runs SGD on randomly initialized values so they do not yield
        # exactly same result. For this reason, tolerance is very relaxed here.
120
        torch.testing.assert_allclose(computed, expected, atol=1.0, rtol=1e-5)
121
122
123
124
125
126
127
128
129
130
131

    def test_batch_compute_deltas(self):
        specgram = torch.randn(2, 31, 2786)

        # Single then transform then batch
        expected = torchaudio.transforms.ComputeDeltas()(specgram).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.ComputeDeltas()(specgram.repeat(3, 1, 1, 1))

        # shape = (3, 2, 201, 1394)
132
        torch.testing.assert_allclose(computed, expected)
133
134

    def test_batch_mulaw(self):
135
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
136
137
138
139
140
141
142
143
144
145
146
        waveform, _ = torchaudio.load(test_filepath)  # (2, 278756), 44100

        # Single then transform then batch
        waveform_encoded = torchaudio.transforms.MuLawEncoding()(waveform)
        expected = waveform_encoded.unsqueeze(0).repeat(3, 1, 1)

        # Batch then transform
        waveform_batched = waveform.unsqueeze(0).repeat(3, 1, 1)
        computed = torchaudio.transforms.MuLawEncoding()(waveform_batched)

        # shape = (3, 2, 201, 1394)
147
        torch.testing.assert_allclose(computed, expected)
148
149
150
151
152
153
154
155
156

        # Single then transform then batch
        waveform_decoded = torchaudio.transforms.MuLawDecoding()(waveform_encoded)
        expected = waveform_decoded.unsqueeze(0).repeat(3, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.MuLawDecoding()(computed)

        # shape = (3, 2, 201, 1394)
157
        torch.testing.assert_allclose(computed, expected)
158
159

    def test_batch_spectrogram(self):
160
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
161
162
163
164
165
166
167
        waveform, _ = torchaudio.load(test_filepath)  # (2, 278756), 44100

        # Single then transform then batch
        expected = torchaudio.transforms.Spectrogram()(waveform).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.Spectrogram()(waveform.repeat(3, 1, 1))
168
        torch.testing.assert_allclose(computed, expected)
169
170

    def test_batch_melspectrogram(self):
171
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
172
173
174
175
176
177
178
        waveform, _ = torchaudio.load(test_filepath)  # (2, 278756), 44100

        # Single then transform then batch
        expected = torchaudio.transforms.MelSpectrogram()(waveform).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.MelSpectrogram()(waveform.repeat(3, 1, 1))
179
        torch.testing.assert_allclose(computed, expected)
180
181
182
183

    @unittest.skipIf("sox" not in BACKENDS, "sox not available")
    @AudioBackendScope("sox")
    def test_batch_mfcc(self):
184
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.mp3')
185
186
187
188
189
190
191
        waveform, _ = torchaudio.load(test_filepath)

        # Single then transform then batch
        expected = torchaudio.transforms.MFCC()(waveform).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.MFCC()(waveform.repeat(3, 1, 1))
192
        torch.testing.assert_allclose(computed, expected, atol=1e-5, rtol=1e-5)
193
194

    def test_batch_TimeStretch(self):
195
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
        waveform, _ = torchaudio.load(test_filepath)  # (2, 278756), 44100

        kwargs = {
            'n_fft': 2048,
            'hop_length': 512,
            'win_length': 2048,
            'window': torch.hann_window(2048),
            'center': True,
            'pad_mode': 'reflect',
            'normalized': True,
            'onesided': True,
        }
        rate = 2

        complex_specgrams = torch.stft(waveform, **kwargs)

        # Single then transform then batch
        expected = torchaudio.transforms.TimeStretch(
            fixed_rate=rate,
            n_freq=1025,
            hop_length=512,
        )(complex_specgrams).repeat(3, 1, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.TimeStretch(
            fixed_rate=rate,
            n_freq=1025,
            hop_length=512,
        )(complex_specgrams.repeat(3, 1, 1, 1, 1))

226
        torch.testing.assert_allclose(computed, expected, atol=1e-5, rtol=1e-5)
227
228

    def test_batch_Fade(self):
229
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
230
231
232
233
234
235
236
237
238
        waveform, _ = torchaudio.load(test_filepath)  # (2, 278756), 44100
        fade_in_len = 3000
        fade_out_len = 3000

        # Single then transform then batch
        expected = torchaudio.transforms.Fade(fade_in_len, fade_out_len)(waveform).repeat(3, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.Fade(fade_in_len, fade_out_len)(waveform.repeat(3, 1, 1))
239
        torch.testing.assert_allclose(computed, expected)
240
241

    def test_batch_Vol(self):
242
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
243
244
245
246
247
248
249
        waveform, _ = torchaudio.load(test_filepath)  # (2, 278756), 44100

        # Single then transform then batch
        expected = torchaudio.transforms.Vol(gain=1.1)(waveform).repeat(3, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.Vol(gain=1.1)(waveform.repeat(3, 1, 1))
250
        torch.testing.assert_allclose(computed, expected)
Vincent QB's avatar
Vincent QB committed
251
252
253
254


if __name__ == '__main__':
    unittest.main()