test_batch_consistency.py 9.62 KB
Newer Older
1
2
3
4
5
6
7
8
"""Test numerical consistency among single input and batched input."""
import unittest

import torch
import torchaudio
import torchaudio.functional as F

import common_utils
9
from common_utils import AudioBackendScope, BACKENDS
10
11


12
13
def _test_batch_consistency(functional, tensor, *args, batch_size=1, atol=1e-8, rtol=1e-5, **kwargs):
    # run then batch the result
14
15
    torch.random.manual_seed(42)
    expected = functional(tensor.clone(), *args, **kwargs)
16
    expected = expected.repeat([batch_size] + [1] * expected.dim())
17

18
    # batch the input and run
19
    torch.random.manual_seed(42)
20
21
    pattern = [batch_size] + [1] * tensor.dim()
    computed = functional(tensor.repeat(pattern), *args, **kwargs)
22

23
    torch.testing.assert_allclose(computed, expected, rtol=rtol, atol=atol)
24
25


moto's avatar
moto committed
26
def _test_batch(functional, tensor, *args, atol=1e-8, rtol=1e-5, **kwargs):
27
28
    _test_batch_consistency(functional, tensor, *args, batch_size=1, atol=atol, rtol=rtol, **kwargs)
    _test_batch_consistency(functional, tensor, *args, batch_size=3, atol=atol, rtol=rtol, **kwargs)
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55


class TestFunctional(unittest.TestCase):
    """Test functions defined in `functional` module"""
    def test_griffinlim(self):
        n_fft = 400
        ws = 400
        hop = 200
        window = torch.hann_window(ws)
        power = 2
        normalize = False
        momentum = 0.99
        n_iter = 32
        length = 1000
        tensor = torch.rand((1, 201, 6))
        _test_batch(
            F.griffinlim, tensor, window, n_fft, hop, ws, power, normalize, n_iter, momentum, length, 0, atol=5e-5
        )

    def test_detect_pitch_frequency(self):
        filenames = [
            'steam-train-whistle-daniel_simon.wav',  # 2ch 44100Hz
            # Files from https://www.mediacollege.com/audio/tone/download/
            '100Hz_44100Hz_16bit_05sec.wav',  # 1ch
            '440Hz_44100Hz_16bit_05sec.wav',  # 1ch
        ]
        for filename in filenames:
56
            filepath = common_utils.get_asset_path(filename)
57
58
59
60
61
62
63
64
65
66
            waveform, sample_rate = torchaudio.load(filepath)
            _test_batch(F.detect_pitch_frequency, waveform, sample_rate)

    def test_istft(self):
        stft = torch.tensor([
            [[4., 0.], [4., 0.], [4., 0.], [4., 0.], [4., 0.]],
            [[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.]],
            [[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.]]
        ])
        _test_batch(F.istft, stft, n_fft=4, length=4)
67

68
69
70
71
    def test_contrast(self):
        waveform = torch.rand(2, 100) - 0.5
        _test_batch(F.contrast, waveform, enhancement_amount=80.)

72
73
74
75
76
77
78
79
80
81
82
83

class TestTransforms(unittest.TestCase):
    """Test suite for classes defined in `transforms` module"""
    def test_batch_AmplitudeToDB(self):
        spec = torch.rand((6, 201))

        # Single then transform then batch
        expected = torchaudio.transforms.AmplitudeToDB()(spec).repeat(3, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.AmplitudeToDB()(spec.repeat(3, 1, 1))

84
        torch.testing.assert_allclose(computed, expected)
85
86
87
88
89
90
91
92
93
94

    def test_batch_Resample(self):
        waveform = torch.randn(2, 2786)

        # Single then transform then batch
        expected = torchaudio.transforms.Resample()(waveform).repeat(3, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.Resample()(waveform.repeat(3, 1, 1))

95
        torch.testing.assert_allclose(computed, expected)
96
97
98
99
100
101
102
103
104
105
106

    def test_batch_MelScale(self):
        specgram = torch.randn(2, 31, 2786)

        # Single then transform then batch
        expected = torchaudio.transforms.MelScale()(specgram).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.MelScale()(specgram.repeat(3, 1, 1, 1))

        # shape = (3, 2, 201, 1394)
107
        torch.testing.assert_allclose(computed, expected)
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123

    def test_batch_InverseMelScale(self):
        n_mels = 32
        n_stft = 5
        mel_spec = torch.randn(2, n_mels, 32) ** 2

        # Single then transform then batch
        expected = torchaudio.transforms.InverseMelScale(n_stft, n_mels)(mel_spec).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.InverseMelScale(n_stft, n_mels)(mel_spec.repeat(3, 1, 1, 1))

        # shape = (3, 2, n_mels, 32)

        # Because InverseMelScale runs SGD on randomly initialized values so they do not yield
        # exactly same result. For this reason, tolerance is very relaxed here.
124
        torch.testing.assert_allclose(computed, expected, atol=1.0, rtol=1e-5)
125
126
127
128
129
130
131
132
133
134
135

    def test_batch_compute_deltas(self):
        specgram = torch.randn(2, 31, 2786)

        # Single then transform then batch
        expected = torchaudio.transforms.ComputeDeltas()(specgram).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.ComputeDeltas()(specgram.repeat(3, 1, 1, 1))

        # shape = (3, 2, 201, 1394)
136
        torch.testing.assert_allclose(computed, expected)
137
138

    def test_batch_mulaw(self):
139
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
140
141
142
143
144
145
146
147
148
149
150
        waveform, _ = torchaudio.load(test_filepath)  # (2, 278756), 44100

        # Single then transform then batch
        waveform_encoded = torchaudio.transforms.MuLawEncoding()(waveform)
        expected = waveform_encoded.unsqueeze(0).repeat(3, 1, 1)

        # Batch then transform
        waveform_batched = waveform.unsqueeze(0).repeat(3, 1, 1)
        computed = torchaudio.transforms.MuLawEncoding()(waveform_batched)

        # shape = (3, 2, 201, 1394)
151
        torch.testing.assert_allclose(computed, expected)
152
153
154
155
156
157
158
159
160

        # Single then transform then batch
        waveform_decoded = torchaudio.transforms.MuLawDecoding()(waveform_encoded)
        expected = waveform_decoded.unsqueeze(0).repeat(3, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.MuLawDecoding()(computed)

        # shape = (3, 2, 201, 1394)
161
        torch.testing.assert_allclose(computed, expected)
162
163

    def test_batch_spectrogram(self):
164
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
165
166
167
168
169
170
171
        waveform, _ = torchaudio.load(test_filepath)  # (2, 278756), 44100

        # Single then transform then batch
        expected = torchaudio.transforms.Spectrogram()(waveform).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.Spectrogram()(waveform.repeat(3, 1, 1))
172
        torch.testing.assert_allclose(computed, expected)
173
174

    def test_batch_melspectrogram(self):
175
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
176
177
178
179
180
181
182
        waveform, _ = torchaudio.load(test_filepath)  # (2, 278756), 44100

        # Single then transform then batch
        expected = torchaudio.transforms.MelSpectrogram()(waveform).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.MelSpectrogram()(waveform.repeat(3, 1, 1))
183
        torch.testing.assert_allclose(computed, expected)
184
185
186
187

    @unittest.skipIf("sox" not in BACKENDS, "sox not available")
    @AudioBackendScope("sox")
    def test_batch_mfcc(self):
188
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.mp3')
189
190
191
192
193
194
195
        waveform, _ = torchaudio.load(test_filepath)

        # Single then transform then batch
        expected = torchaudio.transforms.MFCC()(waveform).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.MFCC()(waveform.repeat(3, 1, 1))
196
        torch.testing.assert_allclose(computed, expected, atol=1e-5, rtol=1e-5)
197
198

    def test_batch_TimeStretch(self):
199
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
        waveform, _ = torchaudio.load(test_filepath)  # (2, 278756), 44100

        kwargs = {
            'n_fft': 2048,
            'hop_length': 512,
            'win_length': 2048,
            'window': torch.hann_window(2048),
            'center': True,
            'pad_mode': 'reflect',
            'normalized': True,
            'onesided': True,
        }
        rate = 2

        complex_specgrams = torch.stft(waveform, **kwargs)

        # Single then transform then batch
        expected = torchaudio.transforms.TimeStretch(
            fixed_rate=rate,
            n_freq=1025,
            hop_length=512,
        )(complex_specgrams).repeat(3, 1, 1, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.TimeStretch(
            fixed_rate=rate,
            n_freq=1025,
            hop_length=512,
        )(complex_specgrams.repeat(3, 1, 1, 1, 1))

230
        torch.testing.assert_allclose(computed, expected, atol=1e-5, rtol=1e-5)
231
232

    def test_batch_Fade(self):
233
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
234
235
236
237
238
239
240
241
242
        waveform, _ = torchaudio.load(test_filepath)  # (2, 278756), 44100
        fade_in_len = 3000
        fade_out_len = 3000

        # Single then transform then batch
        expected = torchaudio.transforms.Fade(fade_in_len, fade_out_len)(waveform).repeat(3, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.Fade(fade_in_len, fade_out_len)(waveform.repeat(3, 1, 1))
243
        torch.testing.assert_allclose(computed, expected)
244
245

    def test_batch_Vol(self):
246
        test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
247
248
249
250
251
252
253
        waveform, _ = torchaudio.load(test_filepath)  # (2, 278756), 44100

        # Single then transform then batch
        expected = torchaudio.transforms.Vol(gain=1.1)(waveform).repeat(3, 1, 1)

        # Batch then transform
        computed = torchaudio.transforms.Vol(gain=1.1)(waveform.repeat(3, 1, 1))
254
        torch.testing.assert_allclose(computed, expected)
Vincent QB's avatar
Vincent QB committed
255
256
257
258


if __name__ == '__main__':
    unittest.main()