librosa_compatibility_test.py 11.8 KB
Newer Older
1
"""Test suites for numerical compatibility with librosa"""
2
import os
3
4
5
import unittest

import torch
6
import torchaudio
7
import torchaudio.functional as F
8
from torchaudio._internal.module_utils import is_module_available
9
from parameterized import parameterized, param
10

11
LIBROSA_AVAILABLE = is_module_available('librosa')
moto's avatar
moto committed
12
13

if LIBROSA_AVAILABLE:
14
    import librosa
15
    import scipy
16

17
from torchaudio_unittest import common_utils
18

19

20
def _load_audio_asset(*asset_paths, **kwargs):
21
    file_path = common_utils.get_asset_path(*asset_paths)
22
23
24
25
    sound, sample_rate = torchaudio.load(file_path, **kwargs)
    return sound, sample_rate


moto's avatar
moto committed
26
@unittest.skipIf(not LIBROSA_AVAILABLE, "Librosa not available")
moto's avatar
moto committed
27
class TestTransforms(common_utils.TorchaudioTestCase):
28
    """Test suite for functions in `transforms` module."""
29

30
31
32
33
34
35
36
37
38
39
    @parameterized.expand([
        param(n_fft=400, hop_length=200, power=2.0),
        param(n_fft=600, hop_length=100, power=2.0),
        param(n_fft=400, hop_length=200, power=3.0),
        param(n_fft=200, hop_length=50, power=2.0),
    ])
    def test_spectrogram(self, n_fft, hop_length, power):
        sample_rate = 16000
        sound = common_utils.get_sinusoid(n_channels=1, sample_rate=sample_rate)
        sound_librosa = sound.cpu().numpy().squeeze()
40
41
42
43
44
45
46
47
        spect_transform = torchaudio.transforms.Spectrogram(
            n_fft=n_fft, hop_length=hop_length, power=power)
        out_librosa, _ = librosa.core.spectrum._spectrogram(
            y=sound_librosa, n_fft=n_fft, hop_length=hop_length, power=power)

        out_torch = spect_transform(sound).squeeze().cpu()
        self.assertEqual(out_torch, torch.from_numpy(out_librosa), atol=1e-5, rtol=1e-5)

48
    @parameterized.expand([
49
        param(norm=norm, mel_scale=mel_scale, **p.kwargs)
50
51
52
53
54
55
        for p in [
            param(n_fft=400, hop_length=200, n_mels=128),
            param(n_fft=600, hop_length=100, n_mels=128),
            param(n_fft=200, hop_length=50, n_mels=128),
        ]
        for norm in [None, 'slaney']
56
        for mel_scale in ['htk', 'slaney']
57
    ])
58
    def test_mel_spectrogram(self, n_fft, hop_length, n_mels, norm, mel_scale):
59
60
61
        sample_rate = 16000
        sound = common_utils.get_sinusoid(n_channels=1, sample_rate=sample_rate)
        sound_librosa = sound.cpu().numpy().squeeze()
62
63
        melspect_transform = torchaudio.transforms.MelSpectrogram(
            sample_rate=sample_rate, window_fn=torch.hann_window,
64
            hop_length=hop_length, n_mels=n_mels, n_fft=n_fft, norm=norm, mel_scale=mel_scale)
65
66
        librosa_mel = librosa.feature.melspectrogram(
            y=sound_librosa, sr=sample_rate, n_fft=n_fft,
67
            hop_length=hop_length, n_mels=n_mels, htk=mel_scale == "htk", norm=norm)
68
69
70
71
72
        librosa_mel_tensor = torch.from_numpy(librosa_mel)
        torch_mel = melspect_transform(sound).squeeze().cpu()
        self.assertEqual(
            torch_mel.type(librosa_mel_tensor.dtype), librosa_mel_tensor, atol=5e-3, rtol=1e-5)

73
    @parameterized.expand([
74
        param(norm=norm, mel_scale=mel_scale, **p.kwargs)
75
76
77
78
79
80
81
82
        for p in [
            param(n_fft=400, hop_length=200, power=2.0, n_mels=128),
            param(n_fft=600, hop_length=100, power=2.0, n_mels=128),
            param(n_fft=400, hop_length=200, power=3.0, n_mels=128),
            # NOTE: Test passes offline, but fails on TravisCI (and CircleCI), see #372.
            param(n_fft=200, hop_length=50, power=2.0, n_mels=128, skip_ci=True),
        ]
        for norm in [None, 'slaney']
83
        for mel_scale in ['htk', 'slaney']
84
    ])
85
    def test_s2db(self, n_fft, hop_length, power, n_mels, norm, mel_scale, skip_ci=False):
86
87
88
89
90
91
92
93
94
95
96
        if skip_ci and 'CI' in os.environ:
            self.skipTest('Test is known to fail on CI')
        sample_rate = 16000
        sound = common_utils.get_sinusoid(n_channels=1, sample_rate=sample_rate)
        sound_librosa = sound.cpu().numpy().squeeze()
        spect_transform = torchaudio.transforms.Spectrogram(
            n_fft=n_fft, hop_length=hop_length, power=power)
        out_librosa, _ = librosa.core.spectrum._spectrogram(
            y=sound_librosa, n_fft=n_fft, hop_length=hop_length, power=power)
        melspect_transform = torchaudio.transforms.MelSpectrogram(
            sample_rate=sample_rate, window_fn=torch.hann_window,
97
            hop_length=hop_length, n_mels=n_mels, n_fft=n_fft, norm=norm, mel_scale=mel_scale)
98
99
        librosa_mel = librosa.feature.melspectrogram(
            y=sound_librosa, sr=sample_rate, n_fft=n_fft,
100
            hop_length=hop_length, n_mels=n_mels, htk=mel_scale == "htk", norm=norm)
101

102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
        power_to_db_transform = torchaudio.transforms.AmplitudeToDB('power', 80.)
        power_to_db_torch = power_to_db_transform(spect_transform(sound)).squeeze().cpu()
        power_to_db_librosa = librosa.core.spectrum.power_to_db(out_librosa)
        self.assertEqual(power_to_db_torch, torch.from_numpy(power_to_db_librosa), atol=5e-3, rtol=1e-5)

        mag_to_db_transform = torchaudio.transforms.AmplitudeToDB('magnitude', 80.)
        mag_to_db_torch = mag_to_db_transform(torch.abs(sound)).squeeze().cpu()
        mag_to_db_librosa = librosa.core.spectrum.amplitude_to_db(sound_librosa)
        self.assertEqual(mag_to_db_torch, torch.from_numpy(mag_to_db_librosa), atol=5e-3, rtol=1e-5)

        power_to_db_torch = power_to_db_transform(melspect_transform(sound)).squeeze().cpu()
        db_librosa = librosa.core.spectrum.power_to_db(librosa_mel)
        db_librosa_tensor = torch.from_numpy(db_librosa)
        self.assertEqual(
            power_to_db_torch.type(db_librosa_tensor.dtype), db_librosa_tensor, atol=5e-3, rtol=1e-5)

118
119
120
121
122
123
124
125
126
127
128
129
130
    @parameterized.expand([
        param(n_fft=400, hop_length=200, n_mels=128, n_mfcc=40),
        param(n_fft=600, hop_length=100, n_mels=128, n_mfcc=20),
        param(n_fft=200, hop_length=50, n_mels=128, n_mfcc=50),
    ])
    def test_mfcc(self, n_fft, hop_length, n_mels, n_mfcc):
        sample_rate = 16000
        sound = common_utils.get_sinusoid(n_channels=1, sample_rate=sample_rate)
        sound_librosa = sound.cpu().numpy().squeeze()
        librosa_mel = librosa.feature.melspectrogram(
            y=sound_librosa, sr=sample_rate, n_fft=n_fft,
            hop_length=hop_length, n_mels=n_mels, htk=True, norm=None)
        db_librosa = librosa.core.spectrum.power_to_db(librosa_mel)
131
132
133
134
135
136
137
138
139
140
141
142
143

        # librosa.feature.mfcc doesn't pass kwargs properly since some of the
        # kwargs for melspectrogram and mfcc are the same. We just follow the
        # function body in
        # https://librosa.github.io/librosa/_modules/librosa/feature/spectral.html#melspectrogram
        # to mirror this function call with correct args:
        #
        # librosa_mfcc = librosa.feature.mfcc(
        #     y=sound_librosa, sr=sample_rate, n_mfcc = n_mfcc,
        #     hop_length=hop_length, n_fft=n_fft, htk=True, norm=None, n_mels=n_mels)

        librosa_mfcc = scipy.fftpack.dct(db_librosa, axis=0, type=2, norm='ortho')[:n_mfcc]
        librosa_mfcc_tensor = torch.from_numpy(librosa_mfcc)
144
145
146
147

        melkwargs = {'hop_length': hop_length, 'n_fft': n_fft}
        mfcc_transform = torchaudio.transforms.MFCC(
            sample_rate=sample_rate, n_mfcc=n_mfcc, norm='ortho', melkwargs=melkwargs)
148
149
150
151
152
        torch_mfcc = mfcc_transform(sound).squeeze().cpu()

        self.assertEqual(
            torch_mfcc.type(librosa_mfcc_tensor.dtype), librosa_mfcc_tensor, atol=5e-3, rtol=1e-5)

153
154
155
156
157
158
159
160
161
    @parameterized.expand([
        param(n_fft=400, hop_length=200),
        param(n_fft=600, hop_length=100),
        param(n_fft=200, hop_length=50),
    ])
    def test_spectral_centroid(self, n_fft, hop_length):
        sample_rate = 16000
        sound = common_utils.get_sinusoid(n_channels=1, sample_rate=sample_rate)
        sound_librosa = sound.cpu().numpy().squeeze()
162
163
164
165
166
167
168
169
170
171
        spect_centroid = torchaudio.transforms.SpectralCentroid(
            sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length)
        out_torch = spect_centroid(sound).squeeze().cpu()

        out_librosa = librosa.feature.spectral_centroid(
            y=sound_librosa, sr=sample_rate, n_fft=n_fft, hop_length=hop_length)
        out_librosa = torch.from_numpy(out_librosa)[0]

        self.assertEqual(out_torch.type(out_librosa.dtype), out_librosa, atol=1e-5, rtol=1e-5)

172
173
174
175
176
    def test_MelScale(self):
        """MelScale transform is comparable to that of librosa"""
        n_fft = 2048
        n_mels = 256
        hop_length = n_fft // 4
moto's avatar
moto committed
177
178
        sample_rate = 44100
        sound = common_utils.get_whitenoise(sample_rate=sample_rate, duration=60)
179
180
181
182
183
184
185
186
187
188
189
        sound = sound.mean(dim=0, keepdim=True)
        spec_ta = F.spectrogram(
            sound, pad=0, window=torch.hann_window(n_fft), n_fft=n_fft,
            hop_length=hop_length, win_length=n_fft, power=2, normalized=False)
        spec_lr = spec_ta.cpu().numpy().squeeze()
        # Perform MelScale with torchaudio and librosa
        melspec_ta = torchaudio.transforms.MelScale(n_mels=n_mels, sample_rate=sample_rate)(spec_ta)
        melspec_lr = librosa.feature.melspectrogram(
            S=spec_lr, sr=sample_rate, n_fft=n_fft, hop_length=hop_length,
            win_length=n_fft, center=True, window='hann', n_mels=n_mels, htk=True, norm=None)
        # Note: Using relaxed rtol instead of atol
190
        self.assertEqual(melspec_ta, torch.from_numpy(melspec_lr[None, ...]), atol=1e-8, rtol=1e-3)
191
192
193
194
195
196
197
198
199

    def test_InverseMelScale(self):
        """InverseMelScale transform is comparable to that of librosa"""
        n_fft = 2048
        n_mels = 256
        n_stft = n_fft // 2 + 1
        hop_length = n_fft // 4

        # Prepare mel spectrogram input. We use torchaudio to compute one.
200
201
202
        path = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
        sound, sample_rate = common_utils.load_wav(path)
        sound = sound[:, 2**10:2**10 + 2**14]
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
        sound = sound.mean(dim=0, keepdim=True)
        spec_orig = F.spectrogram(
            sound, pad=0, window=torch.hann_window(n_fft), n_fft=n_fft,
            hop_length=hop_length, win_length=n_fft, power=2, normalized=False)
        melspec_ta = torchaudio.transforms.MelScale(n_mels=n_mels, sample_rate=sample_rate)(spec_orig)
        melspec_lr = melspec_ta.cpu().numpy().squeeze()
        # Perform InverseMelScale with torch audio and librosa
        spec_ta = torchaudio.transforms.InverseMelScale(
            n_stft, n_mels=n_mels, sample_rate=sample_rate)(melspec_ta)
        spec_lr = librosa.feature.inverse.mel_to_stft(
            melspec_lr, sr=sample_rate, n_fft=n_fft, power=2.0, htk=True, norm=None)
        spec_lr = torch.from_numpy(spec_lr[None, ...])

        # Align dimensions
        # librosa does not return power spectrogram while torchaudio returns power spectrogram
        spec_orig = spec_orig.sqrt()
        spec_ta = spec_ta.sqrt()

        threshold = 2.0
        # This threshold was choosen empirically, based on the following observation
        #
        # torch.dist(spec_lr, spec_ta, p=float('inf'))
        # >>> tensor(1.9666)
        #
        # The spectrograms reconstructed by librosa and torchaudio are not comparable elementwise.
        # This is because they use different approximation algorithms and resulting values can live
        # in different magnitude. (although most of them are very close)
        # See
        # https://github.com/pytorch/audio/pull/366 for the discussion of the choice of algorithm
        # https://github.com/pytorch/audio/pull/448/files#r385747021 for the distribution of P-inf
        # distance over frequencies.
234
        self.assertEqual(spec_ta, spec_lr, atol=threshold, rtol=1e-5)
235
236
237
238
239
240
241
242
243
244
245

        threshold = 1700.0
        # This threshold was choosen empirically, based on the following observations
        #
        # torch.dist(spec_orig, spec_ta, p=1)
        # >>> tensor(1644.3516)
        # torch.dist(spec_orig, spec_lr, p=1)
        # >>> tensor(1420.7103)
        # torch.dist(spec_lr, spec_ta, p=1)
        # >>> tensor(943.2759)
        assert torch.dist(spec_orig, spec_ta, p=1) < threshold