add slaney normalization (#589)

* add slaney normalization. * add torchscript. * convert to string for torchscript compatibility. * flake8. * use string as default.

add slaney normalization (#589)
* add slaney normalization. * add torchscript. * convert to string for torchscript compatibility. * flake8. * use string as default.
995b75f8 · Vincent QB · GitHub · adb3d3da · 995b75f8 · 995b75f8
Unverified Commit 995b75f8 authored May 14, 2020 by Vincent QB Committed by GitHub May 14, 2020
Showing with 22 additions and 5 deletions

test/test_librosa_compatibility.py test/test_librosa_compatibility.py +10 -3

test/test_torchscript_consistency.py test/test_torchscript_consistency.py +2 -1

torchaudio/functional.py torchaudio/functional.py +10 -1

No files found.
--- a/test/test_librosa_compatibility.py
+++ b/test/test_librosa_compatibility.py
@@ -53,19 +53,20 @@ class TestFunctional(_LibrosaMixin, unittest.TestCase):
        torch.testing.assert_allclose(ta_out, lr_out, atol=5e-5, rtol=1e-5)
-    def _test_create_fb(self, n_mels=40, sample_rate=22050, n_fft=2048, fmin=0.0, fmax=8000.0):
+    def _test_create_fb(self, n_mels=40, sample_rate=22050, n_fft=2048, fmin=0.0, fmax=8000.0, norm=None):
        librosa_fb = librosa.filters.mel(sr=sample_rate,
                                         n_fft=n_fft,
                                         n_mels=n_mels,
                                         fmax=fmax,
                                         fmin=fmin,
                                         htk=True,
-                                         norm=None)
+                                         norm=norm)
        fb = F.create_fb_matrix(sample_rate=sample_rate,
                                n_mels=n_mels,
                                f_max=fmax,
                                f_min=fmin,
-                                n_freqs=(n_fft // 2 + 1))
+                                n_freqs=(n_fft // 2 + 1),
+                                norm=norm)
        for i_mel_bank in range(n_mels):
            torch.testing.assert_allclose(fb[:, i_mel_bank], torch.tensor(librosa_fb[i_mel_bank]),
@@ -79,6 +80,12 @@ class TestFunctional(_LibrosaMixin, unittest.TestCase):
        self._test_create_fb(n_mels=56, fmin=800.0, fmax=900.0)
        self._test_create_fb(n_mels=56, fmin=1900.0, fmax=900.0)
        self._test_create_fb(n_mels=10, fmin=1900.0, fmax=900.0)
+        self._test_create_fb(n_mels=128, sample_rate=44100, norm="slaney")
+        self._test_create_fb(n_mels=128, fmin=2000.0, fmax=5000.0, norm="slaney")
+        self._test_create_fb(n_mels=56, fmin=100.0, fmax=9000.0, norm="slaney")
+        self._test_create_fb(n_mels=56, fmin=800.0, fmax=900.0, norm="slaney")
+        self._test_create_fb(n_mels=56, fmin=1900.0, fmax=900.0, norm="slaney")
+        self._test_create_fb(n_mels=10, fmin=1900.0, fmax=900.0, norm="slaney")
    def test_amplitude_to_DB(self):
        spec = torch.rand((6, 201))

--- a/test/test_torchscript_consistency.py
+++ b/test/test_torchscript_consistency.py
@@ -96,7 +96,8 @@ class Functional(common_utils.TestBaseMixin):
            f_max = 20.0
            n_mels = 10
            sample_rate = 16000
-            return F.create_fb_matrix(n_stft, f_min, f_max, n_mels, sample_rate)
+            norm = ""
+            return F.create_fb_matrix(n_stft, f_min, f_max, n_mels, sample_rate, norm)
        dummy = torch.zeros(1, 1)
        self._assert_consistency(func, dummy)

--- a/torchaudio/functional.py
+++ b/torchaudio/functional.py
@@ -335,7 +335,8 @@ def create_fb_matrix(
        f_min: float,
        f_max: float,
        n_mels: int,
-        sample_rate: int
+        sample_rate: int,
+        norm: str = "",
 ) -> Tensor:
    r"""Create a frequency bin conversion matrix.
@@ -345,6 +346,8 @@ def create_fb_matrix(
        f_max (float): Maximum frequency (Hz)
        n_mels (int): Number of mel filterbanks
        sample_rate (int): Sample rate of the audio waveform
+        norm (str): If 'slaney', divide the triangular mel weights by the width of the mel band
+        (area normalization). (Default: '')
    Returns:
        Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_mels``)
@@ -372,6 +375,12 @@ def create_fb_matrix(
    down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1]  # (n_freqs, n_mels)
    up_slopes = slopes[:, 2:] / f_diff[1:]  # (n_freqs, n_mels)
    fb = torch.max(zero, torch.min(down_slopes, up_slopes))
+    if norm == "slaney":
+        # Slaney-style mel is scaled to be approx constant energy per channel
+        enorm = 2.0 / (f_pts[2:n_mels + 2] - f_pts[:n_mels])
+        fb *= enorm.unsqueeze(0)
    return fb