Move bark spectrogram to prototype (#2843)

Summary: follow up to https://github.com/pytorch/audio/issues/2823 - move bark spectrogram to prototype - decrease autograd test tolerance (passing on circle ci) - add diagram for bark fbanks cc jdariasl Pull Request resolved: https://github.com/pytorch/audio/pull/2843 Reviewed By: nateanl Differential Revision: D41199522 Pulled By: carolineechen fbshipit-source-id: 8e6c2e20fb7b14f39477683b3c6ed8356359a213

Move bark spectrogram to prototype (#2843)
Summary: follow up to https://github.com/pytorch/audio/issues/2823 - move bark spectrogram to prototype - decrease autograd test tolerance (passing on circle ci) - add diagram for bark fbanks cc jdariasl Pull Request resolved: https://github.com/pytorch/audio/pull/2843 Reviewed By: nateanl Differential Revision: D41199522 Pulled By: carolineechen fbshipit-source-id: 8e6c2e20fb7b14f39477683b3c6ed8356359a213
7819f3f6 · Caroline Chen · Facebook GitHub Bot · 6e334a46 · 7819f3f6 · 7819f3f6
Commit 7819f3f6 authored Nov 14, 2022 by Caroline Chen Committed by Facebook GitHub Bot Nov 14, 2022
20 changed files
--- a/docs/source/functional.rst
+++ b/docs/source/functional.rst
@@ -17,7 +17,6 @@ Utility
   amplitude_to_DB
   DB_to_amplitude
   melscale_fbanks
-   barkscale_fbanks
   linear_fbanks
   create_dct
   mask_along_axis

--- a/docs/source/prototype.functional.rst
+++ b/docs/source/prototype.functional.rst
@@ -9,6 +9,11 @@ add_noise

 .. autofunction:: add_noise

+barkscale_fbanks
+~~~~~~~~~~~~~~~~
+
+.. autofunction:: barkscale_fbanks
+
 convolve
 ~~~~~~~~


--- a/docs/source/prototype.transforms.rst
+++ b/docs/source/prototype.transforms.rst
@@ -11,3 +11,6 @@ torchaudio.prototype.transforms

    Convolve
    FFTConvolve
+    BarkScale
+    InverseBarkScale
+    BarkSpectrogram
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -84,9 +84,6 @@ Utility

    AmplitudeToDB
    MelScale
-    InverseMelScale
-    BarkScale
-    InverseBarkScale
    MuLawEncoding
    MuLawDecoding
    Resample
@@ -104,7 +101,6 @@ Feature Extractions
    Spectrogram
    InverseSpectrogram
    MelSpectrogram
-    BarkSpectrogram
    GriffinLim
    MFCC
    LFCC

--- a/test/torchaudio_unittest/functional/torchscript_consistency_impl.py
+++ b/test/torchaudio_unittest/functional/torchscript_consistency_impl.py
@@ -140,17 +140,6 @@ class Functional(TempDirMixin, TestBaseMixin):
        norm = "slaney"
        self._assert_consistency(F.melscale_fbanks, (n_stft, f_min, f_max, n_mels, sample_rate, norm, "htk"))

-    def test_barkscale_fbanks(self):
-        if self.device != torch.device("cpu"):
-            raise unittest.SkipTest("No need to perform test on device other than CPU")
-
-        n_stft = 100
-        f_min = 0.0
-        f_max = 20.0
-        n_barks = 10
-        sample_rate = 16000
-        self._assert_consistency(F.barkscale_fbanks, (n_stft, f_min, f_max, n_barks, sample_rate, "traunmuller"))
-
    def test_linear_fbanks(self):
        if self.device != torch.device("cpu"):
            raise unittest.SkipTest("No need to perform test on device other than CPU")

--- a/test/torchaudio_unittest/prototype/functional/torchscript_consistency_test_impl.py
+++ b/test/torchaudio_unittest/prototype/functional/torchscript_consistency_test_impl.py
+import unittest
+
 import torch
 import torchaudio.prototype.functional as F
 from torchaudio_unittest.common_utils import nested_params, TestBaseMixin, torch_script
@@ -45,3 +47,14 @@ class TorchScriptConsistencyTestImpl(TestBaseMixin):
        snr = torch.rand(*leading_dims, dtype=self.dtype, device=self.device, requires_grad=True) * 10

        self._assert_consistency(F.add_noise, (waveform, noise, lengths, snr))
+
+    def test_barkscale_fbanks(self):
+        if self.device != torch.device("cpu"):
+            raise unittest.SkipTest("No need to perform test on device other than CPU")
+
+        n_stft = 100
+        f_min = 0.0
+        f_max = 20.0
+        n_barks = 10
+        sample_rate = 16000
+        self._assert_consistency(F.barkscale_fbanks, (n_stft, f_min, f_max, n_barks, sample_rate, "traunmuller"))
--- a/test/torchaudio_unittest/prototype/transforms/autograd_test_impl.py
+++ b/test/torchaudio_unittest/prototype/transforms/autograd_test_impl.py
@@ -3,7 +3,7 @@ from typing import List
 import torch
 import torchaudio.prototype.transforms as T
 from torch.autograd import gradcheck, gradgradcheck
-from torchaudio_unittest.common_utils import nested_params, TestBaseMixin
+from torchaudio_unittest.common_utils import get_spectrogram, get_whitenoise, nested_params, TestBaseMixin


 class Autograd(TestBaseMixin):
@@ -38,3 +38,21 @@ class Autograd(TestBaseMixin):
        y = torch.rand(*leading_dims, L_y, dtype=self.dtype, device=self.device)
        convolve = cls(mode=mode).to(dtype=self.dtype, device=self.device)
        self.assert_grad(convolve, [x, y])
+
+    def test_barkspectrogram(self):
+        # replication_pad1d_backward_cuda is not deteministic and
+        # gives very small (~e-16) difference.
+        sample_rate = 8000
+        transform = T.BarkSpectrogram(sample_rate=sample_rate)
+        waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2)
+        self.assert_grad(transform, [waveform], nondet_tol=1e-10)
+
+    def test_barkscale(self):
+        sample_rate = 8000
+        n_fft = 400
+        n_barks = n_fft // 2 + 1
+        transform = T.BarkScale(sample_rate=sample_rate, n_barks=n_barks)
+        spec = get_spectrogram(
+            get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2), n_fft=n_fft, power=1
+        )
+        self.assert_grad(transform, [spec])
--- a/test/torchaudio_unittest/prototype/transforms/batch_consistency_test.py
+++ b/test/torchaudio_unittest/prototype/transforms/batch_consistency_test.py
+import os
+
 import torch
 import torchaudio.prototype.transforms as T
+import torchaudio.transforms as transforms
 from torchaudio_unittest.common_utils import nested_params, TorchaudioTestCase


 class BatchConsistencyTest(TorchaudioTestCase):
+    def assert_batch_consistency(self, transform, batch, *args, atol=1e-8, rtol=1e-5, seed=42, **kwargs):
+        n = batch.size(0)
+
+        # Compute items separately, then batch the result
+        torch.random.manual_seed(seed)
+        items_input = batch.clone()
+        items_result = torch.stack([transform(items_input[i], *args, **kwargs) for i in range(n)])
+
+        # Batch the input and run
+        torch.random.manual_seed(seed)
+        batch_input = batch.clone()
+        batch_result = transform(batch_input, *args, **kwargs)
+
+        self.assertEqual(items_input, batch_input, rtol=rtol, atol=atol)
+        self.assertEqual(items_result, batch_result, rtol=rtol, atol=atol)
+
    @nested_params(
        [T.Convolve, T.FFTConvolve],
        ["full", "valid", "same"],
@@ -26,3 +45,21 @@ class BatchConsistencyTest(TorchaudioTestCase):
        )

        self.assertEqual(expected, actual)
+
+    def test_batch_BarkScale(self):
+        specgram = torch.randn(3, 2, 201, 256)
+
+        atol = 1e-6 if os.name == "nt" else 1e-8
+        transform = T.BarkScale()
+
+        self.assert_batch_consistency(transform, specgram, atol=atol)
+
+    def test_batch_InverseBarkScale(self):
+        n_barks = 32
+        n_stft = 5
+        bark_spec = torch.randn(3, 2, n_barks, 32) ** 2
+        transform = transforms.InverseMelScale(n_stft, n_barks)
+
+        # Because InverseBarkScale runs SGD on randomly initialized values so they do not yield
+        # exactly same result. For this reason, tolerance is very relaxed here.
+        self.assert_batch_consistency(transform, bark_spec, atol=1.0, rtol=1e-5)
--- a/test/torchaudio_unittest/prototype/transforms/transforms_test_impl.py
+++ b/test/torchaudio_unittest/prototype/transforms/transforms_test_impl.py
@@ -2,7 +2,11 @@ import numpy as np
 import torch
 import torchaudio.prototype.transforms as T
 from scipy import signal
-from torchaudio_unittest.common_utils import nested_params, TestBaseMixin
+from torchaudio_unittest.common_utils import get_spectrogram, get_whitenoise, nested_params, TestBaseMixin
+
+
+def _get_ratio(mat):
+    return (mat.sum() / mat.numel()).item()


 class TransformsTestImpl(TestBaseMixin):
@@ -52,3 +56,46 @@ class TransformsTestImpl(TestBaseMixin):
        expected = torch.tensor(expected)

        self.assertEqual(expected, actual)
+
+    def test_InverseBarkScale(self):
+        """Gauge the quality of InverseBarkScale transform.
+
+        As InverseBarkScale is currently implemented with
+        random initialization + iterative optimization,
+        it is not practically possible to assert the difference between
+        the estimated spectrogram and the original spectrogram as a whole.
+        Estimated spectrogram has very huge descrepency locally.
+        Thus in this test we gauge what percentage of elements are bellow
+        certain tolerance.
+        At the moment, the quality of estimated spectrogram is worse than the
+        one obtained for Inverse MelScale.
+        When implementation is changed in a way it makes the quality even worse,
+        this test will fail.
+        """
+        n_fft = 400
+        power = 1
+        n_barks = 64
+        sample_rate = 8000
+
+        n_stft = n_fft // 2 + 1
+
+        # Generate reference spectrogram and input mel-scaled spectrogram
+        expected = get_spectrogram(
+            get_whitenoise(sample_rate=sample_rate, duration=1, n_channels=2), n_fft=n_fft, power=power
+        ).to(self.device, self.dtype)
+        input = T.BarkScale(n_barks=n_barks, sample_rate=sample_rate, n_stft=n_stft).to(self.device, self.dtype)(
+            expected
+        )
+
+        # Run transform
+        transform = T.InverseBarkScale(n_stft, n_barks=n_barks, sample_rate=sample_rate).to(self.device, self.dtype)
+        result = transform(input)
+
+        # Compare
+        epsilon = 1e-60
+        relative_diff = torch.abs((result - expected) / (expected + epsilon))
+
+        for tol in [1e-1, 1e-3, 1e-5, 1e-10]:
+            print(f"Ratio of relative diff smaller than {tol:e} is " f"{_get_ratio(relative_diff < tol)}")
+        assert _get_ratio(relative_diff < 1e-1) > 0.2
+        assert _get_ratio(relative_diff < 1e-3) > 2e-3
--- a/test/torchaudio_unittest/transforms/autograd_test_impl.py
+++ b/test/torchaudio_unittest/transforms/autograd_test_impl.py
@@ -90,14 +90,6 @@ class AutogradTestMixin(TestBaseMixin):
        waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2)
        self.assert_grad(transform, [waveform], nondet_tol=1e-10)

-    def test_barkspectrogram(self):
-        # replication_pad1d_backward_cuda is not deteministic and
-        # gives very small (~e-16) difference.
-        sample_rate = 8000
-        transform = T.BarkSpectrogram(sample_rate=sample_rate)
-        waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2)
-        self.assert_grad(transform, [waveform], nondet_tol=1e-10)
-
    @nested_params(
        [0, 0.99],
        [False, True],
@@ -202,16 +194,6 @@ class AutogradTestMixin(TestBaseMixin):
        )
        self.assert_grad(transform, [spec])

-    def test_barkscale(self):
-        sample_rate = 8000
-        n_fft = 400
-        n_barks = n_fft // 2 + 1
-        transform = T.BarkScale(sample_rate=sample_rate, n_barks=n_barks)
-        spec = get_spectrogram(
-            get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2), n_fft=n_fft, power=1
-        )
-        self.assert_grad(transform, [spec])
-
    @parameterized.expand([(1.5, "amplitude"), (2, "power"), (10, "db")])
    def test_vol(self, gain, gain_type):
        sample_rate = 8000

--- a/test/torchaudio_unittest/transforms/batch_consistency_test.py
+++ b/test/torchaudio_unittest/transforms/batch_consistency_test.py
@@ -58,24 +58,6 @@ class TestTransforms(common_utils.TorchaudioTestCase):
        # exactly same result. For this reason, tolerance is very relaxed here.
        self.assert_batch_consistency(transform, mel_spec, atol=1.0, rtol=1e-5)

-    def test_batch_BarkScale(self):
-        specgram = torch.randn(3, 2, 201, 256)
-
-        atol = 1e-4 if os.name == "nt" else 1e-6
-        transform = T.BarkScale()
-
-        self.assert_batch_consistency(transform, specgram, atol=atol)
-
-    def test_batch_InverseBarkScale(self):
-        n_barks = 32
-        n_stft = 5
-        bark_spec = torch.randn(3, 2, n_barks, 32) ** 2
-        transform = T.InverseMelScale(n_stft, n_barks)
-
-        # Because InverseBarkScale runs SGD on randomly initialized values so they do not yield
-        # exactly same result. For this reason, tolerance is very relaxed here.
-        self.assert_batch_consistency(transform, bark_spec, atol=1.0, rtol=1e-5)
-
    def test_batch_compute_deltas(self):
        specgram = torch.randn(3, 2, 31, 2786)
        transform = T.ComputeDeltas()

--- a/test/torchaudio_unittest/transforms/transforms_test.py
+++ b/test/torchaudio_unittest/transforms/transforms_test.py
@@ -131,84 +131,6 @@ class Tester(common_utils.TorchaudioTestCase):
        self.assertTrue(fb_matrix_transform.fb.sum(1).ge(0.0).all())
        self.assertEqual(fb_matrix_transform.fb.size(), (400, 100))

-    def test_barkscale_load_save(self):
-        specgram = torch.ones(1, 201, 100)
-        barkscale_transform = transforms.BarkScale()
-        barkscale_transform(specgram)
-
-        barkscale_transform_copy = transforms.BarkScale()
-        barkscale_transform_copy.load_state_dict(barkscale_transform.state_dict())
-
-        fb = barkscale_transform.fb
-        fb_copy = barkscale_transform_copy.fb
-
-        self.assertEqual(fb_copy.size(), (201, 128))
-        self.assertEqual(fb, fb_copy)
-
-    def test_barkspectrogram_load_save(self):
-        waveform = self.waveform.float()
-        bark_spectrogram_transform = transforms.BarkSpectrogram()
-        bark_spectrogram_transform(waveform)
-
-        bark_spectrogram_transform_copy = transforms.BarkSpectrogram()
-        bark_spectrogram_transform_copy.load_state_dict(bark_spectrogram_transform.state_dict())
-
-        window = bark_spectrogram_transform.spectrogram.window
-        window_copy = bark_spectrogram_transform_copy.spectrogram.window
-
-        fb = bark_spectrogram_transform.bark_scale.fb
-        fb_copy = bark_spectrogram_transform_copy.bark_scale.fb
-
-        self.assertEqual(window, window_copy)
-        # the default for n_fft = 400 and n_mels = 128
-        self.assertEqual(fb_copy.size(), (201, 128))
-        self.assertEqual(fb, fb_copy)
-
-    def test_bark2(self):
-        top_db = 80.0
-        s2db = transforms.AmplitudeToDB("power", top_db)
-
-        waveform = self.waveform.clone()  # (1, 16000)
-        waveform_scaled = self.scale(waveform)  # (1, 16000)
-        bark_transform = transforms.BarkSpectrogram()
-        # check defaults
-        spectrogram_torch = s2db(bark_transform(waveform_scaled))  # (1, 128, 321)
-        self.assertTrue(spectrogram_torch.dim() == 3)
-        self.assertTrue(spectrogram_torch.ge(spectrogram_torch.max() - top_db).all())
-        self.assertEqual(spectrogram_torch.size(1), bark_transform.n_barks)
-        # check correctness of filterbank conversion matrix
-        self.assertTrue(bark_transform.bark_scale.fb.sum(1).le(1.0).all())
-        self.assertTrue(bark_transform.bark_scale.fb.sum(1).ge(0.0).all())
-        # check options
-        kwargs = {
-            "window_fn": torch.hamming_window,
-            "pad": 10,
-            "win_length": 500,
-            "hop_length": 125,
-            "n_fft": 800,
-            "n_barks": 50,
-        }
-        bark_transform2 = transforms.BarkSpectrogram(**kwargs)
-        spectrogram2_torch = s2db(bark_transform2(waveform_scaled))  # (1, 50, 513)
-        self.assertTrue(spectrogram2_torch.dim() == 3)
-        self.assertTrue(spectrogram_torch.ge(spectrogram_torch.max() - top_db).all())
-        self.assertEqual(spectrogram2_torch.size(1), bark_transform2.n_barks)
-        self.assertTrue(bark_transform2.bark_scale.fb.sum(1).le(1.0).all())
-        self.assertTrue(bark_transform2.bark_scale.fb.sum(1).ge(0.0).all())
-        # check on multi-channel audio
-        filepath = common_utils.get_asset_path("steam-train-whistle-daniel_simon.wav")
-        x_stereo = common_utils.load_wav(filepath)[0]  # (2, 278756), 44100
-        spectrogram_stereo = s2db(bark_transform(x_stereo))  # (2, 128, 1394)
-        self.assertTrue(spectrogram_stereo.dim() == 3)
-        self.assertTrue(spectrogram_stereo.size(0) == 2)
-        self.assertTrue(spectrogram_torch.ge(spectrogram_torch.max() - top_db).all())
-        self.assertEqual(spectrogram_stereo.size(1), bark_transform.n_barks)
-        # check filterbank matrix creation
-        fb_matrix_transform = transforms.BarkScale(n_barks=100, sample_rate=16000, f_min=0.0, f_max=None, n_stft=400)
-        self.assertTrue(fb_matrix_transform.fb.sum(1).le(1.0).all())
-        self.assertTrue(fb_matrix_transform.fb.sum(1).ge(0.0).all())
-        self.assertEqual(fb_matrix_transform.fb.size(), (400, 100))
-
    def test_mfcc_defaults(self):
        """Check the default configuration of the MFCC transform."""
        sample_rate = 16000
@@ -374,9 +296,3 @@ class SmokeTest(common_utils.TorchaudioTestCase):
        specgram = melspecgram.spectrogram
        self.assertEqual(specgram.center, True)
        self.assertEqual(specgram.pad_mode, "reflect")
-
-    def test_barkspectrogram(self):
-        barkspecgram = transforms.BarkSpectrogram(center=True, pad_mode="reflect")
-        specgram = barkspecgram.spectrogram
-        self.assertEqual(specgram.center, True)
-        self.assertEqual(specgram.pad_mode, "reflect")
--- a/test/torchaudio_unittest/transforms/transforms_test_impl.py
+++ b/test/torchaudio_unittest/transforms/transforms_test_impl.py
@@ -52,49 +52,6 @@ class TransformsTestBase(TestBaseMixin):
        assert _get_ratio(relative_diff < 1e-3) > 5e-3
        assert _get_ratio(relative_diff < 1e-5) > 1e-5

-    def test_InverseBarkScale(self):
-        """Gauge the quality of InverseBarkScale transform.
-
-        As InverseBarkScale is currently implemented with
-        random initialization + iterative optimization,
-        it is not practically possible to assert the difference between
-        the estimated spectrogram and the original spectrogram as a whole.
-        Estimated spectrogram has very huge descrepency locally.
-        Thus in this test we gauge what percentage of elements are bellow
-        certain tolerance.
-        At the moment, the quality of estimated spectrogram is worse than the
-        one obtained for Inverse MelScale.
-        When implementation is changed in a way it makes the quality even worse,
-        this test will fail.
-        """
-        n_fft = 400
-        power = 1
-        n_barks = 64
-        sample_rate = 8000
-
-        n_stft = n_fft // 2 + 1
-
-        # Generate reference spectrogram and input mel-scaled spectrogram
-        expected = get_spectrogram(
-            get_whitenoise(sample_rate=sample_rate, duration=1, n_channels=2), n_fft=n_fft, power=power
-        ).to(self.device, self.dtype)
-        input = T.BarkScale(n_barks=n_barks, sample_rate=sample_rate, n_stft=n_stft).to(self.device, self.dtype)(
-            expected
-        )
-
-        # Run transform
-        transform = T.InverseBarkScale(n_stft, n_barks=n_barks, sample_rate=sample_rate).to(self.device, self.dtype)
-        result = transform(input)
-
-        # Compare
-        epsilon = 1e-60
-        relative_diff = torch.abs((result - expected) / (expected + epsilon))
-
-        for tol in [1e-1, 1e-3, 1e-5, 1e-10]:
-            print(f"Ratio of relative diff smaller than {tol:e} is " f"{_get_ratio(relative_diff < tol)}")
-        assert _get_ratio(relative_diff < 1e-1) > 0.2
-        assert _get_ratio(relative_diff < 1e-3) > 2e-3
-
    @nested_params(
        ["sinc_interpolation", "kaiser_window"],
        [16000, 44100],

--- a/torchaudio/functional/__init__.py
+++ b/torchaudio/functional/__init__.py
@@ -26,7 +26,6 @@ from .functional import (
    amplitude_to_DB,
    apply_beamforming,
    apply_codec,
-    barkscale_fbanks,
    compute_deltas,
    compute_kaldi_pitch,
    create_dct,
@@ -62,7 +61,6 @@ __all__ = [
    "compute_kaldi_pitch",
    "create_dct",
    "melscale_fbanks",
-    "barkscale_fbanks",
    "linear_fbanks",
    "DB_to_amplitude",
    "loudness",

--- a/torchaudio/functional/functional.py
+++ b/torchaudio/functional/functional.py
@@ -22,7 +22,6 @@ __all__ = [
    "compute_deltas",
    "compute_kaldi_pitch",
    "melscale_fbanks",
-    "barkscale_fbanks",
    "linear_fbanks",
    "create_dct",
    "compute_deltas",
@@ -480,121 +479,6 @@ def _mel_to_hz(mels: Tensor, mel_scale: str = "htk") -> Tensor:
    return freqs


-def _hz_to_bark(freqs: float, bark_scale: str = "traunmuller") -> float:
-    r"""Convert Hz to Barks.
-
-    Args:
-        freqs (float): Frequencies in Hz
-        bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
-
-    Returns:
-        barks (float): Frequency in Barks
-    """
-
-    if bark_scale not in ["schroeder", "traunmuller", "wang"]:
-        raise ValueError('bark_scale should be one of "schroeder", "traunmuller" or "wang".')
-
-    if bark_scale == "wang":
-        return 6.0 * math.asinh(freqs / 600.0)
-    elif bark_scale == "schroeder":
-        return 7.0 * math.asinh(freqs / 650.0)
-    # Traunmuller Bark scale
-    barks = ((26.81 * freqs) / (1960.0 + freqs)) - 0.53
-    # Bark value correction
-    if barks < 2:
-        barks += 0.15 * (2 - barks)
-    elif barks > 20.1:
-        barks += 0.22 * (barks - 20.1)
-
-    return barks
-
-
-def _bark_to_hz(barks: Tensor, bark_scale: str = "traunmuller") -> Tensor:
-    """Convert bark bin numbers to frequencies.
-
-    Args:
-        barks (Tensor): Bark frequencies
-        bark_scale (str, optional): Scale to use: ``traunmuller``,``schroeder`` or ``wang``. (Default: ``traunmuller``)
-
-    Returns:
-        freqs (Tensor): Barks converted in Hz
-    """
-
-    if bark_scale not in ["schroeder", "traunmuller", "wang"]:
-        raise ValueError('bark_scale should be one of "traunmuller", "schroeder" or "wang".')
-
-    if bark_scale == "wang":
-        return 600.0 * torch.sinh(barks / 6.0)
-    elif bark_scale == "schroeder":
-        return 650.0 * torch.sinh(barks / 7.0)
-    # Bark value correction
-    if any(barks < 2):
-        idx = barks < 2
-        barks[idx] = (barks[idx] - 0.3) / 0.85
-    elif any(barks > 20.1):
-        idx = barks > 20.1
-        barks[idx] = (barks[idx] + 4.422) / 1.22
-
-    # Traunmuller Bark scale
-    freqs = 1960 * ((barks + 0.53) / (26.28 - barks))
-
-    return freqs
-
-
-def barkscale_fbanks(
-    n_freqs: int,
-    f_min: float,
-    f_max: float,
-    n_barks: int,
-    sample_rate: int,
-    bark_scale: str = "traunmuller",
-) -> Tensor:
-    r"""Create a frequency bin conversion matrix.
-
-    .. devices:: CPU
-
-    .. properties:: TorchScript
-
-    Args:
-        n_freqs (int): Number of frequencies to highlight/apply
-        f_min (float): Minimum frequency (Hz)
-        f_max (float): Maximum frequency (Hz)
-        n_barks (int): Number of mel filterbanks
-        sample_rate (int): Sample rate of the audio waveform
-        bark_scale (str, optional): Scale to use: ``traunmuller``,``schroeder`` or ``wang``. (Default: ``traunmuller``)
-
-    Returns:
-        Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_barks``)
-        meaning number of frequencies to highlight/apply to x the number of filterbanks.
-        Each column is a filterbank so that assuming there is a matrix A of
-        size (..., ``n_freqs``), the applied result would be
-        ``A * barkscale_fbanks(A.size(-1), ...)``.
-
-    """
-
-    # freq bins
-    all_freqs = torch.linspace(0, sample_rate // 2, n_freqs)
-
-    # calculate bark freq bins
-    m_min = _hz_to_bark(f_min, bark_scale=bark_scale)
-    m_max = _hz_to_bark(f_max, bark_scale=bark_scale)
-
-    m_pts = torch.linspace(m_min, m_max, n_barks + 2)
-    f_pts = _bark_to_hz(m_pts, bark_scale=bark_scale)
-
-    # create filterbank
-    fb = _create_triangular_filterbank(all_freqs, f_pts)
-
-    if (fb.max(dim=0).values == 0.0).any():
-        warnings.warn(
-            "At least one bark filterbank has all zero values. "
-            f"The value for `n_barks` ({n_barks}) may be set too high. "
-            f"Or, the value for `n_freqs` ({n_freqs}) may be set too low."
-        )
-
-    return fb
-
-
 def _create_triangular_filterbank(
    all_freqs: Tensor,
    f_pts: Tensor,

--- a/torchaudio/prototype/functional/__init__.py
+++ b/torchaudio/prototype/functional/__init__.py
-from .functional import add_noise, convolve, fftconvolve
+from .functional import add_noise, barkscale_fbanks, convolve, fftconvolve

-__all__ = ["add_noise", "convolve", "fftconvolve"]
+__all__ = ["add_noise", "barkscale_fbanks", "convolve", "fftconvolve"]
--- a/torchaudio/prototype/functional/functional.py
+++ b/torchaudio/prototype/functional/functional.py
+import math
+import warnings
+
 import torch

+from torchaudio.functional.functional import _create_triangular_filterbank
+

 def _check_convolve_mode(mode: str) -> None:
    valid_convolve_modes = ["full", "valid", "same"]
@@ -172,3 +177,121 @@ def add_noise(waveform: torch.Tensor, noise: torch.Tensor, lengths: torch.Tensor
    scaled_noise = scale.unsqueeze(-1) * noise  # (*, 1) * (*, L) = (*, L)

    return waveform + scaled_noise  # (*, L)
+
+
+def _hz_to_bark(freqs: float, bark_scale: str = "traunmuller") -> float:
+    r"""Convert Hz to Barks.
+
+    Args:
+        freqs (float): Frequencies in Hz
+        bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
+
+    Returns:
+        barks (float): Frequency in Barks
+    """
+
+    if bark_scale not in ["schroeder", "traunmuller", "wang"]:
+        raise ValueError('bark_scale should be one of "schroeder", "traunmuller" or "wang".')
+
+    if bark_scale == "wang":
+        return 6.0 * math.asinh(freqs / 600.0)
+    elif bark_scale == "schroeder":
+        return 7.0 * math.asinh(freqs / 650.0)
+    # Traunmuller Bark scale
+    barks = ((26.81 * freqs) / (1960.0 + freqs)) - 0.53
+    # Bark value correction
+    if barks < 2:
+        barks += 0.15 * (2 - barks)
+    elif barks > 20.1:
+        barks += 0.22 * (barks - 20.1)
+
+    return barks
+
+
+def _bark_to_hz(barks: torch.Tensor, bark_scale: str = "traunmuller") -> torch.Tensor:
+    """Convert bark bin numbers to frequencies.
+
+    Args:
+        barks (torch.Tensor): Bark frequencies
+        bark_scale (str, optional): Scale to use: ``traunmuller``,``schroeder`` or ``wang``. (Default: ``traunmuller``)
+
+    Returns:
+        freqs (torch.Tensor): Barks converted in Hz
+    """
+
+    if bark_scale not in ["schroeder", "traunmuller", "wang"]:
+        raise ValueError('bark_scale should be one of "traunmuller", "schroeder" or "wang".')
+
+    if bark_scale == "wang":
+        return 600.0 * torch.sinh(barks / 6.0)
+    elif bark_scale == "schroeder":
+        return 650.0 * torch.sinh(barks / 7.0)
+    # Bark value correction
+    if any(barks < 2):
+        idx = barks < 2
+        barks[idx] = (barks[idx] - 0.3) / 0.85
+    elif any(barks > 20.1):
+        idx = barks > 20.1
+        barks[idx] = (barks[idx] + 4.422) / 1.22
+
+    # Traunmuller Bark scale
+    freqs = 1960 * ((barks + 0.53) / (26.28 - barks))
+
+    return freqs
+
+
+def barkscale_fbanks(
+    n_freqs: int,
+    f_min: float,
+    f_max: float,
+    n_barks: int,
+    sample_rate: int,
+    bark_scale: str = "traunmuller",
+) -> torch.Tensor:
+    r"""Create a frequency bin conversion matrix.
+
+    .. devices:: CPU
+
+    .. properties:: TorchScript
+
+    .. image:: https://download.pytorch.org/torchaudio/doc-assets/bark_fbanks.png
+        :alt: Visualization of generated filter bank
+
+    Args:
+        n_freqs (int): Number of frequencies to highlight/apply
+        f_min (float): Minimum frequency (Hz)
+        f_max (float): Maximum frequency (Hz)
+        n_barks (int): Number of mel filterbanks
+        sample_rate (int): Sample rate of the audio waveform
+        bark_scale (str, optional): Scale to use: ``traunmuller``,``schroeder`` or ``wang``. (Default: ``traunmuller``)
+
+    Returns:
+        torch.Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_barks``)
+        meaning number of frequencies to highlight/apply to x the number of filterbanks.
+        Each column is a filterbank so that assuming there is a matrix A of
+        size (..., ``n_freqs``), the applied result would be
+        ``A * barkscale_fbanks(A.size(-1), ...)``.
+
+    """
+
+    # freq bins
+    all_freqs = torch.linspace(0, sample_rate // 2, n_freqs)
+
+    # calculate bark freq bins
+    m_min = _hz_to_bark(f_min, bark_scale=bark_scale)
+    m_max = _hz_to_bark(f_max, bark_scale=bark_scale)
+
+    m_pts = torch.linspace(m_min, m_max, n_barks + 2)
+    f_pts = _bark_to_hz(m_pts, bark_scale=bark_scale)
+
+    # create filterbank
+    fb = _create_triangular_filterbank(all_freqs, f_pts)
+
+    if (fb.max(dim=0).values == 0.0).any():
+        warnings.warn(
+            "At least one bark filterbank has all zero values. "
+            f"The value for `n_barks` ({n_barks}) may be set too high. "
+            f"Or, the value for `n_freqs` ({n_freqs}) may be set too low."
+        )
+
+    return fb
--- a/torchaudio/prototype/transforms/__init__.py
+++ b/torchaudio/prototype/transforms/__init__.py
-from ._transforms import Convolve, FFTConvolve
+from ._transforms import BarkScale, BarkSpectrogram, Convolve, FFTConvolve, InverseBarkScale

-__all__ = ["Convolve", "FFTConvolve"]
+__all__ = [
+    "BarkScale",
+    "BarkSpectrogram",
+    "Convolve",
+    "FFTConvolve",
+    "InverseBarkScale",
+]
--- a/torchaudio/prototype/transforms/_transforms.py
+++ b/torchaudio/prototype/transforms/_transforms.py
+from typing import Callable, Optional
+
 import torch
-from torchaudio.prototype.functional import convolve, fftconvolve
+from torchaudio.prototype.functional import barkscale_fbanks, convolve, fftconvolve
 from torchaudio.prototype.functional.functional import _check_convolve_mode
+from torchaudio.transforms import Spectrogram


 class Convolve(torch.nn.Module):
@@ -89,3 +92,295 @@ class FFTConvolve(torch.nn.Module):
            the leading dimensions match those of ``x`` and `L` is dictated by ``mode``.
        """
        return fftconvolve(x, y, mode=self.mode)
+
+
+class BarkScale(torch.nn.Module):
+    r"""Turn a normal STFT into a bark frequency STFT with triangular filter banks.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        n_barks (int, optional): Number of bark filterbanks. (Default: ``128``)
+        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
+        f_min (float, optional): Minimum frequency. (Default: ``0.``)
+        f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
+        n_stft (int, optional): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`. (Default: ``201``)
+        norm (str or None, optional): If ``"slaney"``, divide the triangular bark weights by the width of the bark band
+            (area normalization). (Default: ``None``)
+        bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> spectrogram_transform = transforms.Spectrogram(n_fft=1024)
+        >>> spectrogram = spectrogram_transform(waveform)
+        >>> barkscale_transform = transforms.BarkScale(sample_rate=sample_rate, n_stft=1024 // 2 + 1)
+        >>> barkscale_spectrogram = barkscale_transform(spectrogram)
+
+    See also:
+        :py:func:`torchaudio.prototype.functional.barkscale_fbanks` - The function used to
+        generate the filter banks.
+    """
+    __constants__ = ["n_barks", "sample_rate", "f_min", "f_max"]
+
+    def __init__(
+        self,
+        n_barks: int = 128,
+        sample_rate: int = 16000,
+        f_min: float = 0.0,
+        f_max: Optional[float] = None,
+        n_stft: int = 201,
+        bark_scale: str = "traunmuller",
+    ) -> None:
+        super(BarkScale, self).__init__()
+        self.n_barks = n_barks
+        self.sample_rate = sample_rate
+        self.f_max = f_max if f_max is not None else float(sample_rate // 2)
+        self.f_min = f_min
+        self.bark_scale = bark_scale
+
+        if f_min > self.f_max:
+            raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
+
+        fb = barkscale_fbanks(n_stft, self.f_min, self.f_max, self.n_barks, self.sample_rate, self.bark_scale)
+        self.register_buffer("fb", fb)
+
+    def forward(self, specgram: torch.Tensor) -> torch.Tensor:
+        r"""
+        Args:
+            specgram (torch.Tensor): A spectrogram STFT of dimension (..., freq, time).
+
+        Returns:
+            torch.Tensor: Bark frequency spectrogram of size (..., ``n_barks``, time).
+        """
+
+        # (..., time, freq) dot (freq, n_mels) -> (..., n_mels, time)
+        bark_specgram = torch.matmul(specgram.transpose(-1, -2), self.fb).transpose(-1, -2)
+
+        return bark_specgram
+
+
+class InverseBarkScale(torch.nn.Module):
+    r"""Estimate a STFT in normal frequency domain from bark frequency domain.
+
+    .. devices:: CPU CUDA
+
+    It minimizes the euclidian norm between the input bark-spectrogram and the product between
+    the estimated spectrogram and the filter banks using SGD.
+
+    Args:
+        n_stft (int): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`.
+        n_barks (int, optional): Number of bark filterbanks. (Default: ``128``)
+        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
+        f_min (float, optional): Minimum frequency. (Default: ``0.``)
+        f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
+        max_iter (int, optional): Maximum number of optimization iterations. (Default: ``100000``)
+        tolerance_loss (float, optional): Value of loss to stop optimization at. (Default: ``1e-5``)
+        tolerance_change (float, optional): Difference in losses to stop optimization at. (Default: ``1e-8``)
+        sgdargs (dict or None, optional): Arguments for the SGD optimizer. (Default: ``None``)
+        bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> mel_spectrogram_transform = transforms.BarkSpectrogram(sample_rate, n_fft=1024)
+        >>> mel_spectrogram = bark_spectrogram_transform(waveform)
+        >>> inverse_barkscale_transform = transforms.InverseBarkScale(n_stft=1024 // 2 + 1)
+        >>> spectrogram = inverse_barkscale_transform(mel_spectrogram)
+    """
+    __constants__ = [
+        "n_stft",
+        "n_barks",
+        "sample_rate",
+        "f_min",
+        "f_max",
+        "max_iter",
+        "tolerance_loss",
+        "tolerance_change",
+        "sgdargs",
+    ]
+
+    def __init__(
+        self,
+        n_stft: int,
+        n_barks: int = 128,
+        sample_rate: int = 16000,
+        f_min: float = 0.0,
+        f_max: Optional[float] = None,
+        max_iter: int = 100000,
+        tolerance_loss: float = 1e-5,
+        tolerance_change: float = 1e-8,
+        sgdargs: Optional[dict] = None,
+        bark_scale: str = "traunmuller",
+    ) -> None:
+        super(InverseBarkScale, self).__init__()
+        self.n_barks = n_barks
+        self.sample_rate = sample_rate
+        self.f_max = f_max or float(sample_rate // 2)
+        self.f_min = f_min
+        self.max_iter = max_iter
+        self.tolerance_loss = tolerance_loss
+        self.tolerance_change = tolerance_change
+        self.sgdargs = sgdargs or {"lr": 0.1, "momentum": 0.9}
+
+        if f_min > self.f_max:
+            raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
+
+        fb = barkscale_fbanks(n_stft, self.f_min, self.f_max, self.n_barks, self.sample_rate, bark_scale)
+        self.register_buffer("fb", fb)
+
+    def forward(self, barkspec: torch.Tensor) -> torch.Tensor:
+        r"""
+        Args:
+            barkspec (torch.Tensor): A Bark frequency spectrogram of dimension (..., ``n_barks``, time)
+
+        Returns:
+            torch.Tensor: Linear scale spectrogram of size (..., freq, time)
+        """
+        # pack batch
+        shape = barkspec.size()
+        barkspec = barkspec.view(-1, shape[-2], shape[-1])
+
+        n_barks, time = shape[-2], shape[-1]
+        freq, _ = self.fb.size()  # (freq, n_mels)
+        barkspec = barkspec.transpose(-1, -2)
+        if self.n_barks != n_barks:
+            raise ValueError("Expected an input with {} bark bins. Found: {}".format(self.n_barks, n_barks))
+
+        specgram = torch.rand(
+            barkspec.size()[0], time, freq, requires_grad=True, dtype=barkspec.dtype, device=barkspec.device
+        )
+
+        optim = torch.optim.SGD([specgram], **self.sgdargs)
+
+        loss = float("inf")
+        for _ in range(self.max_iter):
+            optim.zero_grad()
+            diff = barkspec - specgram.matmul(self.fb)
+            new_loss = diff.pow(2).sum(axis=-1).mean()
+            # take sum over bark-frequency then average over other dimensions
+            # so that loss threshold is applied par unit timeframe
+            new_loss.backward()
+            optim.step()
+            specgram.data = specgram.data.clamp(min=0)
+
+            new_loss = new_loss.item()
+            if new_loss < self.tolerance_loss or abs(loss - new_loss) < self.tolerance_change:
+                break
+            loss = new_loss
+
+        specgram.requires_grad_(False)
+        specgram = specgram.clamp(min=0).transpose(-1, -2)
+
+        # unpack batch
+        specgram = specgram.view(shape[:-2] + (freq, time))
+        return specgram
+
+
+class BarkSpectrogram(torch.nn.Module):
+    r"""Create BarkSpectrogram for a raw audio signal.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    This is a composition of :py:func:`torchaudio.transforms.Spectrogram` and
+    and :py:func:`torchaudio.transforms.BarkScale`.
+
+    Sources
+        * https://www.fon.hum.uva.nl/praat/manual/BarkSpectrogram.html
+        * Traunmüller, Hartmut. "Analytical Expressions for the Tonotopic Sensory Scale." Journal of the Acoustical
+        * Society of America. Vol. 88, Issue 1, 1990, pp. 97–100.
+        * https://ccrma.stanford.edu/courses/120-fall-2003/lecture-5.html
+
+    Args:
+        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
+        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
+        win_length (int or None, optional): Window size. (Default: ``n_fft``)
+        hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
+        f_min (float, optional): Minimum frequency. (Default: ``0.``)
+        f_max (float or None, optional): Maximum frequency. (Default: ``None``)
+        pad (int, optional): Two sided padding of signal. (Default: ``0``)
+        n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
+        window_fn (Callable[..., torch.Tensor], optional): A function to create a window tensor
+            that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
+        power (float, optional): Exponent for the magnitude spectrogram,
+            (must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: ``2``)
+        normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``)
+        wkwargs (Dict[..., ...] or None, optional): Arguments for window function. (Default: ``None``)
+        center (bool, optional): whether to pad :attr:`waveform` on both sides so
+            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
+            (Default: ``True``)
+        pad_mode (string, optional): controls the padding method used when
+            :attr:`center` is ``True``. (Default: ``"reflect"``)
+        bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> transform = transforms.BarkSpectrogram(sample_rate)
+        >>> bark_specgram = transform(waveform)  # (channel, n_barks, time)
+
+    See also:
+        :py:func:`torchaudio.functional.melscale_fbanks` - The function used to
+        generate the filter banks.
+    """
+    __constants__ = ["sample_rate", "n_fft", "win_length", "hop_length", "pad", "n_barks", "f_min"]
+
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        n_fft: int = 400,
+        win_length: Optional[int] = None,
+        hop_length: Optional[int] = None,
+        f_min: float = 0.0,
+        f_max: Optional[float] = None,
+        pad: int = 0,
+        n_barks: int = 128,
+        window_fn: Callable[..., torch.Tensor] = torch.hann_window,
+        power: float = 2.0,
+        normalized: bool = False,
+        wkwargs: Optional[dict] = None,
+        center: bool = True,
+        pad_mode: str = "reflect",
+        bark_scale: str = "traunmuller",
+    ) -> None:
+        super(BarkSpectrogram, self).__init__()
+
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.win_length = win_length if win_length is not None else n_fft
+        self.hop_length = hop_length if hop_length is not None else self.win_length // 2
+        self.pad = pad
+        self.power = power
+        self.normalized = normalized
+        self.n_barks = n_barks  # number of bark frequency bins
+        self.f_max = f_max
+        self.f_min = f_min
+        self.spectrogram = Spectrogram(
+            n_fft=self.n_fft,
+            win_length=self.win_length,
+            hop_length=self.hop_length,
+            pad=self.pad,
+            window_fn=window_fn,
+            power=self.power,
+            normalized=self.normalized,
+            wkwargs=wkwargs,
+            center=center,
+            pad_mode=pad_mode,
+            onesided=True,
+        )
+        self.bark_scale = BarkScale(
+            self.n_barks, self.sample_rate, self.f_min, self.f_max, self.n_fft // 2 + 1, bark_scale
+        )
+
+    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
+        r"""
+        Args:
+            waveform (torch.Tensor): torch.Tensor of audio of dimension (..., time).
+
+        Returns:
+            torch.Tensor: Bark frequency spectrogram of size (..., ``n_barks``, time).
+        """
+        specgram = self.spectrogram(waveform)
+        bark_specgram = self.bark_scale(specgram)
+        return bark_specgram
--- a/torchaudio/transforms/__init__.py
+++ b/torchaudio/transforms/__init__.py
 from ._multi_channel import MVDR, PSD, RTFMVDR, SoudenMVDR
 from ._transforms import (
    AmplitudeToDB,
-    BarkScale,
-    BarkSpectrogram,
    ComputeDeltas,
    Fade,
    FrequencyMasking,
    GriffinLim,
-    InverseBarkScale,
    InverseMelScale,
    InverseSpectrogram,
    LFCC,
@@ -37,16 +34,13 @@ __all__ = [
    "FrequencyMasking",
    "GriffinLim",
    "InverseMelScale",
-    "InverseBarkScale",
    "InverseSpectrogram",
    "LFCC",
    "Loudness",
    "MFCC",
    "MVDR",
    "MelScale",
-    "BarkScale",
    "MelSpectrogram",
-    "BarkSpectrogram",
    "MuLawDecoding",
    "MuLawEncoding",
    "PSD",