BarkSpectrogram (#2823)

Summary: I have added BarkScale transform, which can transform a regular Spectrogram into a BarkSpectrograms similar to MelScale. ahmed-fau opened this requirement in December 2021 with the number (https://github.com/pytorch/audio/issues/2103). The new functionality includes three different well-known approximations of the Bark scale. Pull Request resolved: https://github.com/pytorch/audio/pull/2823 Reviewed By: nateanl Differential Revision: D41162100 Pulled By: carolineechen fbshipit-source-id: b2670c4972e49c9ef424da5d5982576f7a4df831

BarkSpectrogram (#2823)
Summary: I have added BarkScale transform, which can transform a regular Spectrogram into a BarkSpectrograms similar to MelScale. ahmed-fau opened this requirement in December 2021 with the number (https://github.com/pytorch/audio/issues/2103). The new functionality includes three different well-known approximations of the Bark scale. Pull Request resolved: https://github.com/pytorch/audio/pull/2823 Reviewed By: nateanl Differential Revision: D41162100 Pulled By: carolineechen fbshipit-source-id: b2670c4972e49c9ef424da5d5982576f7a4df831
b326bc49 · Julián D. Arias-Londoño · Facebook GitHub Bot · 74f9a894 · b326bc49 · b326bc49
Commit b326bc49 authored Nov 10, 2022 by Julián D. Arias-Londoño Committed by Facebook GitHub Bot Nov 10, 2022
11 changed files
--- a/docs/source/functional.rst
+++ b/docs/source/functional.rst
@@ -17,6 +17,7 @@ Utility
   amplitude_to_DB
   DB_to_amplitude
   melscale_fbanks
+   barkscale_fbanks
   linear_fbanks
   create_dct
   mask_along_axis

--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -85,6 +85,8 @@ Utility
    AmplitudeToDB
    MelScale
    InverseMelScale
+    BarkScale
+    InverseBarkScale
    MuLawEncoding
    MuLawDecoding
    Resample
@@ -102,6 +104,7 @@ Feature Extractions
    Spectrogram
    InverseSpectrogram
    MelSpectrogram
+    BarkSpectrogram
    GriffinLim
    MFCC
    LFCC

--- a/test/torchaudio_unittest/functional/torchscript_consistency_impl.py
+++ b/test/torchaudio_unittest/functional/torchscript_consistency_impl.py
@@ -140,6 +140,17 @@ class Functional(TempDirMixin, TestBaseMixin):
        norm = "slaney"
        self._assert_consistency(F.melscale_fbanks, (n_stft, f_min, f_max, n_mels, sample_rate, norm, "htk"))

+    def test_barkscale_fbanks(self):
+        if self.device != torch.device("cpu"):
+            raise unittest.SkipTest("No need to perform test on device other than CPU")
+
+        n_stft = 100
+        f_min = 0.0
+        f_max = 20.0
+        n_barks = 10
+        sample_rate = 16000
+        self._assert_consistency(F.barkscale_fbanks, (n_stft, f_min, f_max, n_barks, sample_rate, "traunmuller"))
+
    def test_linear_fbanks(self):
        if self.device != torch.device("cpu"):
            raise unittest.SkipTest("No need to perform test on device other than CPU")

--- a/test/torchaudio_unittest/transforms/autograd_test_impl.py
+++ b/test/torchaudio_unittest/transforms/autograd_test_impl.py
@@ -90,6 +90,14 @@ class AutogradTestMixin(TestBaseMixin):
        waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2)
        self.assert_grad(transform, [waveform], nondet_tol=1e-10)

+    def test_barkspectrogram(self):
+        # replication_pad1d_backward_cuda is not deteministic and
+        # gives very small (~e-16) difference.
+        sample_rate = 8000
+        transform = T.BarkSpectrogram(sample_rate=sample_rate)
+        waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2)
+        self.assert_grad(transform, [waveform], nondet_tol=1e-10)
+
    @nested_params(
        [0, 0.99],
        [False, True],
@@ -194,6 +202,16 @@ class AutogradTestMixin(TestBaseMixin):
        )
        self.assert_grad(transform, [spec])

+    def test_barkscale(self):
+        sample_rate = 8000
+        n_fft = 400
+        n_barks = n_fft // 2 + 1
+        transform = T.BarkScale(sample_rate=sample_rate, n_barks=n_barks)
+        spec = get_spectrogram(
+            get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2), n_fft=n_fft, power=1
+        )
+        self.assert_grad(transform, [spec])
+
    @parameterized.expand([(1.5, "amplitude"), (2, "power"), (10, "db")])
    def test_vol(self, gain, gain_type):
        sample_rate = 8000

--- a/test/torchaudio_unittest/transforms/batch_consistency_test.py
+++ b/test/torchaudio_unittest/transforms/batch_consistency_test.py
@@ -58,6 +58,24 @@ class TestTransforms(common_utils.TorchaudioTestCase):
        # exactly same result. For this reason, tolerance is very relaxed here.
        self.assert_batch_consistency(transform, mel_spec, atol=1.0, rtol=1e-5)

+    def test_batch_BarkScale(self):
+        specgram = torch.randn(3, 2, 201, 256)
+
+        atol = 1e-4 if os.name == "nt" else 1e-6
+        transform = T.BarkScale()
+
+        self.assert_batch_consistency(transform, specgram, atol=atol)
+
+    def test_batch_InverseBarkScale(self):
+        n_barks = 32
+        n_stft = 5
+        bark_spec = torch.randn(3, 2, n_barks, 32) ** 2
+        transform = T.InverseMelScale(n_stft, n_barks)
+
+        # Because InverseBarkScale runs SGD on randomly initialized values so they do not yield
+        # exactly same result. For this reason, tolerance is very relaxed here.
+        self.assert_batch_consistency(transform, bark_spec, atol=1.0, rtol=1e-5)
+
    def test_batch_compute_deltas(self):
        specgram = torch.randn(3, 2, 31, 2786)
        transform = T.ComputeDeltas()

--- a/test/torchaudio_unittest/transforms/transforms_test.py
+++ b/test/torchaudio_unittest/transforms/transforms_test.py
@@ -131,6 +131,84 @@ class Tester(common_utils.TorchaudioTestCase):
        self.assertTrue(fb_matrix_transform.fb.sum(1).ge(0.0).all())
        self.assertEqual(fb_matrix_transform.fb.size(), (400, 100))

+    def test_barkscale_load_save(self):
+        specgram = torch.ones(1, 201, 100)
+        barkscale_transform = transforms.BarkScale()
+        barkscale_transform(specgram)
+
+        barkscale_transform_copy = transforms.BarkScale()
+        barkscale_transform_copy.load_state_dict(barkscale_transform.state_dict())
+
+        fb = barkscale_transform.fb
+        fb_copy = barkscale_transform_copy.fb
+
+        self.assertEqual(fb_copy.size(), (201, 128))
+        self.assertEqual(fb, fb_copy)
+
+    def test_barkspectrogram_load_save(self):
+        waveform = self.waveform.float()
+        bark_spectrogram_transform = transforms.BarkSpectrogram()
+        bark_spectrogram_transform(waveform)
+
+        bark_spectrogram_transform_copy = transforms.BarkSpectrogram()
+        bark_spectrogram_transform_copy.load_state_dict(bark_spectrogram_transform.state_dict())
+
+        window = bark_spectrogram_transform.spectrogram.window
+        window_copy = bark_spectrogram_transform_copy.spectrogram.window
+
+        fb = bark_spectrogram_transform.bark_scale.fb
+        fb_copy = bark_spectrogram_transform_copy.bark_scale.fb
+
+        self.assertEqual(window, window_copy)
+        # the default for n_fft = 400 and n_mels = 128
+        self.assertEqual(fb_copy.size(), (201, 128))
+        self.assertEqual(fb, fb_copy)
+
+    def test_bark2(self):
+        top_db = 80.0
+        s2db = transforms.AmplitudeToDB("power", top_db)
+
+        waveform = self.waveform.clone()  # (1, 16000)
+        waveform_scaled = self.scale(waveform)  # (1, 16000)
+        bark_transform = transforms.BarkSpectrogram()
+        # check defaults
+        spectrogram_torch = s2db(bark_transform(waveform_scaled))  # (1, 128, 321)
+        self.assertTrue(spectrogram_torch.dim() == 3)
+        self.assertTrue(spectrogram_torch.ge(spectrogram_torch.max() - top_db).all())
+        self.assertEqual(spectrogram_torch.size(1), bark_transform.n_barks)
+        # check correctness of filterbank conversion matrix
+        self.assertTrue(bark_transform.bark_scale.fb.sum(1).le(1.0).all())
+        self.assertTrue(bark_transform.bark_scale.fb.sum(1).ge(0.0).all())
+        # check options
+        kwargs = {
+            "window_fn": torch.hamming_window,
+            "pad": 10,
+            "win_length": 500,
+            "hop_length": 125,
+            "n_fft": 800,
+            "n_barks": 50,
+        }
+        bark_transform2 = transforms.BarkSpectrogram(**kwargs)
+        spectrogram2_torch = s2db(bark_transform2(waveform_scaled))  # (1, 50, 513)
+        self.assertTrue(spectrogram2_torch.dim() == 3)
+        self.assertTrue(spectrogram_torch.ge(spectrogram_torch.max() - top_db).all())
+        self.assertEqual(spectrogram2_torch.size(1), bark_transform2.n_barks)
+        self.assertTrue(bark_transform2.bark_scale.fb.sum(1).le(1.0).all())
+        self.assertTrue(bark_transform2.bark_scale.fb.sum(1).ge(0.0).all())
+        # check on multi-channel audio
+        filepath = common_utils.get_asset_path("steam-train-whistle-daniel_simon.wav")
+        x_stereo = common_utils.load_wav(filepath)[0]  # (2, 278756), 44100
+        spectrogram_stereo = s2db(bark_transform(x_stereo))  # (2, 128, 1394)
+        self.assertTrue(spectrogram_stereo.dim() == 3)
+        self.assertTrue(spectrogram_stereo.size(0) == 2)
+        self.assertTrue(spectrogram_torch.ge(spectrogram_torch.max() - top_db).all())
+        self.assertEqual(spectrogram_stereo.size(1), bark_transform.n_barks)
+        # check filterbank matrix creation
+        fb_matrix_transform = transforms.BarkScale(n_barks=100, sample_rate=16000, f_min=0.0, f_max=None, n_stft=400)
+        self.assertTrue(fb_matrix_transform.fb.sum(1).le(1.0).all())
+        self.assertTrue(fb_matrix_transform.fb.sum(1).ge(0.0).all())
+        self.assertEqual(fb_matrix_transform.fb.size(), (400, 100))
+
    def test_mfcc_defaults(self):
        """Check the default configuration of the MFCC transform."""
        sample_rate = 16000
@@ -296,3 +374,9 @@ class SmokeTest(common_utils.TorchaudioTestCase):
        specgram = melspecgram.spectrogram
        self.assertEqual(specgram.center, True)
        self.assertEqual(specgram.pad_mode, "reflect")
+
+    def test_barkspectrogram(self):
+        barkspecgram = transforms.BarkSpectrogram(center=True, pad_mode="reflect")
+        specgram = barkspecgram.spectrogram
+        self.assertEqual(specgram.center, True)
+        self.assertEqual(specgram.pad_mode, "reflect")
--- a/test/torchaudio_unittest/transforms/transforms_test_impl.py
+++ b/test/torchaudio_unittest/transforms/transforms_test_impl.py
@@ -52,6 +52,49 @@ class TransformsTestBase(TestBaseMixin):
        assert _get_ratio(relative_diff < 1e-3) > 5e-3
        assert _get_ratio(relative_diff < 1e-5) > 1e-5

+    def test_InverseBarkScale(self):
+        """Gauge the quality of InverseBarkScale transform.
+
+        As InverseBarkScale is currently implemented with
+        random initialization + iterative optimization,
+        it is not practically possible to assert the difference between
+        the estimated spectrogram and the original spectrogram as a whole.
+        Estimated spectrogram has very huge descrepency locally.
+        Thus in this test we gauge what percentage of elements are bellow
+        certain tolerance.
+        At the moment, the quality of estimated spectrogram is worse than the
+        one obtained for Inverse MelScale.
+        When implementation is changed in a way it makes the quality even worse,
+        this test will fail.
+        """
+        n_fft = 400
+        power = 1
+        n_barks = 64
+        sample_rate = 8000
+
+        n_stft = n_fft // 2 + 1
+
+        # Generate reference spectrogram and input mel-scaled spectrogram
+        expected = get_spectrogram(
+            get_whitenoise(sample_rate=sample_rate, duration=1, n_channels=2), n_fft=n_fft, power=power
+        ).to(self.device, self.dtype)
+        input = T.BarkScale(n_barks=n_barks, sample_rate=sample_rate, n_stft=n_stft).to(self.device, self.dtype)(
+            expected
+        )
+
+        # Run transform
+        transform = T.InverseBarkScale(n_stft, n_barks=n_barks, sample_rate=sample_rate).to(self.device, self.dtype)
+        result = transform(input)
+
+        # Compare
+        epsilon = 1e-60
+        relative_diff = torch.abs((result - expected) / (expected + epsilon))
+
+        for tol in [1e-1, 1e-3, 1e-5, 1e-10]:
+            print(f"Ratio of relative diff smaller than {tol:e} is " f"{_get_ratio(relative_diff < tol)}")
+        assert _get_ratio(relative_diff < 1e-1) > 0.2
+        assert _get_ratio(relative_diff < 1e-3) > 2e-3
+
    @nested_params(
        ["sinc_interpolation", "kaiser_window"],
        [16000, 44100],

--- a/torchaudio/functional/__init__.py
+++ b/torchaudio/functional/__init__.py
@@ -26,6 +26,7 @@ from .functional import (
    amplitude_to_DB,
    apply_beamforming,
    apply_codec,
+    barkscale_fbanks,
    compute_deltas,
    compute_kaldi_pitch,
    create_dct,
@@ -61,6 +62,7 @@ __all__ = [
    "compute_kaldi_pitch",
    "create_dct",
    "melscale_fbanks",
+    "barkscale_fbanks",
    "linear_fbanks",
    "DB_to_amplitude",
    "loudness",

--- a/torchaudio/functional/functional.py
+++ b/torchaudio/functional/functional.py
@@ -22,6 +22,7 @@ __all__ = [
    "compute_deltas",
    "compute_kaldi_pitch",
    "melscale_fbanks",
+    "barkscale_fbanks",
    "linear_fbanks",
    "create_dct",
    "compute_deltas",
@@ -479,6 +480,121 @@ def _mel_to_hz(mels: Tensor, mel_scale: str = "htk") -> Tensor:
    return freqs


+def _hz_to_bark(freqs: float, bark_scale: str = "traunmuller") -> float:
+    r"""Convert Hz to Barks.
+
+    Args:
+        freqs (float): Frequencies in Hz
+        bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
+
+    Returns:
+        barks (float): Frequency in Barks
+    """
+
+    if bark_scale not in ["schroeder", "traunmuller", "wang"]:
+        raise ValueError('bark_scale should be one of "schroeder", "traunmuller" or "wang".')
+
+    if bark_scale == "wang":
+        return 6.0 * math.asinh(freqs / 600.0)
+    elif bark_scale == "schroeder":
+        return 7.0 * math.asinh(freqs / 650.0)
+    # Traunmuller Bark scale
+    barks = ((26.81 * freqs) / (1960.0 + freqs)) - 0.53
+    # Bark value correction
+    if barks < 2:
+        barks += 0.15 * (2 - barks)
+    elif barks > 20.1:
+        barks += 0.22 * (barks - 20.1)
+
+    return barks
+
+
+def _bark_to_hz(barks: Tensor, bark_scale: str = "traunmuller") -> Tensor:
+    """Convert bark bin numbers to frequencies.
+
+    Args:
+        barks (Tensor): Bark frequencies
+        bark_scale (str, optional): Scale to use: ``traunmuller``,``schroeder`` or ``wang``. (Default: ``traunmuller``)
+
+    Returns:
+        freqs (Tensor): Barks converted in Hz
+    """
+
+    if bark_scale not in ["schroeder", "traunmuller", "wang"]:
+        raise ValueError('bark_scale should be one of "traunmuller", "schroeder" or "wang".')
+
+    if bark_scale == "wang":
+        return 600.0 * torch.sinh(barks / 6.0)
+    elif bark_scale == "schroeder":
+        return 650.0 * torch.sinh(barks / 7.0)
+    # Bark value correction
+    if any(barks < 2):
+        idx = barks < 2
+        barks[idx] = (barks[idx] - 0.3) / 0.85
+    elif any(barks > 20.1):
+        idx = barks > 20.1
+        barks[idx] = (barks[idx] + 4.422) / 1.22
+
+    # Traunmuller Bark scale
+    freqs = 1960 * ((barks + 0.53) / (26.28 - barks))
+
+    return freqs
+
+
+def barkscale_fbanks(
+    n_freqs: int,
+    f_min: float,
+    f_max: float,
+    n_barks: int,
+    sample_rate: int,
+    bark_scale: str = "traunmuller",
+) -> Tensor:
+    r"""Create a frequency bin conversion matrix.
+
+    .. devices:: CPU
+
+    .. properties:: TorchScript
+
+    Args:
+        n_freqs (int): Number of frequencies to highlight/apply
+        f_min (float): Minimum frequency (Hz)
+        f_max (float): Maximum frequency (Hz)
+        n_barks (int): Number of mel filterbanks
+        sample_rate (int): Sample rate of the audio waveform
+        bark_scale (str, optional): Scale to use: ``traunmuller``,``schroeder`` or ``wang``. (Default: ``traunmuller``)
+
+    Returns:
+        Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_barks``)
+        meaning number of frequencies to highlight/apply to x the number of filterbanks.
+        Each column is a filterbank so that assuming there is a matrix A of
+        size (..., ``n_freqs``), the applied result would be
+        ``A * barkscale_fbanks(A.size(-1), ...)``.
+
+    """
+
+    # freq bins
+    all_freqs = torch.linspace(0, sample_rate // 2, n_freqs)
+
+    # calculate bark freq bins
+    m_min = _hz_to_bark(f_min, bark_scale=bark_scale)
+    m_max = _hz_to_bark(f_max, bark_scale=bark_scale)
+
+    m_pts = torch.linspace(m_min, m_max, n_barks + 2)
+    f_pts = _bark_to_hz(m_pts, bark_scale=bark_scale)
+
+    # create filterbank
+    fb = _create_triangular_filterbank(all_freqs, f_pts)
+
+    if (fb.max(dim=0).values == 0.0).any():
+        warnings.warn(
+            "At least one bark filterbank has all zero values. "
+            f"The value for `n_barks` ({n_barks}) may be set too high. "
+            f"Or, the value for `n_freqs` ({n_freqs}) may be set too low."
+        )
+
+    return fb
+
+
 def _create_triangular_filterbank(
    all_freqs: Tensor,
    f_pts: Tensor,

--- a/torchaudio/transforms/__init__.py
+++ b/torchaudio/transforms/__init__.py
 from ._multi_channel import MVDR, PSD, RTFMVDR, SoudenMVDR
 from ._transforms import (
    AmplitudeToDB,
+    BarkScale,
+    BarkSpectrogram,
    ComputeDeltas,
    Fade,
    FrequencyMasking,
    GriffinLim,
+    InverseBarkScale,
    InverseMelScale,
    InverseSpectrogram,
    LFCC,
@@ -34,13 +37,16 @@ __all__ = [
    "FrequencyMasking",
    "GriffinLim",
    "InverseMelScale",
+    "InverseBarkScale",
    "InverseSpectrogram",
    "LFCC",
    "Loudness",
    "MFCC",
    "MVDR",
    "MelScale",
+    "BarkScale",
    "MelSpectrogram",
+    "BarkSpectrogram",
    "MuLawDecoding",
    "MuLawEncoding",
    "PSD",

--- a/torchaudio/transforms/_transforms.py
+++ b/torchaudio/transforms/_transforms.py
@@ -649,6 +649,298 @@ class MelSpectrogram(torch.nn.Module):
        return mel_specgram


+class BarkScale(torch.nn.Module):
+    r"""Turn a normal STFT into a bark frequency STFT with triangular filter banks.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        n_barks (int, optional): Number of bark filterbanks. (Default: ``128``)
+        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
+        f_min (float, optional): Minimum frequency. (Default: ``0.``)
+        f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
+        n_stft (int, optional): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`. (Default: ``201``)
+        norm (str or None, optional): If ``"slaney"``, divide the triangular bark weights by the width of the bark band
+            (area normalization). (Default: ``None``)
+        bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> spectrogram_transform = transforms.Spectrogram(n_fft=1024)
+        >>> spectrogram = spectrogram_transform(waveform)
+        >>> barkscale_transform = transforms.BarkScale(sample_rate=sample_rate, n_stft=1024 // 2 + 1)
+        >>> barkscale_spectrogram = barkscale_transform(spectrogram)
+
+    See also:
+        :py:func:`torchaudio.functional.barkscale_fbanks` - The function used to
+        generate the filter banks.
+    """
+    __constants__ = ["n_barks", "sample_rate", "f_min", "f_max"]
+
+    def __init__(
+        self,
+        n_barks: int = 128,
+        sample_rate: int = 16000,
+        f_min: float = 0.0,
+        f_max: Optional[float] = None,
+        n_stft: int = 201,
+        bark_scale: str = "traunmuller",
+    ) -> None:
+        super(BarkScale, self).__init__()
+        self.n_barks = n_barks
+        self.sample_rate = sample_rate
+        self.f_max = f_max if f_max is not None else float(sample_rate // 2)
+        self.f_min = f_min
+        self.bark_scale = bark_scale
+
+        if f_min > self.f_max:
+            raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
+
+        fb = F.barkscale_fbanks(n_stft, self.f_min, self.f_max, self.n_barks, self.sample_rate, self.bark_scale)
+        self.register_buffer("fb", fb)
+
+    def forward(self, specgram: Tensor) -> Tensor:
+        r"""
+        Args:
+            specgram (Tensor): A spectrogram STFT of dimension (..., freq, time).
+
+        Returns:
+            Tensor: Bark frequency spectrogram of size (..., ``n_barks``, time).
+        """
+
+        # (..., time, freq) dot (freq, n_mels) -> (..., n_mels, time)
+        bark_specgram = torch.matmul(specgram.transpose(-1, -2), self.fb).transpose(-1, -2)
+
+        return bark_specgram
+
+
+class InverseBarkScale(torch.nn.Module):
+    r"""Estimate a STFT in normal frequency domain from bark frequency domain.
+
+    .. devices:: CPU CUDA
+
+    It minimizes the euclidian norm between the input bark-spectrogram and the product between
+    the estimated spectrogram and the filter banks using SGD.
+
+    Args:
+        n_stft (int): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`.
+        n_barks (int, optional): Number of bark filterbanks. (Default: ``128``)
+        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
+        f_min (float, optional): Minimum frequency. (Default: ``0.``)
+        f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
+        max_iter (int, optional): Maximum number of optimization iterations. (Default: ``100000``)
+        tolerance_loss (float, optional): Value of loss to stop optimization at. (Default: ``1e-5``)
+        tolerance_change (float, optional): Difference in losses to stop optimization at. (Default: ``1e-8``)
+        sgdargs (dict or None, optional): Arguments for the SGD optimizer. (Default: ``None``)
+        bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> mel_spectrogram_transform = transforms.BarkSpectrogram(sample_rate, n_fft=1024)
+        >>> mel_spectrogram = bark_spectrogram_transform(waveform)
+        >>> inverse_barkscale_transform = transforms.InverseBarkScale(n_stft=1024 // 2 + 1)
+        >>> spectrogram = inverse_barkscale_transform(mel_spectrogram)
+    """
+    __constants__ = [
+        "n_stft",
+        "n_barks",
+        "sample_rate",
+        "f_min",
+        "f_max",
+        "max_iter",
+        "tolerance_loss",
+        "tolerance_change",
+        "sgdargs",
+    ]
+
+    def __init__(
+        self,
+        n_stft: int,
+        n_barks: int = 128,
+        sample_rate: int = 16000,
+        f_min: float = 0.0,
+        f_max: Optional[float] = None,
+        max_iter: int = 100000,
+        tolerance_loss: float = 1e-5,
+        tolerance_change: float = 1e-8,
+        sgdargs: Optional[dict] = None,
+        bark_scale: str = "traunmuller",
+    ) -> None:
+        super(InverseBarkScale, self).__init__()
+        self.n_barks = n_barks
+        self.sample_rate = sample_rate
+        self.f_max = f_max or float(sample_rate // 2)
+        self.f_min = f_min
+        self.max_iter = max_iter
+        self.tolerance_loss = tolerance_loss
+        self.tolerance_change = tolerance_change
+        self.sgdargs = sgdargs or {"lr": 0.1, "momentum": 0.9}
+
+        if f_min > self.f_max:
+            raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
+
+        fb = F.barkscale_fbanks(n_stft, self.f_min, self.f_max, self.n_barks, self.sample_rate, bark_scale)
+        self.register_buffer("fb", fb)
+
+    def forward(self, barkspec: Tensor) -> Tensor:
+        r"""
+        Args:
+            barkspec (Tensor): A Bark frequency spectrogram of dimension (..., ``n_barks``, time)
+
+        Returns:
+            Tensor: Linear scale spectrogram of size (..., freq, time)
+        """
+        # pack batch
+        shape = barkspec.size()
+        barkspec = barkspec.view(-1, shape[-2], shape[-1])
+
+        n_barks, time = shape[-2], shape[-1]
+        freq, _ = self.fb.size()  # (freq, n_mels)
+        barkspec = barkspec.transpose(-1, -2)
+        if self.n_barks != n_barks:
+            raise ValueError("Expected an input with {} bark bins. Found: {}".format(self.n_barks, n_barks))
+
+        specgram = torch.rand(
+            barkspec.size()[0], time, freq, requires_grad=True, dtype=barkspec.dtype, device=barkspec.device
+        )
+
+        optim = torch.optim.SGD([specgram], **self.sgdargs)
+
+        loss = float("inf")
+        for _ in range(self.max_iter):
+            optim.zero_grad()
+            diff = barkspec - specgram.matmul(self.fb)
+            new_loss = diff.pow(2).sum(axis=-1).mean()
+            # take sum over bark-frequency then average over other dimensions
+            # so that loss threshold is applied par unit timeframe
+            new_loss.backward()
+            optim.step()
+            specgram.data = specgram.data.clamp(min=0)
+
+            new_loss = new_loss.item()
+            if new_loss < self.tolerance_loss or abs(loss - new_loss) < self.tolerance_change:
+                break
+            loss = new_loss
+
+        specgram.requires_grad_(False)
+        specgram = specgram.clamp(min=0).transpose(-1, -2)
+
+        # unpack batch
+        specgram = specgram.view(shape[:-2] + (freq, time))
+        return specgram
+
+
+class BarkSpectrogram(torch.nn.Module):
+    r"""Create BarkSpectrogram for a raw audio signal.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    This is a composition of :py:func:`torchaudio.transforms.Spectrogram` and
+    and :py:func:`torchaudio.transforms.BarkScale`.
+
+    Sources
+        * https://www.fon.hum.uva.nl/praat/manual/BarkSpectrogram.html
+        * Traunmüller, Hartmut. "Analytical Expressions for the Tonotopic Sensory Scale." Journal of the Acoustical
+        * Society of America. Vol. 88, Issue 1, 1990, pp. 97–100.
+        * https://ccrma.stanford.edu/courses/120-fall-2003/lecture-5.html
+
+    Args:
+        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
+        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
+        win_length (int or None, optional): Window size. (Default: ``n_fft``)
+        hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
+        f_min (float, optional): Minimum frequency. (Default: ``0.``)
+        f_max (float or None, optional): Maximum frequency. (Default: ``None``)
+        pad (int, optional): Two sided padding of signal. (Default: ``0``)
+        n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
+        window_fn (Callable[..., Tensor], optional): A function to create a window tensor
+            that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
+        power (float, optional): Exponent for the magnitude spectrogram,
+            (must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: ``2``)
+        normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``)
+        wkwargs (Dict[..., ...] or None, optional): Arguments for window function. (Default: ``None``)
+        center (bool, optional): whether to pad :attr:`waveform` on both sides so
+            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
+            (Default: ``True``)
+        pad_mode (string, optional): controls the padding method used when
+            :attr:`center` is ``True``. (Default: ``"reflect"``)
+        bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> transform = transforms.BarkSpectrogram(sample_rate)
+        >>> bark_specgram = transform(waveform)  # (channel, n_barks, time)
+
+    See also:
+        :py:func:`torchaudio.functional.melscale_fbanks` - The function used to
+        generate the filter banks.
+    """
+    __constants__ = ["sample_rate", "n_fft", "win_length", "hop_length", "pad", "n_barks", "f_min"]
+
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        n_fft: int = 400,
+        win_length: Optional[int] = None,
+        hop_length: Optional[int] = None,
+        f_min: float = 0.0,
+        f_max: Optional[float] = None,
+        pad: int = 0,
+        n_barks: int = 128,
+        window_fn: Callable[..., Tensor] = torch.hann_window,
+        power: float = 2.0,
+        normalized: bool = False,
+        wkwargs: Optional[dict] = None,
+        center: bool = True,
+        pad_mode: str = "reflect",
+        bark_scale: str = "traunmuller",
+    ) -> None:
+        super(BarkSpectrogram, self).__init__()
+
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.win_length = win_length if win_length is not None else n_fft
+        self.hop_length = hop_length if hop_length is not None else self.win_length // 2
+        self.pad = pad
+        self.power = power
+        self.normalized = normalized
+        self.n_barks = n_barks  # number of bark frequency bins
+        self.f_max = f_max
+        self.f_min = f_min
+        self.spectrogram = Spectrogram(
+            n_fft=self.n_fft,
+            win_length=self.win_length,
+            hop_length=self.hop_length,
+            pad=self.pad,
+            window_fn=window_fn,
+            power=self.power,
+            normalized=self.normalized,
+            wkwargs=wkwargs,
+            center=center,
+            pad_mode=pad_mode,
+            onesided=True,
+        )
+        self.bark_scale = BarkScale(
+            self.n_barks, self.sample_rate, self.f_min, self.f_max, self.n_fft // 2 + 1, bark_scale
+        )
+
+    def forward(self, waveform: Tensor) -> Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension (..., time).
+
+        Returns:
+            Tensor: Bark frequency spectrogram of size (..., ``n_barks``, time).
+        """
+        specgram = self.spectrogram(waveform)
+        bark_specgram = self.bark_scale(specgram)
+        return bark_specgram
+
+
 class MFCC(torch.nn.Module):
    r"""Create the Mel-frequency cepstrum coefficients from an audio signal.