Commit 7819f3f6 authored by Caroline Chen's avatar Caroline Chen Committed by Facebook GitHub Bot
Browse files

Move bark spectrogram to prototype (#2843)

Summary:
follow up to https://github.com/pytorch/audio/issues/2823
- move bark spectrogram to prototype
- decrease autograd test tolerance (passing on circle ci)
- add diagram for bark fbanks

cc jdariasl

Pull Request resolved: https://github.com/pytorch/audio/pull/2843

Reviewed By: nateanl

Differential Revision: D41199522

Pulled By: carolineechen

fbshipit-source-id: 8e6c2e20fb7b14f39477683b3c6ed8356359a213
parent 6e334a46
......@@ -17,7 +17,6 @@ Utility
amplitude_to_DB
DB_to_amplitude
melscale_fbanks
barkscale_fbanks
linear_fbanks
create_dct
mask_along_axis
......
......@@ -9,6 +9,11 @@ add_noise
.. autofunction:: add_noise
barkscale_fbanks
~~~~~~~~~~~~~~~~
.. autofunction:: barkscale_fbanks
convolve
~~~~~~~~
......
......@@ -11,3 +11,6 @@ torchaudio.prototype.transforms
Convolve
FFTConvolve
BarkScale
InverseBarkScale
BarkSpectrogram
......@@ -84,9 +84,6 @@ Utility
AmplitudeToDB
MelScale
InverseMelScale
BarkScale
InverseBarkScale
MuLawEncoding
MuLawDecoding
Resample
......@@ -104,7 +101,6 @@ Feature Extractions
Spectrogram
InverseSpectrogram
MelSpectrogram
BarkSpectrogram
GriffinLim
MFCC
LFCC
......
......@@ -140,17 +140,6 @@ class Functional(TempDirMixin, TestBaseMixin):
norm = "slaney"
self._assert_consistency(F.melscale_fbanks, (n_stft, f_min, f_max, n_mels, sample_rate, norm, "htk"))
def test_barkscale_fbanks(self):
if self.device != torch.device("cpu"):
raise unittest.SkipTest("No need to perform test on device other than CPU")
n_stft = 100
f_min = 0.0
f_max = 20.0
n_barks = 10
sample_rate = 16000
self._assert_consistency(F.barkscale_fbanks, (n_stft, f_min, f_max, n_barks, sample_rate, "traunmuller"))
def test_linear_fbanks(self):
if self.device != torch.device("cpu"):
raise unittest.SkipTest("No need to perform test on device other than CPU")
......
import unittest
import torch
import torchaudio.prototype.functional as F
from torchaudio_unittest.common_utils import nested_params, TestBaseMixin, torch_script
......@@ -45,3 +47,14 @@ class TorchScriptConsistencyTestImpl(TestBaseMixin):
snr = torch.rand(*leading_dims, dtype=self.dtype, device=self.device, requires_grad=True) * 10
self._assert_consistency(F.add_noise, (waveform, noise, lengths, snr))
def test_barkscale_fbanks(self):
if self.device != torch.device("cpu"):
raise unittest.SkipTest("No need to perform test on device other than CPU")
n_stft = 100
f_min = 0.0
f_max = 20.0
n_barks = 10
sample_rate = 16000
self._assert_consistency(F.barkscale_fbanks, (n_stft, f_min, f_max, n_barks, sample_rate, "traunmuller"))
......@@ -3,7 +3,7 @@ from typing import List
import torch
import torchaudio.prototype.transforms as T
from torch.autograd import gradcheck, gradgradcheck
from torchaudio_unittest.common_utils import nested_params, TestBaseMixin
from torchaudio_unittest.common_utils import get_spectrogram, get_whitenoise, nested_params, TestBaseMixin
class Autograd(TestBaseMixin):
......@@ -38,3 +38,21 @@ class Autograd(TestBaseMixin):
y = torch.rand(*leading_dims, L_y, dtype=self.dtype, device=self.device)
convolve = cls(mode=mode).to(dtype=self.dtype, device=self.device)
self.assert_grad(convolve, [x, y])
def test_barkspectrogram(self):
# replication_pad1d_backward_cuda is not deteministic and
# gives very small (~e-16) difference.
sample_rate = 8000
transform = T.BarkSpectrogram(sample_rate=sample_rate)
waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2)
self.assert_grad(transform, [waveform], nondet_tol=1e-10)
def test_barkscale(self):
sample_rate = 8000
n_fft = 400
n_barks = n_fft // 2 + 1
transform = T.BarkScale(sample_rate=sample_rate, n_barks=n_barks)
spec = get_spectrogram(
get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2), n_fft=n_fft, power=1
)
self.assert_grad(transform, [spec])
import os
import torch
import torchaudio.prototype.transforms as T
import torchaudio.transforms as transforms
from torchaudio_unittest.common_utils import nested_params, TorchaudioTestCase
class BatchConsistencyTest(TorchaudioTestCase):
def assert_batch_consistency(self, transform, batch, *args, atol=1e-8, rtol=1e-5, seed=42, **kwargs):
n = batch.size(0)
# Compute items separately, then batch the result
torch.random.manual_seed(seed)
items_input = batch.clone()
items_result = torch.stack([transform(items_input[i], *args, **kwargs) for i in range(n)])
# Batch the input and run
torch.random.manual_seed(seed)
batch_input = batch.clone()
batch_result = transform(batch_input, *args, **kwargs)
self.assertEqual(items_input, batch_input, rtol=rtol, atol=atol)
self.assertEqual(items_result, batch_result, rtol=rtol, atol=atol)
@nested_params(
[T.Convolve, T.FFTConvolve],
["full", "valid", "same"],
......@@ -26,3 +45,21 @@ class BatchConsistencyTest(TorchaudioTestCase):
)
self.assertEqual(expected, actual)
def test_batch_BarkScale(self):
specgram = torch.randn(3, 2, 201, 256)
atol = 1e-6 if os.name == "nt" else 1e-8
transform = T.BarkScale()
self.assert_batch_consistency(transform, specgram, atol=atol)
def test_batch_InverseBarkScale(self):
n_barks = 32
n_stft = 5
bark_spec = torch.randn(3, 2, n_barks, 32) ** 2
transform = transforms.InverseMelScale(n_stft, n_barks)
# Because InverseBarkScale runs SGD on randomly initialized values so they do not yield
# exactly same result. For this reason, tolerance is very relaxed here.
self.assert_batch_consistency(transform, bark_spec, atol=1.0, rtol=1e-5)
......@@ -2,7 +2,11 @@ import numpy as np
import torch
import torchaudio.prototype.transforms as T
from scipy import signal
from torchaudio_unittest.common_utils import nested_params, TestBaseMixin
from torchaudio_unittest.common_utils import get_spectrogram, get_whitenoise, nested_params, TestBaseMixin
def _get_ratio(mat):
return (mat.sum() / mat.numel()).item()
class TransformsTestImpl(TestBaseMixin):
......@@ -52,3 +56,46 @@ class TransformsTestImpl(TestBaseMixin):
expected = torch.tensor(expected)
self.assertEqual(expected, actual)
def test_InverseBarkScale(self):
"""Gauge the quality of InverseBarkScale transform.
As InverseBarkScale is currently implemented with
random initialization + iterative optimization,
it is not practically possible to assert the difference between
the estimated spectrogram and the original spectrogram as a whole.
Estimated spectrogram has very huge descrepency locally.
Thus in this test we gauge what percentage of elements are bellow
certain tolerance.
At the moment, the quality of estimated spectrogram is worse than the
one obtained for Inverse MelScale.
When implementation is changed in a way it makes the quality even worse,
this test will fail.
"""
n_fft = 400
power = 1
n_barks = 64
sample_rate = 8000
n_stft = n_fft // 2 + 1
# Generate reference spectrogram and input mel-scaled spectrogram
expected = get_spectrogram(
get_whitenoise(sample_rate=sample_rate, duration=1, n_channels=2), n_fft=n_fft, power=power
).to(self.device, self.dtype)
input = T.BarkScale(n_barks=n_barks, sample_rate=sample_rate, n_stft=n_stft).to(self.device, self.dtype)(
expected
)
# Run transform
transform = T.InverseBarkScale(n_stft, n_barks=n_barks, sample_rate=sample_rate).to(self.device, self.dtype)
result = transform(input)
# Compare
epsilon = 1e-60
relative_diff = torch.abs((result - expected) / (expected + epsilon))
for tol in [1e-1, 1e-3, 1e-5, 1e-10]:
print(f"Ratio of relative diff smaller than {tol:e} is " f"{_get_ratio(relative_diff < tol)}")
assert _get_ratio(relative_diff < 1e-1) > 0.2
assert _get_ratio(relative_diff < 1e-3) > 2e-3
......@@ -90,14 +90,6 @@ class AutogradTestMixin(TestBaseMixin):
waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2)
self.assert_grad(transform, [waveform], nondet_tol=1e-10)
def test_barkspectrogram(self):
# replication_pad1d_backward_cuda is not deteministic and
# gives very small (~e-16) difference.
sample_rate = 8000
transform = T.BarkSpectrogram(sample_rate=sample_rate)
waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2)
self.assert_grad(transform, [waveform], nondet_tol=1e-10)
@nested_params(
[0, 0.99],
[False, True],
......@@ -202,16 +194,6 @@ class AutogradTestMixin(TestBaseMixin):
)
self.assert_grad(transform, [spec])
def test_barkscale(self):
sample_rate = 8000
n_fft = 400
n_barks = n_fft // 2 + 1
transform = T.BarkScale(sample_rate=sample_rate, n_barks=n_barks)
spec = get_spectrogram(
get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2), n_fft=n_fft, power=1
)
self.assert_grad(transform, [spec])
@parameterized.expand([(1.5, "amplitude"), (2, "power"), (10, "db")])
def test_vol(self, gain, gain_type):
sample_rate = 8000
......
......@@ -58,24 +58,6 @@ class TestTransforms(common_utils.TorchaudioTestCase):
# exactly same result. For this reason, tolerance is very relaxed here.
self.assert_batch_consistency(transform, mel_spec, atol=1.0, rtol=1e-5)
def test_batch_BarkScale(self):
specgram = torch.randn(3, 2, 201, 256)
atol = 1e-4 if os.name == "nt" else 1e-6
transform = T.BarkScale()
self.assert_batch_consistency(transform, specgram, atol=atol)
def test_batch_InverseBarkScale(self):
n_barks = 32
n_stft = 5
bark_spec = torch.randn(3, 2, n_barks, 32) ** 2
transform = T.InverseMelScale(n_stft, n_barks)
# Because InverseBarkScale runs SGD on randomly initialized values so they do not yield
# exactly same result. For this reason, tolerance is very relaxed here.
self.assert_batch_consistency(transform, bark_spec, atol=1.0, rtol=1e-5)
def test_batch_compute_deltas(self):
specgram = torch.randn(3, 2, 31, 2786)
transform = T.ComputeDeltas()
......
......@@ -131,84 +131,6 @@ class Tester(common_utils.TorchaudioTestCase):
self.assertTrue(fb_matrix_transform.fb.sum(1).ge(0.0).all())
self.assertEqual(fb_matrix_transform.fb.size(), (400, 100))
def test_barkscale_load_save(self):
specgram = torch.ones(1, 201, 100)
barkscale_transform = transforms.BarkScale()
barkscale_transform(specgram)
barkscale_transform_copy = transforms.BarkScale()
barkscale_transform_copy.load_state_dict(barkscale_transform.state_dict())
fb = barkscale_transform.fb
fb_copy = barkscale_transform_copy.fb
self.assertEqual(fb_copy.size(), (201, 128))
self.assertEqual(fb, fb_copy)
def test_barkspectrogram_load_save(self):
waveform = self.waveform.float()
bark_spectrogram_transform = transforms.BarkSpectrogram()
bark_spectrogram_transform(waveform)
bark_spectrogram_transform_copy = transforms.BarkSpectrogram()
bark_spectrogram_transform_copy.load_state_dict(bark_spectrogram_transform.state_dict())
window = bark_spectrogram_transform.spectrogram.window
window_copy = bark_spectrogram_transform_copy.spectrogram.window
fb = bark_spectrogram_transform.bark_scale.fb
fb_copy = bark_spectrogram_transform_copy.bark_scale.fb
self.assertEqual(window, window_copy)
# the default for n_fft = 400 and n_mels = 128
self.assertEqual(fb_copy.size(), (201, 128))
self.assertEqual(fb, fb_copy)
def test_bark2(self):
top_db = 80.0
s2db = transforms.AmplitudeToDB("power", top_db)
waveform = self.waveform.clone() # (1, 16000)
waveform_scaled = self.scale(waveform) # (1, 16000)
bark_transform = transforms.BarkSpectrogram()
# check defaults
spectrogram_torch = s2db(bark_transform(waveform_scaled)) # (1, 128, 321)
self.assertTrue(spectrogram_torch.dim() == 3)
self.assertTrue(spectrogram_torch.ge(spectrogram_torch.max() - top_db).all())
self.assertEqual(spectrogram_torch.size(1), bark_transform.n_barks)
# check correctness of filterbank conversion matrix
self.assertTrue(bark_transform.bark_scale.fb.sum(1).le(1.0).all())
self.assertTrue(bark_transform.bark_scale.fb.sum(1).ge(0.0).all())
# check options
kwargs = {
"window_fn": torch.hamming_window,
"pad": 10,
"win_length": 500,
"hop_length": 125,
"n_fft": 800,
"n_barks": 50,
}
bark_transform2 = transforms.BarkSpectrogram(**kwargs)
spectrogram2_torch = s2db(bark_transform2(waveform_scaled)) # (1, 50, 513)
self.assertTrue(spectrogram2_torch.dim() == 3)
self.assertTrue(spectrogram_torch.ge(spectrogram_torch.max() - top_db).all())
self.assertEqual(spectrogram2_torch.size(1), bark_transform2.n_barks)
self.assertTrue(bark_transform2.bark_scale.fb.sum(1).le(1.0).all())
self.assertTrue(bark_transform2.bark_scale.fb.sum(1).ge(0.0).all())
# check on multi-channel audio
filepath = common_utils.get_asset_path("steam-train-whistle-daniel_simon.wav")
x_stereo = common_utils.load_wav(filepath)[0] # (2, 278756), 44100
spectrogram_stereo = s2db(bark_transform(x_stereo)) # (2, 128, 1394)
self.assertTrue(spectrogram_stereo.dim() == 3)
self.assertTrue(spectrogram_stereo.size(0) == 2)
self.assertTrue(spectrogram_torch.ge(spectrogram_torch.max() - top_db).all())
self.assertEqual(spectrogram_stereo.size(1), bark_transform.n_barks)
# check filterbank matrix creation
fb_matrix_transform = transforms.BarkScale(n_barks=100, sample_rate=16000, f_min=0.0, f_max=None, n_stft=400)
self.assertTrue(fb_matrix_transform.fb.sum(1).le(1.0).all())
self.assertTrue(fb_matrix_transform.fb.sum(1).ge(0.0).all())
self.assertEqual(fb_matrix_transform.fb.size(), (400, 100))
def test_mfcc_defaults(self):
"""Check the default configuration of the MFCC transform."""
sample_rate = 16000
......@@ -374,9 +296,3 @@ class SmokeTest(common_utils.TorchaudioTestCase):
specgram = melspecgram.spectrogram
self.assertEqual(specgram.center, True)
self.assertEqual(specgram.pad_mode, "reflect")
def test_barkspectrogram(self):
barkspecgram = transforms.BarkSpectrogram(center=True, pad_mode="reflect")
specgram = barkspecgram.spectrogram
self.assertEqual(specgram.center, True)
self.assertEqual(specgram.pad_mode, "reflect")
......@@ -52,49 +52,6 @@ class TransformsTestBase(TestBaseMixin):
assert _get_ratio(relative_diff < 1e-3) > 5e-3
assert _get_ratio(relative_diff < 1e-5) > 1e-5
def test_InverseBarkScale(self):
"""Gauge the quality of InverseBarkScale transform.
As InverseBarkScale is currently implemented with
random initialization + iterative optimization,
it is not practically possible to assert the difference between
the estimated spectrogram and the original spectrogram as a whole.
Estimated spectrogram has very huge descrepency locally.
Thus in this test we gauge what percentage of elements are bellow
certain tolerance.
At the moment, the quality of estimated spectrogram is worse than the
one obtained for Inverse MelScale.
When implementation is changed in a way it makes the quality even worse,
this test will fail.
"""
n_fft = 400
power = 1
n_barks = 64
sample_rate = 8000
n_stft = n_fft // 2 + 1
# Generate reference spectrogram and input mel-scaled spectrogram
expected = get_spectrogram(
get_whitenoise(sample_rate=sample_rate, duration=1, n_channels=2), n_fft=n_fft, power=power
).to(self.device, self.dtype)
input = T.BarkScale(n_barks=n_barks, sample_rate=sample_rate, n_stft=n_stft).to(self.device, self.dtype)(
expected
)
# Run transform
transform = T.InverseBarkScale(n_stft, n_barks=n_barks, sample_rate=sample_rate).to(self.device, self.dtype)
result = transform(input)
# Compare
epsilon = 1e-60
relative_diff = torch.abs((result - expected) / (expected + epsilon))
for tol in [1e-1, 1e-3, 1e-5, 1e-10]:
print(f"Ratio of relative diff smaller than {tol:e} is " f"{_get_ratio(relative_diff < tol)}")
assert _get_ratio(relative_diff < 1e-1) > 0.2
assert _get_ratio(relative_diff < 1e-3) > 2e-3
@nested_params(
["sinc_interpolation", "kaiser_window"],
[16000, 44100],
......
......@@ -26,7 +26,6 @@ from .functional import (
amplitude_to_DB,
apply_beamforming,
apply_codec,
barkscale_fbanks,
compute_deltas,
compute_kaldi_pitch,
create_dct,
......@@ -62,7 +61,6 @@ __all__ = [
"compute_kaldi_pitch",
"create_dct",
"melscale_fbanks",
"barkscale_fbanks",
"linear_fbanks",
"DB_to_amplitude",
"loudness",
......
......@@ -22,7 +22,6 @@ __all__ = [
"compute_deltas",
"compute_kaldi_pitch",
"melscale_fbanks",
"barkscale_fbanks",
"linear_fbanks",
"create_dct",
"compute_deltas",
......@@ -480,121 +479,6 @@ def _mel_to_hz(mels: Tensor, mel_scale: str = "htk") -> Tensor:
return freqs
def _hz_to_bark(freqs: float, bark_scale: str = "traunmuller") -> float:
r"""Convert Hz to Barks.
Args:
freqs (float): Frequencies in Hz
bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
Returns:
barks (float): Frequency in Barks
"""
if bark_scale not in ["schroeder", "traunmuller", "wang"]:
raise ValueError('bark_scale should be one of "schroeder", "traunmuller" or "wang".')
if bark_scale == "wang":
return 6.0 * math.asinh(freqs / 600.0)
elif bark_scale == "schroeder":
return 7.0 * math.asinh(freqs / 650.0)
# Traunmuller Bark scale
barks = ((26.81 * freqs) / (1960.0 + freqs)) - 0.53
# Bark value correction
if barks < 2:
barks += 0.15 * (2 - barks)
elif barks > 20.1:
barks += 0.22 * (barks - 20.1)
return barks
def _bark_to_hz(barks: Tensor, bark_scale: str = "traunmuller") -> Tensor:
"""Convert bark bin numbers to frequencies.
Args:
barks (Tensor): Bark frequencies
bark_scale (str, optional): Scale to use: ``traunmuller``,``schroeder`` or ``wang``. (Default: ``traunmuller``)
Returns:
freqs (Tensor): Barks converted in Hz
"""
if bark_scale not in ["schroeder", "traunmuller", "wang"]:
raise ValueError('bark_scale should be one of "traunmuller", "schroeder" or "wang".')
if bark_scale == "wang":
return 600.0 * torch.sinh(barks / 6.0)
elif bark_scale == "schroeder":
return 650.0 * torch.sinh(barks / 7.0)
# Bark value correction
if any(barks < 2):
idx = barks < 2
barks[idx] = (barks[idx] - 0.3) / 0.85
elif any(barks > 20.1):
idx = barks > 20.1
barks[idx] = (barks[idx] + 4.422) / 1.22
# Traunmuller Bark scale
freqs = 1960 * ((barks + 0.53) / (26.28 - barks))
return freqs
def barkscale_fbanks(
n_freqs: int,
f_min: float,
f_max: float,
n_barks: int,
sample_rate: int,
bark_scale: str = "traunmuller",
) -> Tensor:
r"""Create a frequency bin conversion matrix.
.. devices:: CPU
.. properties:: TorchScript
Args:
n_freqs (int): Number of frequencies to highlight/apply
f_min (float): Minimum frequency (Hz)
f_max (float): Maximum frequency (Hz)
n_barks (int): Number of mel filterbanks
sample_rate (int): Sample rate of the audio waveform
bark_scale (str, optional): Scale to use: ``traunmuller``,``schroeder`` or ``wang``. (Default: ``traunmuller``)
Returns:
Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_barks``)
meaning number of frequencies to highlight/apply to x the number of filterbanks.
Each column is a filterbank so that assuming there is a matrix A of
size (..., ``n_freqs``), the applied result would be
``A * barkscale_fbanks(A.size(-1), ...)``.
"""
# freq bins
all_freqs = torch.linspace(0, sample_rate // 2, n_freqs)
# calculate bark freq bins
m_min = _hz_to_bark(f_min, bark_scale=bark_scale)
m_max = _hz_to_bark(f_max, bark_scale=bark_scale)
m_pts = torch.linspace(m_min, m_max, n_barks + 2)
f_pts = _bark_to_hz(m_pts, bark_scale=bark_scale)
# create filterbank
fb = _create_triangular_filterbank(all_freqs, f_pts)
if (fb.max(dim=0).values == 0.0).any():
warnings.warn(
"At least one bark filterbank has all zero values. "
f"The value for `n_barks` ({n_barks}) may be set too high. "
f"Or, the value for `n_freqs` ({n_freqs}) may be set too low."
)
return fb
def _create_triangular_filterbank(
all_freqs: Tensor,
f_pts: Tensor,
......
from .functional import add_noise, convolve, fftconvolve
from .functional import add_noise, barkscale_fbanks, convolve, fftconvolve
__all__ = ["add_noise", "convolve", "fftconvolve"]
__all__ = ["add_noise", "barkscale_fbanks", "convolve", "fftconvolve"]
import math
import warnings
import torch
from torchaudio.functional.functional import _create_triangular_filterbank
def _check_convolve_mode(mode: str) -> None:
valid_convolve_modes = ["full", "valid", "same"]
......@@ -172,3 +177,121 @@ def add_noise(waveform: torch.Tensor, noise: torch.Tensor, lengths: torch.Tensor
scaled_noise = scale.unsqueeze(-1) * noise # (*, 1) * (*, L) = (*, L)
return waveform + scaled_noise # (*, L)
def _hz_to_bark(freqs: float, bark_scale: str = "traunmuller") -> float:
r"""Convert Hz to Barks.
Args:
freqs (float): Frequencies in Hz
bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
Returns:
barks (float): Frequency in Barks
"""
if bark_scale not in ["schroeder", "traunmuller", "wang"]:
raise ValueError('bark_scale should be one of "schroeder", "traunmuller" or "wang".')
if bark_scale == "wang":
return 6.0 * math.asinh(freqs / 600.0)
elif bark_scale == "schroeder":
return 7.0 * math.asinh(freqs / 650.0)
# Traunmuller Bark scale
barks = ((26.81 * freqs) / (1960.0 + freqs)) - 0.53
# Bark value correction
if barks < 2:
barks += 0.15 * (2 - barks)
elif barks > 20.1:
barks += 0.22 * (barks - 20.1)
return barks
def _bark_to_hz(barks: torch.Tensor, bark_scale: str = "traunmuller") -> torch.Tensor:
"""Convert bark bin numbers to frequencies.
Args:
barks (torch.Tensor): Bark frequencies
bark_scale (str, optional): Scale to use: ``traunmuller``,``schroeder`` or ``wang``. (Default: ``traunmuller``)
Returns:
freqs (torch.Tensor): Barks converted in Hz
"""
if bark_scale not in ["schroeder", "traunmuller", "wang"]:
raise ValueError('bark_scale should be one of "traunmuller", "schroeder" or "wang".')
if bark_scale == "wang":
return 600.0 * torch.sinh(barks / 6.0)
elif bark_scale == "schroeder":
return 650.0 * torch.sinh(barks / 7.0)
# Bark value correction
if any(barks < 2):
idx = barks < 2
barks[idx] = (barks[idx] - 0.3) / 0.85
elif any(barks > 20.1):
idx = barks > 20.1
barks[idx] = (barks[idx] + 4.422) / 1.22
# Traunmuller Bark scale
freqs = 1960 * ((barks + 0.53) / (26.28 - barks))
return freqs
def barkscale_fbanks(
n_freqs: int,
f_min: float,
f_max: float,
n_barks: int,
sample_rate: int,
bark_scale: str = "traunmuller",
) -> torch.Tensor:
r"""Create a frequency bin conversion matrix.
.. devices:: CPU
.. properties:: TorchScript
.. image:: https://download.pytorch.org/torchaudio/doc-assets/bark_fbanks.png
:alt: Visualization of generated filter bank
Args:
n_freqs (int): Number of frequencies to highlight/apply
f_min (float): Minimum frequency (Hz)
f_max (float): Maximum frequency (Hz)
n_barks (int): Number of mel filterbanks
sample_rate (int): Sample rate of the audio waveform
bark_scale (str, optional): Scale to use: ``traunmuller``,``schroeder`` or ``wang``. (Default: ``traunmuller``)
Returns:
torch.Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_barks``)
meaning number of frequencies to highlight/apply to x the number of filterbanks.
Each column is a filterbank so that assuming there is a matrix A of
size (..., ``n_freqs``), the applied result would be
``A * barkscale_fbanks(A.size(-1), ...)``.
"""
# freq bins
all_freqs = torch.linspace(0, sample_rate // 2, n_freqs)
# calculate bark freq bins
m_min = _hz_to_bark(f_min, bark_scale=bark_scale)
m_max = _hz_to_bark(f_max, bark_scale=bark_scale)
m_pts = torch.linspace(m_min, m_max, n_barks + 2)
f_pts = _bark_to_hz(m_pts, bark_scale=bark_scale)
# create filterbank
fb = _create_triangular_filterbank(all_freqs, f_pts)
if (fb.max(dim=0).values == 0.0).any():
warnings.warn(
"At least one bark filterbank has all zero values. "
f"The value for `n_barks` ({n_barks}) may be set too high. "
f"Or, the value for `n_freqs` ({n_freqs}) may be set too low."
)
return fb
from ._transforms import Convolve, FFTConvolve
from ._transforms import BarkScale, BarkSpectrogram, Convolve, FFTConvolve, InverseBarkScale
__all__ = ["Convolve", "FFTConvolve"]
__all__ = [
"BarkScale",
"BarkSpectrogram",
"Convolve",
"FFTConvolve",
"InverseBarkScale",
]
from typing import Callable, Optional
import torch
from torchaudio.prototype.functional import convolve, fftconvolve
from torchaudio.prototype.functional import barkscale_fbanks, convolve, fftconvolve
from torchaudio.prototype.functional.functional import _check_convolve_mode
from torchaudio.transforms import Spectrogram
class Convolve(torch.nn.Module):
......@@ -89,3 +92,295 @@ class FFTConvolve(torch.nn.Module):
the leading dimensions match those of ``x`` and `L` is dictated by ``mode``.
"""
return fftconvolve(x, y, mode=self.mode)
class BarkScale(torch.nn.Module):
r"""Turn a normal STFT into a bark frequency STFT with triangular filter banks.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
n_barks (int, optional): Number of bark filterbanks. (Default: ``128``)
sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
f_min (float, optional): Minimum frequency. (Default: ``0.``)
f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
n_stft (int, optional): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`. (Default: ``201``)
norm (str or None, optional): If ``"slaney"``, divide the triangular bark weights by the width of the bark band
(area normalization). (Default: ``None``)
bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
Example
>>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
>>> spectrogram_transform = transforms.Spectrogram(n_fft=1024)
>>> spectrogram = spectrogram_transform(waveform)
>>> barkscale_transform = transforms.BarkScale(sample_rate=sample_rate, n_stft=1024 // 2 + 1)
>>> barkscale_spectrogram = barkscale_transform(spectrogram)
See also:
:py:func:`torchaudio.prototype.functional.barkscale_fbanks` - The function used to
generate the filter banks.
"""
__constants__ = ["n_barks", "sample_rate", "f_min", "f_max"]
def __init__(
self,
n_barks: int = 128,
sample_rate: int = 16000,
f_min: float = 0.0,
f_max: Optional[float] = None,
n_stft: int = 201,
bark_scale: str = "traunmuller",
) -> None:
super(BarkScale, self).__init__()
self.n_barks = n_barks
self.sample_rate = sample_rate
self.f_max = f_max if f_max is not None else float(sample_rate // 2)
self.f_min = f_min
self.bark_scale = bark_scale
if f_min > self.f_max:
raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
fb = barkscale_fbanks(n_stft, self.f_min, self.f_max, self.n_barks, self.sample_rate, self.bark_scale)
self.register_buffer("fb", fb)
def forward(self, specgram: torch.Tensor) -> torch.Tensor:
r"""
Args:
specgram (torch.Tensor): A spectrogram STFT of dimension (..., freq, time).
Returns:
torch.Tensor: Bark frequency spectrogram of size (..., ``n_barks``, time).
"""
# (..., time, freq) dot (freq, n_mels) -> (..., n_mels, time)
bark_specgram = torch.matmul(specgram.transpose(-1, -2), self.fb).transpose(-1, -2)
return bark_specgram
class InverseBarkScale(torch.nn.Module):
r"""Estimate a STFT in normal frequency domain from bark frequency domain.
.. devices:: CPU CUDA
It minimizes the euclidian norm between the input bark-spectrogram and the product between
the estimated spectrogram and the filter banks using SGD.
Args:
n_stft (int): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`.
n_barks (int, optional): Number of bark filterbanks. (Default: ``128``)
sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
f_min (float, optional): Minimum frequency. (Default: ``0.``)
f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
max_iter (int, optional): Maximum number of optimization iterations. (Default: ``100000``)
tolerance_loss (float, optional): Value of loss to stop optimization at. (Default: ``1e-5``)
tolerance_change (float, optional): Difference in losses to stop optimization at. (Default: ``1e-8``)
sgdargs (dict or None, optional): Arguments for the SGD optimizer. (Default: ``None``)
bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
Example
>>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
>>> mel_spectrogram_transform = transforms.BarkSpectrogram(sample_rate, n_fft=1024)
>>> mel_spectrogram = bark_spectrogram_transform(waveform)
>>> inverse_barkscale_transform = transforms.InverseBarkScale(n_stft=1024 // 2 + 1)
>>> spectrogram = inverse_barkscale_transform(mel_spectrogram)
"""
__constants__ = [
"n_stft",
"n_barks",
"sample_rate",
"f_min",
"f_max",
"max_iter",
"tolerance_loss",
"tolerance_change",
"sgdargs",
]
def __init__(
self,
n_stft: int,
n_barks: int = 128,
sample_rate: int = 16000,
f_min: float = 0.0,
f_max: Optional[float] = None,
max_iter: int = 100000,
tolerance_loss: float = 1e-5,
tolerance_change: float = 1e-8,
sgdargs: Optional[dict] = None,
bark_scale: str = "traunmuller",
) -> None:
super(InverseBarkScale, self).__init__()
self.n_barks = n_barks
self.sample_rate = sample_rate
self.f_max = f_max or float(sample_rate // 2)
self.f_min = f_min
self.max_iter = max_iter
self.tolerance_loss = tolerance_loss
self.tolerance_change = tolerance_change
self.sgdargs = sgdargs or {"lr": 0.1, "momentum": 0.9}
if f_min > self.f_max:
raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
fb = barkscale_fbanks(n_stft, self.f_min, self.f_max, self.n_barks, self.sample_rate, bark_scale)
self.register_buffer("fb", fb)
def forward(self, barkspec: torch.Tensor) -> torch.Tensor:
r"""
Args:
barkspec (torch.Tensor): A Bark frequency spectrogram of dimension (..., ``n_barks``, time)
Returns:
torch.Tensor: Linear scale spectrogram of size (..., freq, time)
"""
# pack batch
shape = barkspec.size()
barkspec = barkspec.view(-1, shape[-2], shape[-1])
n_barks, time = shape[-2], shape[-1]
freq, _ = self.fb.size() # (freq, n_mels)
barkspec = barkspec.transpose(-1, -2)
if self.n_barks != n_barks:
raise ValueError("Expected an input with {} bark bins. Found: {}".format(self.n_barks, n_barks))
specgram = torch.rand(
barkspec.size()[0], time, freq, requires_grad=True, dtype=barkspec.dtype, device=barkspec.device
)
optim = torch.optim.SGD([specgram], **self.sgdargs)
loss = float("inf")
for _ in range(self.max_iter):
optim.zero_grad()
diff = barkspec - specgram.matmul(self.fb)
new_loss = diff.pow(2).sum(axis=-1).mean()
# take sum over bark-frequency then average over other dimensions
# so that loss threshold is applied par unit timeframe
new_loss.backward()
optim.step()
specgram.data = specgram.data.clamp(min=0)
new_loss = new_loss.item()
if new_loss < self.tolerance_loss or abs(loss - new_loss) < self.tolerance_change:
break
loss = new_loss
specgram.requires_grad_(False)
specgram = specgram.clamp(min=0).transpose(-1, -2)
# unpack batch
specgram = specgram.view(shape[:-2] + (freq, time))
return specgram
class BarkSpectrogram(torch.nn.Module):
r"""Create BarkSpectrogram for a raw audio signal.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
This is a composition of :py:func:`torchaudio.transforms.Spectrogram` and
and :py:func:`torchaudio.transforms.BarkScale`.
Sources
* https://www.fon.hum.uva.nl/praat/manual/BarkSpectrogram.html
* Traunmüller, Hartmut. "Analytical Expressions for the Tonotopic Sensory Scale." Journal of the Acoustical
* Society of America. Vol. 88, Issue 1, 1990, pp. 97–100.
* https://ccrma.stanford.edu/courses/120-fall-2003/lecture-5.html
Args:
sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
win_length (int or None, optional): Window size. (Default: ``n_fft``)
hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
f_min (float, optional): Minimum frequency. (Default: ``0.``)
f_max (float or None, optional): Maximum frequency. (Default: ``None``)
pad (int, optional): Two sided padding of signal. (Default: ``0``)
n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
window_fn (Callable[..., torch.Tensor], optional): A function to create a window tensor
that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
power (float, optional): Exponent for the magnitude spectrogram,
(must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: ``2``)
normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``)
wkwargs (Dict[..., ...] or None, optional): Arguments for window function. (Default: ``None``)
center (bool, optional): whether to pad :attr:`waveform` on both sides so
that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
(Default: ``True``)
pad_mode (string, optional): controls the padding method used when
:attr:`center` is ``True``. (Default: ``"reflect"``)
bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
Example
>>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
>>> transform = transforms.BarkSpectrogram(sample_rate)
>>> bark_specgram = transform(waveform) # (channel, n_barks, time)
See also:
:py:func:`torchaudio.functional.melscale_fbanks` - The function used to
generate the filter banks.
"""
__constants__ = ["sample_rate", "n_fft", "win_length", "hop_length", "pad", "n_barks", "f_min"]
def __init__(
self,
sample_rate: int = 16000,
n_fft: int = 400,
win_length: Optional[int] = None,
hop_length: Optional[int] = None,
f_min: float = 0.0,
f_max: Optional[float] = None,
pad: int = 0,
n_barks: int = 128,
window_fn: Callable[..., torch.Tensor] = torch.hann_window,
power: float = 2.0,
normalized: bool = False,
wkwargs: Optional[dict] = None,
center: bool = True,
pad_mode: str = "reflect",
bark_scale: str = "traunmuller",
) -> None:
super(BarkSpectrogram, self).__init__()
self.sample_rate = sample_rate
self.n_fft = n_fft
self.win_length = win_length if win_length is not None else n_fft
self.hop_length = hop_length if hop_length is not None else self.win_length // 2
self.pad = pad
self.power = power
self.normalized = normalized
self.n_barks = n_barks # number of bark frequency bins
self.f_max = f_max
self.f_min = f_min
self.spectrogram = Spectrogram(
n_fft=self.n_fft,
win_length=self.win_length,
hop_length=self.hop_length,
pad=self.pad,
window_fn=window_fn,
power=self.power,
normalized=self.normalized,
wkwargs=wkwargs,
center=center,
pad_mode=pad_mode,
onesided=True,
)
self.bark_scale = BarkScale(
self.n_barks, self.sample_rate, self.f_min, self.f_max, self.n_fft // 2 + 1, bark_scale
)
def forward(self, waveform: torch.Tensor) -> torch.Tensor:
r"""
Args:
waveform (torch.Tensor): torch.Tensor of audio of dimension (..., time).
Returns:
torch.Tensor: Bark frequency spectrogram of size (..., ``n_barks``, time).
"""
specgram = self.spectrogram(waveform)
bark_specgram = self.bark_scale(specgram)
return bark_specgram
from ._multi_channel import MVDR, PSD, RTFMVDR, SoudenMVDR
from ._transforms import (
AmplitudeToDB,
BarkScale,
BarkSpectrogram,
ComputeDeltas,
Fade,
FrequencyMasking,
GriffinLim,
InverseBarkScale,
InverseMelScale,
InverseSpectrogram,
LFCC,
......@@ -37,16 +34,13 @@ __all__ = [
"FrequencyMasking",
"GriffinLim",
"InverseMelScale",
"InverseBarkScale",
"InverseSpectrogram",
"LFCC",
"Loudness",
"MFCC",
"MVDR",
"MelScale",
"BarkScale",
"MelSpectrogram",
"BarkSpectrogram",
"MuLawDecoding",
"MuLawEncoding",
"PSD",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment