Commit b326bc49 authored by Julián D. Arias-Londoño's avatar Julián D. Arias-Londoño Committed by Facebook GitHub Bot
Browse files

BarkSpectrogram (#2823)

Summary:
I have added BarkScale transform, which can transform a regular Spectrogram into a BarkSpectrograms similar to MelScale. ahmed-fau opened this requirement in December 2021 with the number (https://github.com/pytorch/audio/issues/2103). The new functionality includes three different well-known approximations of the Bark scale.

Pull Request resolved: https://github.com/pytorch/audio/pull/2823

Reviewed By: nateanl

Differential Revision: D41162100

Pulled By: carolineechen

fbshipit-source-id: b2670c4972e49c9ef424da5d5982576f7a4df831
parent 74f9a894
......@@ -17,6 +17,7 @@ Utility
amplitude_to_DB
DB_to_amplitude
melscale_fbanks
barkscale_fbanks
linear_fbanks
create_dct
mask_along_axis
......
......@@ -85,6 +85,8 @@ Utility
AmplitudeToDB
MelScale
InverseMelScale
BarkScale
InverseBarkScale
MuLawEncoding
MuLawDecoding
Resample
......@@ -102,6 +104,7 @@ Feature Extractions
Spectrogram
InverseSpectrogram
MelSpectrogram
BarkSpectrogram
GriffinLim
MFCC
LFCC
......
......@@ -140,6 +140,17 @@ class Functional(TempDirMixin, TestBaseMixin):
norm = "slaney"
self._assert_consistency(F.melscale_fbanks, (n_stft, f_min, f_max, n_mels, sample_rate, norm, "htk"))
def test_barkscale_fbanks(self):
if self.device != torch.device("cpu"):
raise unittest.SkipTest("No need to perform test on device other than CPU")
n_stft = 100
f_min = 0.0
f_max = 20.0
n_barks = 10
sample_rate = 16000
self._assert_consistency(F.barkscale_fbanks, (n_stft, f_min, f_max, n_barks, sample_rate, "traunmuller"))
def test_linear_fbanks(self):
if self.device != torch.device("cpu"):
raise unittest.SkipTest("No need to perform test on device other than CPU")
......
......@@ -90,6 +90,14 @@ class AutogradTestMixin(TestBaseMixin):
waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2)
self.assert_grad(transform, [waveform], nondet_tol=1e-10)
def test_barkspectrogram(self):
# replication_pad1d_backward_cuda is not deteministic and
# gives very small (~e-16) difference.
sample_rate = 8000
transform = T.BarkSpectrogram(sample_rate=sample_rate)
waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2)
self.assert_grad(transform, [waveform], nondet_tol=1e-10)
@nested_params(
[0, 0.99],
[False, True],
......@@ -194,6 +202,16 @@ class AutogradTestMixin(TestBaseMixin):
)
self.assert_grad(transform, [spec])
def test_barkscale(self):
sample_rate = 8000
n_fft = 400
n_barks = n_fft // 2 + 1
transform = T.BarkScale(sample_rate=sample_rate, n_barks=n_barks)
spec = get_spectrogram(
get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2), n_fft=n_fft, power=1
)
self.assert_grad(transform, [spec])
@parameterized.expand([(1.5, "amplitude"), (2, "power"), (10, "db")])
def test_vol(self, gain, gain_type):
sample_rate = 8000
......
......@@ -58,6 +58,24 @@ class TestTransforms(common_utils.TorchaudioTestCase):
# exactly same result. For this reason, tolerance is very relaxed here.
self.assert_batch_consistency(transform, mel_spec, atol=1.0, rtol=1e-5)
def test_batch_BarkScale(self):
specgram = torch.randn(3, 2, 201, 256)
atol = 1e-4 if os.name == "nt" else 1e-6
transform = T.BarkScale()
self.assert_batch_consistency(transform, specgram, atol=atol)
def test_batch_InverseBarkScale(self):
n_barks = 32
n_stft = 5
bark_spec = torch.randn(3, 2, n_barks, 32) ** 2
transform = T.InverseMelScale(n_stft, n_barks)
# Because InverseBarkScale runs SGD on randomly initialized values so they do not yield
# exactly same result. For this reason, tolerance is very relaxed here.
self.assert_batch_consistency(transform, bark_spec, atol=1.0, rtol=1e-5)
def test_batch_compute_deltas(self):
specgram = torch.randn(3, 2, 31, 2786)
transform = T.ComputeDeltas()
......
......@@ -131,6 +131,84 @@ class Tester(common_utils.TorchaudioTestCase):
self.assertTrue(fb_matrix_transform.fb.sum(1).ge(0.0).all())
self.assertEqual(fb_matrix_transform.fb.size(), (400, 100))
def test_barkscale_load_save(self):
specgram = torch.ones(1, 201, 100)
barkscale_transform = transforms.BarkScale()
barkscale_transform(specgram)
barkscale_transform_copy = transforms.BarkScale()
barkscale_transform_copy.load_state_dict(barkscale_transform.state_dict())
fb = barkscale_transform.fb
fb_copy = barkscale_transform_copy.fb
self.assertEqual(fb_copy.size(), (201, 128))
self.assertEqual(fb, fb_copy)
def test_barkspectrogram_load_save(self):
waveform = self.waveform.float()
bark_spectrogram_transform = transforms.BarkSpectrogram()
bark_spectrogram_transform(waveform)
bark_spectrogram_transform_copy = transforms.BarkSpectrogram()
bark_spectrogram_transform_copy.load_state_dict(bark_spectrogram_transform.state_dict())
window = bark_spectrogram_transform.spectrogram.window
window_copy = bark_spectrogram_transform_copy.spectrogram.window
fb = bark_spectrogram_transform.bark_scale.fb
fb_copy = bark_spectrogram_transform_copy.bark_scale.fb
self.assertEqual(window, window_copy)
# the default for n_fft = 400 and n_mels = 128
self.assertEqual(fb_copy.size(), (201, 128))
self.assertEqual(fb, fb_copy)
def test_bark2(self):
top_db = 80.0
s2db = transforms.AmplitudeToDB("power", top_db)
waveform = self.waveform.clone() # (1, 16000)
waveform_scaled = self.scale(waveform) # (1, 16000)
bark_transform = transforms.BarkSpectrogram()
# check defaults
spectrogram_torch = s2db(bark_transform(waveform_scaled)) # (1, 128, 321)
self.assertTrue(spectrogram_torch.dim() == 3)
self.assertTrue(spectrogram_torch.ge(spectrogram_torch.max() - top_db).all())
self.assertEqual(spectrogram_torch.size(1), bark_transform.n_barks)
# check correctness of filterbank conversion matrix
self.assertTrue(bark_transform.bark_scale.fb.sum(1).le(1.0).all())
self.assertTrue(bark_transform.bark_scale.fb.sum(1).ge(0.0).all())
# check options
kwargs = {
"window_fn": torch.hamming_window,
"pad": 10,
"win_length": 500,
"hop_length": 125,
"n_fft": 800,
"n_barks": 50,
}
bark_transform2 = transforms.BarkSpectrogram(**kwargs)
spectrogram2_torch = s2db(bark_transform2(waveform_scaled)) # (1, 50, 513)
self.assertTrue(spectrogram2_torch.dim() == 3)
self.assertTrue(spectrogram_torch.ge(spectrogram_torch.max() - top_db).all())
self.assertEqual(spectrogram2_torch.size(1), bark_transform2.n_barks)
self.assertTrue(bark_transform2.bark_scale.fb.sum(1).le(1.0).all())
self.assertTrue(bark_transform2.bark_scale.fb.sum(1).ge(0.0).all())
# check on multi-channel audio
filepath = common_utils.get_asset_path("steam-train-whistle-daniel_simon.wav")
x_stereo = common_utils.load_wav(filepath)[0] # (2, 278756), 44100
spectrogram_stereo = s2db(bark_transform(x_stereo)) # (2, 128, 1394)
self.assertTrue(spectrogram_stereo.dim() == 3)
self.assertTrue(spectrogram_stereo.size(0) == 2)
self.assertTrue(spectrogram_torch.ge(spectrogram_torch.max() - top_db).all())
self.assertEqual(spectrogram_stereo.size(1), bark_transform.n_barks)
# check filterbank matrix creation
fb_matrix_transform = transforms.BarkScale(n_barks=100, sample_rate=16000, f_min=0.0, f_max=None, n_stft=400)
self.assertTrue(fb_matrix_transform.fb.sum(1).le(1.0).all())
self.assertTrue(fb_matrix_transform.fb.sum(1).ge(0.0).all())
self.assertEqual(fb_matrix_transform.fb.size(), (400, 100))
def test_mfcc_defaults(self):
"""Check the default configuration of the MFCC transform."""
sample_rate = 16000
......@@ -296,3 +374,9 @@ class SmokeTest(common_utils.TorchaudioTestCase):
specgram = melspecgram.spectrogram
self.assertEqual(specgram.center, True)
self.assertEqual(specgram.pad_mode, "reflect")
def test_barkspectrogram(self):
barkspecgram = transforms.BarkSpectrogram(center=True, pad_mode="reflect")
specgram = barkspecgram.spectrogram
self.assertEqual(specgram.center, True)
self.assertEqual(specgram.pad_mode, "reflect")
......@@ -52,6 +52,49 @@ class TransformsTestBase(TestBaseMixin):
assert _get_ratio(relative_diff < 1e-3) > 5e-3
assert _get_ratio(relative_diff < 1e-5) > 1e-5
def test_InverseBarkScale(self):
"""Gauge the quality of InverseBarkScale transform.
As InverseBarkScale is currently implemented with
random initialization + iterative optimization,
it is not practically possible to assert the difference between
the estimated spectrogram and the original spectrogram as a whole.
Estimated spectrogram has very huge descrepency locally.
Thus in this test we gauge what percentage of elements are bellow
certain tolerance.
At the moment, the quality of estimated spectrogram is worse than the
one obtained for Inverse MelScale.
When implementation is changed in a way it makes the quality even worse,
this test will fail.
"""
n_fft = 400
power = 1
n_barks = 64
sample_rate = 8000
n_stft = n_fft // 2 + 1
# Generate reference spectrogram and input mel-scaled spectrogram
expected = get_spectrogram(
get_whitenoise(sample_rate=sample_rate, duration=1, n_channels=2), n_fft=n_fft, power=power
).to(self.device, self.dtype)
input = T.BarkScale(n_barks=n_barks, sample_rate=sample_rate, n_stft=n_stft).to(self.device, self.dtype)(
expected
)
# Run transform
transform = T.InverseBarkScale(n_stft, n_barks=n_barks, sample_rate=sample_rate).to(self.device, self.dtype)
result = transform(input)
# Compare
epsilon = 1e-60
relative_diff = torch.abs((result - expected) / (expected + epsilon))
for tol in [1e-1, 1e-3, 1e-5, 1e-10]:
print(f"Ratio of relative diff smaller than {tol:e} is " f"{_get_ratio(relative_diff < tol)}")
assert _get_ratio(relative_diff < 1e-1) > 0.2
assert _get_ratio(relative_diff < 1e-3) > 2e-3
@nested_params(
["sinc_interpolation", "kaiser_window"],
[16000, 44100],
......
......@@ -26,6 +26,7 @@ from .functional import (
amplitude_to_DB,
apply_beamforming,
apply_codec,
barkscale_fbanks,
compute_deltas,
compute_kaldi_pitch,
create_dct,
......@@ -61,6 +62,7 @@ __all__ = [
"compute_kaldi_pitch",
"create_dct",
"melscale_fbanks",
"barkscale_fbanks",
"linear_fbanks",
"DB_to_amplitude",
"loudness",
......
......@@ -22,6 +22,7 @@ __all__ = [
"compute_deltas",
"compute_kaldi_pitch",
"melscale_fbanks",
"barkscale_fbanks",
"linear_fbanks",
"create_dct",
"compute_deltas",
......@@ -479,6 +480,121 @@ def _mel_to_hz(mels: Tensor, mel_scale: str = "htk") -> Tensor:
return freqs
def _hz_to_bark(freqs: float, bark_scale: str = "traunmuller") -> float:
r"""Convert Hz to Barks.
Args:
freqs (float): Frequencies in Hz
bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
Returns:
barks (float): Frequency in Barks
"""
if bark_scale not in ["schroeder", "traunmuller", "wang"]:
raise ValueError('bark_scale should be one of "schroeder", "traunmuller" or "wang".')
if bark_scale == "wang":
return 6.0 * math.asinh(freqs / 600.0)
elif bark_scale == "schroeder":
return 7.0 * math.asinh(freqs / 650.0)
# Traunmuller Bark scale
barks = ((26.81 * freqs) / (1960.0 + freqs)) - 0.53
# Bark value correction
if barks < 2:
barks += 0.15 * (2 - barks)
elif barks > 20.1:
barks += 0.22 * (barks - 20.1)
return barks
def _bark_to_hz(barks: Tensor, bark_scale: str = "traunmuller") -> Tensor:
"""Convert bark bin numbers to frequencies.
Args:
barks (Tensor): Bark frequencies
bark_scale (str, optional): Scale to use: ``traunmuller``,``schroeder`` or ``wang``. (Default: ``traunmuller``)
Returns:
freqs (Tensor): Barks converted in Hz
"""
if bark_scale not in ["schroeder", "traunmuller", "wang"]:
raise ValueError('bark_scale should be one of "traunmuller", "schroeder" or "wang".')
if bark_scale == "wang":
return 600.0 * torch.sinh(barks / 6.0)
elif bark_scale == "schroeder":
return 650.0 * torch.sinh(barks / 7.0)
# Bark value correction
if any(barks < 2):
idx = barks < 2
barks[idx] = (barks[idx] - 0.3) / 0.85
elif any(barks > 20.1):
idx = barks > 20.1
barks[idx] = (barks[idx] + 4.422) / 1.22
# Traunmuller Bark scale
freqs = 1960 * ((barks + 0.53) / (26.28 - barks))
return freqs
def barkscale_fbanks(
n_freqs: int,
f_min: float,
f_max: float,
n_barks: int,
sample_rate: int,
bark_scale: str = "traunmuller",
) -> Tensor:
r"""Create a frequency bin conversion matrix.
.. devices:: CPU
.. properties:: TorchScript
Args:
n_freqs (int): Number of frequencies to highlight/apply
f_min (float): Minimum frequency (Hz)
f_max (float): Maximum frequency (Hz)
n_barks (int): Number of mel filterbanks
sample_rate (int): Sample rate of the audio waveform
bark_scale (str, optional): Scale to use: ``traunmuller``,``schroeder`` or ``wang``. (Default: ``traunmuller``)
Returns:
Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_barks``)
meaning number of frequencies to highlight/apply to x the number of filterbanks.
Each column is a filterbank so that assuming there is a matrix A of
size (..., ``n_freqs``), the applied result would be
``A * barkscale_fbanks(A.size(-1), ...)``.
"""
# freq bins
all_freqs = torch.linspace(0, sample_rate // 2, n_freqs)
# calculate bark freq bins
m_min = _hz_to_bark(f_min, bark_scale=bark_scale)
m_max = _hz_to_bark(f_max, bark_scale=bark_scale)
m_pts = torch.linspace(m_min, m_max, n_barks + 2)
f_pts = _bark_to_hz(m_pts, bark_scale=bark_scale)
# create filterbank
fb = _create_triangular_filterbank(all_freqs, f_pts)
if (fb.max(dim=0).values == 0.0).any():
warnings.warn(
"At least one bark filterbank has all zero values. "
f"The value for `n_barks` ({n_barks}) may be set too high. "
f"Or, the value for `n_freqs` ({n_freqs}) may be set too low."
)
return fb
def _create_triangular_filterbank(
all_freqs: Tensor,
f_pts: Tensor,
......
from ._multi_channel import MVDR, PSD, RTFMVDR, SoudenMVDR
from ._transforms import (
AmplitudeToDB,
BarkScale,
BarkSpectrogram,
ComputeDeltas,
Fade,
FrequencyMasking,
GriffinLim,
InverseBarkScale,
InverseMelScale,
InverseSpectrogram,
LFCC,
......@@ -34,13 +37,16 @@ __all__ = [
"FrequencyMasking",
"GriffinLim",
"InverseMelScale",
"InverseBarkScale",
"InverseSpectrogram",
"LFCC",
"Loudness",
"MFCC",
"MVDR",
"MelScale",
"BarkScale",
"MelSpectrogram",
"BarkSpectrogram",
"MuLawDecoding",
"MuLawEncoding",
"PSD",
......
......@@ -649,6 +649,298 @@ class MelSpectrogram(torch.nn.Module):
return mel_specgram
class BarkScale(torch.nn.Module):
r"""Turn a normal STFT into a bark frequency STFT with triangular filter banks.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
n_barks (int, optional): Number of bark filterbanks. (Default: ``128``)
sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
f_min (float, optional): Minimum frequency. (Default: ``0.``)
f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
n_stft (int, optional): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`. (Default: ``201``)
norm (str or None, optional): If ``"slaney"``, divide the triangular bark weights by the width of the bark band
(area normalization). (Default: ``None``)
bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
Example
>>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
>>> spectrogram_transform = transforms.Spectrogram(n_fft=1024)
>>> spectrogram = spectrogram_transform(waveform)
>>> barkscale_transform = transforms.BarkScale(sample_rate=sample_rate, n_stft=1024 // 2 + 1)
>>> barkscale_spectrogram = barkscale_transform(spectrogram)
See also:
:py:func:`torchaudio.functional.barkscale_fbanks` - The function used to
generate the filter banks.
"""
__constants__ = ["n_barks", "sample_rate", "f_min", "f_max"]
def __init__(
self,
n_barks: int = 128,
sample_rate: int = 16000,
f_min: float = 0.0,
f_max: Optional[float] = None,
n_stft: int = 201,
bark_scale: str = "traunmuller",
) -> None:
super(BarkScale, self).__init__()
self.n_barks = n_barks
self.sample_rate = sample_rate
self.f_max = f_max if f_max is not None else float(sample_rate // 2)
self.f_min = f_min
self.bark_scale = bark_scale
if f_min > self.f_max:
raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
fb = F.barkscale_fbanks(n_stft, self.f_min, self.f_max, self.n_barks, self.sample_rate, self.bark_scale)
self.register_buffer("fb", fb)
def forward(self, specgram: Tensor) -> Tensor:
r"""
Args:
specgram (Tensor): A spectrogram STFT of dimension (..., freq, time).
Returns:
Tensor: Bark frequency spectrogram of size (..., ``n_barks``, time).
"""
# (..., time, freq) dot (freq, n_mels) -> (..., n_mels, time)
bark_specgram = torch.matmul(specgram.transpose(-1, -2), self.fb).transpose(-1, -2)
return bark_specgram
class InverseBarkScale(torch.nn.Module):
r"""Estimate a STFT in normal frequency domain from bark frequency domain.
.. devices:: CPU CUDA
It minimizes the euclidian norm between the input bark-spectrogram and the product between
the estimated spectrogram and the filter banks using SGD.
Args:
n_stft (int): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`.
n_barks (int, optional): Number of bark filterbanks. (Default: ``128``)
sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
f_min (float, optional): Minimum frequency. (Default: ``0.``)
f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
max_iter (int, optional): Maximum number of optimization iterations. (Default: ``100000``)
tolerance_loss (float, optional): Value of loss to stop optimization at. (Default: ``1e-5``)
tolerance_change (float, optional): Difference in losses to stop optimization at. (Default: ``1e-8``)
sgdargs (dict or None, optional): Arguments for the SGD optimizer. (Default: ``None``)
bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
Example
>>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
>>> mel_spectrogram_transform = transforms.BarkSpectrogram(sample_rate, n_fft=1024)
>>> mel_spectrogram = bark_spectrogram_transform(waveform)
>>> inverse_barkscale_transform = transforms.InverseBarkScale(n_stft=1024 // 2 + 1)
>>> spectrogram = inverse_barkscale_transform(mel_spectrogram)
"""
__constants__ = [
"n_stft",
"n_barks",
"sample_rate",
"f_min",
"f_max",
"max_iter",
"tolerance_loss",
"tolerance_change",
"sgdargs",
]
def __init__(
self,
n_stft: int,
n_barks: int = 128,
sample_rate: int = 16000,
f_min: float = 0.0,
f_max: Optional[float] = None,
max_iter: int = 100000,
tolerance_loss: float = 1e-5,
tolerance_change: float = 1e-8,
sgdargs: Optional[dict] = None,
bark_scale: str = "traunmuller",
) -> None:
super(InverseBarkScale, self).__init__()
self.n_barks = n_barks
self.sample_rate = sample_rate
self.f_max = f_max or float(sample_rate // 2)
self.f_min = f_min
self.max_iter = max_iter
self.tolerance_loss = tolerance_loss
self.tolerance_change = tolerance_change
self.sgdargs = sgdargs or {"lr": 0.1, "momentum": 0.9}
if f_min > self.f_max:
raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
fb = F.barkscale_fbanks(n_stft, self.f_min, self.f_max, self.n_barks, self.sample_rate, bark_scale)
self.register_buffer("fb", fb)
def forward(self, barkspec: Tensor) -> Tensor:
r"""
Args:
barkspec (Tensor): A Bark frequency spectrogram of dimension (..., ``n_barks``, time)
Returns:
Tensor: Linear scale spectrogram of size (..., freq, time)
"""
# pack batch
shape = barkspec.size()
barkspec = barkspec.view(-1, shape[-2], shape[-1])
n_barks, time = shape[-2], shape[-1]
freq, _ = self.fb.size() # (freq, n_mels)
barkspec = barkspec.transpose(-1, -2)
if self.n_barks != n_barks:
raise ValueError("Expected an input with {} bark bins. Found: {}".format(self.n_barks, n_barks))
specgram = torch.rand(
barkspec.size()[0], time, freq, requires_grad=True, dtype=barkspec.dtype, device=barkspec.device
)
optim = torch.optim.SGD([specgram], **self.sgdargs)
loss = float("inf")
for _ in range(self.max_iter):
optim.zero_grad()
diff = barkspec - specgram.matmul(self.fb)
new_loss = diff.pow(2).sum(axis=-1).mean()
# take sum over bark-frequency then average over other dimensions
# so that loss threshold is applied par unit timeframe
new_loss.backward()
optim.step()
specgram.data = specgram.data.clamp(min=0)
new_loss = new_loss.item()
if new_loss < self.tolerance_loss or abs(loss - new_loss) < self.tolerance_change:
break
loss = new_loss
specgram.requires_grad_(False)
specgram = specgram.clamp(min=0).transpose(-1, -2)
# unpack batch
specgram = specgram.view(shape[:-2] + (freq, time))
return specgram
class BarkSpectrogram(torch.nn.Module):
r"""Create BarkSpectrogram for a raw audio signal.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
This is a composition of :py:func:`torchaudio.transforms.Spectrogram` and
and :py:func:`torchaudio.transforms.BarkScale`.
Sources
* https://www.fon.hum.uva.nl/praat/manual/BarkSpectrogram.html
* Traunmüller, Hartmut. "Analytical Expressions for the Tonotopic Sensory Scale." Journal of the Acoustical
* Society of America. Vol. 88, Issue 1, 1990, pp. 97–100.
* https://ccrma.stanford.edu/courses/120-fall-2003/lecture-5.html
Args:
sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
win_length (int or None, optional): Window size. (Default: ``n_fft``)
hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
f_min (float, optional): Minimum frequency. (Default: ``0.``)
f_max (float or None, optional): Maximum frequency. (Default: ``None``)
pad (int, optional): Two sided padding of signal. (Default: ``0``)
n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
window_fn (Callable[..., Tensor], optional): A function to create a window tensor
that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
power (float, optional): Exponent for the magnitude spectrogram,
(must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: ``2``)
normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``)
wkwargs (Dict[..., ...] or None, optional): Arguments for window function. (Default: ``None``)
center (bool, optional): whether to pad :attr:`waveform` on both sides so
that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
(Default: ``True``)
pad_mode (string, optional): controls the padding method used when
:attr:`center` is ``True``. (Default: ``"reflect"``)
bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
Example
>>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
>>> transform = transforms.BarkSpectrogram(sample_rate)
>>> bark_specgram = transform(waveform) # (channel, n_barks, time)
See also:
:py:func:`torchaudio.functional.melscale_fbanks` - The function used to
generate the filter banks.
"""
__constants__ = ["sample_rate", "n_fft", "win_length", "hop_length", "pad", "n_barks", "f_min"]
def __init__(
self,
sample_rate: int = 16000,
n_fft: int = 400,
win_length: Optional[int] = None,
hop_length: Optional[int] = None,
f_min: float = 0.0,
f_max: Optional[float] = None,
pad: int = 0,
n_barks: int = 128,
window_fn: Callable[..., Tensor] = torch.hann_window,
power: float = 2.0,
normalized: bool = False,
wkwargs: Optional[dict] = None,
center: bool = True,
pad_mode: str = "reflect",
bark_scale: str = "traunmuller",
) -> None:
super(BarkSpectrogram, self).__init__()
self.sample_rate = sample_rate
self.n_fft = n_fft
self.win_length = win_length if win_length is not None else n_fft
self.hop_length = hop_length if hop_length is not None else self.win_length // 2
self.pad = pad
self.power = power
self.normalized = normalized
self.n_barks = n_barks # number of bark frequency bins
self.f_max = f_max
self.f_min = f_min
self.spectrogram = Spectrogram(
n_fft=self.n_fft,
win_length=self.win_length,
hop_length=self.hop_length,
pad=self.pad,
window_fn=window_fn,
power=self.power,
normalized=self.normalized,
wkwargs=wkwargs,
center=center,
pad_mode=pad_mode,
onesided=True,
)
self.bark_scale = BarkScale(
self.n_barks, self.sample_rate, self.f_min, self.f_max, self.n_fft // 2 + 1, bark_scale
)
def forward(self, waveform: Tensor) -> Tensor:
r"""
Args:
waveform (Tensor): Tensor of audio of dimension (..., time).
Returns:
Tensor: Bark frequency spectrogram of size (..., ``n_barks``, time).
"""
specgram = self.spectrogram(waveform)
bark_specgram = self.bark_scale(specgram)
return bark_specgram
class MFCC(torch.nn.Module):
r"""Create the Mel-frequency cepstrum coefficients from an audio signal.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment