"macapp/forge.config.ts" did not exist on "ad4ffdf754e4e9bc73d3a92c2192a02704d9cbf3"
Unverified Commit dce253f6 authored by Zhiyong Wang's avatar Zhiyong Wang Committed by GitHub
Browse files

Add implementation of `spectrogram_batch` (#27159)



* Add initial implementation of `spectrogram_batch`

* Format the initial implementation

* Add test suite for the `spectrogram_batch`

* Update `spectrogram_batch` to ensure compatibility with test suite

* Update `spectrogram_batch` to include pre and post-processing

* Add `amplitude_to_db_batch` function and associated tests

* Add `power_to_db_batch` function and associated tests

* Reimplement the test suite for `spectrogram_batch`

* Fix errors in `spectrogram_batch`

* Add the function annotation for `spectrogram_batch`

* Address code quality

* Re-add `test_chroma_equivalence` function

* Update src/transformers/audio_utils.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/audio_utils.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>
parent 3c2d4d60
...@@ -18,7 +18,7 @@ and remove unnecessary dependencies. ...@@ -18,7 +18,7 @@ and remove unnecessary dependencies.
""" """
import warnings import warnings
from typing import Optional, Tuple, Union from typing import List, Optional, Tuple, Union
import numpy as np import numpy as np
...@@ -581,6 +581,213 @@ def spectrogram( ...@@ -581,6 +581,213 @@ def spectrogram(
return spectrogram return spectrogram
def spectrogram_batch(
waveform_list: List[np.ndarray],
window: np.ndarray,
frame_length: int,
hop_length: int,
fft_length: Optional[int] = None,
power: Optional[float] = 1.0,
center: bool = True,
pad_mode: str = "reflect",
onesided: bool = True,
preemphasis: Optional[float] = None,
mel_filters: Optional[np.ndarray] = None,
mel_floor: float = 1e-10,
log_mel: Optional[str] = None,
reference: float = 1.0,
min_value: float = 1e-10,
db_range: Optional[float] = None,
remove_dc_offset: Optional[bool] = None,
dtype: np.dtype = np.float32,
) -> List[np.ndarray]:
"""
Calculates spectrograms for a list of waveforms using the Short-Time Fourier Transform, optimized for batch processing.
This function extends the capabilities of the `spectrogram` function to handle multiple waveforms efficiently by leveraging broadcasting.
It supports generating various types of spectrograms:
- amplitude spectrogram (`power = 1.0`)
- power spectrogram (`power = 2.0`)
- complex-valued spectrogram (`power = None`)
- log spectrogram (use `log_mel` argument)
- mel spectrogram (provide `mel_filters`)
- log-mel spectrogram (provide `mel_filters` and `log_mel`)
How this works:
1. The input waveform is split into frames of size `frame_length` that are partially overlapping by `frame_length
- hop_length` samples.
2. Each frame is multiplied by the window and placed into a buffer of size `fft_length`.
3. The DFT is taken of each windowed frame.
4. The results are stacked into a spectrogram.
We make a distinction between the following "blocks" of sample data, each of which may have a different lengths:
- The analysis frame. This is the size of the time slices that the input waveform is split into.
- The window. Each analysis frame is multiplied by the window to avoid spectral leakage.
- The FFT input buffer. The length of this determines how many frequency bins are in the spectrogram.
In this implementation, the window is assumed to be zero-padded to have the same size as the analysis frame. A
padded window can be obtained from `window_function()`. The FFT input buffer may be larger than the analysis frame,
typically the next power of two.
Note: This function is designed for efficient batch processing of multiple waveforms but retains compatibility with individual waveform processing methods like `librosa.stft`.
Args:
waveform_list (`List[np.ndarray]` with arrays of shape `(length,)`):
The list of input waveforms, each a single-channel (mono) signal.
window (`np.ndarray` of shape `(frame_length,)`):
The windowing function to apply, including zero-padding if necessary.
frame_length (`int`):
The length of each frame for analysis.
hop_length (`int`):
The step size between successive frames.
fft_length (`int`, *optional*):
The size of the FFT buffer, defining frequency bin resolution.
power (`float`, *optional*, defaults to 1.0):
Determines the type of spectrogram: 1.0 for amplitude, 2.0 for power, None for complex.
center (`bool`, *optional*, defaults to `True`):
Whether to center-pad the waveform frames.
pad_mode (`str`, *optional*, defaults to `"reflect"`):
The padding strategy when `center` is `True`.
onesided (`bool`, *optional*, defaults to `True`):
If True, returns a one-sided spectrogram for real input signals.
preemphasis (`float`, *optional*):
Applies a pre-emphasis filter to each frame.
mel_filters (`np.ndarray`, *optional*):
Mel filter bank for converting to mel spectrogram.
mel_floor (`float`, *optional*, defaults to 1e-10):
Floor value for mel spectrogram to avoid log(0).
log_mel (`str`, *optional*):
Specifies log scaling strategy; options are None, "log", "log10", "dB".
reference (`float`, *optional*, defaults to 1.0):
Reference value for dB conversion in log_mel.
min_value (`float`, °optional*, defaults to 1e-10):
Minimum floor value for log scale conversions.
db_range (`float`, *optional*):
Dynamic range for dB scale spectrograms.
remove_dc_offset (`bool`, *optional*):
Whether to remove the DC offset from each frame.
dtype (`np.dtype`, *optional*, defaults to `np.float32`):
Data type of the output spectrogram.
Returns:
List[`np.ndarray`]: A list of spectrogram arrays, one for each input waveform.
"""
window_length = len(window)
if fft_length is None:
fft_length = frame_length
if frame_length > fft_length:
raise ValueError(f"frame_length ({frame_length}) may not be larger than fft_length ({fft_length})")
if window_length != frame_length:
raise ValueError(f"Length of the window ({window_length}) must equal frame_length ({frame_length})")
if hop_length <= 0:
raise ValueError("hop_length must be greater than zero")
# Check the dimensions of the waveform
for waveform in waveform_list:
if waveform.ndim != 1:
raise ValueError(f"Input waveform must have only one dimension, shape is {waveform.shape}")
# Check if waveform is complex
for waveform in waveform_list:
if np.iscomplexobj(waveform):
raise ValueError("Complex-valued input waveforms are not currently supported")
# Center pad the waveform
if center:
padding = [(int(frame_length // 2), int(frame_length // 2))]
waveform_list = [
np.pad(
waveform,
padding,
mode=pad_mode,
)
for waveform in waveform_list
]
original_waveform_lengths = [
len(waveform) for waveform in waveform_list
] # these lengths will be used to remove padding later
# Batch pad the waveform
max_length = max(original_waveform_lengths)
padded_waveform_batch = np.array(
[
np.pad(waveform, (0, max_length - len(waveform)), mode="constant", constant_values=0)
for waveform in waveform_list
],
dtype=dtype,
)
# Promote to float64, since np.fft uses float64 internally
padded_waveform_batch = padded_waveform_batch.astype(np.float64)
window = window.astype(np.float64)
# Split waveform into frames of frame_length size
num_frames = int(1 + np.floor((padded_waveform_batch.shape[1] - frame_length) / hop_length))
# these lengths will be used to remove padding later
true_num_frames = [int(1 + np.floor((length - frame_length) / hop_length)) for length in original_waveform_lengths]
num_batches = padded_waveform_batch.shape[0]
num_frequency_bins = (fft_length // 2) + 1 if onesided else fft_length
spectrogram = np.empty((num_batches, num_frames, num_frequency_bins), dtype=np.complex64)
# rfft is faster than fft
fft_func = np.fft.rfft if onesided else np.fft.fft
buffer = np.zeros((num_batches, fft_length))
for frame_idx in range(num_frames):
timestep = frame_idx * hop_length
buffer[:, :frame_length] = padded_waveform_batch[:, timestep : timestep + frame_length]
if remove_dc_offset:
buffer[:, :frame_length] -= buffer[:, :frame_length].mean(axis=1, keepdims=True)
if preemphasis is not None:
buffer[:, 1:frame_length] -= preemphasis * buffer[:, : frame_length - 1]
buffer[:, 0] *= 1 - preemphasis
buffer[:, :frame_length] *= window
spectrogram[:, frame_idx] = fft_func(buffer)
# Note: ** is much faster than np.power
if power is not None:
spectrogram = np.abs(spectrogram, dtype=np.float64) ** power
# Apply mel filters if provided
if mel_filters is not None:
result = np.tensordot(spectrogram, mel_filters.T, axes=([2], [1]))
spectrogram = np.maximum(mel_floor, result)
# Convert to log scale if specified
if power is not None and log_mel is not None:
if log_mel == "log":
spectrogram = np.log(spectrogram)
elif log_mel == "log10":
spectrogram = np.log10(spectrogram)
elif log_mel == "dB":
if power == 1.0:
spectrogram = amplitude_to_db_batch(spectrogram, reference, min_value, db_range)
elif power == 2.0:
spectrogram = power_to_db_batch(spectrogram, reference, min_value, db_range)
else:
raise ValueError(f"Cannot use log_mel option '{log_mel}' with power {power}")
else:
raise ValueError(f"Unknown log_mel option: {log_mel}")
spectrogram = np.asarray(spectrogram, dtype)
spectrogram_list = [spectrogram[i, : true_num_frames[i], :].T for i in range(len(true_num_frames))]
return spectrogram_list
def power_to_db( def power_to_db(
spectrogram: np.ndarray, spectrogram: np.ndarray,
reference: float = 1.0, reference: float = 1.0,
...@@ -632,6 +839,55 @@ def power_to_db( ...@@ -632,6 +839,55 @@ def power_to_db(
return spectrogram return spectrogram
def power_to_db_batch(
spectrogram: np.ndarray,
reference: float = 1.0,
min_value: float = 1e-10,
db_range: Optional[float] = None,
) -> np.ndarray:
"""
Converts a batch of power spectrograms to the decibel scale. This computes `10 * log10(spectrogram / reference)`,
using basic logarithm properties for numerical stability.
This function supports batch processing, where each item in the batch is an individual power (mel) spectrogram.
Args:
spectrogram (`np.ndarray`):
The input batch of power (mel) spectrograms. Expected shape is (batch_size, *spectrogram_shape).
Note that a power spectrogram has the amplitudes squared!
reference (`float`, *optional*, defaults to 1.0):
Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
the loudest part to 0 dB. Must be greater than zero.
min_value (`float`, *optional*, defaults to `1e-10`):
The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
`log(0)`. The default of `1e-10` corresponds to a minimum of -100 dB. Must be greater than zero.
db_range (`float`, *optional*):
Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
Returns:
`np.ndarray`: the batch of spectrograms in decibels
"""
if reference <= 0.0:
raise ValueError("reference must be greater than zero")
if min_value <= 0.0:
raise ValueError("min_value must be greater than zero")
reference = max(min_value, reference)
spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None)
spectrogram = 10.0 * (np.log10(spectrogram) - np.log10(reference))
if db_range is not None:
if db_range <= 0.0:
raise ValueError("db_range must be greater than zero")
# Apply db_range clipping per batch item
max_values = spectrogram.max(axis=(1, 2), keepdims=True)
spectrogram = np.clip(spectrogram, a_min=max_values - db_range, a_max=None)
return spectrogram
def amplitude_to_db( def amplitude_to_db(
spectrogram: np.ndarray, spectrogram: np.ndarray,
reference: float = 1.0, reference: float = 1.0,
...@@ -681,6 +937,51 @@ def amplitude_to_db( ...@@ -681,6 +937,51 @@ def amplitude_to_db(
return spectrogram return spectrogram
def amplitude_to_db_batch(
spectrogram: np.ndarray, reference: float = 1.0, min_value: float = 1e-5, db_range: Optional[float] = None
) -> np.ndarray:
"""
Converts a batch of amplitude spectrograms to the decibel scale. This computes `20 * log10(spectrogram / reference)`,
using basic logarithm properties for numerical stability.
The function supports batch processing, where each item in the batch is an individual amplitude (mel) spectrogram.
Args:
spectrogram (`np.ndarray`):
The input batch of amplitude (mel) spectrograms. Expected shape is (batch_size, *spectrogram_shape).
reference (`float`, *optional*, defaults to 1.0):
Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
the loudest part to 0 dB. Must be greater than zero.
min_value (`float`, *optional*, defaults to `1e-5`):
The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
`log(0)`. The default of `1e-5` corresponds to a minimum of -100 dB. Must be greater than zero.
db_range (`float`, *optional*):
Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
Returns:
`np.ndarray`: the batch of spectrograms in decibels
"""
if reference <= 0.0:
raise ValueError("reference must be greater than zero")
if min_value <= 0.0:
raise ValueError("min_value must be greater than zero")
reference = max(min_value, reference)
spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None)
spectrogram = 20.0 * (np.log10(spectrogram) - np.log10(reference))
if db_range is not None:
if db_range <= 0.0:
raise ValueError("db_range must be greater than zero")
# Apply db_range clipping per batch item
max_values = spectrogram.max(axis=(1, 2), keepdims=True)
spectrogram = np.clip(spectrogram, a_min=max_values - db_range, a_max=None)
return spectrogram
### deprecated functions below this line ### ### deprecated functions below this line ###
......
...@@ -20,12 +20,15 @@ import pytest ...@@ -20,12 +20,15 @@ import pytest
from transformers.audio_utils import ( from transformers.audio_utils import (
amplitude_to_db, amplitude_to_db,
amplitude_to_db_batch,
chroma_filter_bank, chroma_filter_bank,
hertz_to_mel, hertz_to_mel,
mel_filter_bank, mel_filter_bank,
mel_to_hertz, mel_to_hertz,
power_to_db, power_to_db,
power_to_db_batch,
spectrogram, spectrogram,
spectrogram_batch,
window_function, window_function,
) )
from transformers.testing_utils import is_librosa_available, require_librosa from transformers.testing_utils import is_librosa_available, require_librosa
...@@ -284,6 +287,41 @@ class AudioUtilsFunctionTester(unittest.TestCase): ...@@ -284,6 +287,41 @@ class AudioUtilsFunctionTester(unittest.TestCase):
expected = np.array([[0.0, 0.0669873, 0.9330127, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]) expected = np.array([[0.0, 0.0669873, 0.9330127, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])
self.assertTrue(np.allclose(spec, expected)) self.assertTrue(np.allclose(spec, expected))
def test_spectrogram_batch_impulse(self):
waveform1 = np.zeros(40)
waveform1[9] = 1.0
waveform2 = np.zeros(28)
waveform2[12] = 3.0
waveform3 = np.zeros(51)
waveform3[26] = 4.5
waveform_list = [waveform1, waveform2, waveform3]
spec_list = spectrogram_batch(
waveform_list,
window_function(12, "hann", frame_length=16),
frame_length=16,
hop_length=4,
power=1.0,
center=True,
pad_mode="reflect",
onesided=True,
)
self.assertEqual(spec_list[0].shape, (9, 11))
self.assertEqual(spec_list[1].shape, (9, 8))
self.assertEqual(spec_list[2].shape, (9, 13))
expected1 = np.array([[0.0, 0.0669873, 0.9330127, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])
expected2 = np.array([[0.0, 0.0, 0.75, 3.0, 0.75, 0.0, 0.0, 0.0]])
expected3 = np.array([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.375, 3.375, 0.0, 0.0, 0.0, 0.0, 0.0]])
self.assertTrue(np.allclose(spec_list[0], expected1))
self.assertTrue(np.allclose(spec_list[1], expected2))
self.assertTrue(np.allclose(spec_list[2], expected3))
def test_spectrogram_integration_test(self): def test_spectrogram_integration_test(self):
waveform = self._load_datasamples(1)[0] waveform = self._load_datasamples(1)[0]
...@@ -384,6 +422,179 @@ class AudioUtilsFunctionTester(unittest.TestCase): ...@@ -384,6 +422,179 @@ class AudioUtilsFunctionTester(unittest.TestCase):
# fmt: on # fmt: on
self.assertTrue(np.allclose(spec[:64, 400], expected, atol=1e-5)) self.assertTrue(np.allclose(spec[:64, 400], expected, atol=1e-5))
def test_spectrogram_batch_integration_test(self):
waveform_list = self._load_datasamples(3)
spec_list = spectrogram_batch(
waveform_list,
window_function(400, "hann", frame_length=512),
frame_length=512,
hop_length=128,
power=1.0,
center=True,
pad_mode="reflect",
onesided=True,
)
self.assertEqual(spec_list[0].shape, (257, 732))
self.assertEqual(spec_list[1].shape, (257, 602))
self.assertEqual(spec_list[2].shape, (257, 1561))
# fmt: off
expected1 = np.array([
0.02464888, 0.04648664, 0.05872392, 0.02311783, 0.0327175 ,
0.02433643, 0.01198814, 0.02055709, 0.01559287, 0.01394357,
0.01299037, 0.01728045, 0.0254554 , 0.02486533, 0.02011792,
0.01755333, 0.02100457, 0.02337024, 0.01436963, 0.01464558,
0.0211017 , 0.0193489 , 0.01272165, 0.01858462, 0.03722598,
0.0456542 , 0.03281558, 0.00620586, 0.02226466, 0.03618042,
0.03508182, 0.02271432, 0.01051649, 0.01225771, 0.02315293,
0.02331886, 0.01417785, 0.0106844 , 0.01791214, 0.017177 ,
0.02125114, 0.05028201, 0.06830665, 0.05216664, 0.01963666,
0.06941418, 0.11513043, 0.12257859, 0.10948435, 0.08568069,
0.05509328, 0.05047818, 0.047112 , 0.05060737, 0.02982424,
0.02803827, 0.02933729, 0.01760491, 0.00587815, 0.02117637,
0.0293578 , 0.03452379, 0.02194803, 0.01676056,
])
expected2 = np.array([
7.61983171e-02, 1.45338190e-01, 2.63903728e+00, 7.74429535e+00,
9.61932980e+00, 5.40767686e+00, 1.08924884e+00, 3.40908262e+00,
3.59484250e+00, 1.68451077e+00, 5.88405873e-01, 1.17042530e+00,
9.94803324e-01, 3.53757065e-01, 5.47699239e-01, 9.48368581e-01,
7.17770457e-01, 2.09396633e-01, 1.77574463e-01, 2.35644731e-01,
1.31535991e-01, 1.53539552e-02, 4.34416305e-02, 5.32897267e-02,
4.03567305e-02, 1.41842226e-02, 2.90514538e-02, 3.36549485e-02,
1.53516624e-02, 2.37464225e-02, 4.60092464e-02, 4.05769324e-02,
4.82633401e-03, 4.12675364e-02, 7.13859796e-02, 6.16866566e-02,
2.55657822e-02, 1.68923281e-02, 1.91299946e-02, 1.60033798e-02,
1.33405095e-02, 1.52065457e-02, 1.21833352e-02, 2.25786382e-03,
6.15358376e-03, 1.07647616e-02, 1.23051018e-02, 6.75289378e-03,
2.71127435e-03, 1.06515263e-02, 1.18463583e-02, 7.14347935e-03,
1.87912782e-03, 4.44236027e-03, 5.19630243e-03, 2.46666998e-03,
1.01598645e-03, 1.21589237e-03, 1.29095500e-03, 1.07447628e-03,
1.40218156e-03, 3.65402623e-03, 4.00592755e-03, 4.20001841e-03
])
expected3 = np.array([
0.07805249, 0.34305022, 0.55617084, 1.22475182, 1.17040678,
0.51540532, 0.23570016, 0.06630775, 0.09017777, 0.07693192,
0.0333643 , 0.04873054, 0.04668559, 0.02384041, 0.02780435,
0.0289717 , 0.01704903, 0.0201644 , 0.01700376, 0.02176975,
0.02042491, 0.00732129, 0.00326042, 0.00245065, 0.00510645,
0.00681892, 0.00739329, 0.00551437, 0.0070674 , 0.00630015,
0.00379566, 0.0060098 , 0.00311543, 0.00902284, 0.01171038,
0.01202166, 0.01759194, 0.01652899, 0.01201872, 0.01295351,
0.00756432, 0.01415318, 0.02349972, 0.02296833, 0.02429341,
0.02447459, 0.01835044, 0.01437871, 0.02262246, 0.02972324,
0.03392252, 0.03037546, 0.01116927, 0.01555062, 0.02833379,
0.02294212, 0.02069847, 0.02496927, 0.02273526, 0.01341643,
0.00805407, 0.00624943, 0.01076262, 0.01876003
])
# fmt: on
self.assertTrue(np.allclose(spec_list[0][:64, 400], expected1))
self.assertTrue(np.allclose(spec_list[1][:64, 400], expected2))
self.assertTrue(np.allclose(spec_list[2][:64, 400], expected3))
spec_list = spectrogram_batch(
waveform_list,
window_function(400, "hann"),
frame_length=400,
hop_length=128,
fft_length=512,
power=1.0,
center=True,
pad_mode="reflect",
onesided=True,
)
self.assertEqual(spec_list[0].shape, (257, 732))
self.assertEqual(spec_list[1].shape, (257, 602))
self.assertEqual(spec_list[2].shape, (257, 1561))
self.assertTrue(np.allclose(spec_list[0][:64, 400], expected1))
self.assertTrue(np.allclose(spec_list[1][:64, 400], expected2))
self.assertTrue(np.allclose(spec_list[2][:64, 400], expected3))
mel_filters = mel_filter_bank(
num_frequency_bins=256,
num_mel_filters=400,
min_frequency=20,
max_frequency=8000,
sampling_rate=16000,
norm=None,
mel_scale="kaldi",
triangularize_in_mel_space=True,
)
mel_filters = np.pad(mel_filters, ((0, 1), (0, 0)))
spec_list = spectrogram_batch(
waveform_list,
window_function(400, "povey", periodic=False),
frame_length=400,
hop_length=160,
fft_length=512,
power=2.0,
center=False,
pad_mode="reflect",
onesided=True,
preemphasis=0.97,
mel_filters=mel_filters,
log_mel="log",
mel_floor=1.1920928955078125e-07,
remove_dc_offset=True,
)
self.assertEqual(spec_list[0].shape, (400, 584))
self.assertEqual(spec_list[1].shape, (400, 480))
self.assertEqual(spec_list[2].shape, (400, 1247))
# fmt: off
expected1 = np.array([-15.94238515, -8.20712299, -8.22704352, -15.94238515,
-15.94238515, -15.94238515, -15.94238515, -15.94238515,
-6.52463769, -7.73677889, -15.94238515, -15.94238515,
-15.94238515, -15.94238515, -4.18650018, -3.37195286,
-15.94238515, -15.94238515, -15.94238515, -15.94238515,
-4.70190154, -2.4217066 , -15.94238515, -15.94238515,
-15.94238515, -15.94238515, -5.62755239, -3.53385194,
-15.94238515, -15.94238515, -15.94238515, -15.94238515,
-9.43303023, -8.77480925, -15.94238515, -15.94238515,
-15.94238515, -15.94238515, -4.2951092 , -5.51585994,
-15.94238515, -15.94238515, -15.94238515, -4.40151721,
-3.95228878, -15.94238515, -15.94238515, -15.94238515,
-6.10365415, -4.59494697, -15.94238515, -15.94238515,
-15.94238515, -8.10727767, -6.2585298 , -15.94238515,
-15.94238515, -15.94238515, -5.60161702, -4.47217004,
-15.94238515, -15.94238515, -15.94238515, -5.91641988]
)
expected2 = np.array([-15.942385, -8.531508, -8.551396, -15.942385, -15.942385,
-15.942385, -15.942385, -15.942385, -5.626043, -6.8381968,
-15.942385, -15.942385, -15.942385, -15.942385, -3.3122184,
-2.49764, -15.942385, -15.942385, -15.942385, -15.942385,
-3.625868, -1.3457257, -15.942385, -15.942385, -15.942385,
-15.942385, -4.2223063, -2.1285915, -15.942385, -15.942385,
-15.942385, -15.942385, -8.611152, -7.952894, -15.942385,
-15.942385, -15.942385, -15.942385, -2.7585578, -3.9793255,
-15.942385, -15.942385, -15.942385, -2.5377562, -2.0885658,
-15.942385, -15.942385, -15.942385, -3.8310733, -2.322393,
-15.942385, -15.942385, -15.942385, -7.674944, -5.8261633,
-15.942385, -15.942385, -15.942385, -3.5960004, -2.4665844,
-15.942385, -15.942385, -15.942385, -1.7905309]
)
expected3 = np.array([-15.942385, -13.406995, -13.426883, -15.942385, -15.942385,
-15.942385, -15.942385, -15.942385, -15.942385, -15.942385,
-15.942385, -15.942385, -15.942385, -15.942385, -13.493383,
-12.678805, -15.942385, -15.942385, -15.942385, -15.942385,
-14.809377, -12.529235, -15.942385, -15.942385, -15.942385,
-15.942385, -13.838827, -11.745112, -15.942385, -15.942385,
-15.942385, -15.942385, -13.9336405, -13.275384, -15.942385,
-15.942385, -15.942385, -15.942385, -13.043786, -14.264554,
-15.942385, -15.942385, -15.942385, -13.060181, -12.610991,
-15.942385, -15.942385, -15.942385, -14.152064, -12.643384,
-15.942385, -15.942385, -15.942385, -14.48317, -12.634389,
-15.942385, -15.942385, -15.942385, -14.627316, -13.4979,
-15.942385, -15.942385, -15.942385, -12.6279955]
)
# fmt: on
self.assertTrue(np.allclose(spec_list[0][:64, 400], expected1, atol=1e-5))
self.assertTrue(np.allclose(spec_list[1][:64, 400], expected2, atol=1e-5))
self.assertTrue(np.allclose(spec_list[2][:64, 400], expected3, atol=1e-5))
def test_spectrogram_center_padding(self): def test_spectrogram_center_padding(self):
waveform = self._load_datasamples(1)[0] waveform = self._load_datasamples(1)[0]
...@@ -473,6 +684,200 @@ class AudioUtilsFunctionTester(unittest.TestCase): ...@@ -473,6 +684,200 @@ class AudioUtilsFunctionTester(unittest.TestCase):
# fmt: on # fmt: on
self.assertTrue(np.allclose(spec[:64, 0], expected)) self.assertTrue(np.allclose(spec[:64, 0], expected))
def test_spectrogram_batch_center_padding(self):
waveform_list = self._load_datasamples(3)
spec_list = spectrogram_batch(
waveform_list,
window_function(512, "hann"),
frame_length=512,
hop_length=128,
center=True,
pad_mode="reflect",
)
self.assertEqual(spec_list[0].shape, (257, 732))
self.assertEqual(spec_list[1].shape, (257, 602))
self.assertEqual(spec_list[2].shape, (257, 1561))
# fmt: off
expected1 = np.array([
0.1287945 , 0.12792738, 0.08311573, 0.03155122, 0.02470202,
0.00727857, 0.00910694, 0.00686163, 0.01238981, 0.01473668,
0.00336144, 0.00370314, 0.00600871, 0.01120164, 0.01942998,
0.03132008, 0.0232842 , 0.01124642, 0.02754783, 0.02423725,
0.00147893, 0.00038027, 0.00112299, 0.00596233, 0.00571529,
0.02084235, 0.0231855 , 0.00810006, 0.01837943, 0.00651339,
0.00093931, 0.00067426, 0.01058399, 0.01270507, 0.00151734,
0.00331913, 0.00302416, 0.01081792, 0.00754549, 0.00148963,
0.00111943, 0.00152573, 0.00608017, 0.01749986, 0.01205949,
0.0143082 , 0.01910573, 0.00413786, 0.03916619, 0.09873404,
0.08302026, 0.02673891, 0.00401255, 0.01397392, 0.00751862,
0.01024884, 0.01544606, 0.00638907, 0.00623633, 0.0085103 ,
0.00217659, 0.00276204, 0.00260835, 0.00299299,
])
expected2 = np.array([
1.89624839e-02, 1.23274978e-02, 3.69160250e-02, 4.76267971e-02,
1.39258439e-02, 2.98370440e-02, 2.74845166e-03, 3.01934010e-03,
1.18722776e-02, 9.70834121e-03, 2.06300567e-04, 6.32975250e-04,
8.20603687e-03, 1.21864351e-02, 3.28791840e-03, 3.36801982e-04,
2.79373326e-03, 5.00530424e-03, 8.46884679e-03, 1.14089288e-02,
8.59052036e-03, 2.88538425e-03, 9.95071139e-03, 6.80431770e-03,
2.95809377e-03, 1.46285209e-04, 3.36268265e-03, 4.80051298e-04,
2.84506916e-03, 9.34222655e-04, 3.42161348e-03, 2.79612141e-03,
3.38875921e-03, 2.85030343e-03, 5.39513239e-05, 2.72908504e-03,
2.09591188e-03, 5.00271388e-04, 8.31917219e-04, 2.37967237e-03,
1.75001193e-03, 1.31826295e-04, 8.83622793e-04, 1.54303256e-04,
3.09544569e-03, 4.08527814e-03, 2.73566321e-03, 1.78805250e-03,
9.53314066e-06, 1.74316950e-03, 1.51099428e-03, 8.65990878e-04,
8.44859460e-04, 5.35220199e-04, 5.36562002e-04, 8.33181897e-04,
8.22705682e-04, 1.81083288e-03, 9.75003233e-04, 6.73114730e-04,
6.81665202e-04, 2.05180887e-03, 1.10151991e-03, 4.75923851e-04,
])
expected3 = np.array([
0.07079848, 0.04237922, 0.0220724, 0.04446052, 0.03598337,
0.03327273, 0.02545774, 0.01319528, 0.00919659, 0.01376867,
0.00361992, 0.00608425, 0.01105873, 0.0105565, 0.00744286,
0.00244849, 0.00257317, 0.00749989, 0.01061386, 0.01525312,
0.00656914, 0.01199581, 0.00487319, 0.00830956, 0.0046706,
0.00588962, 0.00544486, 0.00565179, 0.00050112, 0.01108059,
0.00217417, 0.00453234, 0.00537306, 0.00269329, 0.00342333,
0.00095484, 0.00708934, 0.00660373, 0.00543686, 0.00217186,
0.00431519, 0.00457764, 0.00503529, 0.01166454, 0.01375581,
0.01467224, 0.00873404, 0.00534086, 0.00476848, 0.0226163,
0.0314, 0.00151021, 0.01975221, 0.01637519, 0.00046068,
0.0460544, 0.06285986, 0.03151625, 0.0013598, 0.004804,
0.0073824, 0.02312599, 0.02613977, 0.01056851
])
# fmt: on
self.assertTrue(np.allclose(spec_list[0][:64, 0], expected1))
self.assertTrue(np.allclose(spec_list[1][:64, 0], expected2))
self.assertTrue(np.allclose(spec_list[2][:64, 0], expected3))
spec_list = spectrogram_batch(
waveform_list,
window_function(512, "hann"),
frame_length=512,
hop_length=128,
center=True,
pad_mode="constant",
)
self.assertEqual(spec_list[0].shape, (257, 732))
self.assertEqual(spec_list[1].shape, (257, 602))
self.assertEqual(spec_list[2].shape, (257, 1561))
# fmt: off
expected1 = np.array([
0.06558744, 0.06889656, 0.06263352, 0.04264418, 0.03404115,
0.03244197, 0.02279134, 0.01646339, 0.01452216, 0.00826055,
0.00062093, 0.0031821 , 0.00419456, 0.00689327, 0.01106367,
0.01712119, 0.01721762, 0.00977533, 0.01606626, 0.02275621,
0.01727687, 0.00992739, 0.01217688, 0.01049927, 0.01022947,
0.01302475, 0.01166873, 0.01081812, 0.01057327, 0.00767912,
0.00429567, 0.00089625, 0.00654583, 0.00912084, 0.00700984,
0.00225026, 0.00290545, 0.00667712, 0.00730663, 0.00410813,
0.00073102, 0.00219296, 0.00527618, 0.00996585, 0.01123781,
0.00872816, 0.01165121, 0.02047945, 0.03681747, 0.0514379 ,
0.05137928, 0.03960042, 0.02821562, 0.01813349, 0.01201322,
0.01260964, 0.00900654, 0.00207905, 0.00456714, 0.00850599,
0.00788239, 0.00664407, 0.00824227, 0.00628301,
])
expected2 = np.array([
0.00955754, 0.01445548, 0.02393902, 0.02903068, 0.02512844,
0.01508297, 0.00474784, 0.00440362, 0.0073898, 0.00546519,
0.00126077, 0.00240507, 0.00523254, 0.00632742, 0.00415215,
0.00056628, 0.00161288, 0.0026956, 0.00431587, 0.00621471,
0.00791291, 0.0079454, 0.00594525, 0.00334581, 0.00180047,
0.00144485, 0.00175764, 0.00188037, 0.00134889, 0.00150253,
0.00178821, 0.00158875, 0.00204339, 0.00266497, 0.00280556,
0.00221949, 0.00108956, 0.000532, 0.00108454, 0.00129254,
0.00089315, 0.00022803, 0.00038176, 0.0011302, 0.00189306,
0.0021964, 0.00203576, 0.00207306, 0.00217727, 0.00174297,
0.00103331, 0.00076695, 0.0007422, 0.00061986, 0.00081204,
0.00079615, 0.00089417, 0.00105452, 0.00042615, 0.00066372,
0.00132765, 0.00122087, 0.00054903, 0.00107945,
])
expected3 = np.array([
0.03573493, 0.03625983, 0.03341755, 0.02431477, 0.01770546,
0.0169356 , 0.01579034, 0.01600499, 0.01329064, 0.00747957,
0.00367372, 0.00403853, 0.00519597, 0.00551022, 0.00532757,
0.00367569, 0.00130341, 0.00345149, 0.00520744, 0.00872308,
0.01172503, 0.00948154, 0.00344236, 0.00387997, 0.00425455,
0.00394357, 0.00711733, 0.00615654, 0.00055756, 0.00656414,
0.00852001, 0.00666252, 0.00509767, 0.00246784, 0.00376049,
0.00682879, 0.00641118, 0.00469685, 0.00358701, 0.0015552 ,
0.00261458, 0.00701979, 0.00929578, 0.00894536, 0.00828491,
0.00773528, 0.00552091, 0.00259871, 0.00933179, 0.01588626,
0.01697887, 0.01268552, 0.00957255, 0.01204092, 0.02123362,
0.03062669, 0.03215763, 0.02629963, 0.01769568, 0.01088869,
0.01151334, 0.01378197, 0.01319263, 0.01066859,
])
# fmt: on
self.assertTrue(np.allclose(spec_list[0][:64, 0], expected1))
self.assertTrue(np.allclose(spec_list[1][:64, 0], expected2))
self.assertTrue(np.allclose(spec_list[2][:64, 0], expected3))
spec_list = spectrogram_batch(
waveform_list,
window_function(512, "hann"),
frame_length=512,
hop_length=128,
center=False,
)
self.assertEqual(spec_list[0].shape, (257, 728))
self.assertEqual(spec_list[1].shape, (257, 598))
self.assertEqual(spec_list[2].shape, (257, 1557))
# fmt: off
expected1 = np.array([
0.00250445, 0.02161521, 0.06232229, 0.04339567, 0.00937727,
0.01080616, 0.00248685, 0.0095264 , 0.00727476, 0.0079152 ,
0.00839946, 0.00254932, 0.00716622, 0.005559 , 0.00272623,
0.00581774, 0.01896395, 0.01829788, 0.01020514, 0.01632692,
0.00870888, 0.02065827, 0.0136022 , 0.0132382 , 0.011827 ,
0.00194505, 0.0189979 , 0.026874 , 0.02194014, 0.01923883,
0.01621437, 0.00661967, 0.00289517, 0.00470257, 0.00957801,
0.00191455, 0.00431664, 0.00544359, 0.01126213, 0.00785778,
0.00423469, 0.01322504, 0.02226548, 0.02318576, 0.03428908,
0.03648811, 0.0202938 , 0.011902 , 0.03226198, 0.06347476,
0.01306318, 0.05308729, 0.05474771, 0.03127991, 0.00998512,
0.01449977, 0.01272741, 0.00868176, 0.00850386, 0.00313876,
0.00811857, 0.00538216, 0.00685749, 0.00535275,
])
expected2 = np.array([
0.01232908, 0.05980514, 0.08285419, 0.01850723, 0.02823627,
0.00204369, 0.01372626, 0.00956435, 0.02267217, 0.00947112,
0.00355174, 0.00418008, 0.00843608, 0.01559252, 0.01125505,
0.00183573, 0.00765051, 0.0109983 , 0.00890545, 0.00583453,
0.00115901, 0.00579039, 0.00151353, 0.00395812, 0.00231413,
0.00384272, 0.00313914, 0.00072331, 0.00338935, 0.00383328,
0.00218129, 0.00284516, 0.00228538, 0.00083603, 0.00111663,
0.00235799, 0.00142748, 0.00092908, 0.0012966 , 0.0011403 ,
0.0010619 , 0.00158732, 0.00289866, 0.00216709, 0.00313325,
0.00361277, 0.00202507, 0.0009948 , 0.00114428, 0.00200851,
0.0009234 , 0.00063468, 0.00018746, 0.00100463, 0.00053799,
0.00080009, 0.00158291, 0.00172077, 0.00173586, 0.00197127,
0.00107058, 0.00043486, 0.0009859 , 0.00215484,
])
expected3 = np.array([
0.01864123, 0.06131337, 0.08346292, 0.04936386, 0.02792609,
0.01005205, 0.00884826, 0.02198604, 0.02421535, 0.00957573,
0.00503561, 0.00241331, 0.00175652, 0.00195889, 0.00453299,
0.0020317 , 0.00249264, 0.00517483, 0.01111943, 0.0150079 ,
0.01977743, 0.01253825, 0.00517561, 0.01031712, 0.00579466,
0.00783679, 0.0071415 , 0.00591847, 0.01510728, 0.01194921,
0.00518072, 0.00125978, 0.00577552, 0.01050614, 0.0077644 ,
0.0042905 , 0.00278469, 0.00166695, 0.00255013, 0.00578153,
0.00586451, 0.00929514, 0.01501226, 0.00741419, 0.00310625,
0.00086757, 0.00595618, 0.0053882 , 0.0116266 , 0.02504773,
0.02889692, 0.03739442, 0.04730207, 0.03856638, 0.05700104,
0.04299267, 0.02153366, 0.03740607, 0.03811468, 0.01575022,
0.00676344, 0.01359865, 0.01769319, 0.00907966,
])
# fmt: on
self.assertTrue(np.allclose(spec_list[0][:64, 0], expected1))
self.assertTrue(np.allclose(spec_list[1][:64, 0], expected2))
self.assertTrue(np.allclose(spec_list[2][:64, 0], expected3))
def test_spectrogram_shapes(self): def test_spectrogram_shapes(self):
waveform = self._load_datasamples(1)[0] waveform = self._load_datasamples(1)[0]
...@@ -549,6 +954,94 @@ class AudioUtilsFunctionTester(unittest.TestCase): ...@@ -549,6 +954,94 @@ class AudioUtilsFunctionTester(unittest.TestCase):
) )
self.assertEqual(spec.shape, (512, 183)) self.assertEqual(spec.shape, (512, 183))
def test_spectrogram_batch_shapes(self):
waveform_list = self._load_datasamples(3)
spec_list = spectrogram_batch(
waveform_list,
window_function(400, "hann"),
frame_length=400,
hop_length=128,
power=1.0,
center=True,
pad_mode="reflect",
onesided=True,
)
self.assertEqual(spec_list[0].shape, (201, 732))
self.assertEqual(spec_list[1].shape, (201, 602))
self.assertEqual(spec_list[2].shape, (201, 1561))
spec_list = spectrogram_batch(
waveform_list,
window_function(400, "hann"),
frame_length=400,
hop_length=128,
power=1.0,
center=False,
pad_mode="reflect",
onesided=True,
)
self.assertEqual(spec_list[0].shape, (201, 729))
self.assertEqual(spec_list[1].shape, (201, 599))
self.assertEqual(spec_list[2].shape, (201, 1558))
spec_list = spectrogram_batch(
waveform_list,
window_function(400, "hann"),
frame_length=400,
hop_length=128,
fft_length=512,
power=1.0,
center=True,
pad_mode="reflect",
onesided=True,
)
self.assertEqual(spec_list[0].shape, (257, 732))
self.assertEqual(spec_list[1].shape, (257, 602))
self.assertEqual(spec_list[2].shape, (257, 1561))
spec_list = spectrogram_batch(
waveform_list,
window_function(400, "hann", frame_length=512),
frame_length=512,
hop_length=64,
power=1.0,
center=True,
pad_mode="reflect",
onesided=False,
)
self.assertEqual(spec_list[0].shape, (512, 1464))
self.assertEqual(spec_list[1].shape, (512, 1204))
self.assertEqual(spec_list[2].shape, (512, 3122))
spec_list = spectrogram_batch(
waveform_list,
window_function(512, "hann"),
frame_length=512,
hop_length=64,
power=1.0,
center=True,
pad_mode="reflect",
onesided=False,
)
self.assertEqual(spec_list[0].shape, (512, 1464))
self.assertEqual(spec_list[1].shape, (512, 1204))
self.assertEqual(spec_list[2].shape, (512, 3122))
spec_list = spectrogram_batch(
waveform_list,
window_function(512, "hann"),
frame_length=512,
hop_length=512,
power=1.0,
center=True,
pad_mode="reflect",
onesided=False,
)
self.assertEqual(spec_list[0].shape, (512, 183))
self.assertEqual(spec_list[1].shape, (512, 151))
self.assertEqual(spec_list[2].shape, (512, 391))
def test_mel_spectrogram(self): def test_mel_spectrogram(self):
waveform = self._load_datasamples(1)[0] waveform = self._load_datasamples(1)[0]
...@@ -592,6 +1085,67 @@ class AudioUtilsFunctionTester(unittest.TestCase): ...@@ -592,6 +1085,67 @@ class AudioUtilsFunctionTester(unittest.TestCase):
# fmt: on # fmt: on
self.assertTrue(np.allclose(spec[:, 300], expected)) self.assertTrue(np.allclose(spec[:, 300], expected))
def test_mel_spectrogram_batch(self):
waveform_list = self._load_datasamples(3)
mel_filters = mel_filter_bank(
num_frequency_bins=513,
num_mel_filters=13,
min_frequency=100,
max_frequency=4000,
sampling_rate=16000,
norm=None,
mel_scale="htk",
)
self.assertEqual(mel_filters.shape, (513, 13))
spec_list = spectrogram_batch(
waveform_list,
window_function(800, "hann", frame_length=1024),
frame_length=1024,
hop_length=128,
power=2.0,
)
self.assertEqual(spec_list[0].shape, (513, 732))
self.assertEqual(spec_list[1].shape, (513, 602))
self.assertEqual(spec_list[2].shape, (513, 1561))
spec_list = spectrogram_batch(
waveform_list,
window_function(800, "hann", frame_length=1024),
frame_length=1024,
hop_length=128,
power=2.0,
mel_filters=mel_filters,
)
self.assertEqual(spec_list[0].shape, (13, 732))
self.assertEqual(spec_list[1].shape, (13, 602))
self.assertEqual(spec_list[2].shape, (13, 1561))
# fmt: off
expected1 = np.array([
1.08027889e+02, 1.48080673e+01, 7.70758213e+00, 9.57676639e-01,
8.81639061e-02, 5.26073833e-02, 1.52736155e-02, 9.95350117e-03,
7.95364356e-03, 1.01148004e-02, 4.29241020e-03, 9.90708797e-03,
9.44153646e-04
])
expected2 = np.array([
71.82577165, 109.44693334, 272.4834194, 164.90450355,
16.54056349, 11.60810547, 24.87525946, 21.07317022,
1.26736284, 1.4583074, 1.36659061, 1.76305768,
2.03703503
])
expected3 = np.array([
5.22246749e+02, 6.92660728e+02, 2.65895922e+02, 2.06526565e+01,
2.28692104e+00, 1.19473622e+00, 8.43228216e-01, 3.20760592e+00,
1.33654151e+00, 1.51050684e-01, 2.78282477e-01, 9.25020981e-01,
2.29908841e-01
])
# fmt: on
self.assertTrue(np.allclose(spec_list[0][:, 300], expected1))
self.assertTrue(np.allclose(spec_list[1][:, 300], expected2))
self.assertTrue(np.allclose(spec_list[2][:, 300], expected3))
def test_spectrogram_power(self): def test_spectrogram_power(self):
waveform = self._load_datasamples(1)[0] waveform = self._load_datasamples(1)[0]
...@@ -688,6 +1242,219 @@ class AudioUtilsFunctionTester(unittest.TestCase): ...@@ -688,6 +1242,219 @@ class AudioUtilsFunctionTester(unittest.TestCase):
# fmt: on # fmt: on
self.assertTrue(np.allclose(spec[64:128, 321], expected)) self.assertTrue(np.allclose(spec[64:128, 321], expected))
def test_spectrogram_batch_power(self):
waveform_list = self._load_datasamples(3)
spec_list = spectrogram_batch(
waveform_list,
window_function(400, "hann", frame_length=512),
frame_length=512,
hop_length=128,
power=None,
)
self.assertEqual(spec_list[0].shape, (257, 732))
self.assertEqual(spec_list[0].dtype, np.complex64)
self.assertEqual(spec_list[1].shape, (257, 602))
self.assertEqual(spec_list[1].dtype, np.complex64)
self.assertEqual(spec_list[2].shape, (257, 1561))
self.assertEqual(spec_list[2].dtype, np.complex64)
# fmt: off
expected1 = np.array([
0.01452305+0.01820039j, -0.01737362-0.01641946j,
0.0121028 +0.01565081j, -0.02794554-0.03021514j,
0.04719803+0.04086519j, -0.04391563-0.02779365j,
0.05682834+0.01571325j, -0.08604821-0.02023657j,
0.07497991+0.0186641j , -0.06366091-0.00922475j,
0.11003416+0.0114788j , -0.13677941-0.01523552j,
0.10934535-0.00117226j, -0.11635598+0.02551187j,
0.14708674-0.03469823j, -0.1328196 +0.06034218j,
0.12667368-0.13973421j, -0.14764774+0.18912019j,
0.10235471-0.12181523j, -0.00773012+0.04730498j,
-0.01487191-0.07312611j, -0.02739162+0.09619419j,
0.02895459-0.05398273j, 0.01198589+0.05276592j,
-0.02117299-0.10123465j, 0.00666388+0.09526499j,
-0.01672773-0.05649684j, 0.02723125+0.05939891j,
-0.01879361-0.062954j , 0.03686557+0.04568823j,
-0.07394181-0.07949649j, 0.06238583+0.13905765j,
])
expected2 = np.array([
-0.01634146-7.0067253e-03j, -0.00068403+9.2661660e-03j,
0.00571721-3.9035487e-03j, -0.00915086+1.5033451e-03j,
0.01138636+5.4256055e-03j, -0.00294282-1.2016168e-02j,
-0.00428711+7.3687937e-03j, -0.001002 -1.3972387e-03j,
0.00622582+3.7551194e-03j, -0.00137886-7.0342086e-03j,
-0.00824075+3.8430823e-03j, 0.0107349 +7.1450039e-03j,
0.00363763-1.4242286e-02j, -0.01499857+1.7917662e-05j,
-0.0046242 +1.2500680e-02j, 0.02180984+7.2047939e-03j,
-0.00273568-1.6844695e-02j, -0.00178986-7.5209686e-03j,
-0.01661806+1.2662713e-03j, -0.01045276+2.0611197e-02j,
0.03252975+2.5592113e-02j, 0.03945662-6.7136563e-02j,
-0.10622615+4.9393820e-03j, 0.06684612+6.4607985e-02j,
-0.00753762-5.1637031e-02j, -0.00220644+1.8002450e-02j,
-0.00357443-4.1291970e-03j, 0.01463647-1.4063751e-03j,
-0.02252573-1.1189026e-02j, 0.00276293+1.9019062e-02j,
0.01216721+1.2095908e-03j, 0.00034753-7.4386634e-03j
])
expected3 = np.array([
2.3276670e-02+0.0406534j, -2.4413882e-02-0.07868771j,
1.0993068e-02+0.05550544j, -1.5825305e-02+0.00480187j,
4.7617555e-02-0.04421869j, -7.1669750e-02+0.06317082j,
5.9706111e-02-0.08369736j, -2.2317577e-02+0.08915959j,
-2.3291381e-02-0.06601578j, 5.9362967e-02+0.03185856j,
-6.5269925e-02+0.0030586j, 5.0898481e-02-0.04319243j,
-4.0413942e-02+0.08051146j, 3.0059000e-02-0.09730332j,
-1.2479190e-02+0.09703682j, -6.1806822e-03-0.09617531j,
2.6907364e-02+0.08084074j, -4.1639723e-02-0.03391053j,
3.1113219e-02-0.01497662j, 3.4023849e-03+0.03632669j,
-4.9804080e-02-0.039231j, 8.9777440e-02+0.02577243j,
-9.2947647e-02+0.01514865j, 6.2368069e-02-0.05954866j,
-2.9966677e-02+0.06520324j, -8.2365885e-05-0.0440613j ,
2.0203773e-02+0.04350767j, -8.9924788e-04-0.05406843j,
-3.5951469e-02+0.03055602j, 3.3790238e-02+0.02182594j,
1.0919777e-03-0.06437822j, -1.8534327e-02+0.07866792j
])
# fmt: on
self.assertTrue(np.allclose(spec_list[0][64:96, 321], expected1))
self.assertTrue(np.allclose(spec_list[1][64:96, 321], expected2))
self.assertTrue(np.allclose(spec_list[2][64:96, 321], expected3))
spec_list = spectrogram_batch(
waveform_list,
window_function(400, "hann", frame_length=512),
frame_length=512,
hop_length=128,
power=1.0,
)
self.assertEqual(spec_list[0].shape, (257, 732))
self.assertEqual(spec_list[0].dtype, np.float64)
self.assertEqual(spec_list[1].shape, (257, 602))
self.assertEqual(spec_list[1].dtype, np.float64)
self.assertEqual(spec_list[2].shape, (257, 1561))
self.assertEqual(spec_list[2].dtype, np.float64)
# fmt: off
expected1 = np.array([
0.02328461, 0.02390484, 0.01978448, 0.04115711, 0.0624309 ,
0.05197181, 0.05896072, 0.08839577, 0.07726794, 0.06432579,
0.11063128, 0.13762532, 0.10935163, 0.11911998, 0.15112405,
0.14588428, 0.18860507, 0.23992978, 0.15910825, 0.04793241,
0.07462307, 0.10001811, 0.06125769, 0.05411011, 0.10342509,
0.09549777, 0.05892122, 0.06534349, 0.06569936, 0.05870678,
0.10856833, 0.1524107 , 0.11463385, 0.05766969, 0.12385171,
0.14472842, 0.11978184, 0.10353675, 0.07244056, 0.03461861,
0.02624896, 0.02227475, 0.01238363, 0.00885281, 0.0110049 ,
0.00807005, 0.01033663, 0.01703181, 0.01445856, 0.00585615,
0.0132431 , 0.02754132, 0.01524478, 0.0204908 , 0.07453328,
0.10716327, 0.07195779, 0.08816078, 0.18340898, 0.16449876,
0.12322842, 0.1621659 , 0.12334293, 0.06033659,
])
expected2 = np.array([
0.01778026, 0.00929138, 0.00692273, 0.00927352, 0.01261294,
0.01237128, 0.00852516, 0.00171938, 0.00727061, 0.00716808,
0.00909281, 0.01289532, 0.01469949, 0.01499858, 0.01332855,
0.02296907, 0.01706539, 0.00773101, 0.01666623, 0.02311021,
0.0413901, 0.07787261, 0.10634092, 0.09296556, 0.05218428,
0.01813716, 0.00546139, 0.01470388, 0.02515159, 0.0192187,
0.01222719, 0.00744678, 0.01045674, 0.01923522, 0.01990819,
0.01174323, 0.01535391, 0.02786647, 0.02904595, 0.0313408 ,
0.0340503, 0.03118268, 0.02915136, 0.04200513, 0.05563153,
0.05429446, 0.05021769, 0.05882667, 0.06668596, 0.06555867,
0.04523559, 0.01489498, 0.01031892, 0.02134155, 0.01736669,
0.0195216, 0.03971575, 0.03938636, 0.02052712, 0.03104931,
0.0902727, 0.09022622, 0.03275532, 0.0172633,
])
expected3 = np.array([
0.04684551, 0.08238806, 0.05658358, 0.01653778, 0.06498249,
0.09553589, 0.10281084, 0.09191031, 0.07000408, 0.06737158,
0.06534155, 0.06675509, 0.09008541, 0.10184046, 0.09783596,
0.0963737, 0.08520112, 0.05370093, 0.03453015, 0.03648568,
0.06339967, 0.09340346, 0.09417402, 0.08623119, 0.07175977,
0.04406138, 0.04796988, 0.05407591, 0.0471824 , 0.04022626,
0.06438748, 0.0808218, 0.0745263, 0.06191467, 0.03116328,
0.03206497, 0.05867718, 0.04424652, 0.04448404, 0.07032498,
0.08300796, 0.07895744, 0.0816894, 0.09392357, 0.07571699,
0.03967651, 0.07703795, 0.06464871, 0.08704693, 0.14085226,
0.1350321, 0.18794712, 0.27043005, 0.26596246, 0.19948336,
0.06545141, 0.13204652, 0.08554521, 0.2262849, 0.33900721,
0.3970475, 0.3482436, 0.17134947, 0.46249565,
])
# fmt: on
self.assertTrue(np.allclose(spec_list[0][64:128, 321], expected1))
self.assertTrue(np.allclose(spec_list[1][64:128, 321], expected2))
self.assertTrue(np.allclose(spec_list[2][64:128, 321], expected3))
spec_list = spectrogram_batch(
waveform_list,
window_function(400, "hann", frame_length=512),
frame_length=512,
hop_length=128,
power=2.0,
)
self.assertEqual(spec_list[0].shape, (257, 732))
self.assertEqual(spec_list[0].dtype, np.float64)
self.assertEqual(spec_list[1].shape, (257, 602))
self.assertEqual(spec_list[1].dtype, np.float64)
self.assertEqual(spec_list[2].shape, (257, 1561))
self.assertEqual(spec_list[2].dtype, np.float64)
# fmt: off
expected1 = np.array([
5.42173162e-04, 5.71441371e-04, 3.91425507e-04, 1.69390778e-03,
3.89761780e-03, 2.70106923e-03, 3.47636663e-03, 7.81381316e-03,
5.97033510e-03, 4.13780799e-03, 1.22392802e-02, 1.89407300e-02,
1.19577805e-02, 1.41895693e-02, 2.28384770e-02, 2.12822221e-02,
3.55718732e-02, 5.75663000e-02, 2.53154356e-02, 2.29751552e-03,
5.56860259e-03, 1.00036217e-02, 3.75250424e-03, 2.92790355e-03,
1.06967501e-02, 9.11982451e-03, 3.47171025e-03, 4.26977174e-03,
4.31640586e-03, 3.44648538e-03, 1.17870830e-02, 2.32290216e-02,
1.31409196e-02, 3.32579296e-03, 1.53392460e-02, 2.09463164e-02,
1.43476883e-02, 1.07198600e-02, 5.24763530e-03, 1.19844836e-03,
6.89007982e-04, 4.96164430e-04, 1.53354369e-04, 7.83722571e-05,
1.21107812e-04, 6.51257360e-05, 1.06845939e-04, 2.90082477e-04,
2.09049831e-04, 3.42945241e-05, 1.75379610e-04, 7.58524227e-04,
2.32403356e-04, 4.19872697e-04, 5.55520924e-03, 1.14839673e-02,
5.17792348e-03, 7.77232368e-03, 3.36388536e-02, 2.70598419e-02,
1.51852425e-02, 2.62977779e-02, 1.52134784e-02, 3.64050455e-03,
])
expected2 = np.array([
3.16137604e-04, 8.63297362e-05, 4.79241720e-05, 8.59982493e-05,
1.59086326e-04, 1.53048476e-04, 7.26783945e-05, 2.95627100e-06,
5.28617352e-05, 5.13813355e-05, 8.26792588e-05, 1.66289156e-04,
2.16075069e-04, 2.24957314e-04, 1.77650211e-04, 5.27578282e-04,
2.91227688e-04, 5.97685493e-05, 2.77763360e-04, 5.34081651e-04,
1.71314057e-03, 6.06414277e-03, 1.13083916e-02, 8.64259617e-03,
2.72319867e-03, 3.28956593e-04, 2.98268126e-05, 2.16204145e-04,
6.32602626e-04, 3.69358508e-04, 1.49504171e-04, 5.54544917e-05,
1.09343371e-04, 3.69993847e-04, 3.96335839e-04, 1.37903521e-04,
2.35742483e-04, 7.76540114e-04, 8.43667068e-04, 9.82245923e-04,
1.15942286e-03, 9.72359636e-04, 8.49801853e-04, 1.76443092e-03,
3.09486753e-03, 2.94788822e-03, 2.52181630e-03, 3.46057723e-03,
4.44701769e-03, 4.29793858e-03, 2.04625858e-03, 2.21860290e-04,
1.06480179e-04, 4.55461892e-04, 3.01601836e-04, 3.81092892e-04,
1.57734053e-03, 1.55128531e-03, 4.21362677e-04, 9.64059883e-04,
8.14916019e-03, 8.14077014e-03, 1.07291131e-03, 2.98021545e-04,
])
expected3 = np.array([
0.0021945 , 0.00678779, 0.0032017 , 0.0002735 , 0.00422272,
0.00912711, 0.01057007, 0.00844751, 0.00490057, 0.00453893,
0.00426952, 0.00445624, 0.00811538, 0.01037148, 0.00957188,
0.00928789, 0.00725923, 0.00288379, 0.00119233, 0.0013312 ,
0.00401952, 0.00872421, 0.00886875, 0.00743582, 0.00514946,
0.00194141, 0.00230111, 0.0029242 , 0.00222618, 0.00161815,
0.00414575, 0.00653216, 0.00555417, 0.00383343, 0.00097115,
0.00102816, 0.00344301, 0.00195775, 0.00197883, 0.0049456 ,
0.00689032, 0.00623428, 0.00667316, 0.00882164, 0.00573306,
0.00157423, 0.00593485, 0.00417946, 0.00757717, 0.01983936,
0.01823367, 0.03532412, 0.07313241, 0.07073603, 0.03979361,
0.00428389, 0.01743628, 0.00731798, 0.05120486, 0.11492589,
0.15764671, 0.1212736 , 0.02936064, 0.21390222
])
# fmt: on
self.assertTrue(np.allclose(spec_list[0][64:128, 321], expected1))
self.assertTrue(np.allclose(spec_list[1][64:128, 321], expected2))
self.assertTrue(np.allclose(spec_list[2][64:128, 321], expected3))
def test_power_to_db(self): def test_power_to_db(self):
spectrogram = np.zeros((2, 3)) spectrogram = np.zeros((2, 3))
spectrogram[0, 0] = 2.0 spectrogram[0, 0] = 2.0
...@@ -726,6 +1493,84 @@ class AudioUtilsFunctionTester(unittest.TestCase): ...@@ -726,6 +1493,84 @@ class AudioUtilsFunctionTester(unittest.TestCase):
with pytest.raises(ValueError): with pytest.raises(ValueError):
power_to_db(spectrogram, db_range=-80) power_to_db(spectrogram, db_range=-80)
def test_power_to_db_batch(self):
# Setup a batch of spectrograms with varying values and lengths
batch_spectrogram = np.zeros((3, 2, 3))
batch_spectrogram[0, 0, 0] = 2.0
batch_spectrogram[0, 0, 1] = 0.5
batch_spectrogram[0, 0, 2] = 0.707
batch_spectrogram[0, 1, 1] = 1.0
batch_spectrogram[1, :, :2] = batch_spectrogram[0, :, :2] * 1.5
batch_spectrogram[2, :, :1] = batch_spectrogram[0, :, :1] * 0.5
# Expected values computed by applying `power_to_db` iteratively
output = power_to_db_batch(batch_spectrogram, reference=1.0)
expected = np.array(
[
[[3.01029996, -3.01029996, -1.50580586], [-100, 0, -100]],
[[4.77121255, -1.24938737, -100], [-100, 1.76091259, -100]],
[[0, -100, -100], [-100, -100, -100]],
]
)
self.assertTrue(np.allclose(output, expected))
output = power_to_db_batch(batch_spectrogram, reference=2.0)
expected = np.array(
[
[[0, -6.02059991, -4.51610582], [-103.01029996, -3.01029996, -103.01029996]],
[[1.76091259, -4.25968732, -103.01029996], [-103.01029996, -1.24938737, -103.01029996]],
[[-3.01029996, -103.01029996, -103.01029996], [-103.01029996, -103.01029996, -103.01029996]],
]
)
self.assertTrue(np.allclose(output, expected))
output = power_to_db_batch(batch_spectrogram, min_value=1e-6)
expected = np.array(
[
[[3.01029996, -3.01029996, -1.50580586], [-60, 0, -60]],
[[4.77121255, -1.24938737, -60], [-60, 1.76091259, -60]],
[[0, -60, -60], [-60, -60, -60]],
]
)
self.assertTrue(np.allclose(output, expected))
output = power_to_db_batch(batch_spectrogram, db_range=80)
expected = np.array(
[
[[3.01029996, -3.01029996, -1.50580586], [-76.98970004, 0, -76.98970004]],
[[4.77121255, -1.24938737, -75.22878745], [-75.22878745, 1.76091259, -75.22878745]],
[[0, -80, -80], [-80, -80, -80]],
]
)
self.assertTrue(np.allclose(output, expected))
output = power_to_db_batch(batch_spectrogram, reference=2.0, db_range=80)
expected = np.array(
[
[[0, -6.02059991, -4.51610582], [-80, -3.01029996, -80]],
[[1.76091259, -4.25968732, -78.23908741], [-78.23908741, -1.24938737, -78.23908741]],
[[-3.01029996, -83.01029996, -83.01029996], [-83.01029996, -83.01029996, -83.01029996]],
]
)
self.assertTrue(np.allclose(output, expected))
output = power_to_db_batch(batch_spectrogram, reference=2.0, min_value=1e-6, db_range=80)
expected = np.array(
[
[[0, -6.02059991, -4.51610582], [-63.01029996, -3.01029996, -63.01029996]],
[[1.76091259, -4.25968732, -63.01029996], [-63.01029996, -1.24938737, -63.01029996]],
[[-3.01029996, -63.01029996, -63.01029996], [-63.01029996, -63.01029996, -63.01029996]],
]
)
self.assertTrue(np.allclose(output, expected))
with pytest.raises(ValueError):
power_to_db_batch(batch_spectrogram, reference=0.0)
with pytest.raises(ValueError):
power_to_db_batch(batch_spectrogram, min_value=0.0)
with pytest.raises(ValueError):
power_to_db_batch(batch_spectrogram, db_range=-80)
def test_amplitude_to_db(self): def test_amplitude_to_db(self):
spectrogram = np.zeros((2, 3)) spectrogram = np.zeros((2, 3))
spectrogram[0, 0] = 2.0 spectrogram[0, 0] = 2.0
...@@ -764,6 +1609,84 @@ class AudioUtilsFunctionTester(unittest.TestCase): ...@@ -764,6 +1609,84 @@ class AudioUtilsFunctionTester(unittest.TestCase):
with pytest.raises(ValueError): with pytest.raises(ValueError):
amplitude_to_db(spectrogram, db_range=-80) amplitude_to_db(spectrogram, db_range=-80)
def test_amplitude_to_db_batch(self):
# Setup a batch of spectrograms with varying values and lengths
batch_spectrogram = np.zeros((3, 2, 3))
batch_spectrogram[0, 0, 0] = 2.0
batch_spectrogram[0, 0, 1] = 0.5
batch_spectrogram[0, 0, 2] = 0.707
batch_spectrogram[0, 1, 1] = 1.0
batch_spectrogram[1, :, :2] = batch_spectrogram[0, :, :2] * 1.5
batch_spectrogram[2, :, :1] = batch_spectrogram[0, :, :1] * 0.5
# Expected values computed by applying `amplitude_to_db` iteratively
output = amplitude_to_db_batch(batch_spectrogram, reference=1.0)
expected = np.array(
[
[[6.02059991, -6.02059991, -3.01161172], [-100, 0, -100]],
[[9.54242509, -2.49877473, -100], [-100, 3.52182518, -100]],
[[0, -100, -100], [-100, -100, -100]],
]
)
self.assertTrue(np.allclose(output, expected))
output = amplitude_to_db_batch(batch_spectrogram, reference=2.0)
expected = np.array(
[
[[0, -12.04119983, -9.03221164], [-106.02059991, -6.02059991, -106.02059991]],
[[3.52182518, -8.51937465, -106.02059991], [-106.02059991, -2.49877473, -106.02059991]],
[[-6.02059991, -106.02059991, -106.02059991], [-106.02059991, -106.02059991, -106.02059991]],
]
)
self.assertTrue(np.allclose(output, expected))
output = amplitude_to_db_batch(batch_spectrogram, min_value=1e-3)
expected = np.array(
[
[[6.02059991, -6.02059991, -3.01161172], [-60, 0, -60]],
[[9.54242509, -2.49877473, -60], [-60, 3.52182518, -60]],
[[0, -60, -60], [-60, -60, -60]],
]
)
self.assertTrue(np.allclose(output, expected))
output = amplitude_to_db_batch(batch_spectrogram, db_range=80)
expected = np.array(
[
[[6.02059991, -6.02059991, -3.01161172], [-73.97940009, 0, -73.97940009]],
[[9.54242509, -2.49877473, -70.45757491], [-70.45757491, 3.52182518, -70.45757491]],
[[0, -80, -80], [-80, -80, -80]],
]
)
self.assertTrue(np.allclose(output, expected))
output = amplitude_to_db_batch(batch_spectrogram, reference=2.0, db_range=80)
expected = np.array(
[
[[0, -12.04119983, -9.03221164], [-80, -6.02059991, -80]],
[[3.52182518, -8.51937465, -76.47817482], [-76.47817482, -2.49877473, -76.47817482]],
[[-6.02059991, -86.02059991, -86.02059991], [-86.02059991, -86.02059991, -86.02059991]],
]
)
self.assertTrue(np.allclose(output, expected))
output = amplitude_to_db_batch(batch_spectrogram, reference=2.0, min_value=1e-3, db_range=80)
expected = np.array(
[
[[0, -12.04119983, -9.03221164], [-66.02059991, -6.02059991, -66.02059991]],
[[3.52182518, -8.51937465, -66.02059991], [-66.02059991, -2.49877473, -66.02059991]],
[[-6.02059991, -66.02059991, -66.02059991], [-66.02059991, -66.02059991, -66.02059991]],
]
)
self.assertTrue(np.allclose(output, expected))
with pytest.raises(ValueError):
amplitude_to_db_batch(batch_spectrogram, reference=0.0)
with pytest.raises(ValueError):
amplitude_to_db_batch(batch_spectrogram, min_value=0.0)
with pytest.raises(ValueError):
amplitude_to_db_batch(batch_spectrogram, db_range=-80)
@require_librosa @require_librosa
def test_chroma_equivalence(self): def test_chroma_equivalence(self):
num_frequency_bins = 25 num_frequency_bins = 25
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment