Commit 8f187354 authored by moto's avatar moto Committed by Facebook GitHub Bot
Browse files

Update resampling tutorial (#2773)

Summary:
* Refactor benchmark script
* Rename `time` variable to avoid (potential) conflicting with time module
* Fix `beta` parameter in benchmark (it was not used previously)
* Use `timeit` module for benchmark
* Add plot
* Move the comment on result at the end
* Add link to an explanation of aliasing

https://output.circle-artifacts.com/output/job/20b57d2f-3614-4161-a18e-e0c1a537739c/artifacts/0/docs/tutorials/audio_resampling_tutorial.html

Pull Request resolved: https://github.com/pytorch/audio/pull/2773

Reviewed By: carolineechen

Differential Revision: D40421337

Pulled By: mthrok

fbshipit-source-id: b402f84d4517695daeca75fb84ad876ef9354b3a
parent 5239583e
...@@ -22,19 +22,14 @@ print(torchaudio.__version__) ...@@ -22,19 +22,14 @@ print(torchaudio.__version__)
# #
# First, we import the modules and define the helper functions. # First, we import the modules and define the helper functions.
# #
# .. note::
# When running this tutorial in Google Colab, install the required packages
# with the following.
#
# .. code::
#
# !pip install librosa
import math import math
import time import timeit
import librosa import librosa
import resampy
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import pandas as pd import pandas as pd
from IPython.display import Audio, display from IPython.display import Audio, display
...@@ -63,18 +58,18 @@ def _get_inverse_log_freq(freq, sample_rate, offset): ...@@ -63,18 +58,18 @@ def _get_inverse_log_freq(freq, sample_rate, offset):
def _get_freq_ticks(sample_rate, offset, f_max): def _get_freq_ticks(sample_rate, offset, f_max):
# Given the original sample rate used for generating the sweep, # Given the original sample rate used for generating the sweep,
# find the x-axis value where the log-scale major frequency values fall in # find the x-axis value where the log-scale major frequency values fall in
time, freq = [], [] times, freq = [], []
for exp in range(2, 5): for exp in range(2, 5):
for v in range(1, 10): for v in range(1, 10):
f = v * 10**exp f = v * 10**exp
if f < sample_rate // 2: if f < sample_rate // 2:
t = _get_inverse_log_freq(f, sample_rate, offset) / sample_rate t = _get_inverse_log_freq(f, sample_rate, offset) / sample_rate
time.append(t) times.append(t)
freq.append(f) freq.append(f)
t_max = _get_inverse_log_freq(f_max, sample_rate, offset) / sample_rate t_max = _get_inverse_log_freq(f_max, sample_rate, offset) / sample_rate
time.append(t_max) times.append(t_max)
freq.append(f_max) freq.append(f_max)
return time, freq return times, freq
def get_sine_sweep(sample_rate, offset=DEFAULT_OFFSET): def get_sine_sweep(sample_rate, offset=DEFAULT_OFFSET):
...@@ -118,7 +113,7 @@ def plot_sweep( ...@@ -118,7 +113,7 @@ def plot_sweep(
# ------------------- # -------------------
# #
# To resample an audio waveform from one freqeuncy to another, you can use # To resample an audio waveform from one freqeuncy to another, you can use
# :py:func:`torchaudio.transforms.Resample` or # :py:class:`torchaudio.transforms.Resample` or
# :py:func:`torchaudio.functional.resample`. # :py:func:`torchaudio.functional.resample`.
# ``transforms.Resample`` precomputes and caches the kernel used for resampling, # ``transforms.Resample`` precomputes and caches the kernel used for resampling,
# while ``functional.resample`` computes it on the fly, so using # while ``functional.resample`` computes it on the fly, so using
...@@ -163,6 +158,9 @@ Audio(waveform.numpy()[0], rate=sample_rate) ...@@ -163,6 +158,9 @@ Audio(waveform.numpy()[0], rate=sample_rate)
# #
# We see that in the spectrogram of the resampled waveform, there is an # We see that in the spectrogram of the resampled waveform, there is an
# artifact, which was not present in the original waveform. # artifact, which was not present in the original waveform.
# This effect is called aliasing.
# `This page <https://music.arts.uci.edu/dobrian/digitalaudio.htm>`__ has
# an explanation of how it happens, and why it looks like a reflection.
resample_rate = 32000 resample_rate = 32000
resampler = T.Resample(sample_rate, resample_rate, dtype=waveform.dtype) resampler = T.Resample(sample_rate, resample_rate, dtype=waveform.dtype)
...@@ -332,20 +330,15 @@ print("torchaudio and librosa kaiser fast MSE:", mse) ...@@ -332,20 +330,15 @@ print("torchaudio and librosa kaiser fast MSE:", mse)
# ``kaiser_best`` and ``kaiser_fast`` using their corresponding parameters # ``kaiser_best`` and ``kaiser_fast`` using their corresponding parameters
# in ``torchaudio``. # in ``torchaudio``.
# #
# To elaborate on the results:
#
# - a larger ``lowpass_filter_width`` results in a larger resampling kernel,
# and therefore increases computation time for both the kernel computation
# and convolution
# - using ``kaiser_window`` results in longer computation times than the default
# ``sinc_interpolation`` because it is more complex to compute the intermediate
# window values - a large GCD between the sample and resample rate will result
# in a simplification that allows for a smaller kernel and faster kernel computation.
#
print(f"torchaudio: {torchaudio.__version__}")
print(f"librosa: {librosa.__version__}")
print(f"resampy: {resampy.__version__}")
def benchmark_resample( ######################################################################
method, #
def benchmark_resample_functional(
waveform, waveform,
sample_rate, sample_rate,
resample_rate, resample_rate,
...@@ -353,126 +346,194 @@ def benchmark_resample( ...@@ -353,126 +346,194 @@ def benchmark_resample(
rolloff=0.99, rolloff=0.99,
resampling_method="sinc_interpolation", resampling_method="sinc_interpolation",
beta=None, beta=None,
librosa_type=None,
iters=5, iters=5,
): ):
if method == "functional": return timeit.timeit(
begin = time.monotonic() stmt='''
for _ in range(iters): torchaudio.functional.resample(
F.resample(
waveform, waveform,
sample_rate, sample_rate,
resample_rate, resample_rate,
lowpass_filter_width=lowpass_filter_width, lowpass_filter_width=lowpass_filter_width,
rolloff=rolloff, rolloff=rolloff,
resampling_method=resampling_method, resampling_method=resampling_method,
) beta=beta,
elapsed = time.monotonic() - begin )
return elapsed / iters ''',
elif method == "transforms": setup='import torchaudio',
resampler = T.Resample( number=iters,
globals=locals(),
) * 1000 / iters
######################################################################
#
def benchmark_resample_transforms(
waveform,
sample_rate,
resample_rate,
lowpass_filter_width=6,
rolloff=0.99,
resampling_method="sinc_interpolation",
beta=None,
iters=5,
):
return timeit.timeit(
stmt='resampler(waveform)',
setup='''
import torchaudio
resampler = torchaudio.transforms.Resample(
sample_rate, sample_rate,
resample_rate, resample_rate,
lowpass_filter_width=lowpass_filter_width, lowpass_filter_width=lowpass_filter_width,
rolloff=rolloff, rolloff=rolloff,
resampling_method=resampling_method, resampling_method=resampling_method,
dtype=waveform.dtype, dtype=waveform.dtype,
) beta=beta,
begin = time.monotonic() )
for _ in range(iters): resampler.to(waveform.device)
resampler(waveform) ''',
elapsed = time.monotonic() - begin number=iters,
return elapsed / iters globals=locals(),
elif method == "librosa": ) * 1000 / iters
waveform_np = waveform.squeeze().numpy()
begin = time.monotonic()
for _ in range(iters):
librosa.resample(waveform_np, orig_sr=sample_rate, target_sr=resample_rate, res_type=librosa_type)
elapsed = time.monotonic() - begin
return elapsed / iters
###################################################################### ######################################################################
# #
configs = { def benchmark_resample_librosa(
"downsample (48 -> 44.1 kHz)": [48000, 44100], waveform,
"downsample (16 -> 8 kHz)": [16000, 8000], sample_rate,
"upsample (44.1 -> 48 kHz)": [44100, 48000], resample_rate,
"upsample (8 -> 16 kHz)": [8000, 16000], res_type=None,
} iters=5,
):
waveform_np = waveform.squeeze().numpy()
return timeit.timeit(
stmt='''
librosa.resample(
waveform_np,
orig_sr=sample_rate,
target_sr=resample_rate,
res_type=res_type,
)
''',
setup='import librosa',
number=iters,
globals=locals(),
) * 1000 / iters
for label in configs:
######################################################################
#
def benchmark(sample_rate, resample_rate):
times, rows = [], [] times, rows = [], []
sample_rate = configs[label][0] waveform = get_sine_sweep(sample_rate).to(torch.float32)
resample_rate = configs[label][1]
waveform = get_sine_sweep(sample_rate) args = (waveform, sample_rate, resample_rate)
# sinc 64 zero-crossings # sinc 64 zero-crossings
f_time = benchmark_resample("functional", waveform, sample_rate, resample_rate, lowpass_filter_width=64) f_time = benchmark_resample_functional(*args, lowpass_filter_width=64)
t_time = benchmark_resample("transforms", waveform, sample_rate, resample_rate, lowpass_filter_width=64) t_time = benchmark_resample_transforms(*args, lowpass_filter_width=64)
times.append([None, 1000 * f_time, 1000 * t_time]) times.append([None, f_time, t_time])
rows.append("sinc (width 64)") rows.append("sinc (width 64)")
# sinc 6 zero-crossings # sinc 6 zero-crossings
f_time = benchmark_resample("functional", waveform, sample_rate, resample_rate, lowpass_filter_width=16) f_time = benchmark_resample_functional(*args, lowpass_filter_width=16)
t_time = benchmark_resample("transforms", waveform, sample_rate, resample_rate, lowpass_filter_width=16) t_time = benchmark_resample_transforms(*args, lowpass_filter_width=16)
times.append([None, 1000 * f_time, 1000 * t_time]) times.append([None, f_time, t_time])
rows.append("sinc (width 16)") rows.append("sinc (width 16)")
# kaiser best # kaiser best
lib_time = benchmark_resample("librosa", waveform, sample_rate, resample_rate, librosa_type="kaiser_best") kwargs = {
f_time = benchmark_resample( "lowpass_filter_width": 64,
"functional", "rolloff": 0.9475937167399596,
waveform, "resampling_method": "kaiser_window",
sample_rate, "beta": 14.769656459379492,
resample_rate, }
lowpass_filter_width=64, lib_time = benchmark_resample_librosa(*args, res_type="kaiser_best")
rolloff=0.9475937167399596, f_time = benchmark_resample_functional(*args, **kwargs)
resampling_method="kaiser_window", t_time = benchmark_resample_transforms(*args, **kwargs)
beta=14.769656459379492, times.append([lib_time, f_time, t_time])
)
t_time = benchmark_resample(
"transforms",
waveform,
sample_rate,
resample_rate,
lowpass_filter_width=64,
rolloff=0.9475937167399596,
resampling_method="kaiser_window",
beta=14.769656459379492,
)
times.append([1000 * lib_time, 1000 * f_time, 1000 * t_time])
rows.append("kaiser_best") rows.append("kaiser_best")
# kaiser fast # kaiser fast
lib_time = benchmark_resample("librosa", waveform, sample_rate, resample_rate, librosa_type="kaiser_fast") kwargs = {
f_time = benchmark_resample( "lowpass_filter_width": 16,
"functional", "rolloff": 0.85,
waveform, "resampling_method": "kaiser_window",
sample_rate, "beta": 8.555504641634386,
resample_rate, }
lowpass_filter_width=16, lib_time = benchmark_resample_librosa(*args, res_type="kaiser_fast")
rolloff=0.85, f_time = benchmark_resample_functional(*args, **kwargs)
resampling_method="kaiser_window", t_time = benchmark_resample_transforms(*args, **kwargs)
beta=8.555504641634386, times.append([lib_time, f_time, t_time])
)
t_time = benchmark_resample(
"transforms",
waveform,
sample_rate,
resample_rate,
lowpass_filter_width=16,
rolloff=0.85,
resampling_method="kaiser_window",
beta=8.555504641634386,
)
times.append([1000 * lib_time, 1000 * f_time, 1000 * t_time])
rows.append("kaiser_fast") rows.append("kaiser_fast")
df = pd.DataFrame(times, columns=["librosa", "functional", "transforms"], index=rows) df = pd.DataFrame(times, columns=["librosa", "functional", "transforms"], index=rows)
df.columns = pd.MultiIndex.from_product([[f"{label} time (ms)"], df.columns]) return df
######################################################################
#
def plot(df):
print(df.round(2))
ax = df.plot(kind="bar")
plt.ylabel("Time Elapsed [ms]")
plt.xticks(rotation = 0, fontsize=10)
for cont, col, color in zip(ax.containers, df.columns, mcolors.TABLEAU_COLORS):
label = ["N/A" if v != v else str(v) for v in df[col].round(2)]
ax.bar_label(cont, labels=label, color=color, fontweight="bold", fontsize="x-small")
######################################################################
#
# Downsample (48 -> 44.1 kHz)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
df = benchmark(48_000, 44_100)
plot(df)
######################################################################
#
# Downsample (16 -> 8 kHz)
# ~~~~~~~~~~~~~~~~~~~~~~~~
df = benchmark(16_000, 8_000)
plot(df)
######################################################################
#
# Upsample (44.1 -> 48 kHz)
# ~~~~~~~~~~~~~~~~~~~~~~~~~
df = benchmark(44_100, 48_000)
plot(df)
######################################################################
#
# Upsample (8 -> 16 kHz)
# ~~~~~~~~~~~~~~~~~~~~~~
df = benchmark(8_000, 16_000)
plot(df)
print(f"torchaudio: {torchaudio.__version__}") ######################################################################
print(f"librosa: {librosa.__version__}") #
display(df.round(2)) # Summary
# ~~~~~~~
#
# To elaborate on the results:
#
# - a larger ``lowpass_filter_width`` results in a larger resampling kernel,
# and therefore increases computation time for both the kernel computation
# and convolution
# - using ``kaiser_window`` results in longer computation times than the default
# ``sinc_interpolation`` because it is more complex to compute the intermediate
# window values
# - a large GCD between the sample and resample rate will result
# in a simplification that allows for a smaller kernel and faster kernel computation.
#
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment