Commit 105b77fe authored by moto's avatar moto Committed by Facebook GitHub Bot
Browse files

Add more explanation about `n_fft` (#3442)

Summary: Pull Request resolved: https://github.com/pytorch/audio/pull/3442

Differential Revision: D46797481

Pulled By: mthrok

fbshipit-source-id: 3513037cbb8f2edb70fdab0fec5c7c554a697abe
parent 70968293
...@@ -41,6 +41,7 @@ import matplotlib.pyplot as plt ...@@ -41,6 +41,7 @@ import matplotlib.pyplot as plt
# !pip install librosa # !pip install librosa
# #
from IPython.display import Audio from IPython.display import Audio
from matplotlib.patches import Rectangle
from torchaudio.utils import download_asset from torchaudio.utils import download_asset
torch.random.manual_seed(0) torch.random.manual_seed(0)
...@@ -48,26 +49,28 @@ torch.random.manual_seed(0) ...@@ -48,26 +49,28 @@ torch.random.manual_seed(0)
SAMPLE_SPEECH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav") SAMPLE_SPEECH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
def plot_waveform(waveform, sr, title="Waveform"): def plot_waveform(waveform, sr, title="Waveform", ax=None):
waveform = waveform.numpy() waveform = waveform.numpy()
num_channels, num_frames = waveform.shape num_channels, num_frames = waveform.shape
time_axis = torch.arange(0, num_frames) / sr time_axis = torch.arange(0, num_frames) / sr
figure, axes = plt.subplots(num_channels, 1) if ax is None:
axes.plot(time_axis, waveform[0], linewidth=1) _, ax = plt.subplots(num_channels, 1)
axes.grid(True) ax.plot(time_axis, waveform[0], linewidth=1)
figure.suptitle(title) ax.grid(True)
ax.set_xlim([0, time_axis[-1]])
ax.set_title(title)
plt.show(block=False) plt.show(block=False)
def plot_spectrogram(specgram, title=None, ylabel="freq_bin"): def plot_spectrogram(specgram, title=None, ylabel="freq_bin", ax=None):
fig, axs = plt.subplots(1, 1) if ax is None:
axs.set_title(title or "Spectrogram (db)") _, ax = plt.subplots(1, 1)
axs.set_ylabel(ylabel) if title is not None:
axs.set_xlabel("frame") ax.set_title(title)
im = axs.imshow(librosa.power_to_db(specgram), origin="lower", aspect="auto") ax.set_ylabel(ylabel)
fig.colorbar(im, ax=axs) ax.imshow(librosa.power_to_db(specgram), origin="lower", aspect="auto", interpolation="nearest")
plt.show(block=False) plt.show(block=False)
...@@ -102,77 +105,155 @@ def plot_fbank(fbank, title=None): ...@@ -102,77 +105,155 @@ def plot_fbank(fbank, title=None):
# you can use :py:func:`torchaudio.transforms.Spectrogram`. # you can use :py:func:`torchaudio.transforms.Spectrogram`.
# #
# Load audio
SPEECH_WAVEFORM, SAMPLE_RATE = torchaudio.load(SAMPLE_SPEECH) SPEECH_WAVEFORM, SAMPLE_RATE = torchaudio.load(SAMPLE_SPEECH)
plot_waveform(SPEECH_WAVEFORM, SAMPLE_RATE, title="Original waveform") # Define transform
Audio(SPEECH_WAVEFORM.numpy(), rate=SAMPLE_RATE) spectrogram = T.Spectrogram(n_fft=512)
# Perform transform
spec = spectrogram(SPEECH_WAVEFORM)
###################################################################### ######################################################################
# #
n_fft = 1024 fig, axs = plt.subplots(2, 1)
win_length = None plot_waveform(SPEECH_WAVEFORM, SAMPLE_RATE, title="Original waveform", ax=axs[0])
hop_length = 512 plot_spectrogram(spec[0], title="spectrogram", ax=axs[1])
fig.tight_layout()
# Define transform ######################################################################
spectrogram = T.Spectrogram( #
n_fft=n_fft,
win_length=win_length, Audio(SPEECH_WAVEFORM.numpy(), rate=SAMPLE_RATE)
hop_length=hop_length,
center=True,
pad_mode="reflect",
power=2.0,
)
###################################################################### ######################################################################
# The effect of ``n_fft`` parameter
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# The core of spectrogram computation is (short-term) Fourier transform,
# and the ``n_fft`` parameter corresponds to the :math:`N` in the following
# definition of descrete Fourier transform.
#
# $$ X_k = \\sum_{n=0}^{N-1} x_n e^{-\\frac{2\\pi i}{N} nk} $$
#
# (For the detail of Fourier transform, please refer to
# `Wikipedia <https://en.wikipedia.org/wiki/Fast_Fourier_transform>`__.
#
# The value of ``n_fft`` determines the resolution of frequency axis.
# However, with the higher ``n_fft`` value, the energy will be distributed
# among more bins, so when you visualize it, it might look more blurry,
# even thought they are higher resolution.
#
# The following illustrates this;
# #
# Perform transform ######################################################################
spec = spectrogram(SPEECH_WAVEFORM) #
# .. note::
#
# ``hop_length`` determines the time axis resolution.
# By default, (i.e. ``hop_length=None`` and ``win_length=None``),
# the value of ``n_fft // 4`` is used.
# Here we use the same ``hop_length`` value across different ``n_fft``
# so that the visualization.
n_ffts = [32, 128, 512, 2048]
hop_length = 64
specs = []
for n_fft in n_ffts:
spectrogram = T.Spectrogram(n_fft=n_fft, hop_length=hop_length)
spec = spectrogram(SPEECH_WAVEFORM)
specs.append(spec)
###################################################################### ######################################################################
# #
plot_spectrogram(spec[0], title="torchaudio") fig, axs = plt.subplots(len(specs), 1, sharex=True)
for i, (spec, n_fft) in enumerate(zip(specs, n_ffts)):
plot_spectrogram(spec[0], ylabel=f"n_fft={n_fft}", ax=axs[i])
axs[i].set_xlabel(None)
fig.tight_layout()
###################################################################### ######################################################################
# GriffinLim
# ----------
# #
# To recover a waveform from a spectrogram, you can use ``GriffinLim``. # When comparing signals, it is desirable to use the same sampling rate,
# however if you must use the different sampling rate, care must be
# taken for interpretating the meaning of ``n_fft``.
# ``n_fft`` determines the resolution of the frequency, and what
# each frequency bin represents is subject to the sampling rate.
# #
# As we have seen above, changing the value of ``n_fft`` does not change
# the coverage of frequency range.
torch.random.manual_seed(0) ######################################################################
#
# Let's downsample the audio and apply spectrogram with the same ``n_fft``
# value.
n_fft = 1024 # Downsample to half of the original sample rate
win_length = None speech2 = torchaudio.functional.resample(SPEECH_WAVEFORM, SAMPLE_RATE, SAMPLE_RATE // 2)
hop_length = 512 # Upsample to the original sample rate
speech3 = torchaudio.functional.resample(speech2, SAMPLE_RATE // 2, SAMPLE_RATE)
spec = T.Spectrogram( ######################################################################
n_fft=n_fft, #
win_length=win_length,
hop_length=hop_length, # Apply the same spectrogram
)(SPEECH_WAVEFORM) spectrogram = T.Spectrogram(n_fft=512)
spec0 = spectrogram(SPEECH_WAVEFORM)
spec2 = spectrogram(speech2)
spec3 = spectrogram(speech3)
###################################################################### ######################################################################
# #
griffin_lim = T.GriffinLim( # Visualize it
n_fft=n_fft, fig, axs = plt.subplots(3, 1)
win_length=win_length, plot_spectrogram(spec0[0], ylabel="Original", ax=axs[0])
hop_length=hop_length, axs[0].add_patch(Rectangle((0, 3), 212, 128, edgecolor="r", facecolor="none"))
) plot_spectrogram(spec2[0], ylabel="Downsampled", ax=axs[1])
plot_spectrogram(spec3[0], ylabel="Upsampled", ax=axs[2])
fig.tight_layout()
###################################################################### ######################################################################
# #
# In the above visualization, the second plot ("Downsampled") might
# give the impression that the spectrogram is streched.
# This is because the meaning of frequency bins is different from
# the original one.
# Even though, they have the same number of bins, in the second plot,
# the frequency is only covered to the half of the original sampling
# rate.
# This becomes more clear if we resample the downsampled signal again
# so that it has the same sample rate as the original.
######################################################################
# GriffinLim
# ----------
#
# To recover a waveform from a spectrogram, you can use
# :py:class:`torchaudio.transforms.GriffinLim`.
#
# The same set of parameters used for spectrogram must be used.
# Define transforms
n_fft = 1024
spectrogram = T.Spectrogram(n_fft=n_fft)
griffin_lim = T.GriffinLim(n_fft=n_fft)
# Apply the transforms
spec = spectrogram(SPEECH_WAVEFORM)
reconstructed_waveform = griffin_lim(spec) reconstructed_waveform = griffin_lim(spec)
###################################################################### ######################################################################
# #
plot_waveform(reconstructed_waveform, SAMPLE_RATE, title="Reconstructed") _, axes = plt.subplots(2, 1, sharex=True, sharey=True)
plot_waveform(SPEECH_WAVEFORM, SAMPLE_RATE, title="Original", ax=axes[0])
plot_waveform(reconstructed_waveform, SAMPLE_RATE, title="Reconstructed", ax=axes[1])
Audio(reconstructed_waveform, rate=SAMPLE_RATE) Audio(reconstructed_waveform, rate=SAMPLE_RATE)
###################################################################### ######################################################################
...@@ -254,7 +335,6 @@ mel_spectrogram = T.MelSpectrogram( ...@@ -254,7 +335,6 @@ mel_spectrogram = T.MelSpectrogram(
pad_mode="reflect", pad_mode="reflect",
power=2.0, power=2.0,
norm="slaney", norm="slaney",
onesided=True,
n_mels=n_mels, n_mels=n_mels,
mel_scale="htk", mel_scale="htk",
) )
...@@ -323,7 +403,7 @@ mfcc = mfcc_transform(SPEECH_WAVEFORM) ...@@ -323,7 +403,7 @@ mfcc = mfcc_transform(SPEECH_WAVEFORM)
###################################################################### ######################################################################
# #
plot_spectrogram(mfcc[0]) plot_spectrogram(mfcc[0], title="MFCC")
###################################################################### ######################################################################
# Comparison against librosa # Comparison against librosa
...@@ -351,7 +431,7 @@ mfcc_librosa = librosa.feature.mfcc( ...@@ -351,7 +431,7 @@ mfcc_librosa = librosa.feature.mfcc(
###################################################################### ######################################################################
# #
plot_spectrogram(mfcc_librosa) plot_spectrogram(mfcc_librosa, title="MFCC (librosa)")
mse = torch.square(mfcc - mfcc_librosa).mean().item() mse = torch.square(mfcc - mfcc_librosa).mean().item()
print("Mean Square Difference: ", mse) print("Mean Square Difference: ", mse)
...@@ -377,7 +457,7 @@ lfcc_transform = T.LFCC( ...@@ -377,7 +457,7 @@ lfcc_transform = T.LFCC(
) )
lfcc = lfcc_transform(SPEECH_WAVEFORM) lfcc = lfcc_transform(SPEECH_WAVEFORM)
plot_spectrogram(lfcc[0]) plot_spectrogram(lfcc[0], title="LFCC")
###################################################################### ######################################################################
# Pitch # Pitch
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment