Update data augmentation tutorial to use new operators (#3062)

Summary: Updates tutorial "Audio Data Augmentation" to use two of the newly introduced data augmentation operators in beta: `torchaudio.functional.fftconvolve` and `torchaudio.functional.add_noise`. Pull Request resolved: https://github.com/pytorch/audio/pull/3062 Reviewed By: mthrok Differential Revision: D43298120 Pulled By: hwangjeff fbshipit-source-id: 09ca736a5c67242568515d600b7d31eab32c2df1

Update data augmentation tutorial to use new operators (#3062)
Summary: Updates tutorial "Audio Data Augmentation" to use two of the newly introduced data augmentation operators in beta: `torchaudio.functional.fftconvolve` and `torchaudio.functional.add_noise`. Pull Request resolved: https://github.com/pytorch/audio/pull/3062 Reviewed By: mthrok Differential Revision: D43298120 Pulled By: hwangjeff fbshipit-source-id: 09ca736a5c67242568515d600b7d31eab32c2df1
b9ef69d1 · hwangjeff · Facebook GitHub Bot · 12e8cb97 · b9ef69d1
Commit b9ef69d1 authored Feb 15, 2023 by hwangjeff Committed by Facebook GitHub Bot Feb 15, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 21 additions and 27 deletions

examples/tutorials/audio_data_augmentation_tutorial.py examples/tutorials/audio_data_augmentation_tutorial.py +21 -27

No files found.
--- a/examples/tutorials/audio_data_augmentation_tutorial.py
+++ b/examples/tutorials/audio_data_augmentation_tutorial.py
@@ -185,24 +185,22 @@ plot_specgram(rir_raw, sample_rate, title="Room Impulse Response (raw)")
 Audio(rir_raw, rate=sample_rate)
 ######################################################################
-# First, we need to clean up the RIR. We extract the main impulse, normalize
+# First, we need to clean up the RIR. We extract the main impulse and normalize
-# the signal power, then flip along the time axis.
+# it by its power.
 #
 rir = rir_raw[:, int(sample_rate * 1.01) : int(sample_rate * 1.3)]
 rir = rir / torch.norm(rir, p=2)
-RIR = torch.flip(rir, [1])
 plot_waveform(rir, sample_rate, title="Room Impulse Response")
 ######################################################################
-# Then, we convolve the speech signal with the RIR filter.
+# Then, using :py:func:`torchaudio.functional.fftconvolve`,
+# we convolve the speech signal with the RIR.
 #
 speech, _ = torchaudio.load(SAMPLE_SPEECH)
+augmented = F.fftconvolve(speech, rir)
-speech_ = torch.nn.functional.pad(speech, (RIR.shape[1] - 1, 0))
-augmented = torch.nn.functional.conv1d(speech_[None, ...], RIR[None, ...])[0]
 ######################################################################
 # Original:
@@ -227,29 +225,27 @@ Audio(augmented, rate=sample_rate)
 # Adding background noise
 # -----------------------
 #
-# To add background noise to audio data, you can simply add a noise Tensor to
+# To introduce background noise to audio data, we can add a noise Tensor to
-# the Tensor representing the audio data. A common method to adjust the
+# the Tensor representing the audio data according to some desired
-# intensity of noise is changing the Signal-to-Noise Ratio (SNR).
+# signal-to-noise ratio (SNR)
-# [`wikipedia <https://en.wikipedia.org/wiki/Signal-to-noise_ratio>`__]
+# [`wikipedia <https://en.wikipedia.org/wiki/Signal-to-noise_ratio>`__],
+# which determines the intensity of the audio data relative to that of the noise
+# in the output.
 #
 # $$ \\mathrm{SNR} = \\frac{P_{signal}}{P_{noise}} $$
 #
 # $$ \\mathrm{SNR_{dB}} = 10 \\log _{{10}} \\mathrm {SNR} $$
 #
+# To add noise to audio data per SNRs, we
+# use :py:func:`torchaudio.functional.add_noise`.
 speech, _ = torchaudio.load(SAMPLE_SPEECH)
 noise, _ = torchaudio.load(SAMPLE_NOISE)
 noise = noise[:, : speech.shape[1]]
-speech_rms = speech.norm(p=2)
+snr_dbs = torch.tensor([20, 10, 3])
-noise_rms = noise.norm(p=2)
+noisy_speeches = F.add_noise(speech, noise, snr_dbs)
-snr_dbs = [20, 10, 3]
-noisy_speeches = []
-for snr_db in snr_dbs:
-    snr = 10 ** (snr_db / 20)
-    scale = snr * noise_rms / speech_rms
-    noisy_speeches.append((scale * speech + noise) / 2)
 ######################################################################
 # Background noise:
@@ -265,7 +261,7 @@ Audio(noise, rate=sample_rate)
 # ~~~~~~~~~~
 #
-snr_db, noisy_speech = snr_dbs[0], noisy_speeches[0]
+snr_db, noisy_speech = snr_dbs[0], noisy_speeches[0:1]
 plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
 plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
 Audio(noisy_speech, rate=sample_rate)
@@ -275,7 +271,7 @@ Audio(noisy_speech, rate=sample_rate)
 # ~~~~~~~~~~
 #
-snr_db, noisy_speech = snr_dbs[1], noisy_speeches[1]
+snr_db, noisy_speech = snr_dbs[1], noisy_speeches[1:2]
 plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
 plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
 Audio(noisy_speech, rate=sample_rate)
@@ -285,7 +281,7 @@ Audio(noisy_speech, rate=sample_rate)
 # ~~~~~~~~~
 #
-snr_db, noisy_speech = snr_dbs[2], noisy_speeches[2]
+snr_db, noisy_speech = snr_dbs[2], noisy_speeches[2:3]
 plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
 plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
 Audio(noisy_speech, rate=sample_rate)
@@ -365,8 +361,7 @@ original_speech, sample_rate = torchaudio.load(SAMPLE_SPEECH)
 plot_specgram(original_speech, sample_rate, title="Original")
 # Apply RIR
-speech_ = torch.nn.functional.pad(original_speech, (RIR.shape[1] - 1, 0))
+rir_applied = F.fftconvolve(speech, rir)
-rir_applied = torch.nn.functional.conv1d(speech_[None, ...], RIR[None, ...])[0]
 plot_specgram(rir_applied, sample_rate, title="RIR Applied")
@@ -377,9 +372,8 @@ plot_specgram(rir_applied, sample_rate, title="RIR Applied")
 noise, _ = torchaudio.load(SAMPLE_NOISE)
 noise = noise[:, : rir_applied.shape[1]]
-snr_db = 8
+snr_db = torch.tensor([8])
-scale = (10 ** (snr_db / 20)) * noise.norm(p=2) / rir_applied.norm(p=2)
+bg_added = F.add_noise(rir_applied, noise, snr_db)
-bg_added = (scale * rir_applied + noise) / 2
 plot_specgram(bg_added, sample_rate, title="BG noise added")