Update audio data augmentation tutorial (#2388)

Summary: - Adopt `torchaudio.utils.download_asset` to simplify asset management. - Break down the first section about helper functions. - Reduce the number of helper functions https://output.circle-artifacts.com/output/job/d7dd1b93-6dfe-46da-a080-109bfdc63881/artifacts/0/docs/tutorials/audio_data_augmentation_tutorial.html Pull Request resolved: https://github.com/pytorch/audio/pull/2388 Reviewed By: carolineechen Differential Revision: D36404405 Pulled By: mthrok fbshipit-source-id: f460ed810519797fce6e2fa7baaee110bddd1d06

Update audio data augmentation tutorial (#2388)
Summary: - Adopt `torchaudio.utils.download_asset` to simplify asset management. - Break down the first section about helper functions. - Reduce the number of helper functions https://output.circle-artifacts.com/output/job/d7dd1b93-6dfe-46da-a080-109bfdc63881/artifacts/0/docs/tutorials/audio_data_augmentation_tutorial.html Pull Request resolved: https://github.com/pytorch/audio/pull/2388 Reviewed By: carolineechen Differential Revision: D36404405 Pulled By: mthrok fbshipit-source-id: f460ed810519797fce6e2fa7baaee110bddd1d06
41082eb0 · moto · Facebook GitHub Bot · fd2be89a · 41082eb0
Commit 41082eb0 authored Jun 03, 2022 by moto Committed by Facebook GitHub Bot Jun 03, 2022
Show whitespace changes
Inline Side-by-side

Showing with 105 additions and 205 deletions

examples/tutorials/audio_data_augmentation_tutorial.py examples/tutorials/audio_data_augmentation_tutorial.py +105 -205

No files found.
--- a/examples/tutorials/audio_data_augmentation_tutorial.py
+++ b/examples/tutorials/audio_data_augmentation_tutorial.py
@@ -4,6 +4,11 @@ Audio Data Augmentation
 =======================

 ``torchaudio`` provides a variety of ways to augment audio data.
+
+In this tutorial, we look into a way to apply effects, filters,
+RIR (room impulse response) and codecs.
+
+At the end, we synthesize noisy speech over phone from clean speech.
 """

 import torch
@@ -14,164 +19,23 @@ print(torch.__version__)
 print(torchaudio.__version__)

 ######################################################################
-# Preparing data and utility functions (skip this section)
-# --------------------------------------------------------
+# Preparation
+# -----------
+#
+# First, we import the modules and download the audio assets we use in this tutorial.
 #
-
-# @title Prepare data and utility functions. {display-mode: "form"}
-# @markdown
-# @markdown You do not need to look into this cell.
-# @markdown Just execute once and you are good to go.
-# @markdown
-# @markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/),
-# @markdown which is licensed under Creative Commos BY 4.0.
-
-# -------------------------------------------------------------------------------
-# Preparation of data and helper functions.
-# -------------------------------------------------------------------------------

 import math
-import os

+from IPython.display import Audio
 import matplotlib.pyplot as plt
-import requests
-from IPython.display import Audio, display
-
-
-_SAMPLE_DIR = "_assets"
-
-SAMPLE_WAV_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/steam-train-whistle-daniel_simon.wav"
-SAMPLE_WAV_PATH = os.path.join(_SAMPLE_DIR, "steam.wav")
-
-SAMPLE_RIR_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/distant-16k/room-response/rm1/impulse/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo.wav"  # noqa: E501
-SAMPLE_RIR_PATH = os.path.join(_SAMPLE_DIR, "rir.wav")
-
-SAMPLE_WAV_SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"  # noqa: E501
-SAMPLE_WAV_SPEECH_PATH = os.path.join(_SAMPLE_DIR, "speech.wav")
-
-SAMPLE_NOISE_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/distant-16k/distractors/rm1/babb/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo.wav"  # noqa: E501
-SAMPLE_NOISE_PATH = os.path.join(_SAMPLE_DIR, "bg.wav")
-
-os.makedirs(_SAMPLE_DIR, exist_ok=True)
-
-
-def _fetch_data():
-    uri = [
-        (SAMPLE_WAV_URL, SAMPLE_WAV_PATH),
-        (SAMPLE_RIR_URL, SAMPLE_RIR_PATH),
-        (SAMPLE_WAV_SPEECH_URL, SAMPLE_WAV_SPEECH_PATH),
-        (SAMPLE_NOISE_URL, SAMPLE_NOISE_PATH),
-    ]
-    for url, path in uri:
-        with open(path, "wb") as file_:
-            file_.write(requests.get(url).content)
-
-
-_fetch_data()
-
-
-def _get_sample(path, resample=None):
-    effects = [["remix", "1"]]
-    if resample:
-        effects.extend(
-            [
-                ["lowpass", f"{resample // 2}"],
-                ["rate", f"{resample}"],
-            ]
-        )
-    return torchaudio.sox_effects.apply_effects_file(path, effects=effects)
-
-
-def get_sample(*, resample=None):
-    return _get_sample(SAMPLE_WAV_PATH, resample=resample)
-
-
-def get_speech_sample(*, resample=None):
-    return _get_sample(SAMPLE_WAV_SPEECH_PATH, resample=resample)
-
-
-def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None, ylim=None):
-    waveform = waveform.numpy()
-
-    num_channels, num_frames = waveform.shape
-    time_axis = torch.arange(0, num_frames) / sample_rate
-
-    figure, axes = plt.subplots(num_channels, 1)
-    if num_channels == 1:
-        axes = [axes]
-    for c in range(num_channels):
-        axes[c].plot(time_axis, waveform[c], linewidth=1)
-        axes[c].grid(True)
-        if num_channels > 1:
-            axes[c].set_ylabel(f"Channel {c+1}")
-        if xlim:
-            axes[c].set_xlim(xlim)
-        if ylim:
-            axes[c].set_ylim(ylim)
-    figure.suptitle(title)
-    plt.show(block=False)
-
-
-def print_stats(waveform, sample_rate=None, src=None):
-    if src:
-        print("-" * 10)
-        print("Source:", src)
-        print("-" * 10)
-    if sample_rate:
-        print("Sample Rate:", sample_rate)
-    print("Shape:", tuple(waveform.shape))
-    print("Dtype:", waveform.dtype)
-    print(f" - Max:     {waveform.max().item():6.3f}")
-    print(f" - Min:     {waveform.min().item():6.3f}")
-    print(f" - Mean:    {waveform.mean().item():6.3f}")
-    print(f" - Std Dev: {waveform.std().item():6.3f}")
-    print()
-    print(waveform)
-    print()
-
-
-def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
-    waveform = waveform.numpy()
-
-    num_channels, num_frames = waveform.shape
-
-    figure, axes = plt.subplots(num_channels, 1)
-    if num_channels == 1:
-        axes = [axes]
-    for c in range(num_channels):
-        axes[c].specgram(waveform[c], Fs=sample_rate)
-        if num_channels > 1:
-            axes[c].set_ylabel(f"Channel {c+1}")
-        if xlim:
-            axes[c].set_xlim(xlim)
-    figure.suptitle(title)
-    plt.show(block=False)
-

-def play_audio(waveform, sample_rate):
-    waveform = waveform.numpy()
-
-    num_channels, num_frames = waveform.shape
-    if num_channels == 1:
-        return Audio(waveform[0], rate=sample_rate)
-    elif num_channels == 2:
-        return Audio((waveform[0], waveform[1]), rate=sample_rate)
-    else:
-        raise ValueError("Waveform with more than 2 channels are not supported.")
+from torchaudio.utils import download_asset

-
-def get_rir_sample(*, resample=None, processed=False):
-    rir_raw, sample_rate = _get_sample(SAMPLE_RIR_PATH, resample=resample)
-    if not processed:
-        return rir_raw, sample_rate
-    rir = rir_raw[:, int(sample_rate * 1.01) : int(sample_rate * 1.3)]
-    rir = rir / torch.norm(rir, p=2)
-    rir = torch.flip(rir, [1])
-    return rir, sample_rate
-
-
-def get_noise_sample(*, resample=None):
-    return _get_sample(SAMPLE_NOISE_PATH, resample=resample)
+SAMPLE_WAV = download_asset("tutorial-assets/steam-train-whistle-daniel_simon.wav")
+SAMPLE_RIR = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo-8000hz.wav")
+SAMPLE_SPEECH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042-8000hz.wav")
+SAMPLE_NOISE = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo-8000hz.wav")


 ######################################################################
@@ -210,9 +74,8 @@ def get_noise_sample(*, resample=None):
 # **Note** This process is not differentiable.
 #

-
 # Load the data
-waveform1, sample_rate1 = get_sample(resample=16000)
+waveform1, sample_rate1 = torchaudio.load(SAMPLE_WAV)

 # Define effects
 effects = [
@@ -227,8 +90,8 @@ effects = [
 # Apply effects
 waveform2, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor(waveform1, sample_rate1, effects)

-print_stats(waveform1, sample_rate=sample_rate1, src="Original")
-print_stats(waveform2, sample_rate=sample_rate2, src="Effects Applied")
+print(waveform1.shape, sample_rate1)
+print(waveform2.shape, sample_rate2)

 ######################################################################
 # Note that the number of frames and number of channels are different from
@@ -236,6 +99,45 @@ print_stats(waveform2, sample_rate=sample_rate2, src="Effects Applied")
 # audio.
 #

+def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None):
+    waveform = waveform.numpy()
+
+    num_channels, num_frames = waveform.shape
+    time_axis = torch.arange(0, num_frames) / sample_rate
+
+    figure, axes = plt.subplots(num_channels, 1)
+    if num_channels == 1:
+        axes = [axes]
+    for c in range(num_channels):
+        axes[c].plot(time_axis, waveform[c], linewidth=1)
+        axes[c].grid(True)
+        if num_channels > 1:
+            axes[c].set_ylabel(f"Channel {c+1}")
+        if xlim:
+            axes[c].set_xlim(xlim)
+    figure.suptitle(title)
+    plt.show(block=False)
+
+######################################################################
+#
+
+def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
+    waveform = waveform.numpy()
+
+    num_channels, _ = waveform.shape
+
+    figure, axes = plt.subplots(num_channels, 1)
+    if num_channels == 1:
+        axes = [axes]
+    for c in range(num_channels):
+        axes[c].specgram(waveform[c], Fs=sample_rate)
+        if num_channels > 1:
+            axes[c].set_ylabel(f"Channel {c+1}")
+        if xlim:
+            axes[c].set_xlim(xlim)
+    figure.suptitle(title)
+    plt.show(block=False)
+
 ######################################################################
 # Original:
 # ~~~~~~~~~
@@ -243,7 +145,7 @@ print_stats(waveform2, sample_rate=sample_rate2, src="Effects Applied")

 plot_waveform(waveform1, sample_rate1, title="Original", xlim=(-0.1, 3.2))
 plot_specgram(waveform1, sample_rate1, title="Original", xlim=(0, 3.04))
-play_audio(waveform1, sample_rate1)
+Audio(waveform1, rate=sample_rate1)

 ######################################################################
 # Effects applied:
@@ -252,7 +154,7 @@ play_audio(waveform1, sample_rate1)

 plot_waveform(waveform2, sample_rate2, title="Effects Applied", xlim=(-0.1, 3.2))
 plot_specgram(waveform2, sample_rate2, title="Effects Applied", xlim=(0, 3.04))
-play_audio(waveform2, sample_rate2)
+Audio(waveform2, rate=sample_rate2)

 ######################################################################
 # Doesn’t it sound more dramatic?
@@ -275,14 +177,10 @@ play_audio(waveform2, sample_rate2)
 # and clap your hands.
 #

-
-sample_rate = 8000
-
-rir_raw, _ = get_rir_sample(resample=sample_rate)
-
-plot_waveform(rir_raw, sample_rate, title="Room Impulse Response (raw)", ylim=None)
+rir_raw, sample_rate = torchaudio.load(SAMPLE_RIR)
+plot_waveform(rir_raw, sample_rate, title="Room Impulse Response (raw)")
 plot_specgram(rir_raw, sample_rate, title="Room Impulse Response (raw)")
-play_audio(rir_raw, sample_rate)
+Audio(rir_raw, rate=sample_rate)

 ######################################################################
 # First, we need to clean up the RIR. We extract the main impulse, normalize
@@ -291,37 +189,36 @@ play_audio(rir_raw, sample_rate)

 rir = rir_raw[:, int(sample_rate * 1.01) : int(sample_rate * 1.3)]
 rir = rir / torch.norm(rir, p=2)
-rir = torch.flip(rir, [1])
+RIR = torch.flip(rir, [1])

-print_stats(rir)
-plot_waveform(rir, sample_rate, title="Room Impulse Response", ylim=None)
+plot_waveform(rir, sample_rate, title="Room Impulse Response")

 ######################################################################
 # Then, we convolve the speech signal with the RIR filter.
 #

-speech, _ = get_speech_sample(resample=sample_rate)
+speech, _ = torchaudio.load(SAMPLE_SPEECH)

-speech_ = torch.nn.functional.pad(speech, (rir.shape[1] - 1, 0))
-augmented = torch.nn.functional.conv1d(speech_[None, ...], rir[None, ...])[0]
+speech_ = torch.nn.functional.pad(speech, (RIR.shape[1] - 1, 0))
+augmented = torch.nn.functional.conv1d(speech_[None, ...], RIR[None, ...])[0]

 ######################################################################
 # Original:
 # ~~~~~~~~~
 #

-plot_waveform(speech, sample_rate, title="Original", ylim=None)
+plot_waveform(speech, sample_rate, title="Original")
 plot_specgram(speech, sample_rate, title="Original")
-play_audio(speech, sample_rate)
+Audio(speech, rate=sample_rate)

 ######################################################################
 # RIR applied:
 # ~~~~~~~~~~~~
 #

-plot_waveform(augmented, sample_rate, title="RIR Applied", ylim=None)
+plot_waveform(augmented, sample_rate, title="RIR Applied")
 plot_specgram(augmented, sample_rate, title="RIR Applied")
-play_audio(augmented, sample_rate)
+Audio(augmented, rate=sample_rate)


 ######################################################################
@@ -338,10 +235,8 @@ play_audio(augmented, sample_rate)
 # $$ \\mathrm{SNR_{dB}} = 10 \\log _{{10}} \\mathrm {SNR} $$
 #

-
-sample_rate = 8000
-speech, _ = get_speech_sample(resample=sample_rate)
-noise, _ = get_noise_sample(resample=sample_rate)
+speech, _ = torchaudio.load(SAMPLE_SPEECH)
+noise, _ = torchaudio.load(SAMPLE_NOISE)
 noise = noise[:, : speech.shape[1]]

 speech_power = speech.norm(p=2)
@@ -361,7 +256,7 @@ for snr_db in snr_dbs:

 plot_waveform(noise, sample_rate, title="Background noise")
 plot_specgram(noise, sample_rate, title="Background noise")
-play_audio(noise, sample_rate)
+Audio(noise, rate=sample_rate)

 ######################################################################
 # SNR 20 dB:
@@ -371,7 +266,7 @@ play_audio(noise, sample_rate)
 snr_db, noisy_speech = snr_dbs[0], noisy_speeches[0]
 plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
 plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
-play_audio(noisy_speech, sample_rate)
+Audio(noisy_speech, rate=sample_rate)

 ######################################################################
 # SNR 10 dB:
@@ -381,17 +276,18 @@ play_audio(noisy_speech, sample_rate)
 snr_db, noisy_speech = snr_dbs[1], noisy_speeches[1]
 plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
 plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
-play_audio(noisy_speech, sample_rate)
+Audio(noisy_speech, rate=sample_rate)

 ######################################################################
 # SNR 3 dB:
-# ~~~~~~~~~~
+# ~~~~~~~~~
 #

 snr_db, noisy_speech = snr_dbs[2], noisy_speeches[2]
 plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
 plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
-play_audio(noisy_speech, sample_rate)
+Audio(noisy_speech, rate=sample_rate)
+

 ######################################################################
 # Applying codec to Tensor object
@@ -404,19 +300,16 @@ play_audio(noisy_speech, sample_rate)
 #


-waveform, sample_rate = get_speech_sample(resample=8000)
-
-plot_specgram(waveform, sample_rate, title="Original")
+waveform, sample_rate = torchaudio.load(SAMPLE_SPEECH)

 configs = [
-    ({"format": "wav", "encoding": "ULAW", "bits_per_sample": 8}, "8 bit mu-law"),
-    ({"format": "gsm"}, "GSM-FR"),
-    ({"format": "vorbis", "compression": -1}, "Vorbis"),
+    {"format": "wav", "encoding": "ULAW", "bits_per_sample": 8},
+    {"format": "gsm"},
+    {"format": "vorbis", "compression": -1},
 ]
 waveforms = []
-for param, title in configs:
+for param in configs:
    augmented = F.apply_codec(waveform, sample_rate, **param)
-    plot_specgram(augmented, sample_rate, title=title)
    waveforms.append(augmented)

 ######################################################################
@@ -424,28 +317,36 @@ for param, title in configs:
 # ~~~~~~~~~
 #

-play_audio(waveform, sample_rate)
+plot_waveform(waveform, sample_rate, title="Original")
+plot_specgram(waveform, sample_rate, title="Original")
+Audio(waveform, rate=sample_rate)

 ######################################################################
 # 8 bit mu-law:
 # ~~~~~~~~~~~~~
 #

-play_audio(waveforms[0], sample_rate)
+plot_waveform(waveforms[0], sample_rate, title="8 bit mu-law")
+plot_specgram(waveforms[0], sample_rate, title="8 bit mu-law")
+Audio(waveforms[0], rate=sample_rate)

 ######################################################################
 # GSM-FR:
 # ~~~~~~~
 #

-play_audio(waveforms[1], sample_rate)
+plot_waveform(waveforms[1], sample_rate, title="GSM-FR")
+plot_specgram(waveforms[1], sample_rate, title="GSM-FR")
+Audio(waveforms[1], rate=sample_rate)

 ######################################################################
 # Vorbis:
 # ~~~~~~~
 #

-play_audio(waveforms[2], sample_rate)
+plot_waveform(waveforms[2], sample_rate, title="Vorbis")
+plot_specgram(waveforms[2], sample_rate, title="Vorbis")
+Audio(waveforms[2], rate=sample_rate)

 ######################################################################
 # Simulating a phone recoding
@@ -457,14 +358,13 @@ play_audio(waveforms[2], sample_rate)
 #

 sample_rate = 16000
-original_speech, _ = get_speech_sample(resample=sample_rate)
+original_speech, sample_rate = torchaudio.load(SAMPLE_SPEECH)

 plot_specgram(original_speech, sample_rate, title="Original")

 # Apply RIR
-rir, _ = get_rir_sample(resample=sample_rate, processed=True)
-speech_ = torch.nn.functional.pad(original_speech, (rir.shape[1] - 1, 0))
-rir_applied = torch.nn.functional.conv1d(speech_[None, ...], rir[None, ...])[0]
+speech_ = torch.nn.functional.pad(original_speech, (RIR.shape[1] - 1, 0))
+rir_applied = torch.nn.functional.conv1d(speech_[None, ...], RIR[None, ...])[0]

 plot_specgram(rir_applied, sample_rate, title="RIR Applied")

@@ -472,7 +372,7 @@ plot_specgram(rir_applied, sample_rate, title="RIR Applied")
 # Because the noise is recorded in the actual environment, we consider that
 # the noise contains the acoustic feature of the environment. Therefore, we add
 # the noise after RIR application.
-noise, _ = get_noise_sample(resample=sample_rate)
+noise, _ = torchaudio.load(SAMPLE_NOISE)
 noise = noise[:, : rir_applied.shape[1]]

 snr_db = 8
@@ -512,32 +412,32 @@ plot_specgram(codec_applied, sample_rate2, title="GSM Codec Applied")
 # ~~~~~~~~~~~~~~~~
 #

-play_audio(original_speech, sample_rate)
+Audio(original_speech, rate=sample_rate)

 ######################################################################
 # RIR applied:
 # ~~~~~~~~~~~~
 #

-play_audio(rir_applied, sample_rate)
+Audio(rir_applied, rate=sample_rate)

 ######################################################################
 # Background noise added:
 # ~~~~~~~~~~~~~~~~~~~~~~~
 #

-play_audio(bg_added, sample_rate)
+Audio(bg_added, rate=sample_rate)

 ######################################################################
 # Filtered:
 # ~~~~~~~~~
 #

-play_audio(filtered, sample_rate2)
+Audio(filtered, rate=sample_rate2)

 ######################################################################
 # Codec aplied:
 # ~~~~~~~~~~~~~
 #

-play_audio(codec_applied, sample_rate2)
+Audio(codec_applied, rate=sample_rate2)