"sgl-kernel/vscode:/vscode.git/clone" did not exist on "39fd1788311cbd08b009f77a233bdad2f795b6d1"
Commit 41082eb0 authored by moto's avatar moto Committed by Facebook GitHub Bot
Browse files

Update audio data augmentation tutorial (#2388)

Summary:
- Adopt `torchaudio.utils.download_asset` to simplify asset management.
- Break down the first section about helper functions.
- Reduce the number of helper functions

https://output.circle-artifacts.com/output/job/d7dd1b93-6dfe-46da-a080-109bfdc63881/artifacts/0/docs/tutorials/audio_data_augmentation_tutorial.html

Pull Request resolved: https://github.com/pytorch/audio/pull/2388

Reviewed By: carolineechen

Differential Revision: D36404405

Pulled By: mthrok

fbshipit-source-id: f460ed810519797fce6e2fa7baaee110bddd1d06
parent fd2be89a
......@@ -4,6 +4,11 @@ Audio Data Augmentation
=======================
``torchaudio`` provides a variety of ways to augment audio data.
In this tutorial, we look into a way to apply effects, filters,
RIR (room impulse response) and codecs.
At the end, we synthesize noisy speech over phone from clean speech.
"""
import torch
......@@ -14,164 +19,23 @@ print(torch.__version__)
print(torchaudio.__version__)
######################################################################
# Preparing data and utility functions (skip this section)
# --------------------------------------------------------
# Preparation
# -----------
#
# First, we import the modules and download the audio assets we use in this tutorial.
#
# @title Prepare data and utility functions. {display-mode: "form"}
# @markdown
# @markdown You do not need to look into this cell.
# @markdown Just execute once and you are good to go.
# @markdown
# @markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/),
# @markdown which is licensed under Creative Commos BY 4.0.
# -------------------------------------------------------------------------------
# Preparation of data and helper functions.
# -------------------------------------------------------------------------------
import math
import os
from IPython.display import Audio
import matplotlib.pyplot as plt
import requests
from IPython.display import Audio, display
_SAMPLE_DIR = "_assets"
SAMPLE_WAV_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/steam-train-whistle-daniel_simon.wav"
SAMPLE_WAV_PATH = os.path.join(_SAMPLE_DIR, "steam.wav")
SAMPLE_RIR_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/distant-16k/room-response/rm1/impulse/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo.wav" # noqa: E501
SAMPLE_RIR_PATH = os.path.join(_SAMPLE_DIR, "rir.wav")
SAMPLE_WAV_SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" # noqa: E501
SAMPLE_WAV_SPEECH_PATH = os.path.join(_SAMPLE_DIR, "speech.wav")
SAMPLE_NOISE_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/distant-16k/distractors/rm1/babb/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo.wav" # noqa: E501
SAMPLE_NOISE_PATH = os.path.join(_SAMPLE_DIR, "bg.wav")
os.makedirs(_SAMPLE_DIR, exist_ok=True)
def _fetch_data():
uri = [
(SAMPLE_WAV_URL, SAMPLE_WAV_PATH),
(SAMPLE_RIR_URL, SAMPLE_RIR_PATH),
(SAMPLE_WAV_SPEECH_URL, SAMPLE_WAV_SPEECH_PATH),
(SAMPLE_NOISE_URL, SAMPLE_NOISE_PATH),
]
for url, path in uri:
with open(path, "wb") as file_:
file_.write(requests.get(url).content)
_fetch_data()
def _get_sample(path, resample=None):
effects = [["remix", "1"]]
if resample:
effects.extend(
[
["lowpass", f"{resample // 2}"],
["rate", f"{resample}"],
]
)
return torchaudio.sox_effects.apply_effects_file(path, effects=effects)
def get_sample(*, resample=None):
return _get_sample(SAMPLE_WAV_PATH, resample=resample)
def get_speech_sample(*, resample=None):
return _get_sample(SAMPLE_WAV_SPEECH_PATH, resample=resample)
def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None, ylim=None):
waveform = waveform.numpy()
num_channels, num_frames = waveform.shape
time_axis = torch.arange(0, num_frames) / sample_rate
figure, axes = plt.subplots(num_channels, 1)
if num_channels == 1:
axes = [axes]
for c in range(num_channels):
axes[c].plot(time_axis, waveform[c], linewidth=1)
axes[c].grid(True)
if num_channels > 1:
axes[c].set_ylabel(f"Channel {c+1}")
if xlim:
axes[c].set_xlim(xlim)
if ylim:
axes[c].set_ylim(ylim)
figure.suptitle(title)
plt.show(block=False)
def print_stats(waveform, sample_rate=None, src=None):
if src:
print("-" * 10)
print("Source:", src)
print("-" * 10)
if sample_rate:
print("Sample Rate:", sample_rate)
print("Shape:", tuple(waveform.shape))
print("Dtype:", waveform.dtype)
print(f" - Max: {waveform.max().item():6.3f}")
print(f" - Min: {waveform.min().item():6.3f}")
print(f" - Mean: {waveform.mean().item():6.3f}")
print(f" - Std Dev: {waveform.std().item():6.3f}")
print()
print(waveform)
print()
def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
waveform = waveform.numpy()
num_channels, num_frames = waveform.shape
figure, axes = plt.subplots(num_channels, 1)
if num_channels == 1:
axes = [axes]
for c in range(num_channels):
axes[c].specgram(waveform[c], Fs=sample_rate)
if num_channels > 1:
axes[c].set_ylabel(f"Channel {c+1}")
if xlim:
axes[c].set_xlim(xlim)
figure.suptitle(title)
plt.show(block=False)
def play_audio(waveform, sample_rate):
waveform = waveform.numpy()
num_channels, num_frames = waveform.shape
if num_channels == 1:
return Audio(waveform[0], rate=sample_rate)
elif num_channels == 2:
return Audio((waveform[0], waveform[1]), rate=sample_rate)
else:
raise ValueError("Waveform with more than 2 channels are not supported.")
from torchaudio.utils import download_asset
def get_rir_sample(*, resample=None, processed=False):
rir_raw, sample_rate = _get_sample(SAMPLE_RIR_PATH, resample=resample)
if not processed:
return rir_raw, sample_rate
rir = rir_raw[:, int(sample_rate * 1.01) : int(sample_rate * 1.3)]
rir = rir / torch.norm(rir, p=2)
rir = torch.flip(rir, [1])
return rir, sample_rate
def get_noise_sample(*, resample=None):
return _get_sample(SAMPLE_NOISE_PATH, resample=resample)
SAMPLE_WAV = download_asset("tutorial-assets/steam-train-whistle-daniel_simon.wav")
SAMPLE_RIR = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo-8000hz.wav")
SAMPLE_SPEECH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042-8000hz.wav")
SAMPLE_NOISE = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo-8000hz.wav")
######################################################################
......@@ -210,9 +74,8 @@ def get_noise_sample(*, resample=None):
# **Note** This process is not differentiable.
#
# Load the data
waveform1, sample_rate1 = get_sample(resample=16000)
waveform1, sample_rate1 = torchaudio.load(SAMPLE_WAV)
# Define effects
effects = [
......@@ -227,8 +90,8 @@ effects = [
# Apply effects
waveform2, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor(waveform1, sample_rate1, effects)
print_stats(waveform1, sample_rate=sample_rate1, src="Original")
print_stats(waveform2, sample_rate=sample_rate2, src="Effects Applied")
print(waveform1.shape, sample_rate1)
print(waveform2.shape, sample_rate2)
######################################################################
# Note that the number of frames and number of channels are different from
......@@ -236,6 +99,45 @@ print_stats(waveform2, sample_rate=sample_rate2, src="Effects Applied")
# audio.
#
def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None):
waveform = waveform.numpy()
num_channels, num_frames = waveform.shape
time_axis = torch.arange(0, num_frames) / sample_rate
figure, axes = plt.subplots(num_channels, 1)
if num_channels == 1:
axes = [axes]
for c in range(num_channels):
axes[c].plot(time_axis, waveform[c], linewidth=1)
axes[c].grid(True)
if num_channels > 1:
axes[c].set_ylabel(f"Channel {c+1}")
if xlim:
axes[c].set_xlim(xlim)
figure.suptitle(title)
plt.show(block=False)
######################################################################
#
def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
waveform = waveform.numpy()
num_channels, _ = waveform.shape
figure, axes = plt.subplots(num_channels, 1)
if num_channels == 1:
axes = [axes]
for c in range(num_channels):
axes[c].specgram(waveform[c], Fs=sample_rate)
if num_channels > 1:
axes[c].set_ylabel(f"Channel {c+1}")
if xlim:
axes[c].set_xlim(xlim)
figure.suptitle(title)
plt.show(block=False)
######################################################################
# Original:
# ~~~~~~~~~
......@@ -243,7 +145,7 @@ print_stats(waveform2, sample_rate=sample_rate2, src="Effects Applied")
plot_waveform(waveform1, sample_rate1, title="Original", xlim=(-0.1, 3.2))
plot_specgram(waveform1, sample_rate1, title="Original", xlim=(0, 3.04))
play_audio(waveform1, sample_rate1)
Audio(waveform1, rate=sample_rate1)
######################################################################
# Effects applied:
......@@ -252,7 +154,7 @@ play_audio(waveform1, sample_rate1)
plot_waveform(waveform2, sample_rate2, title="Effects Applied", xlim=(-0.1, 3.2))
plot_specgram(waveform2, sample_rate2, title="Effects Applied", xlim=(0, 3.04))
play_audio(waveform2, sample_rate2)
Audio(waveform2, rate=sample_rate2)
######################################################################
# Doesn’t it sound more dramatic?
......@@ -275,14 +177,10 @@ play_audio(waveform2, sample_rate2)
# and clap your hands.
#
sample_rate = 8000
rir_raw, _ = get_rir_sample(resample=sample_rate)
plot_waveform(rir_raw, sample_rate, title="Room Impulse Response (raw)", ylim=None)
rir_raw, sample_rate = torchaudio.load(SAMPLE_RIR)
plot_waveform(rir_raw, sample_rate, title="Room Impulse Response (raw)")
plot_specgram(rir_raw, sample_rate, title="Room Impulse Response (raw)")
play_audio(rir_raw, sample_rate)
Audio(rir_raw, rate=sample_rate)
######################################################################
# First, we need to clean up the RIR. We extract the main impulse, normalize
......@@ -291,37 +189,36 @@ play_audio(rir_raw, sample_rate)
rir = rir_raw[:, int(sample_rate * 1.01) : int(sample_rate * 1.3)]
rir = rir / torch.norm(rir, p=2)
rir = torch.flip(rir, [1])
RIR = torch.flip(rir, [1])
print_stats(rir)
plot_waveform(rir, sample_rate, title="Room Impulse Response", ylim=None)
plot_waveform(rir, sample_rate, title="Room Impulse Response")
######################################################################
# Then, we convolve the speech signal with the RIR filter.
#
speech, _ = get_speech_sample(resample=sample_rate)
speech, _ = torchaudio.load(SAMPLE_SPEECH)
speech_ = torch.nn.functional.pad(speech, (rir.shape[1] - 1, 0))
augmented = torch.nn.functional.conv1d(speech_[None, ...], rir[None, ...])[0]
speech_ = torch.nn.functional.pad(speech, (RIR.shape[1] - 1, 0))
augmented = torch.nn.functional.conv1d(speech_[None, ...], RIR[None, ...])[0]
######################################################################
# Original:
# ~~~~~~~~~
#
plot_waveform(speech, sample_rate, title="Original", ylim=None)
plot_waveform(speech, sample_rate, title="Original")
plot_specgram(speech, sample_rate, title="Original")
play_audio(speech, sample_rate)
Audio(speech, rate=sample_rate)
######################################################################
# RIR applied:
# ~~~~~~~~~~~~
#
plot_waveform(augmented, sample_rate, title="RIR Applied", ylim=None)
plot_waveform(augmented, sample_rate, title="RIR Applied")
plot_specgram(augmented, sample_rate, title="RIR Applied")
play_audio(augmented, sample_rate)
Audio(augmented, rate=sample_rate)
######################################################################
......@@ -338,10 +235,8 @@ play_audio(augmented, sample_rate)
# $$ \\mathrm{SNR_{dB}} = 10 \\log _{{10}} \\mathrm {SNR} $$
#
sample_rate = 8000
speech, _ = get_speech_sample(resample=sample_rate)
noise, _ = get_noise_sample(resample=sample_rate)
speech, _ = torchaudio.load(SAMPLE_SPEECH)
noise, _ = torchaudio.load(SAMPLE_NOISE)
noise = noise[:, : speech.shape[1]]
speech_power = speech.norm(p=2)
......@@ -361,7 +256,7 @@ for snr_db in snr_dbs:
plot_waveform(noise, sample_rate, title="Background noise")
plot_specgram(noise, sample_rate, title="Background noise")
play_audio(noise, sample_rate)
Audio(noise, rate=sample_rate)
######################################################################
# SNR 20 dB:
......@@ -371,7 +266,7 @@ play_audio(noise, sample_rate)
snr_db, noisy_speech = snr_dbs[0], noisy_speeches[0]
plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
play_audio(noisy_speech, sample_rate)
Audio(noisy_speech, rate=sample_rate)
######################################################################
# SNR 10 dB:
......@@ -381,17 +276,18 @@ play_audio(noisy_speech, sample_rate)
snr_db, noisy_speech = snr_dbs[1], noisy_speeches[1]
plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
play_audio(noisy_speech, sample_rate)
Audio(noisy_speech, rate=sample_rate)
######################################################################
# SNR 3 dB:
# ~~~~~~~~~~
# ~~~~~~~~~
#
snr_db, noisy_speech = snr_dbs[2], noisy_speeches[2]
plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
play_audio(noisy_speech, sample_rate)
Audio(noisy_speech, rate=sample_rate)
######################################################################
# Applying codec to Tensor object
......@@ -404,19 +300,16 @@ play_audio(noisy_speech, sample_rate)
#
waveform, sample_rate = get_speech_sample(resample=8000)
plot_specgram(waveform, sample_rate, title="Original")
waveform, sample_rate = torchaudio.load(SAMPLE_SPEECH)
configs = [
({"format": "wav", "encoding": "ULAW", "bits_per_sample": 8}, "8 bit mu-law"),
({"format": "gsm"}, "GSM-FR"),
({"format": "vorbis", "compression": -1}, "Vorbis"),
{"format": "wav", "encoding": "ULAW", "bits_per_sample": 8},
{"format": "gsm"},
{"format": "vorbis", "compression": -1},
]
waveforms = []
for param, title in configs:
for param in configs:
augmented = F.apply_codec(waveform, sample_rate, **param)
plot_specgram(augmented, sample_rate, title=title)
waveforms.append(augmented)
######################################################################
......@@ -424,28 +317,36 @@ for param, title in configs:
# ~~~~~~~~~
#
play_audio(waveform, sample_rate)
plot_waveform(waveform, sample_rate, title="Original")
plot_specgram(waveform, sample_rate, title="Original")
Audio(waveform, rate=sample_rate)
######################################################################
# 8 bit mu-law:
# ~~~~~~~~~~~~~
#
play_audio(waveforms[0], sample_rate)
plot_waveform(waveforms[0], sample_rate, title="8 bit mu-law")
plot_specgram(waveforms[0], sample_rate, title="8 bit mu-law")
Audio(waveforms[0], rate=sample_rate)
######################################################################
# GSM-FR:
# ~~~~~~~
#
play_audio(waveforms[1], sample_rate)
plot_waveform(waveforms[1], sample_rate, title="GSM-FR")
plot_specgram(waveforms[1], sample_rate, title="GSM-FR")
Audio(waveforms[1], rate=sample_rate)
######################################################################
# Vorbis:
# ~~~~~~~
#
play_audio(waveforms[2], sample_rate)
plot_waveform(waveforms[2], sample_rate, title="Vorbis")
plot_specgram(waveforms[2], sample_rate, title="Vorbis")
Audio(waveforms[2], rate=sample_rate)
######################################################################
# Simulating a phone recoding
......@@ -457,14 +358,13 @@ play_audio(waveforms[2], sample_rate)
#
sample_rate = 16000
original_speech, _ = get_speech_sample(resample=sample_rate)
original_speech, sample_rate = torchaudio.load(SAMPLE_SPEECH)
plot_specgram(original_speech, sample_rate, title="Original")
# Apply RIR
rir, _ = get_rir_sample(resample=sample_rate, processed=True)
speech_ = torch.nn.functional.pad(original_speech, (rir.shape[1] - 1, 0))
rir_applied = torch.nn.functional.conv1d(speech_[None, ...], rir[None, ...])[0]
speech_ = torch.nn.functional.pad(original_speech, (RIR.shape[1] - 1, 0))
rir_applied = torch.nn.functional.conv1d(speech_[None, ...], RIR[None, ...])[0]
plot_specgram(rir_applied, sample_rate, title="RIR Applied")
......@@ -472,7 +372,7 @@ plot_specgram(rir_applied, sample_rate, title="RIR Applied")
# Because the noise is recorded in the actual environment, we consider that
# the noise contains the acoustic feature of the environment. Therefore, we add
# the noise after RIR application.
noise, _ = get_noise_sample(resample=sample_rate)
noise, _ = torchaudio.load(SAMPLE_NOISE)
noise = noise[:, : rir_applied.shape[1]]
snr_db = 8
......@@ -512,32 +412,32 @@ plot_specgram(codec_applied, sample_rate2, title="GSM Codec Applied")
# ~~~~~~~~~~~~~~~~
#
play_audio(original_speech, sample_rate)
Audio(original_speech, rate=sample_rate)
######################################################################
# RIR applied:
# ~~~~~~~~~~~~
#
play_audio(rir_applied, sample_rate)
Audio(rir_applied, rate=sample_rate)
######################################################################
# Background noise added:
# ~~~~~~~~~~~~~~~~~~~~~~~
#
play_audio(bg_added, sample_rate)
Audio(bg_added, rate=sample_rate)
######################################################################
# Filtered:
# ~~~~~~~~~
#
play_audio(filtered, sample_rate2)
Audio(filtered, rate=sample_rate2)
######################################################################
# Codec aplied:
# ~~~~~~~~~~~~~
#
play_audio(codec_applied, sample_rate2)
Audio(codec_applied, rate=sample_rate2)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment