"docs/vscode:/vscode.git/clone" did not exist on "1b456bd5d52faa0896d9ebceed38e1d4f298a56f"
Commit b9ef69d1 authored by hwangjeff's avatar hwangjeff Committed by Facebook GitHub Bot
Browse files

Update data augmentation tutorial to use new operators (#3062)

Summary:
Updates tutorial "Audio Data Augmentation" to use two of the newly introduced data augmentation operators in beta: `torchaudio.functional.fftconvolve` and `torchaudio.functional.add_noise`.

Pull Request resolved: https://github.com/pytorch/audio/pull/3062

Reviewed By: mthrok

Differential Revision: D43298120

Pulled By: hwangjeff

fbshipit-source-id: 09ca736a5c67242568515d600b7d31eab32c2df1
parent 12e8cb97
...@@ -185,24 +185,22 @@ plot_specgram(rir_raw, sample_rate, title="Room Impulse Response (raw)") ...@@ -185,24 +185,22 @@ plot_specgram(rir_raw, sample_rate, title="Room Impulse Response (raw)")
Audio(rir_raw, rate=sample_rate) Audio(rir_raw, rate=sample_rate)
###################################################################### ######################################################################
# First, we need to clean up the RIR. We extract the main impulse, normalize # First, we need to clean up the RIR. We extract the main impulse and normalize
# the signal power, then flip along the time axis. # it by its power.
# #
rir = rir_raw[:, int(sample_rate * 1.01) : int(sample_rate * 1.3)] rir = rir_raw[:, int(sample_rate * 1.01) : int(sample_rate * 1.3)]
rir = rir / torch.norm(rir, p=2) rir = rir / torch.norm(rir, p=2)
RIR = torch.flip(rir, [1])
plot_waveform(rir, sample_rate, title="Room Impulse Response") plot_waveform(rir, sample_rate, title="Room Impulse Response")
###################################################################### ######################################################################
# Then, we convolve the speech signal with the RIR filter. # Then, using :py:func:`torchaudio.functional.fftconvolve`,
# we convolve the speech signal with the RIR.
# #
speech, _ = torchaudio.load(SAMPLE_SPEECH) speech, _ = torchaudio.load(SAMPLE_SPEECH)
augmented = F.fftconvolve(speech, rir)
speech_ = torch.nn.functional.pad(speech, (RIR.shape[1] - 1, 0))
augmented = torch.nn.functional.conv1d(speech_[None, ...], RIR[None, ...])[0]
###################################################################### ######################################################################
# Original: # Original:
...@@ -227,29 +225,27 @@ Audio(augmented, rate=sample_rate) ...@@ -227,29 +225,27 @@ Audio(augmented, rate=sample_rate)
# Adding background noise # Adding background noise
# ----------------------- # -----------------------
# #
# To add background noise to audio data, you can simply add a noise Tensor to # To introduce background noise to audio data, we can add a noise Tensor to
# the Tensor representing the audio data. A common method to adjust the # the Tensor representing the audio data according to some desired
# intensity of noise is changing the Signal-to-Noise Ratio (SNR). # signal-to-noise ratio (SNR)
# [`wikipedia <https://en.wikipedia.org/wiki/Signal-to-noise_ratio>`__] # [`wikipedia <https://en.wikipedia.org/wiki/Signal-to-noise_ratio>`__],
# which determines the intensity of the audio data relative to that of the noise
# in the output.
# #
# $$ \\mathrm{SNR} = \\frac{P_{signal}}{P_{noise}} $$ # $$ \\mathrm{SNR} = \\frac{P_{signal}}{P_{noise}} $$
# #
# $$ \\mathrm{SNR_{dB}} = 10 \\log _{{10}} \\mathrm {SNR} $$ # $$ \\mathrm{SNR_{dB}} = 10 \\log _{{10}} \\mathrm {SNR} $$
# #
# To add noise to audio data per SNRs, we
# use :py:func:`torchaudio.functional.add_noise`.
speech, _ = torchaudio.load(SAMPLE_SPEECH) speech, _ = torchaudio.load(SAMPLE_SPEECH)
noise, _ = torchaudio.load(SAMPLE_NOISE) noise, _ = torchaudio.load(SAMPLE_NOISE)
noise = noise[:, : speech.shape[1]] noise = noise[:, : speech.shape[1]]
speech_rms = speech.norm(p=2) snr_dbs = torch.tensor([20, 10, 3])
noise_rms = noise.norm(p=2) noisy_speeches = F.add_noise(speech, noise, snr_dbs)
snr_dbs = [20, 10, 3]
noisy_speeches = []
for snr_db in snr_dbs:
snr = 10 ** (snr_db / 20)
scale = snr * noise_rms / speech_rms
noisy_speeches.append((scale * speech + noise) / 2)
###################################################################### ######################################################################
# Background noise: # Background noise:
...@@ -265,7 +261,7 @@ Audio(noise, rate=sample_rate) ...@@ -265,7 +261,7 @@ Audio(noise, rate=sample_rate)
# ~~~~~~~~~~ # ~~~~~~~~~~
# #
snr_db, noisy_speech = snr_dbs[0], noisy_speeches[0] snr_db, noisy_speech = snr_dbs[0], noisy_speeches[0:1]
plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
Audio(noisy_speech, rate=sample_rate) Audio(noisy_speech, rate=sample_rate)
...@@ -275,7 +271,7 @@ Audio(noisy_speech, rate=sample_rate) ...@@ -275,7 +271,7 @@ Audio(noisy_speech, rate=sample_rate)
# ~~~~~~~~~~ # ~~~~~~~~~~
# #
snr_db, noisy_speech = snr_dbs[1], noisy_speeches[1] snr_db, noisy_speech = snr_dbs[1], noisy_speeches[1:2]
plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
Audio(noisy_speech, rate=sample_rate) Audio(noisy_speech, rate=sample_rate)
...@@ -285,7 +281,7 @@ Audio(noisy_speech, rate=sample_rate) ...@@ -285,7 +281,7 @@ Audio(noisy_speech, rate=sample_rate)
# ~~~~~~~~~ # ~~~~~~~~~
# #
snr_db, noisy_speech = snr_dbs[2], noisy_speeches[2] snr_db, noisy_speech = snr_dbs[2], noisy_speeches[2:3]
plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
Audio(noisy_speech, rate=sample_rate) Audio(noisy_speech, rate=sample_rate)
...@@ -365,8 +361,7 @@ original_speech, sample_rate = torchaudio.load(SAMPLE_SPEECH) ...@@ -365,8 +361,7 @@ original_speech, sample_rate = torchaudio.load(SAMPLE_SPEECH)
plot_specgram(original_speech, sample_rate, title="Original") plot_specgram(original_speech, sample_rate, title="Original")
# Apply RIR # Apply RIR
speech_ = torch.nn.functional.pad(original_speech, (RIR.shape[1] - 1, 0)) rir_applied = F.fftconvolve(speech, rir)
rir_applied = torch.nn.functional.conv1d(speech_[None, ...], RIR[None, ...])[0]
plot_specgram(rir_applied, sample_rate, title="RIR Applied") plot_specgram(rir_applied, sample_rate, title="RIR Applied")
...@@ -377,9 +372,8 @@ plot_specgram(rir_applied, sample_rate, title="RIR Applied") ...@@ -377,9 +372,8 @@ plot_specgram(rir_applied, sample_rate, title="RIR Applied")
noise, _ = torchaudio.load(SAMPLE_NOISE) noise, _ = torchaudio.load(SAMPLE_NOISE)
noise = noise[:, : rir_applied.shape[1]] noise = noise[:, : rir_applied.shape[1]]
snr_db = 8 snr_db = torch.tensor([8])
scale = (10 ** (snr_db / 20)) * noise.norm(p=2) / rir_applied.norm(p=2) bg_added = F.add_noise(rir_applied, noise, snr_db)
bg_added = (scale * rir_applied + noise) / 2
plot_specgram(bg_added, sample_rate, title="BG noise added") plot_specgram(bg_added, sample_rate, title="BG noise added")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment