Commit 2ba36b47 authored by moto's avatar moto Committed by Facebook GitHub Bot
Browse files

Update data augmentation tutorial (#3375)

Summary:
Replace sox_effects with `torchaudio.io.AudioEffector`

1. To show case the new and better feature
2. To prepare for the upcoming removal of file-like support object

Pull Request resolved: https://github.com/pytorch/audio/pull/3375

Reviewed By: nateanl

Differential Revision: D46379016

Pulled By: mthrok

fbshipit-source-id: 70f24b62494204949f327f6ac6c49f315c9ee315
parent ab7a39f7
...@@ -27,8 +27,6 @@ print(torchaudio.__version__) ...@@ -27,8 +27,6 @@ print(torchaudio.__version__)
# First, we import the modules and download the audio assets we use in this tutorial. # First, we import the modules and download the audio assets we use in this tutorial.
# #
import math
from IPython.display import Audio from IPython.display import Audio
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
...@@ -44,56 +42,38 @@ SAMPLE_NOISE = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-st ...@@ -44,56 +42,38 @@ SAMPLE_NOISE = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-st
# Applying effects and filtering # Applying effects and filtering
# ------------------------------ # ------------------------------
# #
# :py:func:`torchaudio.sox_effects` allows for directly applying filters similar to # :py:class:`torchaudio.io.AudioEffector` allows for directly applying
# those available in ``sox`` to Tensor objects and file object audio sources. # filters and codecs to Tensor objects, in a similar way as ``ffmpeg``
# # command
# There are two functions for this:
#
# - :py:func:`torchaudio.sox_effects.apply_effects_tensor` for applying effects
# to Tensor.
# - :py:func:`torchaudio.sox_effects.apply_effects_file` for applying effects to
# other audio sources.
#
# Both functions accept effect definitions in the form
# ``List[List[str]]``.
# This is mostly consistent with how ``sox`` command works, but one caveat is
# that ``sox`` adds some effects automatically, whereas ``torchaudio``’s
# implementation does not.
#
# For the list of available effects, please refer to `the sox
# documentation <http://sox.sourceforge.net/sox.html>`__.
# #
# **Tip** If you need to load and resample your audio data on the fly, # `AudioEffector Usages <./effector_tutorial.html>` explains how to use
# then you can use :py:func:`torchaudio.sox_effects.apply_effects_file` # this class, so for the detail, please refer to the tutorial.
# with effect ``"rate"``.
#
# **Note** :py:func:`torchaudio.sox_effects.apply_effects_file` accepts a
# file-like object or path-like object.
# Similar to :py:func:`torchaudio.load`, when the audio format cannot be
# inferred from either the file extension or header, you can provide
# argument ``format`` to specify the format of the audio source.
#
# **Note** This process is not differentiable.
# #
# Load the data # Load the data
waveform1, sample_rate1 = torchaudio.load(SAMPLE_WAV) waveform1, sample_rate = torchaudio.load(SAMPLE_WAV, channels_first=False)
# Define effects # Define effects
effects = [ effect = ",".join(
["lowpass", "-1", "300"], # apply single-pole lowpass filter [
["speed", "0.8"], # reduce the speed "lowpass=frequency=300:poles=1", # apply single-pole lowpass filter
# This only changes sample rate, so it is necessary to "atempo=0.8", # reduce the speed
# add `rate` effect with original sample rate after this. "aecho=in_gain=0.8:out_gain=0.9:delays=200:decays=0.3|delays=400:decays=0.3"
["rate", f"{sample_rate1}"], # Applying echo gives some dramatic feeling
["reverb", "-w"], # Reverbration gives some dramatic feeling ],
] )
# Apply effects # Apply effects
waveform2, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor(waveform1, sample_rate1, effects) def apply_effect(waveform, sample_rate, effect):
effector = torchaudio.io.AudioEffector(effect=effect)
return effector.apply(waveform, sample_rate)
print(waveform1.shape, sample_rate1)
print(waveform2.shape, sample_rate2) waveform2 = apply_effect(waveform1, sample_rate, effect)
print(waveform1.shape, sample_rate)
print(waveform2.shape, sample_rate)
###################################################################### ######################################################################
# Note that the number of frames and number of channels are different from # Note that the number of frames and number of channels are different from
...@@ -101,6 +81,7 @@ print(waveform2.shape, sample_rate2) ...@@ -101,6 +81,7 @@ print(waveform2.shape, sample_rate2)
# audio. # audio.
# #
def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None): def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None):
waveform = waveform.numpy() waveform = waveform.numpy()
...@@ -123,6 +104,7 @@ def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None): ...@@ -123,6 +104,7 @@ def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None):
###################################################################### ######################################################################
# #
def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
waveform = waveform.numpy() waveform = waveform.numpy()
...@@ -141,26 +123,23 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): ...@@ -141,26 +123,23 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
plt.show(block=False) plt.show(block=False)
###################################################################### ######################################################################
# Original: # Original
# ~~~~~~~~~ # ~~~~~~~~
# #
plot_waveform(waveform1, sample_rate1, title="Original", xlim=(-0.1, 3.2)) plot_waveform(waveform1.T, sample_rate, title="Original", xlim=(-0.1, 3.2))
plot_specgram(waveform1, sample_rate1, title="Original", xlim=(0, 3.04)) plot_specgram(waveform1.T, sample_rate, title="Original", xlim=(0, 3.04))
Audio(waveform1, rate=sample_rate1) Audio(waveform1.T, rate=sample_rate)
###################################################################### ######################################################################
# Effects applied: # Effects applied
# ~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~
# #
plot_waveform(waveform2, sample_rate2, title="Effects Applied", xlim=(-0.1, 3.2)) plot_waveform(waveform2.T, sample_rate, title="Effects Applied", xlim=(-0.1, 3.2))
plot_specgram(waveform2, sample_rate2, title="Effects Applied", xlim=(0, 3.04)) plot_specgram(waveform2.T, sample_rate, title="Effects Applied", xlim=(0, 3.04))
Audio(waveform2, rate=sample_rate2) Audio(waveform2.T, rate=sample_rate)
######################################################################
# Doesn’t it sound more dramatic?
#
###################################################################### ######################################################################
# Simulating room reverberation # Simulating room reverberation
...@@ -203,8 +182,8 @@ speech, _ = torchaudio.load(SAMPLE_SPEECH) ...@@ -203,8 +182,8 @@ speech, _ = torchaudio.load(SAMPLE_SPEECH)
augmented = F.fftconvolve(speech, rir) augmented = F.fftconvolve(speech, rir)
###################################################################### ######################################################################
# Original: # Original
# ~~~~~~~~~ # ~~~~~~~~
# #
plot_waveform(speech, sample_rate, title="Original") plot_waveform(speech, sample_rate, title="Original")
...@@ -212,8 +191,8 @@ plot_specgram(speech, sample_rate, title="Original") ...@@ -212,8 +191,8 @@ plot_specgram(speech, sample_rate, title="Original")
Audio(speech, rate=sample_rate) Audio(speech, rate=sample_rate)
###################################################################### ######################################################################
# RIR applied: # RIR applied
# ~~~~~~~~~~~~ # ~~~~~~~~~~~
# #
plot_waveform(augmented, sample_rate, title="RIR Applied") plot_waveform(augmented, sample_rate, title="RIR Applied")
...@@ -248,8 +227,8 @@ noisy_speeches = F.add_noise(speech, noise, snr_dbs) ...@@ -248,8 +227,8 @@ noisy_speeches = F.add_noise(speech, noise, snr_dbs)
###################################################################### ######################################################################
# Background noise: # Background noise
# ~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~~
# #
plot_waveform(noise, sample_rate, title="Background noise") plot_waveform(noise, sample_rate, title="Background noise")
...@@ -257,8 +236,8 @@ plot_specgram(noise, sample_rate, title="Background noise") ...@@ -257,8 +236,8 @@ plot_specgram(noise, sample_rate, title="Background noise")
Audio(noise, rate=sample_rate) Audio(noise, rate=sample_rate)
###################################################################### ######################################################################
# SNR 20 dB: # SNR 20 dB
# ~~~~~~~~~~ # ~~~~~~~~~
# #
snr_db, noisy_speech = snr_dbs[0], noisy_speeches[0:1] snr_db, noisy_speech = snr_dbs[0], noisy_speeches[0:1]
...@@ -267,8 +246,8 @@ plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") ...@@ -267,8 +246,8 @@ plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
Audio(noisy_speech, rate=sample_rate) Audio(noisy_speech, rate=sample_rate)
###################################################################### ######################################################################
# SNR 10 dB: # SNR 10 dB
# ~~~~~~~~~~ # ~~~~~~~~~
# #
snr_db, noisy_speech = snr_dbs[1], noisy_speeches[1:2] snr_db, noisy_speech = snr_dbs[1], noisy_speeches[1:2]
...@@ -277,8 +256,8 @@ plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") ...@@ -277,8 +256,8 @@ plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
Audio(noisy_speech, rate=sample_rate) Audio(noisy_speech, rate=sample_rate)
###################################################################### ######################################################################
# SNR 3 dB: # SNR 3 dB
# ~~~~~~~~~ # ~~~~~~~~
# #
snr_db, noisy_speech = snr_dbs[2], noisy_speeches[2:3] snr_db, noisy_speech = snr_dbs[2], noisy_speeches[2:3]
...@@ -291,60 +270,56 @@ Audio(noisy_speech, rate=sample_rate) ...@@ -291,60 +270,56 @@ Audio(noisy_speech, rate=sample_rate)
# Applying codec to Tensor object # Applying codec to Tensor object
# ------------------------------- # -------------------------------
# #
# :py:func:`torchaudio.functional.apply_codec` can apply codecs to # :py:class:`torchaudio.io.AudioEffector` can also apply codecs to
# a Tensor object. # a Tensor object.
# #
# **Note** This process is not differentiable.
# waveform, sample_rate = torchaudio.load(SAMPLE_SPEECH, channels_first=False)
waveform, sample_rate = torchaudio.load(SAMPLE_SPEECH) def apply_codec(waveform, sample_rate, format, encoder=None):
encoder = torchaudio.io.AudioEffector(format=format, encoder=encoder)
return encoder.apply(waveform, sample_rate)
configs = [
{"format": "wav", "encoding": "ULAW", "bits_per_sample": 8},
{"format": "gsm"},
{"format": "vorbis", "compression": -1},
]
waveforms = []
for param in configs:
augmented = F.apply_codec(waveform, sample_rate, **param)
waveforms.append(augmented)
###################################################################### ######################################################################
# Original: # Original
# ~~~~~~~~~ # ~~~~~~~~
# #
plot_waveform(waveform, sample_rate, title="Original") plot_waveform(waveform.T, sample_rate, title="Original")
plot_specgram(waveform, sample_rate, title="Original") plot_specgram(waveform.T, sample_rate, title="Original")
Audio(waveform, rate=sample_rate) Audio(waveform.T, rate=sample_rate)
###################################################################### ######################################################################
# 8 bit mu-law: # 8 bit mu-law
# ~~~~~~~~~~~~~ # ~~~~~~~~~~~~
# #
plot_waveform(waveforms[0], sample_rate, title="8 bit mu-law") mulaw = apply_codec(waveform, sample_rate, "wav", encoder="pcm_mulaw")
plot_specgram(waveforms[0], sample_rate, title="8 bit mu-law") plot_waveform(mulaw.T, sample_rate, title="8 bit mu-law")
Audio(waveforms[0], rate=sample_rate) plot_specgram(mulaw.T, sample_rate, title="8 bit mu-law")
Audio(mulaw.T, rate=sample_rate)
###################################################################### ######################################################################
# GSM-FR: # G.722
# ~~~~~~~ # ~~~~~
# #
plot_waveform(waveforms[1], sample_rate, title="GSM-FR") g722 = apply_codec(waveform, sample_rate, "g722")
plot_specgram(waveforms[1], sample_rate, title="GSM-FR") plot_waveform(g722.T, sample_rate, title="G.722")
Audio(waveforms[1], rate=sample_rate) plot_specgram(g722.T, sample_rate, title="G.722")
Audio(g722.T, rate=sample_rate)
###################################################################### ######################################################################
# Vorbis: # Vorbis
# ~~~~~~~ # ~~~~~~
# #
plot_waveform(waveforms[2], sample_rate, title="Vorbis") vorbis = apply_codec(waveform, sample_rate, "ogg", encoder="vorbis")
plot_specgram(waveforms[2], sample_rate, title="Vorbis") plot_waveform(vorbis.T, sample_rate, title="Vorbis")
Audio(waveforms[2], rate=sample_rate) plot_specgram(vorbis.T, sample_rate, title="Vorbis")
Audio(vorbis.T, rate=sample_rate)
###################################################################### ######################################################################
# Simulating a phone recoding # Simulating a phone recoding
...@@ -378,62 +353,52 @@ bg_added = F.add_noise(rir_applied, noise, snr_db) ...@@ -378,62 +353,52 @@ bg_added = F.add_noise(rir_applied, noise, snr_db)
plot_specgram(bg_added, sample_rate, title="BG noise added") plot_specgram(bg_added, sample_rate, title="BG noise added")
# Apply filtering and change sample rate # Apply filtering and change sample rate
filtered, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor( effect = ",".join([
bg_added, "lowpass=frequency=4000:poles=1",
sample_rate, "compand=attacks=0.02:decays=0.05:points=-60/-60|-30/-10|-20/-8|-5/-8|-2/-8:gain=-8:volume=-7:delay=0.05",
effects=[ ])
["lowpass", "4000"],
[
"compand",
"0.02,0.05",
"-60,-60,-30,-10,-20,-8,-5,-8,-2,-8",
"-8",
"-7",
"0.05",
],
["rate", "8000"],
],
)
plot_specgram(filtered, sample_rate2, title="Filtered") filtered = apply_effect(bg_added.T, sample_rate, effect)
sample_rate2 = 8000
# Apply telephony codec plot_specgram(filtered.T, sample_rate2, title="Filtered")
codec_applied = F.apply_codec(filtered, sample_rate2, format="gsm")
plot_specgram(codec_applied, sample_rate2, title="GSM Codec Applied") # Apply telephony codec
codec_applied = apply_codec(filtered, sample_rate2, "g722")
plot_specgram(codec_applied.T, sample_rate2, title="G.722 Codec Applied")
###################################################################### ######################################################################
# Original speech: # Original speech
# ~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~
# #
Audio(original_speech, rate=sample_rate) Audio(original_speech, rate=sample_rate)
###################################################################### ######################################################################
# RIR applied: # RIR applied
# ~~~~~~~~~~~~ # ~~~~~~~~~~~
# #
Audio(rir_applied, rate=sample_rate) Audio(rir_applied, rate=sample_rate)
###################################################################### ######################################################################
# Background noise added: # Background noise added
# ~~~~~~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~~~~~~~~
# #
Audio(bg_added, rate=sample_rate) Audio(bg_added, rate=sample_rate)
###################################################################### ######################################################################
# Filtered: # Filtered
# ~~~~~~~~~ # ~~~~~~~~
# #
Audio(filtered, rate=sample_rate2) Audio(filtered.T, rate=sample_rate2)
###################################################################### ######################################################################
# Codec applied: # Codec applied
# ~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~
# #
Audio(codec_applied, rate=sample_rate2) Audio(codec_applied.T, rate=sample_rate2)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment