Update data augmentation tutorial (#3375)

Summary: Replace sox_effects with `torchaudio.io.AudioEffector` 1. To show case the new and better feature 2. To prepare for the upcoming removal of file-like support object Pull Request resolved: https://github.com/pytorch/audio/pull/3375 Reviewed By: nateanl Differential Revision: D46379016 Pulled By: mthrok fbshipit-source-id: 70f24b62494204949f327f6ac6c49f315c9ee315

Update data augmentation tutorial (#3375)
Summary: Replace sox_effects with `torchaudio.io.AudioEffector` 1. To show case the new and better feature 2. To prepare for the upcoming removal of file-like support object Pull Request resolved: https://github.com/pytorch/audio/pull/3375 Reviewed By: nateanl Differential Revision: D46379016 Pulled By: mthrok fbshipit-source-id: 70f24b62494204949f327f6ac6c49f315c9ee315
2ba36b47 · moto · Facebook GitHub Bot · ab7a39f7 · 2ba36b47
Commit 2ba36b47 authored Jun 02, 2023 by moto Committed by Facebook GitHub Bot Jun 02, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 98 additions and 133 deletions

examples/tutorials/audio_data_augmentation_tutorial.py examples/tutorials/audio_data_augmentation_tutorial.py +98 -133

No files found.
--- a/examples/tutorials/audio_data_augmentation_tutorial.py
+++ b/examples/tutorials/audio_data_augmentation_tutorial.py
@@ -27,8 +27,6 @@ print(torchaudio.__version__)
 # First, we import the modules and download the audio assets we use in this tutorial.
 #
-import math
 from IPython.display import Audio
 import matplotlib.pyplot as plt
@@ -44,56 +42,38 @@ SAMPLE_NOISE = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-st
 # Applying effects and filtering
 # ------------------------------
 #
-# :py:func:`torchaudio.sox_effects` allows for directly applying filters similar to
+# :py:class:`torchaudio.io.AudioEffector` allows for directly applying
-# those available in ``sox`` to Tensor objects and file object audio sources.
+# filters and codecs to Tensor objects, in a similar way as ``ffmpeg``
-#
+# command
-# There are two functions for this:
-#
-# -  :py:func:`torchaudio.sox_effects.apply_effects_tensor` for applying effects
-#    to Tensor.
-# -  :py:func:`torchaudio.sox_effects.apply_effects_file` for applying effects to
-#    other audio sources.
-#
-# Both functions accept effect definitions in the form
-# ``List[List[str]]``.
-# This is mostly consistent with how ``sox`` command works, but one caveat is
-# that ``sox`` adds some effects automatically, whereas ``torchaudio``’s
-# implementation does not.
-#
-# For the list of available effects, please refer to `the sox
-# documentation <http://sox.sourceforge.net/sox.html>`__.
 #
-# **Tip** If you need to load and resample your audio data on the fly,
+# `AudioEffector Usages <./effector_tutorial.html>` explains how to use
-# then you can use :py:func:`torchaudio.sox_effects.apply_effects_file`
+# this class, so for the detail, please refer to the tutorial.
-# with effect ``"rate"``.
-#
-# **Note** :py:func:`torchaudio.sox_effects.apply_effects_file` accepts a
-# file-like object or path-like object.
-# Similar to :py:func:`torchaudio.load`, when the audio format cannot be
-# inferred from either the file extension or header, you can provide
-# argument ``format`` to specify the format of the audio source.
-#
-# **Note** This process is not differentiable.
 #
 # Load the data
-waveform1, sample_rate1 = torchaudio.load(SAMPLE_WAV)
+waveform1, sample_rate = torchaudio.load(SAMPLE_WAV, channels_first=False)
 # Define effects
-effects = [
+effect = ",".join(
-    ["lowpass", "-1", "300"],  # apply single-pole lowpass filter
+    [
-    ["speed", "0.8"],  # reduce the speed
+        "lowpass=frequency=300:poles=1",  # apply single-pole lowpass filter
-    # This only changes sample rate, so it is necessary to
+        "atempo=0.8",  # reduce the speed
-    # add `rate` effect with original sample rate after this.
+        "aecho=in_gain=0.8:out_gain=0.9:delays=200:decays=0.3|delays=400:decays=0.3"
-    ["rate", f"{sample_rate1}"],
+        # Applying echo gives some dramatic feeling
-    ["reverb", "-w"],  # Reverbration gives some dramatic feeling
+    ],
-]
+)
 # Apply effects
-waveform2, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor(waveform1, sample_rate1, effects)
+def apply_effect(waveform, sample_rate, effect):
+    effector = torchaudio.io.AudioEffector(effect=effect)
+    return effector.apply(waveform, sample_rate)
-print(waveform1.shape, sample_rate1)
-print(waveform2.shape, sample_rate2)
+waveform2 = apply_effect(waveform1, sample_rate, effect)
+print(waveform1.shape, sample_rate)
+print(waveform2.shape, sample_rate)
 ######################################################################
 # Note that the number of frames and number of channels are different from
@@ -101,6 +81,7 @@ print(waveform2.shape, sample_rate2)
 # audio.
 #
 def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None):
    waveform = waveform.numpy()
@@ -123,6 +104,7 @@ def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None):
 ######################################################################
 #
 def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
    waveform = waveform.numpy()
@@ -141,26 +123,23 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
    plt.show(block=False)
 ######################################################################
-# Original:
+# Original
-# ~~~~~~~~~
+# ~~~~~~~~
 #
-plot_waveform(waveform1, sample_rate1, title="Original", xlim=(-0.1, 3.2))
+plot_waveform(waveform1.T, sample_rate, title="Original", xlim=(-0.1, 3.2))
-plot_specgram(waveform1, sample_rate1, title="Original", xlim=(0, 3.04))
+plot_specgram(waveform1.T, sample_rate, title="Original", xlim=(0, 3.04))
-Audio(waveform1, rate=sample_rate1)
+Audio(waveform1.T, rate=sample_rate)
 ######################################################################
-# Effects applied:
+# Effects applied
-# ~~~~~~~~~~~~~~~~
+# ~~~~~~~~~~~~~~~
 #
-plot_waveform(waveform2, sample_rate2, title="Effects Applied", xlim=(-0.1, 3.2))
+plot_waveform(waveform2.T, sample_rate, title="Effects Applied", xlim=(-0.1, 3.2))
-plot_specgram(waveform2, sample_rate2, title="Effects Applied", xlim=(0, 3.04))
+plot_specgram(waveform2.T, sample_rate, title="Effects Applied", xlim=(0, 3.04))
-Audio(waveform2, rate=sample_rate2)
+Audio(waveform2.T, rate=sample_rate)
-######################################################################
-# Doesn’t it sound more dramatic?
-#
 ######################################################################
 # Simulating room reverberation
@@ -203,8 +182,8 @@ speech, _ = torchaudio.load(SAMPLE_SPEECH)
 augmented = F.fftconvolve(speech, rir)
 ######################################################################
-# Original:
+# Original
-# ~~~~~~~~~
+# ~~~~~~~~
 #
 plot_waveform(speech, sample_rate, title="Original")
@@ -212,8 +191,8 @@ plot_specgram(speech, sample_rate, title="Original")
 Audio(speech, rate=sample_rate)
 ######################################################################
-# RIR applied:
+# RIR applied
-# ~~~~~~~~~~~~
+# ~~~~~~~~~~~
 #
 plot_waveform(augmented, sample_rate, title="RIR Applied")
@@ -248,8 +227,8 @@ noisy_speeches = F.add_noise(speech, noise, snr_dbs)
 ######################################################################
-# Background noise:
+# Background noise
-# ~~~~~~~~~~~~~~~~~
+# ~~~~~~~~~~~~~~~~
 #
 plot_waveform(noise, sample_rate, title="Background noise")
@@ -257,8 +236,8 @@ plot_specgram(noise, sample_rate, title="Background noise")
 Audio(noise, rate=sample_rate)
 ######################################################################
-# SNR 20 dB:
+# SNR 20 dB
-# ~~~~~~~~~~
+# ~~~~~~~~~
 #
 snr_db, noisy_speech = snr_dbs[0], noisy_speeches[0:1]
@@ -267,8 +246,8 @@ plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
 Audio(noisy_speech, rate=sample_rate)
 ######################################################################
-# SNR 10 dB:
+# SNR 10 dB
-# ~~~~~~~~~~
+# ~~~~~~~~~
 #
 snr_db, noisy_speech = snr_dbs[1], noisy_speeches[1:2]
@@ -277,8 +256,8 @@ plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
 Audio(noisy_speech, rate=sample_rate)
 ######################################################################
-# SNR 3 dB:
+# SNR 3 dB
-# ~~~~~~~~~
+# ~~~~~~~~
 #
 snr_db, noisy_speech = snr_dbs[2], noisy_speeches[2:3]
@@ -291,60 +270,56 @@ Audio(noisy_speech, rate=sample_rate)
 # Applying codec to Tensor object
 # -------------------------------
 #
-# :py:func:`torchaudio.functional.apply_codec` can apply codecs to
+# :py:class:`torchaudio.io.AudioEffector` can also apply codecs to
 # a Tensor object.
 #
-# **Note** This process is not differentiable.
-#
+waveform, sample_rate = torchaudio.load(SAMPLE_SPEECH, channels_first=False)
-waveform, sample_rate = torchaudio.load(SAMPLE_SPEECH)
+def apply_codec(waveform, sample_rate, format, encoder=None):
+    encoder = torchaudio.io.AudioEffector(format=format, encoder=encoder)
+    return encoder.apply(waveform, sample_rate)
-configs = [
-    {"format": "wav", "encoding": "ULAW", "bits_per_sample": 8},
-    {"format": "gsm"},
-    {"format": "vorbis", "compression": -1},
-]
-waveforms = []
-for param in configs:
-    augmented = F.apply_codec(waveform, sample_rate, **param)
-    waveforms.append(augmented)
 ######################################################################
-# Original:
+# Original
-# ~~~~~~~~~
+# ~~~~~~~~
 #
-plot_waveform(waveform, sample_rate, title="Original")
+plot_waveform(waveform.T, sample_rate, title="Original")
-plot_specgram(waveform, sample_rate, title="Original")
+plot_specgram(waveform.T, sample_rate, title="Original")
-Audio(waveform, rate=sample_rate)
+Audio(waveform.T, rate=sample_rate)
 ######################################################################
-# 8 bit mu-law:
+# 8 bit mu-law
-# ~~~~~~~~~~~~~
+# ~~~~~~~~~~~~
 #
-plot_waveform(waveforms[0], sample_rate, title="8 bit mu-law")
+mulaw = apply_codec(waveform, sample_rate, "wav", encoder="pcm_mulaw")
-plot_specgram(waveforms[0], sample_rate, title="8 bit mu-law")
+plot_waveform(mulaw.T, sample_rate, title="8 bit mu-law")
-Audio(waveforms[0], rate=sample_rate)
+plot_specgram(mulaw.T, sample_rate, title="8 bit mu-law")
+Audio(mulaw.T, rate=sample_rate)
 ######################################################################
-# GSM-FR:
+# G.722
-# ~~~~~~~
+# ~~~~~
 #
-plot_waveform(waveforms[1], sample_rate, title="GSM-FR")
+g722 = apply_codec(waveform, sample_rate, "g722")
-plot_specgram(waveforms[1], sample_rate, title="GSM-FR")
+plot_waveform(g722.T, sample_rate, title="G.722")
-Audio(waveforms[1], rate=sample_rate)
+plot_specgram(g722.T, sample_rate, title="G.722")
+Audio(g722.T, rate=sample_rate)
 ######################################################################
-# Vorbis:
+# Vorbis
-# ~~~~~~~
+# ~~~~~~
 #
-plot_waveform(waveforms[2], sample_rate, title="Vorbis")
+vorbis = apply_codec(waveform, sample_rate, "ogg", encoder="vorbis")
-plot_specgram(waveforms[2], sample_rate, title="Vorbis")
+plot_waveform(vorbis.T, sample_rate, title="Vorbis")
-Audio(waveforms[2], rate=sample_rate)
+plot_specgram(vorbis.T, sample_rate, title="Vorbis")
+Audio(vorbis.T, rate=sample_rate)
 ######################################################################
 # Simulating a phone recoding
@@ -378,62 +353,52 @@ bg_added = F.add_noise(rir_applied, noise, snr_db)
 plot_specgram(bg_added, sample_rate, title="BG noise added")
 # Apply filtering and change sample rate
-filtered, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor(
+effect = ",".join([
-    bg_added,
+    "lowpass=frequency=4000:poles=1",
-    sample_rate,
+    "compand=attacks=0.02:decays=0.05:points=-60/-60|-30/-10|-20/-8|-5/-8|-2/-8:gain=-8:volume=-7:delay=0.05",
-    effects=[
+])
-        ["lowpass", "4000"],
-        [
-            "compand",
-            "0.02,0.05",
-            "-60,-60,-30,-10,-20,-8,-5,-8,-2,-8",
-            "-8",
-            "-7",
-            "0.05",
-        ],
-        ["rate", "8000"],
-    ],
-)
-plot_specgram(filtered, sample_rate2, title="Filtered")
+filtered = apply_effect(bg_added.T, sample_rate, effect)
+sample_rate2 = 8000
-# Apply telephony codec
+plot_specgram(filtered.T, sample_rate2, title="Filtered")
-codec_applied = F.apply_codec(filtered, sample_rate2, format="gsm")
-plot_specgram(codec_applied, sample_rate2, title="GSM Codec Applied")
+# Apply telephony codec
+codec_applied = apply_codec(filtered, sample_rate2, "g722")
+plot_specgram(codec_applied.T, sample_rate2, title="G.722 Codec Applied")
 ######################################################################
-# Original speech:
+# Original speech
-# ~~~~~~~~~~~~~~~~
+# ~~~~~~~~~~~~~~~
 #
 Audio(original_speech, rate=sample_rate)
 ######################################################################
-# RIR applied:
+# RIR applied
-# ~~~~~~~~~~~~
+# ~~~~~~~~~~~
 #
 Audio(rir_applied, rate=sample_rate)
 ######################################################################
-# Background noise added:
+# Background noise added
-# ~~~~~~~~~~~~~~~~~~~~~~~
+# ~~~~~~~~~~~~~~~~~~~~~~
 #
 Audio(bg_added, rate=sample_rate)
 ######################################################################
-# Filtered:
+# Filtered
-# ~~~~~~~~~
+# ~~~~~~~~
 #
-Audio(filtered, rate=sample_rate2)
+Audio(filtered.T, rate=sample_rate2)
 ######################################################################
-# Codec applied:
+# Codec applied
-# ~~~~~~~~~~~~~~
+# ~~~~~~~~~~~~~
 #
-Audio(codec_applied, rate=sample_rate2)
+Audio(codec_applied.T, rate=sample_rate2)