Update audio I/O tutorials (#2385)

Summary: - Adopt `torchaudio.utils.download_asset` to simplify asset management. - Break down the first section about helper functions. - Use tempfile so that executing tutorial won't leave any artifacts on local file system. Example: https://output.circle-artifacts.com/output/job/b11a0087-8bf9-4999-a74f-b53798eaa77f/artifacts/0/docs/tutorials/audio_io_tutorial.html Pull Request resolved: https://github.com/pytorch/audio/pull/2385 Reviewed By: hwangjeff Differential Revision: D36404399 Pulled By: mthrok fbshipit-source-id: 106af34e8ddd22a061aa12767b444b32aef07bad

Update audio I/O tutorials (#2385)
Summary: - Adopt `torchaudio.utils.download_asset` to simplify asset management. - Break down the first section about helper functions. - Use tempfile so that executing tutorial won't leave any artifacts on local file system. Example: https://output.circle-artifacts.com/output/job/b11a0087-8bf9-4999-a74f-b53798eaa77f/artifacts/0/docs/tutorials/audio_io_tutorial.html Pull Request resolved: https://github.com/pytorch/audio/pull/2385 Reviewed By: hwangjeff Differential Revision: D36404399 Pulled By: mthrok fbshipit-source-id: 106af34e8ddd22a061aa12767b444b32aef07bad
4c19e2cb · moto · Facebook GitHub Bot · 550e6dcb · 4c19e2cb
Commit 4c19e2cb authored Jun 07, 2022 by moto Committed by Facebook GitHub Bot Jun 07, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 151 additions and 219 deletions

examples/tutorials/audio_io_tutorial.py examples/tutorials/audio_io_tutorial.py +151 -219

No files found.
--- a/examples/tutorials/audio_io_tutorial.py
+++ b/examples/tutorials/audio_io_tutorial.py
@@ -3,13 +3,10 @@
 Audio I/O
 =========

-``torchaudio`` integrates ``libsox`` and provides a rich set of audio I/O.
+This tutorial shows how to use TorchAudio's basic I/O API to load audio files
+into PyTorch's Tensor object, and save Tensor objects to audio files.
 """

-# When running this tutorial in Google Colab, install the required packages
-# with the following.
-# !pip install torchaudio boto3
-
 import torch
 import torchaudio

@@ -17,162 +14,37 @@ print(torch.__version__)
 print(torchaudio.__version__)

 ######################################################################
-# Preparing data and utility functions (skip this section)
-# --------------------------------------------------------
+# Preparation
+# -----------
 #
-
-# @title Prepare data and utility functions. {display-mode: "form"}
-# @markdown
-# @markdown You do not need to look into this cell.
-# @markdown Just execute once and you are good to go.
-# @markdown
-# @markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/),
-# @markdown which is licensed under Creative Commos BY 4.0.
-
+# First, we import the modules and download the audio assets we use in this tutorial.
+#
+# .. note::
+#    When running this tutorial in Google Colab, install the required packages
+#    with the following:
+#
+#    .. code::
+#
+#       !pip install boto3

 import io
 import os
 import tarfile
+import tempfile

 import boto3
 import matplotlib.pyplot as plt
 import requests
 from botocore import UNSIGNED
 from botocore.config import Config
-from IPython.display import Audio, display
-
-
-_SAMPLE_DIR = "_assets"
-SAMPLE_WAV_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/steam-train-whistle-daniel_simon.wav"
-SAMPLE_WAV_PATH = os.path.join(_SAMPLE_DIR, "steam.wav")
-
-SAMPLE_MP3_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/steam-train-whistle-daniel_simon.mp3"
-SAMPLE_MP3_PATH = os.path.join(_SAMPLE_DIR, "steam.mp3")
-
-SAMPLE_GSM_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/steam-train-whistle-daniel_simon.gsm"
-SAMPLE_GSM_PATH = os.path.join(_SAMPLE_DIR, "steam.gsm")
-
-SAMPLE_WAV_SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"  # noqa: E501
-SAMPLE_WAV_SPEECH_PATH = os.path.join(_SAMPLE_DIR, "speech.wav")
-
-SAMPLE_TAR_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit.tar.gz"
-SAMPLE_TAR_PATH = os.path.join(_SAMPLE_DIR, "sample.tar.gz")
-SAMPLE_TAR_ITEM = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
-
-S3_BUCKET = "pytorch-tutorial-assets"
-S3_KEY = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
-
-
-def _fetch_data():
-    os.makedirs(_SAMPLE_DIR, exist_ok=True)
-    uri = [
-        (SAMPLE_WAV_URL, SAMPLE_WAV_PATH),
-        (SAMPLE_MP3_URL, SAMPLE_MP3_PATH),
-        (SAMPLE_GSM_URL, SAMPLE_GSM_PATH),
-        (SAMPLE_WAV_SPEECH_URL, SAMPLE_WAV_SPEECH_PATH),
-        (SAMPLE_TAR_URL, SAMPLE_TAR_PATH),
-    ]
-    for url, path in uri:
-        with open(path, "wb") as file_:
-            file_.write(requests.get(url).content)
-
-
-_fetch_data()
-
-
-def print_stats(waveform, sample_rate=None, src=None):
-    if src:
-        print("-" * 10)
-        print("Source:", src)
-        print("-" * 10)
-    if sample_rate:
-        print("Sample Rate:", sample_rate)
-    print("Shape:", tuple(waveform.shape))
-    print("Dtype:", waveform.dtype)
-    print(f" - Max:     {waveform.max().item():6.3f}")
-    print(f" - Min:     {waveform.min().item():6.3f}")
-    print(f" - Mean:    {waveform.mean().item():6.3f}")
-    print(f" - Std Dev: {waveform.std().item():6.3f}")
-    print()
-    print(waveform)
-    print()
-
-
-def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None, ylim=None):
-    waveform = waveform.numpy()
-
-    num_channels, num_frames = waveform.shape
-    time_axis = torch.arange(0, num_frames) / sample_rate
-
-    figure, axes = plt.subplots(num_channels, 1)
-    if num_channels == 1:
-        axes = [axes]
-    for c in range(num_channels):
-        axes[c].plot(time_axis, waveform[c], linewidth=1)
-        axes[c].grid(True)
-        if num_channels > 1:
-            axes[c].set_ylabel(f"Channel {c+1}")
-        if xlim:
-            axes[c].set_xlim(xlim)
-        if ylim:
-            axes[c].set_ylim(ylim)
-    figure.suptitle(title)
-    plt.show(block=False)
-
-
-def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
-    waveform = waveform.numpy()
-
-    num_channels, num_frames = waveform.shape
-
-    figure, axes = plt.subplots(num_channels, 1)
-    if num_channels == 1:
-        axes = [axes]
-    for c in range(num_channels):
-        axes[c].specgram(waveform[c], Fs=sample_rate)
-        if num_channels > 1:
-            axes[c].set_ylabel(f"Channel {c+1}")
-        if xlim:
-            axes[c].set_xlim(xlim)
-    figure.suptitle(title)
-    plt.show(block=False)
-
-
-def play_audio(waveform, sample_rate):
-    waveform = waveform.numpy()
-
-    num_channels, num_frames = waveform.shape
-    if num_channels == 1:
-        display(Audio(waveform[0], rate=sample_rate))
-    elif num_channels == 2:
-        display(Audio((waveform[0], waveform[1]), rate=sample_rate))
-    else:
-        raise ValueError("Waveform with more than 2 channels are not supported.")
-
+from IPython.display import Audio
+from torchaudio.utils import download_asset

-def _get_sample(path, resample=None):
-    effects = [["remix", "1"]]
-    if resample:
-        effects.extend(
-            [
-                ["lowpass", f"{resample // 2}"],
-                ["rate", f"{resample}"],
-            ]
-        )
-    return torchaudio.sox_effects.apply_effects_file(path, effects=effects)
+SAMPLE_GSM = download_asset("tutorial-assets/steam-train-whistle-daniel_simon.gsm")
+SAMPLE_WAV = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
+SAMPLE_WAV_8000 = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042-8000hz.wav")


-def get_sample(*, resample=None):
-    return _get_sample(SAMPLE_WAV_PATH, resample=resample)
-
-
-def inspect_file(path):
-    print("-" * 10)
-    print("Source:", path)
-    print("-" * 10)
-    print(f" - File size: {os.path.getsize(path)} bytes")
-    print(f" - {torchaudio.info(path)}")
-

 ######################################################################
 # Querying audio metadata
@@ -182,8 +54,7 @@ def inspect_file(path):
 # You can provide a path-like object or file-like object.
 #

-
-metadata = torchaudio.info(SAMPLE_WAV_PATH)
+metadata = torchaudio.info(SAMPLE_WAV)
 print(metadata)

 ######################################################################
@@ -215,6 +86,7 @@ print(metadata)
 # -  ``"OPUS"``: Opus [`opus-codec.org <https://opus-codec.org/>`__]
 # -  ``"GSM"``: GSM-FR
 #    [`wikipedia <https://en.wikipedia.org/wiki/Full_Rate>`__]
+# -  ``"HTK"``: Single channel 16-bit PCM
 # -  ``"UNKNOWN"`` None of above
 #

@@ -225,56 +97,36 @@ print(metadata)
 #    variable bit rate (such as MP3).
 # -  ``num_frames`` can be ``0`` for GSM-FR format.
 #
-# .. code::
-#
-#    metadata = torchaudio.info(SAMPLE_MP3_PATH)
-#    print(metadata)
-#
-#    metadata = torchaudio.info(SAMPLE_GSM_PATH)
-#    print(metadata)
-#
-#    >>> AudioMetaData(sample_rate=44100, num_frames=110559, num_channels=2, bits_per_sample=0, encoding=MP3)
-#    >>> AudioMetaData(sample_rate=8000, num_frames=0, num_channels=1, bits_per_sample=0, encoding=GSM)
+
+metadata = torchaudio.info(SAMPLE_GSM)
+print(metadata)
+

 ######################################################################
 # Querying file-like object
-# ~~~~~~~~~~~~~~~~~~~~~~~~~
+# -------------------------
 #
 # :py:func:`torchaudio.info` works on file-like objects.
 #

-print("Source:", SAMPLE_WAV_URL)
-with requests.get(SAMPLE_WAV_URL, stream=True) as response:
+url = "https://download.pytorch.org/torchaudio/tutorial-assets/steam-train-whistle-daniel_simon.wav"
+with requests.get(url, stream=True) as response:
    metadata = torchaudio.info(response.raw)
 print(metadata)

 ######################################################################
-# **Note** When passing a file-like object, ``info`` does not read
-# all of the underlying data; rather, it reads only a portion
-# of the data from the beginning.
-# Therefore, for a given audio format, it may not be able to retrieve the
-# correct metadata, including the format itself.
-# The following example illustrates this.
-#
-# -  Use argument ``format`` to specify the audio format of the input.
-# -  The returned metadata has ``num_frames = 0``
-#
-# .. code::
-#
-#    print("Source:", SAMPLE_MP3_URL)
-#    with requests.get(SAMPLE_MP3_URL, stream=True) as response:
-#        metadata = torchaudio.info(response.raw, format="mp3")
+# .. note::
 #
-#        print(f"Fetched {response.raw.tell()} bytes.")
-#    print(metadata)
-#
-#    >>> Source: https://pytorch-tutorial-assets.s3.amazonaws.com/steam-train-whistle-daniel_simon.mp3
-#    >>> Fetched 8192 bytes.
-#    >>> AudioMetaData(sample_rate=44100, num_frames=0, num_channels=2, bits_per_sample=0, encoding=MP3)
+#    When passing a file-like object, ``info`` does not read
+#    all of the underlying data; rather, it reads only a portion
+#    of the data from the beginning.
+#    Therefore, for a given audio format, it may not be able to retrieve the
+#    correct metadata, including the format itself. In such case, you
+#    can pass ``format`` argument to specify the format of the audio.

 ######################################################################
-# Loading audio data into Tensor
-# ------------------------------
+# Loading audio data
+# ------------------
 #
 # To load audio data, you can use :py:func:`torchaudio.load`.
 #
@@ -284,51 +136,112 @@ print(metadata)
 # (``int``).
 #
 # By default, the resulting tensor object has ``dtype=torch.float32`` and
-# its value range is normalized within ``[-1.0, 1.0]``.
+# its value range is ``[-1.0, 1.0]``.
 #
 # For the list of supported format, please refer to `the torchaudio
 # documentation <https://pytorch.org/audio>`__.
 #

-waveform, sample_rate = torchaudio.load(SAMPLE_WAV_SPEECH_PATH)
+waveform, sample_rate = torchaudio.load(SAMPLE_WAV)
+
+
+######################################################################
+#
+def plot_waveform(waveform, sample_rate):
+    waveform = waveform.numpy()
+
+    num_channels, num_frames = waveform.shape
+    time_axis = torch.arange(0, num_frames) / sample_rate
+
+    figure, axes = plt.subplots(num_channels, 1)
+    if num_channels == 1:
+        axes = [axes]
+    for c in range(num_channels):
+        axes[c].plot(time_axis, waveform[c], linewidth=1)
+        axes[c].grid(True)
+        if num_channels > 1:
+            axes[c].set_ylabel(f"Channel {c+1}")
+    figure.suptitle("waveform")
+    plt.show(block=False)
+

-print_stats(waveform, sample_rate=sample_rate)
+######################################################################
+#
 plot_waveform(waveform, sample_rate)
+
+
+######################################################################
+#
+def plot_specgram(waveform, sample_rate, title="Spectrogram"):
+    waveform = waveform.numpy()
+
+    num_channels, num_frames = waveform.shape
+
+    figure, axes = plt.subplots(num_channels, 1)
+    if num_channels == 1:
+        axes = [axes]
+    for c in range(num_channels):
+        axes[c].specgram(waveform[c], Fs=sample_rate)
+        if num_channels > 1:
+            axes[c].set_ylabel(f"Channel {c+1}")
+    figure.suptitle(title)
+    plt.show(block=False)
+
+
+######################################################################
+#
 plot_specgram(waveform, sample_rate)
-play_audio(waveform, sample_rate)


+######################################################################
+#
+Audio(waveform.numpy()[0], rate=sample_rate)
+
 ######################################################################
 # Loading from file-like object
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# -----------------------------
 #
-# ``torchaudio``\ ’s I/O functions now support file-like objects. This
-# allows for fetching and decoding audio data from locations
+# The I/O functions support file-like objects.
+# This allows for fetching and decoding audio data from locations
 # within and beyond the local file system.
 # The following examples illustrate this.
 #

+######################################################################
+#
+
 # Load audio data as HTTP request
-with requests.get(SAMPLE_WAV_SPEECH_URL, stream=True) as response:
+url = "https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
+with requests.get(url, stream=True) as response:
    waveform, sample_rate = torchaudio.load(response.raw)
 plot_specgram(waveform, sample_rate, title="HTTP datasource")

+######################################################################
+#
+
 # Load audio from tar file
-with tarfile.open(SAMPLE_TAR_PATH, mode="r") as tarfile_:
-    fileobj = tarfile_.extractfile(SAMPLE_TAR_ITEM)
+tar_path = download_asset("tutorial-assets/VOiCES_devkit.tar.gz")
+tar_item = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
+with tarfile.open(tar_path, mode="r") as tarfile_:
+    fileobj = tarfile_.extractfile(tar_item)
    waveform, sample_rate = torchaudio.load(fileobj)
 plot_specgram(waveform, sample_rate, title="TAR file")

+######################################################################
+#
+
 # Load audio from S3
+bucket = "pytorch-tutorial-assets"
+key = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
 client = boto3.client("s3", config=Config(signature_version=UNSIGNED))
-response = client.get_object(Bucket=S3_BUCKET, Key=S3_KEY)
+response = client.get_object(Bucket=bucket, Key=key)
 waveform, sample_rate = torchaudio.load(response["Body"])
 plot_specgram(waveform, sample_rate, title="From S3")


 ######################################################################
 # Tips on slicing
-# ~~~~~~~~~~~~~~~
+# ---------------
 #
 # Providing ``num_frames`` and ``frame_offset`` arguments restricts
 # decoding to the corresponding segment of the input.
@@ -353,14 +266,15 @@ plot_specgram(waveform, sample_rate, title="From S3")

 frame_offset, num_frames = 16000, 16000  # Fetch and decode the 1 - 2 seconds

+url = "https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
 print("Fetching all the data...")
-with requests.get(SAMPLE_WAV_SPEECH_URL, stream=True) as response:
+with requests.get(url, stream=True) as response:
    waveform1, sample_rate1 = torchaudio.load(response.raw)
    waveform1 = waveform1[:, frame_offset : frame_offset + num_frames]
    print(f" - Fetched {response.raw.tell()} bytes")

 print("Fetching until the requested frames are available...")
-with requests.get(SAMPLE_WAV_SPEECH_URL, stream=True) as response:
+with requests.get(url, stream=True) as response:
    waveform2, sample_rate2 = torchaudio.load(response.raw, frame_offset=frame_offset, num_frames=num_frames)
    print(f" - Fetched {response.raw.tell()} bytes")

@@ -368,7 +282,6 @@ print("Checking the resulting waveform ... ", end="")
 assert (waveform1 == waveform2).all()
 print("matched!")

-
 ######################################################################
 # Saving audio to file
 # --------------------
@@ -389,35 +302,51 @@ print("matched!")
 # ``bits_per_sample`` to change this behavior. For example, to save data
 # in 16-bit signed integer PCM, you can do the following.
 #
-# **Note** Saving data in encodings with lower bit depth reduces the
+# .. note::
+#
+# Saving data in encodings with a lower bit depth reduces the
 # resulting file size but also precision.
 #

+waveform, sample_rate = torchaudio.load(SAMPLE_WAV)
+

-waveform, sample_rate = get_sample()
-print_stats(waveform, sample_rate=sample_rate)
+######################################################################
+#

+def inspect_file(path):
+    print("-" * 10)
+    print("Source:", path)
+    print("-" * 10)
+    print(f" - File size: {os.path.getsize(path)} bytes")
+    print(f" - {torchaudio.info(path)}")
+    print()
+
+######################################################################
+#
 # Save without any encoding option.
 # The function will pick up the encoding which
 # the provided data fit
-path = f"{_SAMPLE_DIR}/save_example_default.wav"
-torchaudio.save(path, waveform, sample_rate)
-inspect_file(path)
+with tempfile.TemporaryDirectory() as tempdir:
+    path = f"{tempdir}/save_example_default.wav"
+    torchaudio.save(path, waveform, sample_rate)
+    inspect_file(path)

+######################################################################
+#
 # Save as 16-bit signed integer Linear PCM
 # The resulting file occupies half the storage but loses precision
-path = f"{_SAMPLE_DIR}/save_example_PCM_S16.wav"
-torchaudio.save(path, waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
-inspect_file(path)
+with tempfile.TemporaryDirectory() as tempdir:
+    path = f"{tempdir}/save_example_PCM_S16.wav"
+    torchaudio.save(path, waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
+    inspect_file(path)


 ######################################################################
-# :py:func`torchaudio.save` can also handle other formats.
+# :py:func:`torchaudio.save` can also handle other formats.
 # To name a few:
 #

-waveform, sample_rate = get_sample(resample=8000)
-
 formats = [
    "flac",
    "vorbis",
@@ -427,15 +356,18 @@ formats = [
    "gsm",
 ]

-for format in formats:
-    path = f"{_SAMPLE_DIR}/save_example.{format}"
-    torchaudio.save(path, waveform, sample_rate, format=format)
-    inspect_file(path)
-
+######################################################################
+#
+waveform, sample_rate = torchaudio.load(SAMPLE_WAV_8000)
+with tempfile.TemporaryDirectory() as tempdir:
+    for format in formats:
+        path = f"{tempdir}/save_example.{format}"
+        torchaudio.save(path, waveform, sample_rate, format=format)
+        inspect_file(path)

 ######################################################################
 # Saving to file-like object
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~
+# --------------------------
 #
 # Similar to the other I/O functions, you can save audio to file-like
 # objects. When saving to a file-like object, argument ``format`` is
@@ -443,7 +375,7 @@ for format in formats:
 #


-waveform, sample_rate = get_sample()
+waveform, sample_rate = torchaudio.load(SAMPLE_WAV)

 # Saving to bytes buffer
 buffer_ = io.BytesIO()