UPDATE

ffeba11a · mayp777 · 29deb085 · ffeba11a · ffeba11a · ffeba11a
Commit ffeba11a authored Sep 02, 2024 by mayp777
20 changed files
--- a/examples/tutorials/forced_alignment_tutorial.py
+++ b/examples/tutorials/forced_alignment_tutorial.py
@@ -9,6 +9,25 @@ This tutorial shows how to align transcript to speech with
 `CTC-Segmentation of Large Corpora for German End-to-end Speech
 Recognition <https://arxiv.org/abs/2007.09127>`__.
+.. note::
+   This tutorial was originally written to illustrate a usecase
+   for Wav2Vec2 pretrained model.
+   TorchAudio now has a set of APIs designed for forced alignment.
+   The `CTC forced alignment API tutorial
+   <./ctc_forced_alignment_api_tutorial.html>`__ illustrates the
+   usage of :py:func:`torchaudio.functional.forced_align`, which is
+   the core API.
+   If you are looking to align your corpus, we recommend to use
+   :py:class:`torchaudio.pipelines.Wav2Vec2FABundle`, which combines
+   :py:func:`~torchaudio.functional.forced_align` and other support
+   functions with pre-trained model specifically trained for
+   forced-alignment. Please refer to the
+   `Forced alignment for multilingual data
+   <forced_alignment_for_multilingual_data_tutorial.html>`__ which
+   illustrates its usage.
 """
 import torch
@@ -45,16 +64,11 @@ print(device)
 # First we import the necessary packages, and fetch data that we work on.
 #
-# %matplotlib inline
 from dataclasses import dataclass
 import IPython
-import matplotlib
 import matplotlib.pyplot as plt
-matplotlib.rcParams["figure.figsize"] = [16.0, 4.8]
 torch.random.manual_seed(0)
 SPEECH_FILE = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
@@ -64,7 +78,7 @@ SPEECH_FILE = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-
 # Generate frame-wise label probability
 # -------------------------------------
 #
-# The first step is to generate the label class porbability of each aduio
+# The first step is to generate the label class porbability of each audio
 # frame. We can use a Wav2Vec2 model that is trained for ASR. Here we use
 # :py:func:`torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H`.
 #
@@ -88,17 +102,24 @@ with torch.inference_mode():
 emission = emissions[0].cpu().detach()
+print(labels)
 ################################################################################
 # Visualization
-################################################################################
+# ~~~~~~~~~~~~~
-print(labels)
-plt.imshow(emission.T)
-plt.colorbar()
+def plot():
-plt.title("Frame-wise class probability")
+    fig, ax = plt.subplots()
-plt.xlabel("Time")
+    img = ax.imshow(emission.T)
-plt.ylabel("Labels")
+    ax.set_title("Frame-wise class probability")
-plt.show()
+    ax.set_xlabel("Time")
+    ax.set_ylabel("Labels")
+    fig.colorbar(img, ax=ax, shrink=0.6, location="bottom")
+    fig.tight_layout()
+plot()
 ######################################################################
 # Generate alignment probability (trellis)
@@ -138,7 +159,9 @@ plt.show()
 # [`distill.pub <https://distill.pub/2017/ctc/>`__])
 #
-transcript = "I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT"
+# We enclose the transcript with space tokens, which represent SOS and EOS.
+transcript = "|I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|"
 dictionary = {c: i for i, c in enumerate(labels)}
 tokens = [dictionary[c] for c in transcript]
@@ -149,21 +172,17 @@ def get_trellis(emission, tokens, blank_id=0):
    num_frame = emission.size(0)
    num_tokens = len(tokens)
-    # Trellis has extra diemsions for both time axis and tokens.
+    trellis = torch.zeros((num_frame, num_tokens))
-    # The extra dim for tokens represents <SoS> (start-of-sentence)
+    trellis[1:, 0] = torch.cumsum(emission[1:, blank_id], 0)
-    # The extra dim for time axis is for simplification of the code.
+    trellis[0, 1:] = -float("inf")
-    trellis = torch.empty((num_frame + 1, num_tokens + 1))
+    trellis[-num_tokens + 1 :, 0] = float("inf")
-    trellis[0, 0] = 0
-    trellis[1:, 0] = torch.cumsum(emission[:, 0], 0)
-    trellis[0, -num_tokens:] = -float("inf")
-    trellis[-num_tokens:, 0] = float("inf")
-    for t in range(num_frame):
+    for t in range(num_frame - 1):
        trellis[t + 1, 1:] = torch.maximum(
            # Score for staying at the same token
            trellis[t, 1:] + emission[t, blank_id],
            # Score for changing to the next token
-            trellis[t, :-1] + emission[t, tokens],
+            trellis[t, :-1] + emission[t, tokens[1:]],
        )
    return trellis
@@ -172,11 +191,19 @@ trellis = get_trellis(emission, tokens)
 ################################################################################
 # Visualization
-################################################################################
+# ~~~~~~~~~~~~~
-plt.imshow(trellis[1:, 1:].T, origin="lower")
-plt.annotate("- Inf", (trellis.size(1) / 5, trellis.size(1) / 1.5))
-plt.colorbar()
+def plot():
-plt.show()
+    fig, ax = plt.subplots()
+    img = ax.imshow(trellis.T, origin="lower")
+    ax.annotate("- Inf", (trellis.size(1) / 5, trellis.size(1) / 1.5))
+    ax.annotate("+ Inf", (trellis.size(0) - trellis.size(1) / 5, trellis.size(1) / 3))
+    fig.colorbar(img, ax=ax, shrink=0.6, location="bottom")
+    fig.tight_layout()
+plot()
 ######################################################################
 # In the above visualization, we can see that there is a trace of high
@@ -214,38 +241,38 @@ class Point:
 def backtrack(trellis, emission, tokens, blank_id=0):
-    # Note:
+    t, j = trellis.size(0) - 1, trellis.size(1) - 1
-    # j and t are indices for trellis, which has extra dimensions
-    # for time and tokens at the beginning.
+    path = [Point(j, t, emission[t, blank_id].exp().item())]
-    # When referring to time frame index `T` in trellis,
+    while j > 0:
-    # the corresponding index in emission is `T-1`.
+        # Should not happen but just in case
-    # Similarly, when referring to token index `J` in trellis,
+        assert t > 0
-    # the corresponding index in transcript is `J-1`.
-    j = trellis.size(1) - 1
-    t_start = torch.argmax(trellis[:, j]).item()
-    path = []
-    for t in range(t_start, 0, -1):
        # 1. Figure out if the current position was stay or change
-        # Note (again):
+        # Frame-wise score of stay vs change
-        # `emission[J-1]` is the emission at time frame `J` of trellis dimension.
+        p_stay = emission[t - 1, blank_id]
-        # Score for token staying the same from time frame J-1 to T.
+        p_change = emission[t - 1, tokens[j]]
-        stayed = trellis[t - 1, j] + emission[t - 1, blank_id]
-        # Score for token changing from C-1 at T-1 to J at T.
+        # Context-aware score for stay vs change
-        changed = trellis[t - 1, j - 1] + emission[t - 1, tokens[j - 1]]
+        stayed = trellis[t - 1, j] + p_stay
+        changed = trellis[t - 1, j - 1] + p_change
-        # 2. Store the path with frame-wise probability.
-        prob = emission[t - 1, tokens[j - 1] if changed > stayed else 0].exp().item()
+        # Update position
-        # Return token index and time index in non-trellis coordinate.
+        t -= 1
-        path.append(Point(j - 1, t - 1, prob))
-        # 3. Update the token
        if changed > stayed:
            j -= 1
-            if j == 0:
-                break
+        # Store the path with frame-wise probability.
-    else:
+        prob = (p_change if changed > stayed else p_stay).exp().item()
-        raise ValueError("Failed to align")
+        path.append(Point(j, t, prob))
+    # Now j == 0, which means, it reached the SoS.
+    # Fill up the rest for the sake of visualization
+    while t > 0:
+        prob = emission[t - 1, blank_id].exp().item()
+        path.append(Point(j, t - 1, prob))
+        t -= 1
    return path[::-1]
@@ -256,21 +283,28 @@ for p in path:
 ################################################################################
 # Visualization
-################################################################################
+# ~~~~~~~~~~~~~
 def plot_trellis_with_path(trellis, path):
    # To plot trellis with path, we take advantage of 'nan' value
    trellis_with_path = trellis.clone()
    for _, p in enumerate(path):
        trellis_with_path[p.time_index, p.token_index] = float("nan")
-    plt.imshow(trellis_with_path[1:, 1:].T, origin="lower")
+    plt.imshow(trellis_with_path.T, origin="lower")
+    plt.title("The path found by backtracking")
+    plt.tight_layout()
 plot_trellis_with_path(trellis, path)
-plt.title("The path found by backtracking")
-plt.show()
 ######################################################################
-# Looking good. Now this path contains repetations for the same labels, so
+# Looking good.
+######################################################################
+# Segment the path
+# ----------------
+# Now this path contains repetations for the same labels, so
 # let’s merge them to make it close to the original transcript.
 #
 # When merging the multiple path points, we simply take the average
@@ -320,23 +354,24 @@ for seg in segments:
 ################################################################################
 # Visualization
-################################################################################
+# ~~~~~~~~~~~~~
 def plot_trellis_with_segments(trellis, segments, transcript):
    # To plot trellis with path, we take advantage of 'nan' value
    trellis_with_path = trellis.clone()
    for i, seg in enumerate(segments):
        if seg.label != "|":
-            trellis_with_path[seg.start + 1 : seg.end + 1, i + 1] = float("nan")
+            trellis_with_path[seg.start : seg.end, i] = float("nan")
-    fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(16, 9.5))
+    fig, [ax1, ax2] = plt.subplots(2, 1, sharex=True)
    ax1.set_title("Path, label and probability for each label")
-    ax1.imshow(trellis_with_path.T, origin="lower")
+    ax1.imshow(trellis_with_path.T, origin="lower", aspect="auto")
-    ax1.set_xticks([])
    for i, seg in enumerate(segments):
        if seg.label != "|":
-            ax1.annotate(seg.label, (seg.start + 0.7, i + 0.3), weight="bold")
+            ax1.annotate(seg.label, (seg.start, i - 0.7), size="small")
-            ax1.annotate(f"{seg.score:.2f}", (seg.start - 0.3, i + 4.3))
+            ax1.annotate(f"{seg.score:.2f}", (seg.start, i + 3), size="small")
    ax2.set_title("Label probability with and without repetation")
    xs, hs, ws = [], [], []
@@ -345,7 +380,7 @@ def plot_trellis_with_segments(trellis, segments, transcript):
            xs.append((seg.end + seg.start) / 2 + 0.4)
            hs.append(seg.score)
            ws.append(seg.end - seg.start)
-            ax2.annotate(seg.label, (seg.start + 0.8, -0.07), weight="bold")
+            ax2.annotate(seg.label, (seg.start + 0.8, -0.07))
    ax2.bar(xs, hs, width=ws, color="gray", alpha=0.5, edgecolor="black")
    xs, hs = [], []
@@ -357,17 +392,21 @@ def plot_trellis_with_segments(trellis, segments, transcript):
    ax2.bar(xs, hs, width=0.5, alpha=0.5)
    ax2.axhline(0, color="black")
-    ax2.set_xlim(ax1.get_xlim())
+    ax2.grid(True, axis="y")
    ax2.set_ylim(-0.1, 1.1)
+    fig.tight_layout()
 plot_trellis_with_segments(trellis, segments, transcript)
-plt.tight_layout()
-plt.show()
 ######################################################################
-# Looks good. Now let’s merge the words. The Wav2Vec2 model uses ``'|'``
+# Looks good.
+######################################################################
+# Merge the segments into words
+# -----------------------------
+# Now let’s merge the words. The Wav2Vec2 model uses ``'|'``
 # as the word boundary, so we merge the segments before each occurance of
 # ``'|'``.
 #
@@ -400,46 +439,43 @@ for word in word_segments:
 ################################################################################
 # Visualization
-################################################################################
+# ~~~~~~~~~~~~~
-def plot_alignments(trellis, segments, word_segments, waveform):
+def plot_alignments(trellis, segments, word_segments, waveform, sample_rate=bundle.sample_rate):
    trellis_with_path = trellis.clone()
    for i, seg in enumerate(segments):
        if seg.label != "|":
-            trellis_with_path[seg.start + 1 : seg.end + 1, i + 1] = float("nan")
+            trellis_with_path[seg.start : seg.end, i] = float("nan")
-    fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(16, 9.5))
+    fig, [ax1, ax2] = plt.subplots(2, 1)
-    ax1.imshow(trellis_with_path[1:, 1:].T, origin="lower")
+    ax1.imshow(trellis_with_path.T, origin="lower", aspect="auto")
+    ax1.set_facecolor("lightgray")
    ax1.set_xticks([])
    ax1.set_yticks([])
    for word in word_segments:
-        ax1.axvline(word.start - 0.5)
+        ax1.axvspan(word.start - 0.5, word.end - 0.5, edgecolor="white", facecolor="none")
-        ax1.axvline(word.end - 0.5)
    for i, seg in enumerate(segments):
        if seg.label != "|":
-            ax1.annotate(seg.label, (seg.start, i + 0.3))
+            ax1.annotate(seg.label, (seg.start, i - 0.7), size="small")
-            ax1.annotate(f"{seg.score:.2f}", (seg.start, i + 4), fontsize=8)
+            ax1.annotate(f"{seg.score:.2f}", (seg.start, i + 3), size="small")
    # The original waveform
-    ratio = waveform.size(0) / (trellis.size(0) - 1)
+    ratio = waveform.size(0) / sample_rate / trellis.size(0)
-    ax2.plot(waveform)
+    ax2.specgram(waveform, Fs=sample_rate)
    for word in word_segments:
        x0 = ratio * word.start
        x1 = ratio * word.end
-        ax2.axvspan(x0, x1, alpha=0.1, color="red")
+        ax2.axvspan(x0, x1, facecolor="none", edgecolor="white", hatch="/")
-        ax2.annotate(f"{word.score:.2f}", (x0, 0.8))
+        ax2.annotate(f"{word.score:.2f}", (x0, sample_rate * 0.51), annotation_clip=False)
    for seg in segments:
        if seg.label != "|":
-            ax2.annotate(seg.label, (seg.start * ratio, 0.9))
+            ax2.annotate(seg.label, (seg.start * ratio, sample_rate * 0.55), annotation_clip=False)
-    xticks = ax2.get_xticks()
-    plt.xticks(xticks, xticks / bundle.sample_rate)
    ax2.set_xlabel("time [second]")
    ax2.set_yticks([])
-    ax2.set_ylim(-1.0, 1.0)
+    fig.tight_layout()
-    ax2.set_xlim(0, waveform.size(-1))
 plot_alignments(
@@ -448,16 +484,16 @@ plot_alignments(
    word_segments,
    waveform[0],
 )
-plt.show()
 ################################################################################
+# Audio Samples
+# -------------
 #
-# A trick to embed the resulting audio to the generated file.
-# `IPython.display.Audio` has to be the last call in a cell,
-# and there should be only one call par cell.
 def display_segment(i):
-    ratio = waveform.size(1) / (trellis.size(0) - 1)
+    ratio = waveform.size(1) / trellis.size(0)
    word = word_segments[i]
    x0 = int(ratio * word.start)
    x1 = int(ratio * word.end)

--- a/examples/tutorials/hybrid_demucs_tutorial.py
+++ b/examples/tutorials/hybrid_demucs_tutorial.py
@@ -45,6 +45,8 @@ import torchaudio
 print(torch.__version__)
 print(torchaudio.__version__)
+import matplotlib.pyplot as plt
 ######################################################################
 # In addition to ``torchaudio``, ``mir_eval`` is required to perform
 # signal-to-distortion ratio (SDR) calculations. To install ``mir_eval``
@@ -52,30 +54,9 @@ print(torchaudio.__version__)
 #
 from IPython.display import Audio
+from mir_eval import separation
+from torchaudio.pipelines import HDEMUCS_HIGH_MUSDB_PLUS
 from torchaudio.utils import download_asset
-import matplotlib.pyplot as plt
-try:
-    from torchaudio.pipelines import HDEMUCS_HIGH_MUSDB_PLUS
-    from mir_eval import separation
-except ModuleNotFoundError:
-    try:
-        import google.colab
-        print(
-            """
-            To enable running this notebook in Google Colab, install nightly
-            torch and torchaudio builds by adding the following code block to the top
-            of the notebook before running it:
-            !pip3 uninstall -y torch torchvision torchaudio
-            !pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-            !pip3 install mir_eval
-            """
-        )
-    except ModuleNotFoundError:
-        pass
-    raise
 ######################################################################
 # 3. Construct the pipeline
@@ -130,11 +111,11 @@ from torchaudio.transforms import Fade
 def separate_sources(
-        model,
+    model,
-        mix,
+    mix,
-        segment=10.,
+    segment=10.0,
-        overlap=0.1,
+    overlap=0.1,
-        device=None,
+    device=None,
 ):
    """
    Apply model to a given mixture. Use fade, and add segments together in order to add model segment by segment.
@@ -157,7 +138,7 @@ def separate_sources(
    start = 0
    end = chunk_len
    overlap_frames = overlap * sample_rate
-    fade = Fade(fade_in_len=0, fade_out_len=int(overlap_frames), fade_shape='linear')
+    fade = Fade(fade_in_len=0, fade_out_len=int(overlap_frames), fade_shape="linear")
    final = torch.zeros(batch, len(model.sources), channels, length, device=device)
@@ -181,11 +162,10 @@ def separate_sources(
 def plot_spectrogram(stft, title="Spectrogram"):
    magnitude = stft.abs()
    spectrogram = 20 * torch.log10(magnitude + 1e-8).numpy()
-    figure, axis = plt.subplots(1, 1)
+    _, axis = plt.subplots(1, 1)
-    img = axis.imshow(spectrogram, cmap="viridis", vmin=-60, vmax=0, origin="lower", aspect="auto")
+    axis.imshow(spectrogram, cmap="viridis", vmin=-60, vmax=0, origin="lower", aspect="auto")
-    figure.suptitle(title)
+    axis.set_title(title)
-    plt.colorbar(img, ax=axis)
+    plt.tight_layout()
-    plt.show()
 ######################################################################
@@ -208,7 +188,7 @@ def plot_spectrogram(stft, title="Spectrogram"):
 # We download the audio file from our storage. Feel free to download another file and use audio from a specific path
 SAMPLE_SONG = download_asset("tutorial-assets/hdemucs_mix.wav")
 waveform, sample_rate = torchaudio.load(SAMPLE_SONG)  # replace SAMPLE_SONG with desired path for different song
-waveform.to(device)
+waveform = waveform.to(device)
 mixture = waveform
 # parameters
@@ -265,12 +245,13 @@ stft = torchaudio.transforms.Spectrogram(
 # scores.
 #
 def output_results(original_source: torch.Tensor, predicted_source: torch.Tensor, source: str):
-    print("SDR score is:",
+    print(
-          separation.bss_eval_sources(
+        "SDR score is:",
-              original_source.detach().numpy(),
+        separation.bss_eval_sources(original_source.detach().numpy(), predicted_source.detach().numpy())[0].mean(),
-              predicted_source.detach().numpy())[0].mean())
+    )
-    plot_spectrogram(stft(predicted_source)[0], f'Spectrogram {source}')
+    plot_spectrogram(stft(predicted_source)[0], f"Spectrogram - {source}")
    return Audio(predicted_source, rate=sample_rate)
@@ -285,23 +266,19 @@ bass_original = download_asset("tutorial-assets/hdemucs_bass_segment.wav")
 vocals_original = download_asset("tutorial-assets/hdemucs_vocals_segment.wav")
 other_original = download_asset("tutorial-assets/hdemucs_other_segment.wav")
-drums_spec = audios["drums"][:, frame_start: frame_end]
+drums_spec = audios["drums"][:, frame_start:frame_end].cpu()
 drums, sample_rate = torchaudio.load(drums_original)
-drums.to(device)
-bass_spec = audios["bass"][:, frame_start: frame_end]
+bass_spec = audios["bass"][:, frame_start:frame_end].cpu()
 bass, sample_rate = torchaudio.load(bass_original)
-bass.to(device)
-vocals_spec = audios["vocals"][:, frame_start: frame_end]
+vocals_spec = audios["vocals"][:, frame_start:frame_end].cpu()
 vocals, sample_rate = torchaudio.load(vocals_original)
-vocals.to(device)
-other_spec = audios["other"][:, frame_start: frame_end]
+other_spec = audios["other"][:, frame_start:frame_end].cpu()
 other, sample_rate = torchaudio.load(other_original)
-other.to(device)
-mix_spec = mixture[:, frame_start: frame_end]
+mix_spec = mixture[:, frame_start:frame_end].cpu()
 ######################################################################
@@ -316,7 +293,7 @@ mix_spec = mixture[:, frame_start: frame_end]
 #
 # Mixture Clip
-plot_spectrogram(stft(mix_spec)[0], "Spectrogram Mixture")
+plot_spectrogram(stft(mix_spec)[0], "Spectrogram - Mixture")
 Audio(mix_spec, rate=sample_rate)
 ######################################################################

--- a/examples/tutorials/mvdr_tutorial.py
+++ b/examples/tutorials/mvdr_tutorial.py
@@ -37,6 +37,10 @@ print(torch.__version__)
 print(torchaudio.__version__)
+import matplotlib.pyplot as plt
+import mir_eval
+from IPython.display import Audio
 ######################################################################
 # 2. Preparation
 # --------------
@@ -59,10 +63,6 @@ print(torchaudio.__version__)
 from pesq import pesq
 from pystoi import stoi
-import mir_eval
-import matplotlib.pyplot as plt
-from IPython.display import Audio
 from torchaudio.utils import download_asset
 ######################################################################
@@ -98,23 +98,21 @@ SAMPLE_NOISE = download_asset("tutorial-assets/mvdr/noise.wav")
 #
-def plot_spectrogram(stft, title="Spectrogram", xlim=None):
+def plot_spectrogram(stft, title="Spectrogram"):
    magnitude = stft.abs()
    spectrogram = 20 * torch.log10(magnitude + 1e-8).numpy()
    figure, axis = plt.subplots(1, 1)
    img = axis.imshow(spectrogram, cmap="viridis", vmin=-100, vmax=0, origin="lower", aspect="auto")
-    figure.suptitle(title)
+    axis.set_title(title)
    plt.colorbar(img, ax=axis)
-    plt.show()
-def plot_mask(mask, title="Mask", xlim=None):
+def plot_mask(mask, title="Mask"):
    mask = mask.numpy()
    figure, axis = plt.subplots(1, 1)
    img = axis.imshow(mask, cmap="viridis", origin="lower", aspect="auto")
-    figure.suptitle(title)
+    axis.set_title(title)
    plt.colorbar(img, ax=axis)
-    plt.show()
 def si_snr(estimate, reference, epsilon=1e-8):

--- a/examples/tutorials/nvdec_tutorial.py
+++ b/examples/tutorials/nvdec_tutorial.py
+"""
+Accelerated video decoding with NVDEC
+=====================================
+.. _nvdec_tutorial:
+**Author**: `Moto Hira <moto@meta.com>`__
+This tutorial shows how to use NVIDIA’s hardware video decoder (NVDEC)
+with TorchAudio, and how it improves the performance of video decoding.
+"""
+######################################################################
+#
+# .. note::
+#
+#    This tutorial requires FFmpeg libraries compiled with HW
+#    acceleration enabled.
+#
+#    Please refer to
+#    :ref:`Enabling GPU video decoder/encoder <enabling_hw_decoder>`
+#    for how to build FFmpeg with HW acceleration.
+#
+import torch
+import torchaudio
+print(torch.__version__)
+print(torchaudio.__version__)
+######################################################################
+#
+import os
+import time
+import matplotlib.pyplot as plt
+from torchaudio.io import StreamReader
+######################################################################
+#
+# Check the prerequisites
+# -----------------------
+#
+# First, we check that TorchAudio correctly detects FFmpeg libraries
+# that support HW decoder/encoder.
+#
+from torchaudio.utils import ffmpeg_utils
+######################################################################
+#
+print("FFmpeg Library versions:")
+for k, ver in ffmpeg_utils.get_versions().items():
+    print(f"  {k}:\t{'.'.join(str(v) for v in ver)}")
+######################################################################
+#
+print("Available NVDEC Decoders:")
+for k in ffmpeg_utils.get_video_decoders().keys():
+    if "cuvid" in k:
+        print(f" - {k}")
+######################################################################
+#
+print("Avaialbe GPU:")
+print(torch.cuda.get_device_properties(0))
+######################################################################
+#
+# We will use the following video which has the following properties;
+#
+# - Codec: H.264
+# - Resolution: 960x540
+# - FPS: 29.97
+# - Pixel format: YUV420P
+#
+# .. raw:: html
+#
+#    <video style="max-width: 100%" controls>
+#      <source src="https://download.pytorch.org/torchaudio/tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4" type="video/mp4">
+#    </video>
+######################################################################
+#
+src = torchaudio.utils.download_asset(
+    "tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4"
+)
+######################################################################
+# Decoding videos with NVDEC
+# --------------------------
+#
+# To use HW video decoder, you need to specify the HW decoder when
+# defining the output video stream by passing ``decoder`` option to
+# :py:meth:`~torchaudio.io.StreamReader.add_video_stream` method.
+#
+s = StreamReader(src)
+s.add_video_stream(5, decoder="h264_cuvid")
+s.fill_buffer()
+(video,) = s.pop_chunks()
+######################################################################
+#
+# The video frames are decoded and returned as tensor of NCHW format.
+print(video.shape, video.dtype)
+######################################################################
+#
+# By default, the decoded frames are sent back to CPU memory, and
+# CPU tensors are created.
+print(video.device)
+######################################################################
+#
+# By specifying ``hw_accel`` option, you can convert the decoded frames
+# to CUDA tensor.
+# ``hw_accel`` option takes string values and pass it
+# to :py:class:`torch.device`.
+#
+# .. note::
+#
+#    Currently, ``hw_accel`` option and
+#    :py:meth:`~torchaudio.io.StreamReader.add_basic_video_stream`
+#    are not compatible. ``add_basic_video_stream`` adds post-decoding
+#    process, which is designed for frames in CPU memory.
+#    Please use :py:meth:`~torchaudio.io.StreamReader.add_video_stream`.
+#
+s = StreamReader(src)
+s.add_video_stream(5, decoder="h264_cuvid", hw_accel="cuda:0")
+s.fill_buffer()
+(video,) = s.pop_chunks()
+print(video.shape, video.dtype, video.device)
+######################################################################
+# .. note::
+#
+#    When there are multiple of GPUs available, ``StreamReader`` by
+#    default uses the first GPU. You can change this by providing
+#    ``"gpu"`` option.
+#
+# .. code::
+#
+#    # Video data is sent to CUDA device 0, decoded and
+#    # converted on the same device.
+#    s.add_video_stream(
+#        ...,
+#        decoder="h264_cuvid",
+#        decoder_option={"gpu": "0"},
+#        hw_accel="cuda:0",
+#    )
+#
+# .. note::
+#
+#    ``"gpu"`` option and ``hw_accel`` option can be specified
+#    independently. If they do not match, decoded frames are
+#    transfered to the device specified by ``hw_accell``
+#    automatically.
+#
+# .. code::
+#
+#    # Video data is sent to CUDA device 0, and decoded there.
+#    # Then it is transfered to CUDA device 1, and converted to
+#    # CUDA tensor.
+#    s.add_video_stream(
+#        ...,
+#        decoder="h264_cuvid",
+#        decoder_option={"gpu": "0"},
+#        hw_accel="cuda:1",
+#    )
+######################################################################
+# Visualization
+# -------------
+#
+# Let's look at the frames decoded by HW decoder and compare them
+# against equivalent results from software decoders.
+#
+# The following function seeks into the given timestamp and decode one
+# frame with the specificed decoder.
+def test_decode(decoder: str, seek: float):
+    s = StreamReader(src)
+    s.seek(seek)
+    s.add_video_stream(1, decoder=decoder)
+    s.fill_buffer()
+    (video,) = s.pop_chunks()
+    return video[0]
+######################################################################
+#
+timestamps = [12, 19, 45, 131, 180]
+cpu_frames = [test_decode(decoder="h264", seek=ts) for ts in timestamps]
+cuda_frames = [test_decode(decoder="h264_cuvid", seek=ts) for ts in timestamps]
+######################################################################
+#
+# .. note::
+#
+#    Currently, HW decoder does not support colorspace conversion.
+#    Decoded frames are YUV format.
+#    The following function performs YUV to RGB covnersion
+#    (and axis shuffling for plotting).
+def yuv_to_rgb(frames):
+    frames = frames.cpu().to(torch.float)
+    y = frames[..., 0, :, :]
+    u = frames[..., 1, :, :]
+    v = frames[..., 2, :, :]
+    y /= 255
+    u = u / 255 - 0.5
+    v = v / 255 - 0.5
+    r = y + 1.14 * v
+    g = y + -0.396 * u - 0.581 * v
+    b = y + 2.029 * u
+    rgb = torch.stack([r, g, b], -1)
+    rgb = (rgb * 255).clamp(0, 255).to(torch.uint8)
+    return rgb.numpy()
+######################################################################
+#
+# Now we visualize the resutls.
+#
+def plot():
+    n_rows = len(timestamps)
+    fig, axes = plt.subplots(n_rows, 2, figsize=[12.8, 16.0])
+    for i in range(n_rows):
+        axes[i][0].imshow(yuv_to_rgb(cpu_frames[i]))
+        axes[i][1].imshow(yuv_to_rgb(cuda_frames[i]))
+    axes[0][0].set_title("Software decoder")
+    axes[0][1].set_title("HW decoder")
+    plt.setp(axes, xticks=[], yticks=[])
+    plt.tight_layout()
+plot()
+######################################################################
+#
+# They are indistinguishable to the eyes of the author.
+# Feel free to let us know if you spot something. :)
+#
+######################################################################
+# HW resizing and cropping
+# ------------------------
+#
+# You can use ``decoder_option`` argument to provide decoder-specific
+# options.
+#
+# The following options are often relevant in preprocessing.
+#
+# - ``resize``: Resize the frame into ``(width)x(height)``.
+# - ``crop``: Crop the frame ``(top)x(bottom)x(left)x(right)``.
+#   Note that the specified values are the amount of rows/columns removed.
+#   The final image size is ``(width - left - right)x(height - top -bottom)``.
+#   If ``crop`` and ``resize`` options are used together,
+#   ``crop`` is performed first.
+#
+# For other available options, please run
+# ``ffmpeg -h decoder=h264_cuvid``.
+#
+def test_options(option):
+    s = StreamReader(src)
+    s.seek(87)
+    s.add_video_stream(1, decoder="h264_cuvid", hw_accel="cuda:0", decoder_option=option)
+    s.fill_buffer()
+    (video,) = s.pop_chunks()
+    print(f"Option: {option}:\t{video.shape}")
+    return video[0]
+######################################################################
+#
+original = test_options(option=None)
+resized = test_options(option={"resize": "480x270"})
+cropped = test_options(option={"crop": "135x135x240x240"})
+cropped_and_resized = test_options(option={"crop": "135x135x240x240", "resize": "640x360"})
+######################################################################
+#
+def plot():
+    fig, axes = plt.subplots(2, 2, figsize=[12.8, 9.6])
+    axes[0][0].imshow(yuv_to_rgb(original))
+    axes[0][1].imshow(yuv_to_rgb(resized))
+    axes[1][0].imshow(yuv_to_rgb(cropped))
+    axes[1][1].imshow(yuv_to_rgb(cropped_and_resized))
+    axes[0][0].set_title("Original")
+    axes[0][1].set_title("Resized")
+    axes[1][0].set_title("Cropped")
+    axes[1][1].set_title("Cropped and resized")
+    plt.tight_layout()
+    return fig
+plot()
+######################################################################
+# Comparing resizing methods
+# --------------------------
+#
+# Unlike software scaling, NVDEC does not provide an option to choose
+# the scaling algorithm.
+# In ML applicatoins, it is often necessary to construct a
+# preprocessing pipeline with a similar numerical property.
+# So here we compare the result of hardware resizing with software
+# resizing of different algorithms.
+#
+# We will use the following video, which contains the test pattern
+# generated using the following command.
+#
+# .. code::
+#
+#    ffmpeg -y -f lavfi -t 12.05 -i mptestsrc -movflags +faststart mptestsrc.mp4
+#
+# .. raw:: html
+#
+#    <video style="max-width: 100%" controls>
+#      <source src="https://download.pytorch.org/torchaudio/tutorial-assets/mptestsrc.mp4" type="video/mp4">
+#    </video>
+######################################################################
+#
+test_src = torchaudio.utils.download_asset("tutorial-assets/mptestsrc.mp4")
+######################################################################
+# The following function decodes video and
+# apply the specified scaling algorithm.
+#
+def decode_resize_ffmpeg(mode, height, width, seek):
+    filter_desc = None if mode is None else f"scale={width}:{height}:sws_flags={mode}"
+    s = StreamReader(test_src)
+    s.add_video_stream(1, filter_desc=filter_desc)
+    s.seek(seek)
+    s.fill_buffer()
+    (chunk,) = s.pop_chunks()
+    return chunk
+######################################################################
+# The following function uses HW decoder to decode video and resize.
+#
+def decode_resize_cuvid(height, width, seek):
+    s = StreamReader(test_src)
+    s.add_video_stream(1, decoder="h264_cuvid", decoder_option={"resize": f"{width}x{height}"}, hw_accel="cuda:0")
+    s.seek(seek)
+    s.fill_buffer()
+    (chunk,) = s.pop_chunks()
+    return chunk.cpu()
+######################################################################
+# Now we execute them and visualize the resulting frames.
+params = {"height": 224, "width": 224, "seek": 3}
+frames = [
+    decode_resize_ffmpeg(None, **params),
+    decode_resize_ffmpeg("neighbor", **params),
+    decode_resize_ffmpeg("bilinear", **params),
+    decode_resize_ffmpeg("bicubic", **params),
+    decode_resize_cuvid(**params),
+    decode_resize_ffmpeg("spline", **params),
+    decode_resize_ffmpeg("lanczos:param0=1", **params),
+    decode_resize_ffmpeg("lanczos:param0=3", **params),
+    decode_resize_ffmpeg("lanczos:param0=5", **params),
+]
+######################################################################
+#
+def plot():
+    fig, axes = plt.subplots(3, 3, figsize=[12.8, 15.2])
+    for i, f in enumerate(frames):
+        h, w = f.shape[2:4]
+        f = f[..., : h // 4, : w // 4]
+        axes[i // 3][i % 3].imshow(yuv_to_rgb(f[0]))
+    axes[0][0].set_title("Original")
+    axes[0][1].set_title("nearest neighbor")
+    axes[0][2].set_title("bilinear")
+    axes[1][0].set_title("bicubic")
+    axes[1][1].set_title("NVDEC")
+    axes[1][2].set_title("spline")
+    axes[2][0].set_title("lanczos(1)")
+    axes[2][1].set_title("lanczos(3)")
+    axes[2][2].set_title("lanczos(5)")
+    plt.setp(axes, xticks=[], yticks=[])
+    plt.tight_layout()
+plot()
+######################################################################
+# None of them is exactly the same. To the eyes of authors, lanczos(1)
+# appears to be most similar to NVDEC.
+# The bicubic looks close as well.
+######################################################################
+#
+# Benchmark NVDEC with StreamReader
+# ---------------------------------
+#
+# In this section, we compare the performace of software video
+# decoding and HW video decoding.
+#
+######################################################################
+# Decode as CUDA frames
+# ---------------------
+#
+# First, we compare the time it takes for software decoder and
+# hardware encoder to decode the same video.
+# To make the result comparable, when using software decoder, we move
+# the resulting tensor to CUDA.
+#
+# The procedures to test look like the following
+#
+# - Use hardware decoder and place data on CUDA directly
+# - Use software decoder, generate CPU Tensors and move them to CUDA.
+#
+# .. note:
+#
+#    Because HW decoder currently only supports reading videos as
+#    YUV444P format, we decode frames into YUV444P format for the case of
+#    software decoder as well.
+#
+######################################################################
+# The following function implements the hardware decoder test case.
+def test_decode_cuda(src, decoder, hw_accel="cuda", frames_per_chunk=5):
+    s = StreamReader(src)
+    s.add_video_stream(frames_per_chunk, decoder=decoder, hw_accel=hw_accel)
+    num_frames = 0
+    chunk = None
+    t0 = time.monotonic()
+    for (chunk,) in s.stream():
+        num_frames += chunk.shape[0]
+    elapsed = time.monotonic() - t0
+    print(f" - Shape: {chunk.shape}")
+    fps = num_frames / elapsed
+    print(f" - Processed {num_frames} frames in {elapsed:.2f} seconds. ({fps:.2f} fps)")
+    return fps
+######################################################################
+# The following function implements the software decoder test case.
+def test_decode_cpu(src, threads, decoder=None, frames_per_chunk=5):
+    s = StreamReader(src)
+    s.add_video_stream(frames_per_chunk, decoder=decoder, decoder_option={"threads": f"{threads}"})
+    num_frames = 0
+    device = torch.device("cuda")
+    t0 = time.monotonic()
+    for i, (chunk,) in enumerate(s.stream()):
+        if i == 0:
+            print(f" - Shape: {chunk.shape}")
+        num_frames += chunk.shape[0]
+        chunk = chunk.to(device)
+    elapsed = time.monotonic() - t0
+    fps = num_frames / elapsed
+    print(f" - Processed {num_frames} frames in {elapsed:.2f} seconds. ({fps:.2f} fps)")
+    return fps
+######################################################################
+# For each resolution of video, we run multiple software decoder test
+# cases with different number of threads.
+def run_decode_tests(src, frames_per_chunk=5):
+    fps = []
+    print(f"Testing: {os.path.basename(src)}")
+    for threads in [1, 4, 8, 16]:
+        print(f"* Software decoding (num_threads={threads})")
+        fps.append(test_decode_cpu(src, threads))
+    print("* Hardware decoding")
+    fps.append(test_decode_cuda(src, decoder="h264_cuvid"))
+    return fps
+######################################################################
+# Now we run the tests with videos of different resolutions.
+#
+# QVGA
+# ----
+src_qvga = torchaudio.utils.download_asset("tutorial-assets/testsrc2_qvga.h264.mp4")
+fps_qvga = run_decode_tests(src_qvga)
+######################################################################
+# VGA
+# ---
+src_vga = torchaudio.utils.download_asset("tutorial-assets/testsrc2_vga.h264.mp4")
+fps_vga = run_decode_tests(src_vga)
+######################################################################
+# XGA
+# ---
+src_xga = torchaudio.utils.download_asset("tutorial-assets/testsrc2_xga.h264.mp4")
+fps_xga = run_decode_tests(src_xga)
+######################################################################
+# Result
+# ------
+#
+# Now we plot the result.
+def plot():
+    fig, ax = plt.subplots(figsize=[9.6, 6.4])
+    for items in zip(fps_qvga, fps_vga, fps_xga, "ov^sx"):
+        ax.plot(items[:-1], marker=items[-1])
+    ax.grid(axis="both")
+    ax.set_xticks([0, 1, 2], ["QVGA (320x240)", "VGA (640x480)", "XGA (1024x768)"])
+    ax.legend(
+        [
+            "Software Decoding (threads=1)",
+            "Software Decoding (threads=4)",
+            "Software Decoding (threads=8)",
+            "Software Decoding (threads=16)",
+            "Hardware Decoding (CUDA Tensor)",
+        ]
+    )
+    ax.set_title("Speed of processing video frames")
+    ax.set_ylabel("Frames per second")
+    plt.tight_layout()
+plot()
+######################################################################
+#
+# We observe couple of things
+#
+# - Increasing the number of threads in software decoding makes the
+#   pipeline faster, but the performance saturates around 8 threads.
+# - The performance gain from using hardware decoder depends on the
+#   resolution of video.
+# - At lower resolutions like QVGA, hardware decoding is slower than
+#   software decoding
+# - At higher resolutions like XGA, hardware decoding is faster
+#   than software decoding.
+#
+#
+# It is worth noting that the performance gain also depends on the
+# type of GPU.
+# We observed that when decoding VGA videos using V100 or A100 GPUs,
+# hardware decoders are slower than software decoders. But using A10
+# GPU hardware deocder is faster than software decodr.
+#
+######################################################################
+# Decode and resize
+# -----------------
+#
+# Next, we add resize operation to the pipeline.
+# We will compare the following pipelines.
+#
+# 1. Decode video using software decoder and read the frames as
+#    PyTorch Tensor. Resize the tensor using
+#    :py:func:`torch.nn.functional.interpolate`, then send
+#    the resulting tensor to CUDA device.
+# 2. Decode video using software decoder, resize the frame with
+#    FFmpeg's filter graph, read the resized frames as PyTorch tensor,
+#    then send it to CUDA device.
+# 3. Decode and resize video simulaneously with HW decoder, read the
+#    resulting frames as CUDA tensor.
+#
+# The pipeline 1 represents common video loading implementations.
+#
+# The pipeline 2 uses FFmpeg's filter graph, which allows to manipulate
+# raw frames before converting them to Tensors.
+#
+# The pipeline 3 has the minimum amount of data transfer from CPU to
+# CUDA, which significantly contribute to performant data loading.
+#
+######################################################################
+# The following function implements the pipeline 1. It uses PyTorch's
+# :py:func:`torch.nn.functional.interpolate`.
+# We use ``bincubic`` mode, as we saw that the resulting frames are
+# closest to NVDEC resizing.
+#
+def test_decode_then_resize(src, height, width, mode="bicubic", frames_per_chunk=5):
+    s = StreamReader(src)
+    s.add_video_stream(frames_per_chunk, decoder_option={"threads": "8"})
+    num_frames = 0
+    device = torch.device("cuda")
+    chunk = None
+    t0 = time.monotonic()
+    for (chunk,) in s.stream():
+        num_frames += chunk.shape[0]
+        chunk = torch.nn.functional.interpolate(chunk, [height, width], mode=mode, antialias=True)
+        chunk = chunk.to(device)
+    elapsed = time.monotonic() - t0
+    fps = num_frames / elapsed
+    print(f" - Shape: {chunk.shape}")
+    print(f" - Processed {num_frames} frames in {elapsed:.2f} seconds. ({fps:.2f} fps)")
+    return fps
+######################################################################
+# The following function implements the pipeline 2. Frames are resized
+# as part of decoding process, then sent to CUDA device.
+#
+# We use ``bincubic`` mode, to make the result comparable with
+# PyTorch-based implementation above.
+#
+def test_decode_and_resize(src, height, width, mode="bicubic", frames_per_chunk=5):
+    s = StreamReader(src)
+    s.add_video_stream(
+        frames_per_chunk, filter_desc=f"scale={width}:{height}:sws_flags={mode}", decoder_option={"threads": "8"}
+    )
+    num_frames = 0
+    device = torch.device("cuda")
+    chunk = None
+    t0 = time.monotonic()
+    for (chunk,) in s.stream():
+        num_frames += chunk.shape[0]
+        chunk = chunk.to(device)
+    elapsed = time.monotonic() - t0
+    fps = num_frames / elapsed
+    print(f" - Shape: {chunk.shape}")
+    print(f" - Processed {num_frames} frames in {elapsed:.2f} seconds. ({fps:.2f} fps)")
+    return fps
+######################################################################
+# The following function implements the pipeline 3. Resizing is
+# performed by NVDEC and the resulting tensor is placed on CUDA memory.
+def test_hw_decode_and_resize(src, decoder, decoder_option, hw_accel="cuda", frames_per_chunk=5):
+    s = StreamReader(src)
+    s.add_video_stream(5, decoder=decoder, decoder_option=decoder_option, hw_accel=hw_accel)
+    num_frames = 0
+    chunk = None
+    t0 = time.monotonic()
+    for (chunk,) in s.stream():
+        num_frames += chunk.shape[0]
+    elapsed = time.monotonic() - t0
+    fps = num_frames / elapsed
+    print(f" - Shape: {chunk.shape}")
+    print(f" - Processed {num_frames} frames in {elapsed:.2f} seconds. ({fps:.2f} fps)")
+    return fps
+######################################################################
+#
+# The following function run the benchmark functions on given sources.
+#
+def run_resize_tests(src):
+    print(f"Testing: {os.path.basename(src)}")
+    height, width = 224, 224
+    print("* Software decoding with PyTorch interpolate")
+    cpu_resize1 = test_decode_then_resize(src, height=height, width=width)
+    print("* Software decoding with FFmpeg scale")
+    cpu_resize2 = test_decode_and_resize(src, height=height, width=width)
+    print("* Hardware decoding with resize")
+    cuda_resize = test_hw_decode_and_resize(src, decoder="h264_cuvid", decoder_option={"resize": f"{width}x{height}"})
+    return [cpu_resize1, cpu_resize2, cuda_resize]
+######################################################################
+#
+# Now we run the tests.
+######################################################################
+# QVGA
+# ----
+fps_qvga = run_resize_tests(src_qvga)
+######################################################################
+# VGA
+# ---
+fps_vga = run_resize_tests(src_vga)
+######################################################################
+# XGA
+# ---
+fps_xga = run_resize_tests(src_xga)
+######################################################################
+# Result
+# ------
+# Now we plot the result.
+#
+def plot():
+    fig, ax = plt.subplots(figsize=[9.6, 6.4])
+    for items in zip(fps_qvga, fps_vga, fps_xga, "ov^sx"):
+        ax.plot(items[:-1], marker=items[-1])
+    ax.grid(axis="both")
+    ax.set_xticks([0, 1, 2], ["QVGA (320x240)", "VGA (640x480)", "XGA (1024x768)"])
+    ax.legend(
+        [
+            "Software decoding\nwith resize\n(PyTorch interpolate)",
+            "Software decoding\nwith resize\n(FFmpeg scale)",
+            "NVDEC\nwith resizing",
+        ]
+    )
+    ax.set_title("Speed of processing video frames")
+    ax.set_xlabel("Input video resolution")
+    ax.set_ylabel("Frames per second")
+    plt.tight_layout()
+plot()
+######################################################################
+#
+# Hardware deocder shows a similar trend as previous experiment.
+# In fact, the performance is almost the same. Hardware resizing has
+# almost zero overhead for scaling down the frames.
+#
+# Software decoding also shows a similar trend. Performing resizing as
+# part of decoding is faster. One possible explanation is that, video
+# frames are internally stored as YUV420P, which has half the number
+# of pixels compared to RGB24, or YUV444P. This means that if resizing
+# before copying frame data to PyTorch tensor, the number of pixels
+# manipulated and copied are smaller than the case where applying
+# resizing after frames are converted to Tensor.
+#
+######################################################################
+#
+# Tag: :obj:`torchaudio.io`
--- a/examples/tutorials/nvenc_tutorial.py
+++ b/examples/tutorials/nvenc_tutorial.py
+"""
+Accelerated video encoding with NVENC
+=====================================
+.. _nvenc_tutorial:
+**Author**: `Moto Hira <moto@meta.com>`__
+This tutorial shows how to use NVIDIA’s hardware video encoder (NVENC)
+with TorchAudio, and how it improves the performance of video encoding.
+"""
+######################################################################
+# .. note::
+#
+#    This tutorial requires FFmpeg libraries compiled with HW
+#    acceleration enabled.
+#
+#    Please refer to
+#    :ref:`Enabling GPU video decoder/encoder <enabling_hw_decoder>`
+#    for how to build FFmpeg with HW acceleration.
+#
+# .. note::
+#
+#    Most modern GPUs have both HW decoder and encoder, but some
+#    highend GPUs like A100 and H100 do not have HW encoder.
+#    Please refer to the following for the availability and
+#    format coverage.
+#    https://developer.nvidia.com/video-encode-and-decode-gpu-support-matrix-new
+#
+#    Attempting to use HW encoder on these GPUs fails with an error
+#    message like ``Generic error in an external library``.
+#    You can enable debug log with
+#    :py:func:`torchaudio.utils.ffmpeg_utils.set_log_level` to see more
+#    detailed error messages issued along the way.
+#
+import torch
+import torchaudio
+print(torch.__version__)
+print(torchaudio.__version__)
+import io
+import time
+import matplotlib.pyplot as plt
+from IPython.display import Video
+from torchaudio.io import StreamReader, StreamWriter
+######################################################################
+#
+# Check the prerequisites
+# -----------------------
+#
+# First, we check that TorchAudio correctly detects FFmpeg libraries
+# that support HW decoder/encoder.
+#
+from torchaudio.utils import ffmpeg_utils
+######################################################################
+#
+print("FFmpeg Library versions:")
+for k, ver in ffmpeg_utils.get_versions().items():
+    print(f"  {k}:\t{'.'.join(str(v) for v in ver)}")
+######################################################################
+#
+print("Available NVENC Encoders:")
+for k in ffmpeg_utils.get_video_encoders().keys():
+    if "nvenc" in k:
+        print(f" - {k}")
+######################################################################
+#
+print("Avaialbe GPU:")
+print(torch.cuda.get_device_properties(0))
+######################################################################
+# We use the following helper function to generate test frame data.
+# For the detail of synthetic video generation please refer to
+# :ref:`StreamReader Advanced Usage <lavfi>`.
+def get_data(height, width, format="yuv444p", frame_rate=30000 / 1001, duration=4):
+    src = f"testsrc2=rate={frame_rate}:size={width}x{height}:duration={duration}"
+    s = StreamReader(src=src, format="lavfi")
+    s.add_basic_video_stream(-1, format=format)
+    s.process_all_packets()
+    (video,) = s.pop_chunks()
+    return video
+######################################################################
+# Encoding videos with NVENC
+# --------------------------
+#
+# To use HW video encoder, you need to specify the HW encoder when
+# defining the output video stream by providing ``encoder`` option to
+# :py:meth:`~torchaudio.io.StreamWriter.add_video_stream`.
+#
+######################################################################
+#
+pict_config = {
+    "height": 360,
+    "width": 640,
+    "frame_rate": 30000 / 1001,
+    "format": "yuv444p",
+}
+frame_data = get_data(**pict_config)
+######################################################################
+#
+w = StreamWriter(io.BytesIO(), format="mp4")
+w.add_video_stream(**pict_config, encoder="h264_nvenc", encoder_format="yuv444p")
+with w.open():
+    w.write_video_chunk(0, frame_data)
+######################################################################
+# Similar to the HW decoder, by default, the encoder expects the frame
+# data to be on CPU memory. To send data from CUDA memory, you need to
+# specify ``hw_accel`` option.
+#
+buffer = io.BytesIO()
+w = StreamWriter(buffer, format="mp4")
+w.add_video_stream(**pict_config, encoder="h264_nvenc", encoder_format="yuv444p", hw_accel="cuda:0")
+with w.open():
+    w.write_video_chunk(0, frame_data.to(torch.device("cuda:0")))
+buffer.seek(0)
+video_cuda = buffer.read()
+######################################################################
+#
+Video(video_cuda, embed=True, mimetype="video/mp4")
+######################################################################
+# Benchmark NVENC with StreamWriter
+# ---------------------------------
+#
+# Now we compare the performance of software encoder and hardware
+# encoder.
+#
+# Similar to the benchmark in NVDEC, we process the videos of different
+# resolution, and measure the time it takes to encode them.
+#
+# We also measure the size of resulting video file.
+######################################################################
+# The following function encodes the given frames and measure the time
+# it takes to encode and the size of the resulting video data.
+#
+def test_encode(data, encoder, width, height, hw_accel=None, **config):
+    assert data.is_cuda
+    buffer = io.BytesIO()
+    s = StreamWriter(buffer, format="mp4")
+    s.add_video_stream(encoder=encoder, width=width, height=height, hw_accel=hw_accel, **config)
+    with s.open():
+        t0 = time.monotonic()
+        if hw_accel is None:
+            data = data.to("cpu")
+        s.write_video_chunk(0, data)
+        elapsed = time.monotonic() - t0
+    size = buffer.tell()
+    fps = len(data) / elapsed
+    print(f" - Processed {len(data)} frames in {elapsed:.2f} seconds. ({fps:.2f} fps)")
+    print(f" - Encoded data size: {size} bytes")
+    return elapsed, size
+######################################################################
+# We conduct the tests for the following configurations
+#
+# - Software encoder with the number of threads 1, 4, 8
+# - Hardware encoder with and without ``hw_accel`` option.
+#
+def run_tests(height, width, duration=4):
+    # Generate the test data
+    print(f"Testing resolution: {width}x{height}")
+    pict_config = {
+        "height": height,
+        "width": width,
+        "frame_rate": 30000 / 1001,
+        "format": "yuv444p",
+    }
+    data = get_data(**pict_config, duration=duration)
+    data = data.to(torch.device("cuda:0"))
+    times = []
+    sizes = []
+    # Test software encoding
+    encoder_config = {
+        "encoder": "libx264",
+        "encoder_format": "yuv444p",
+    }
+    for i, num_threads in enumerate([1, 4, 8]):
+        print(f"* Software Encoder (num_threads={num_threads})")
+        time_, size = test_encode(
+            data,
+            encoder_option={"threads": str(num_threads)},
+            **pict_config,
+            **encoder_config,
+        )
+        times.append(time_)
+        if i == 0:
+            sizes.append(size)
+    # Test hardware encoding
+    encoder_config = {
+        "encoder": "h264_nvenc",
+        "encoder_format": "yuv444p",
+        "encoder_option": {"gpu": "0"},
+    }
+    for i, hw_accel in enumerate([None, "cuda"]):
+        print(f"* Hardware Encoder {'(CUDA frames)' if hw_accel else ''}")
+        time_, size = test_encode(
+            data,
+            **pict_config,
+            **encoder_config,
+            hw_accel=hw_accel,
+        )
+        times.append(time_)
+        if i == 0:
+            sizes.append(size)
+    return times, sizes
+######################################################################
+# And we change the resolution of videos to see how these measurement
+# change.
+#
+# 360P
+# ----
+#
+time_360, size_360 = run_tests(360, 640)
+######################################################################
+# 720P
+# ----
+#
+time_720, size_720 = run_tests(720, 1280)
+######################################################################
+# 1080P
+# -----
+#
+time_1080, size_1080 = run_tests(1080, 1920)
+######################################################################
+# Now we plot the result.
+#
+def plot():
+    fig, axes = plt.subplots(2, 1, sharex=True, figsize=[9.6, 7.2])
+    for items in zip(time_360, time_720, time_1080, "ov^X+"):
+        axes[0].plot(items[:-1], marker=items[-1])
+    axes[0].grid(axis="both")
+    axes[0].set_xticks([0, 1, 2], ["360p", "720p", "1080p"], visible=True)
+    axes[0].tick_params(labeltop=False)
+    axes[0].legend(
+        [
+            "Software Encoding (threads=1)",
+            "Software Encoding (threads=4)",
+            "Software Encoding (threads=8)",
+            "Hardware Encoding (CPU Tensor)",
+            "Hardware Encoding (CUDA Tensor)",
+        ]
+    )
+    axes[0].set_title("Time to encode videos with different resolutions")
+    axes[0].set_ylabel("Time [s]")
+    for items in zip(size_360, size_720, size_1080, "v^"):
+        axes[1].plot(items[:-1], marker=items[-1])
+    axes[1].grid(axis="both")
+    axes[1].set_xticks([0, 1, 2], ["360p", "720p", "1080p"])
+    axes[1].set_ylabel("The encoded size [bytes]")
+    axes[1].set_title("The size of encoded videos")
+    axes[1].legend(
+        [
+            "Software Encoding",
+            "Hardware Encoding",
+        ]
+    )
+    plt.tight_layout()
+plot()
+######################################################################
+# Result
+# ------
+#
+# We observe couple of things;
+#
+# - The time to encode video grows as the resolution becomes larger.
+# - In the case of software encoding, increasing the number of threads
+#   helps reduce the decoding time.
+# - The gain from extra threads diminishes around 8.
+# - Hardware encoding is faster than software encoding in general.
+# - Using ``hw_accel`` does not improve the speed of encoding itself
+#   as much.
+# - The size of the resulting videos grow as the resolution becomes
+#   larger.
+# - Hardware encoder produces smaller video file at larger resolution.
+#
+# The last point is somewhat strange to the author (who is not an
+# expert in production of videos.)
+# It is often said that hardware decoders produce larger video
+# compared to software encoders.
+# Some says that software encoders allow fine-grained control over
+# encoding configuration, so the resulting video is more optimal.
+# Meanwhile, hardware encoders are optimized for performance, thus
+# does not provide as much control over quality and binary size.
+#
+######################################################################
+# Quality Spotcheck
+# -----------------
+#
+# So, how are the quality of videos produced with hardware encoders?
+# A quick spot check of high resolution videos uncovers that they have
+# more noticeable artifacts on higher resolution.
+# Which might be an explanation of the smaller binary size. (meaning,
+# it is not allocating enough bits to produce quality output.)
+#
+# The following images are raw frames of videos encoded with hardware
+# encoders.
+#
+######################################################################
+# 360P
+# ----
+#
+# .. raw:: html
+#
+#    <img style="max-width: 100%" src="https://download.pytorch.org/torchaudio/tutorial-assets/nvenc_testsrc2_360_097.png" alt="NVENC sample 360P">
+######################################################################
+# 720P
+# ----
+#
+# .. raw:: html
+#
+#    <img style="max-width: 100%" src="https://download.pytorch.org/torchaudio/tutorial-assets/nvenc_testsrc2_720_097.png" alt="NVENC sample 720P">
+######################################################################
+# 1080P
+# -----
+#
+# .. raw:: html
+#
+#    <img style="max-width: 100%" src="https://download.pytorch.org/torchaudio/tutorial-assets/nvenc_testsrc2_1080_097.png" alt="NVENC sample 1080P">
+######################################################################
+#
+# We can see that there are more artifacts at higher resolution, which
+# are noticeable.
+#
+# Perhaps one might be able to reduce these using ``encoder_options``
+# arguments.
+# We did not try, but if you try that and find a better quality
+# setting, feel free to let us know. ;)
+######################################################################
+#
+# Tag: :obj:`torchaudio.io`
--- a/examples/tutorials/online_asr_tutorial.py
+++ b/examples/tutorials/online_asr_tutorial.py
@@ -13,14 +13,11 @@ to perform online speech recognition.
 #
 # .. note::
 #
-#    This tutorial requires FFmpeg libraries (>=4.1, <4.4) and SentencePiece.
+#    This tutorial requires FFmpeg libraries and SentencePiece.
 #
-#    There are multiple ways to install FFmpeg libraries.
+#    Please refer to :ref:`Optional Dependencies <optional_dependencies>`
-#    If you are using Anaconda Python distribution,
+#    for the detail.
-#    ``conda install 'ffmpeg<4.4'`` will install
-#    the required FFmpeg libraries.
 #
-#    You can install SentencePiece by running ``pip install sentencepiece``.
 ######################################################################
 # 1. Overview
@@ -45,29 +42,9 @@ import torchaudio
 print(torch.__version__)
 print(torchaudio.__version__)
-######################################################################
-#
 import IPython
+import matplotlib.pyplot as plt
-try:
+from torchaudio.io import StreamReader
-    from torchaudio.io import StreamReader
-except ModuleNotFoundError:
-    try:
-        import google.colab
-        print(
-            """
-            To enable running this notebook in Google Colab, install the requisite
-            third party libraries by running the following code block:
-            !add-apt-repository -y ppa:savoury1/ffmpeg4
-            !apt-get -qq install -y ffmpeg
-            """
-        )
-    except ModuleNotFoundError:
-        pass
-    raise
 ######################################################################
 # 3. Construct the pipeline
@@ -195,22 +172,43 @@ state, hypothesis = None, None
 stream_iterator = streamer.stream()
+def _plot(feats, num_iter, unit=25):
+    unit_dur = segment_length / sample_rate * unit
+    num_plots = num_iter // unit + (1 if num_iter % unit else 0)
+    fig, axes = plt.subplots(num_plots, 1)
+    t0 = 0
+    for i, ax in enumerate(axes):
+        feats_ = feats[i * unit : (i + 1) * unit]
+        t1 = t0 + segment_length / sample_rate * len(feats_)
+        feats_ = torch.cat([f[2:-2] for f in feats_])  # remove boundary effect and overlap
+        ax.imshow(feats_.T, extent=[t0, t1, 0, 1], aspect="auto", origin="lower")
+        ax.tick_params(which="both", left=False, labelleft=False)
+        ax.set_xlim(t0, t0 + unit_dur)
+        t0 = t1
+    fig.suptitle("MelSpectrogram Feature")
+    plt.tight_layout()
 @torch.inference_mode()
-def run_inference(num_iter=200):
+def run_inference(num_iter=100):
    global state, hypothesis
    chunks = []
+    feats = []
    for i, (chunk,) in enumerate(stream_iterator, start=1):
        segment = cacher(chunk[:, 0])
        features, length = feature_extractor(segment)
        hypos, state = decoder.infer(features, length, 10, state=state, hypothesis=hypothesis)
-        hypothesis = hypos[0]
+        hypothesis = hypos
-        transcript = token_processor(hypothesis[0], lstrip=False)
+        transcript = token_processor(hypos[0][0], lstrip=False)
-        print(transcript, end="", flush=True)
+        print(transcript, end="\r", flush=True)
        chunks.append(chunk)
+        feats.append(features)
        if i == num_iter:
            break
+    # Plot the features
+    _plot(feats, num_iter)
    return IPython.display.Audio(torch.cat(chunks).T.numpy(), rate=bundle.sample_rate)
@@ -249,6 +247,36 @@ run_inference()
 run_inference()
+######################################################################
+#
+run_inference()
+######################################################################
+#
+run_inference()
+######################################################################
+#
+run_inference()
+######################################################################
+#
+run_inference()
+######################################################################
+#
+run_inference()
+######################################################################
+#
+run_inference()
 ######################################################################
 #
 # Tag: :obj:`torchaudio.io`
--- a/examples/tutorials/speech_recognition_pipeline_tutorial.py
+++ b/examples/tutorials/speech_recognition_pipeline_tutorial.py
@@ -160,8 +160,7 @@ for i, feats in enumerate(features):
    ax[i].set_title(f"Feature from transformer layer {i+1}")
    ax[i].set_xlabel("Feature dimension")
    ax[i].set_ylabel("Frame (time-axis)")
-plt.tight_layout()
+fig.tight_layout()
-plt.show()
 ######################################################################
@@ -190,7 +189,7 @@ plt.imshow(emission[0].cpu().T, interpolation="nearest")
 plt.title("Classification result")
 plt.xlabel("Frame (time-axis)")
 plt.ylabel("Class")
-plt.show()
+plt.tight_layout()
 print("Class labels:", bundle.get_labels())

--- a/examples/tutorials/squim_tutorial.py
+++ b/examples/tutorials/squim_tutorial.py
+"""
+Torchaudio-Squim: Non-intrusive Speech Assessment in TorchAudio
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+"""
+######################################################################
+# Author: `Anurag Kumar <anuragkr90@meta.com>`__, `Zhaoheng
+# Ni <zni@meta.com>`__
+#
+######################################################################
+# 1. Overview
+# ^^^^^^^^^^^
+#
+######################################################################
+# This tutorial shows uses of Torchaudio-Squim to estimate objective and
+# subjective metrics for assessment of speech quality and intelligibility.
+#
+# TorchAudio-Squim enables speech assessment in Torchaudio. It provides
+# interface and pre-trained models to estimate various speech quality and
+# intelligibility metrics. Currently, Torchaudio-Squim [1] supports
+# reference-free estimation 3 widely used objective metrics:
+#
+# -  Wideband Perceptual Estimation of Speech Quality (PESQ) [2]
+#
+# -  Short-Time Objective Intelligibility (STOI) [3]
+#
+# -  Scale-Invariant Signal-to-Distortion Ratio (SI-SDR) [4]
+#
+# It also supports estimation of subjective Mean Opinion Score (MOS) for a
+# given audio waveform using Non-Matching References [1, 5].
+#
+# **References**
+#
+# [1] Kumar, Anurag, et al. “TorchAudio-Squim: Reference-less Speech
+# Quality and Intelligibility measures in TorchAudio.” ICASSP 2023-2023
+# IEEE International Conference on Acoustics, Speech and Signal Processing
+# (ICASSP). IEEE, 2023.
+#
+# [2] I. Rec, “P.862.2: Wideband extension to recommendation P.862 for the
+# assessment of wideband telephone networks and speech codecs,”
+# International Telecommunication Union, CH–Geneva, 2005.
+#
+# [3] Taal, C. H., Hendriks, R. C., Heusdens, R., & Jensen, J. (2010,
+# March). A short-time objective intelligibility measure for
+# time-frequency weighted noisy speech. In 2010 IEEE international
+# conference on acoustics, speech and signal processing (pp. 4214-4217).
+# IEEE.
+#
+# [4] Le Roux, Jonathan, et al. “SDR–half-baked or well done?.” ICASSP
+# 2019-2019 IEEE International Conference on Acoustics, Speech and Signal
+# Processing (ICASSP). IEEE, 2019.
+#
+# [5] Manocha, Pranay, and Anurag Kumar. “Speech quality assessment
+# through MOS using non-matching references.” Interspeech, 2022.
+#
+import torch
+import torchaudio
+print(torch.__version__)
+print(torchaudio.__version__)
+######################################################################
+# 2. Preparation
+# ^^^^^^^^^^^^^^
+#
+# First import the modules and define the helper functions.
+#
+# We will need torch, torchaudio to use Torchaudio-squim, Matplotlib to
+# plot data, pystoi, pesq for computing reference metrics.
+#
+try:
+    from pesq import pesq
+    from pystoi import stoi
+    from torchaudio.pipelines import SQUIM_OBJECTIVE, SQUIM_SUBJECTIVE
+except ImportError:
+    try:
+        import google.colab  # noqa: F401
+        print(
+            """
+            To enable running this notebook in Google Colab, install nightly
+            torch and torchaudio builds by adding the following code block to the top
+            of the notebook before running it:
+            !pip3 uninstall -y torch torchvision torchaudio
+            !pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+            !pip3 install pesq
+            !pip3 install pystoi
+            """
+        )
+    except Exception:
+        pass
+    raise
+import matplotlib.pyplot as plt
+######################################################################
+#
+#
+import torchaudio.functional as F
+from IPython.display import Audio
+from torchaudio.utils import download_asset
+def si_snr(estimate, reference, epsilon=1e-8):
+    estimate = estimate - estimate.mean()
+    reference = reference - reference.mean()
+    reference_pow = reference.pow(2).mean(axis=1, keepdim=True)
+    mix_pow = (estimate * reference).mean(axis=1, keepdim=True)
+    scale = mix_pow / (reference_pow + epsilon)
+    reference = scale * reference
+    error = estimate - reference
+    reference_pow = reference.pow(2)
+    error_pow = error.pow(2)
+    reference_pow = reference_pow.mean(axis=1)
+    error_pow = error_pow.mean(axis=1)
+    si_snr = 10 * torch.log10(reference_pow) - 10 * torch.log10(error_pow)
+    return si_snr.item()
+def plot(waveform, title, sample_rate=16000):
+    wav_numpy = waveform.numpy()
+    sample_size = waveform.shape[1]
+    time_axis = torch.arange(0, sample_size) / sample_rate
+    figure, axes = plt.subplots(2, 1)
+    axes[0].plot(time_axis, wav_numpy[0], linewidth=1)
+    axes[0].grid(True)
+    axes[1].specgram(wav_numpy[0], Fs=sample_rate)
+    figure.suptitle(title)
+######################################################################
+# 3. Load Speech and Noise Sample
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+SAMPLE_SPEECH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
+SAMPLE_NOISE = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo.wav")
+######################################################################
+#
+#
+WAVEFORM_SPEECH, SAMPLE_RATE_SPEECH = torchaudio.load(SAMPLE_SPEECH)
+WAVEFORM_NOISE, SAMPLE_RATE_NOISE = torchaudio.load(SAMPLE_NOISE)
+WAVEFORM_NOISE = WAVEFORM_NOISE[0:1, :]
+######################################################################
+# Currently, Torchaudio-Squim model only supports 16000 Hz sampling rate.
+# Resample the waveforms if necessary.
+#
+if SAMPLE_RATE_SPEECH != 16000:
+    WAVEFORM_SPEECH = F.resample(WAVEFORM_SPEECH, SAMPLE_RATE_SPEECH, 16000)
+if SAMPLE_RATE_NOISE != 16000:
+    WAVEFORM_NOISE = F.resample(WAVEFORM_NOISE, SAMPLE_RATE_NOISE, 16000)
+######################################################################
+# Trim waveforms so that they have the same number of frames.
+#
+if WAVEFORM_SPEECH.shape[1] < WAVEFORM_NOISE.shape[1]:
+    WAVEFORM_NOISE = WAVEFORM_NOISE[:, : WAVEFORM_SPEECH.shape[1]]
+else:
+    WAVEFORM_SPEECH = WAVEFORM_SPEECH[:, : WAVEFORM_NOISE.shape[1]]
+######################################################################
+# Play speech sample
+#
+Audio(WAVEFORM_SPEECH.numpy()[0], rate=16000)
+######################################################################
+# Play noise sample
+#
+Audio(WAVEFORM_NOISE.numpy()[0], rate=16000)
+######################################################################
+# 4. Create distorted (noisy) speech samples
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+snr_dbs = torch.tensor([20, -5])
+WAVEFORM_DISTORTED = F.add_noise(WAVEFORM_SPEECH, WAVEFORM_NOISE, snr_dbs)
+######################################################################
+# Play distorted speech with 20dB SNR
+#
+Audio(WAVEFORM_DISTORTED.numpy()[0], rate=16000)
+######################################################################
+# Play distorted speech with -5dB SNR
+#
+Audio(WAVEFORM_DISTORTED.numpy()[1], rate=16000)
+######################################################################
+# 5. Visualize the waveforms
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+######################################################################
+# Visualize speech sample
+#
+plot(WAVEFORM_SPEECH, "Clean Speech")
+######################################################################
+# Visualize noise sample
+#
+plot(WAVEFORM_NOISE, "Noise")
+######################################################################
+# Visualize distorted speech with 20dB SNR
+#
+plot(WAVEFORM_DISTORTED[0:1], f"Distorted Speech with {snr_dbs[0]}dB SNR")
+######################################################################
+# Visualize distorted speech with -5dB SNR
+#
+plot(WAVEFORM_DISTORTED[1:2], f"Distorted Speech with {snr_dbs[1]}dB SNR")
+######################################################################
+# 6. Predict Objective Metrics
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+######################################################################
+# Get the pre-trained ``SquimObjective``\ model.
+#
+objective_model = SQUIM_OBJECTIVE.get_model()
+######################################################################
+# Compare model outputs with ground truths for distorted speech with 20dB
+# SNR
+#
+stoi_hyp, pesq_hyp, si_sdr_hyp = objective_model(WAVEFORM_DISTORTED[0:1, :])
+print(f"Estimated metrics for distorted speech at {snr_dbs[0]}dB are\n")
+print(f"STOI: {stoi_hyp[0]}")
+print(f"PESQ: {pesq_hyp[0]}")
+print(f"SI-SDR: {si_sdr_hyp[0]}\n")
+pesq_ref = pesq(16000, WAVEFORM_SPEECH[0].numpy(), WAVEFORM_DISTORTED[0].numpy(), mode="wb")
+stoi_ref = stoi(WAVEFORM_SPEECH[0].numpy(), WAVEFORM_DISTORTED[0].numpy(), 16000, extended=False)
+si_sdr_ref = si_snr(WAVEFORM_DISTORTED[0:1], WAVEFORM_SPEECH)
+print(f"Reference metrics for distorted speech at {snr_dbs[0]}dB are\n")
+print(f"STOI: {stoi_ref}")
+print(f"PESQ: {pesq_ref}")
+print(f"SI-SDR: {si_sdr_ref}")
+######################################################################
+# Compare model outputs with ground truths for distorted speech with -5dB
+# SNR
+#
+stoi_hyp, pesq_hyp, si_sdr_hyp = objective_model(WAVEFORM_DISTORTED[1:2, :])
+print(f"Estimated metrics for distorted speech at {snr_dbs[1]}dB are\n")
+print(f"STOI: {stoi_hyp[0]}")
+print(f"PESQ: {pesq_hyp[0]}")
+print(f"SI-SDR: {si_sdr_hyp[0]}\n")
+pesq_ref = pesq(16000, WAVEFORM_SPEECH[0].numpy(), WAVEFORM_DISTORTED[1].numpy(), mode="wb")
+stoi_ref = stoi(WAVEFORM_SPEECH[0].numpy(), WAVEFORM_DISTORTED[1].numpy(), 16000, extended=False)
+si_sdr_ref = si_snr(WAVEFORM_DISTORTED[1:2], WAVEFORM_SPEECH)
+print(f"Reference metrics for distorted speech at {snr_dbs[1]}dB are\n")
+print(f"STOI: {stoi_ref}")
+print(f"PESQ: {pesq_ref}")
+print(f"SI-SDR: {si_sdr_ref}")
+######################################################################
+# 7. Predict Mean Opinion Scores (Subjective) Metric
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+######################################################################
+# Get the pre-trained ``SquimSubjective`` model.
+#
+subjective_model = SQUIM_SUBJECTIVE.get_model()
+######################################################################
+# Load a non-matching reference (NMR)
+#
+NMR_SPEECH = download_asset("tutorial-assets/ctc-decoding/1688-142285-0007.wav")
+WAVEFORM_NMR, SAMPLE_RATE_NMR = torchaudio.load(NMR_SPEECH)
+if SAMPLE_RATE_NMR != 16000:
+    WAVEFORM_NMR = F.resample(WAVEFORM_NMR, SAMPLE_RATE_NMR, 16000)
+######################################################################
+# Compute MOS metric for distorted speech with 20dB SNR
+#
+mos = subjective_model(WAVEFORM_DISTORTED[0:1, :], WAVEFORM_NMR)
+print(f"Estimated MOS for distorted speech at {snr_dbs[0]}dB is MOS: {mos[0]}")
+######################################################################
+# Compute MOS metric for distorted speech with -5dB SNR
+#
+mos = subjective_model(WAVEFORM_DISTORTED[1:2, :], WAVEFORM_NMR)
+print(f"Estimated MOS for distorted speech at {snr_dbs[1]}dB is MOS: {mos[0]}")
+######################################################################
+# 8. Comparison with ground truths and baselines
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# Visualizing the estimated metrics by the ``SquimObjective`` and
+# ``SquimSubjective`` models can help users better understand how the
+# models can be applicable in real scenario. The graph below shows scatter
+# plots of three different systems: MOSA-Net [1], AMSA [2], and the
+# ``SquimObjective`` model, where y axis represents the estimated STOI,
+# PESQ, and Si-SDR scores, and x axis represents the corresponding ground
+# truth.
+#
+# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/objective_plot.png
+#    :width: 500px
+#    :align: center
+#
+# [1] Zezario, Ryandhimas E., Szu-Wei Fu, Fei Chen, Chiou-Shann Fuh,
+# Hsin-Min Wang, and Yu Tsao. “Deep learning-based non-intrusive
+# multi-objective speech assessment model with cross-domain features.”
+# IEEE/ACM Transactions on Audio, Speech, and Language Processing 31
+# (2022): 54-70.
+#
+# [2] Dong, Xuan, and Donald S. Williamson. “An attention enhanced
+# multi-task model for objective speech assessment in real-world
+# environments.” In ICASSP 2020-2020 IEEE International Conference on
+# Acoustics, Speech and Signal Processing (ICASSP), pp. 911-915. IEEE,
+# 2020.
+#
+######################################################################
+# The graph below shows scatter plot of the ``SquimSubjective`` model,
+# where y axis represents the estimated MOS metric score, and x axis
+# represents the corresponding ground truth.
+#
+# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/subjective_plot.png
+#    :width: 500px
+#    :align: center
+#
--- a/examples/tutorials/streamreader_advanced_tutorial.py
+++ b/examples/tutorials/streamreader_advanced_tutorial.py
@@ -20,35 +20,15 @@ import torchaudio
 print(torch.__version__)
 print(torchaudio.__version__)
-######################################################################
-#
-try:
-    from torchaudio.io import StreamReader
-except ModuleNotFoundError:
-    try:
-        import google.colab
-        print(
-            """
-            To enable running this notebook in Google Colab, install the requisite
-            third party libraries by running the following code:
-            !add-apt-repository -y ppa:savoury1/ffmpeg4
-            !apt-get -qq install -y ffmpeg
-            """
-        )
-    except ModuleNotFoundError:
-        pass
-    raise
 import IPython
 import matplotlib.pyplot as plt
+from torchaudio.io import StreamReader
 base_url = "https://download.pytorch.org/torchaudio/tutorial-assets"
 AUDIO_URL = f"{base_url}/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
 VIDEO_URL = f"{base_url}/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4.mp4"
 ######################################################################
 # Audio / Video device input
 # --------------------------
@@ -122,6 +102,9 @@ VIDEO_URL = f"{base_url}/stream-api/NASAs_Most_Scientifically_Complex_Space_Obse
 #
 ######################################################################
+#
+# .. _lavfi:
+#
 # Synthetic source streams
 # ------------------------
 #
@@ -372,13 +355,14 @@ chunks = next(streamer.stream())
 def _display(i):
    print("filter_desc:", streamer.get_out_stream_info(i).filter_description)
-    _, axs = plt.subplots(2, 1)
+    fig, axs = plt.subplots(2, 1)
    waveform = chunks[i][:, 0]
    axs[0].plot(waveform)
    axs[0].grid(True)
    axs[0].set_ylim([-1, 1])
    plt.setp(axs[0].get_xticklabels(), visible=False)
    axs[1].specgram(waveform, Fs=sample_rate)
+    fig.tight_layout()
    return IPython.display.Audio(chunks[i].T, rate=sample_rate)
@@ -457,7 +441,6 @@ def _display(i):
        axs[j].imshow(chunk[10 * j + 1].permute(1, 2, 0))
        axs[j].set_axis_off()
    plt.tight_layout()
-    plt.show(block=False)
 ######################################################################

--- a/examples/tutorials/streamreader_basic_tutorial.py
+++ b/examples/tutorials/streamreader_basic_tutorial.py
@@ -14,12 +14,9 @@ libavfilter provides.
 #
 # .. note::
 #
-#    This tutorial requires FFmpeg libraries (>=4.1, <4.4).
+#    This tutorial requires FFmpeg libraries.
-#
+#    Please refer to :ref:`FFmpeg dependency <ffmpeg_dependency>` for
-#    There are multiple ways to install FFmpeg libraries.
+#    the detail.
-#    If you are using Anaconda Python distribution,
-#    ``conda install -c anaconda 'ffmpeg<4.4'`` will install
-#    the required libraries.
 #
 ######################################################################
@@ -65,29 +62,8 @@ import torchaudio
 print(torch.__version__)
 print(torchaudio.__version__)
-######################################################################
-#
-try:
-    from torchaudio.io import StreamReader
-except ModuleNotFoundError:
-    try:
-        import google.colab
-        print(
-            """
-            To enable running this notebook in Google Colab, install the requisite
-            third party libraries by running the following code:
-            !add-apt-repository -y ppa:savoury1/ffmpeg4
-            !apt-get -qq install -y ffmpeg
-            """
-        )
-    except ModuleNotFoundError:
-        pass
-    raise
 import matplotlib.pyplot as plt
+from torchaudio.io import StreamReader
 base_url = "https://download.pytorch.org/torchaudio/tutorial-assets"
 AUDIO_URL = f"{base_url}/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
@@ -613,7 +589,6 @@ for i, vid in enumerate(vids2):
        if i == 0 and j == 0:
            ax.set_ylabel("Stream 2")
 plt.tight_layout()
-plt.show(block=False)
 ######################################################################
 #

--- a/examples/tutorials/streamwriter_advanced.py
+++ b/examples/tutorials/streamwriter_advanced.py
@@ -23,17 +23,9 @@ play audio and video.
 #
 # .. note::
 #
-#    This tutorial requires torchaudio nightly build and FFmpeg libraries (>=4.1, <4.4).
+#    This tutorial requires FFmpeg libraries.
-#
+#    Please refer to :ref:`FFmpeg dependency <ffmpeg_dependency>` for
-#    To install torchaudio nightly build, please refer to
+#    the detail.
-#    https://pytorch.org/get-started/locally/ .
-#
-#
-#    There are multiple ways to install FFmpeg libraries.
-#    If you are using Anaconda Python distribution,
-#    ``conda install 'ffmpeg<4.4'`` will install the required FFmpeg libraries,
-#    however, this distribution does not have SDL plugin, so it cannot play
-#    video.
 #
 ######################################################################
@@ -74,7 +66,9 @@ from torchaudio.io import StreamWriter
 from torchaudio.utils import download_asset
 AUDIO_PATH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
-VIDEO_PATH = download_asset("tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4")
+VIDEO_PATH = download_asset(
+    "tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4"
+)
 ######################################################################
 #
@@ -140,7 +134,7 @@ s.add_audio_stream(sample_rate, num_channels, format="s16")
 # Write audio to the device
 with s.open():
    for i in range(0, num_frames, 256):
-        s.write_audio_chunk(0, waveform[i:i+256])
+        s.write_audio_chunk(0, waveform[i : i + 256])
 ######################################################################
 #
@@ -186,8 +180,12 @@ width, height = 640, 360
 # a background thread and give chunks
 running = True
 def video_streamer(path, frames_per_chunk):
-    import queue, threading
+    import queue
+    import threading
    from torchaudio.io import StreamReader
    q = queue.Queue()
@@ -196,9 +194,9 @@ def video_streamer(path, frames_per_chunk):
    def _streamer():
        streamer = StreamReader(path)
        streamer.add_basic_video_stream(
-            frames_per_chunk, format="rgb24",
+            frames_per_chunk, format="rgb24", frame_rate=frame_rate, width=width, height=height
-            frame_rate=frame_rate, width=width, height=height)
+        )
-        for (chunk_, ) in streamer.stream():
+        for (chunk_,) in streamer.stream():
            q.put(chunk_)
            if not running:
                break
@@ -246,7 +244,7 @@ with s.open():
 #        <source src="https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio-sdl-demo.mp4">
 #    </video>
 #
-# [`code <https://download.pytorch.org/torchaudio/tutorial-assets/sdl.py>`_]
+# [`code <https://download.pytorch.org/torchaudio/tutorial-assets/sdl.py>`__]
 #
 ######################################################################
@@ -292,7 +290,7 @@ with s.open():
 #        <source src="https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio-rtmp-demo.mp4">
 #    </video>
 #
-# [`code <https://download.pytorch.org/torchaudio/tutorial-assets/rtmp.py>`_]
+# [`code <https://download.pytorch.org/torchaudio/tutorial-assets/rtmp.py>`__]
 #
@@ -324,7 +322,7 @@ with s.open():
 #        <source src="https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio-udp-demo.mp4">
 #    </video>
 #
-# [`code <https://download.pytorch.org/torchaudio/tutorial-assets/udp.py>`_]
+# [`code <https://download.pytorch.org/torchaudio/tutorial-assets/udp.py>`__]
 #
 ######################################################################

--- a/examples/tutorials/streamwriter_basic_tutorial.py
+++ b/examples/tutorials/streamwriter_basic_tutorial.py
@@ -13,14 +13,9 @@ encode and save audio/video data into various formats/destinations.
 #
 # .. note::
 #
-#    This tutorial requires torchaudio nightly build and FFmpeg libraries (>=4.1, <4.4).
+#    This tutorial requires FFmpeg libraries.
-#
+#    Please refer to :ref:`FFmpeg dependency <ffmpeg_dependency>` for
-#    To install torchaudio nightly build, please refer to
+#    the detail.
-#    https://pytorch.org/get-started/locally/ .
-#
-#    There are multiple ways to install FFmpeg libraries.
-#    If you are using Anaconda Python distribution,
-#    ``conda install 'ffmpeg<4.4'`` will install the required FFmpeg libraries.
 #
 ######################################################################
@@ -51,27 +46,7 @@ import torchaudio
 print(torch.__version__)
 print(torchaudio.__version__)
-######################################################################
+from torchaudio.io import StreamWriter
-#
-try:
-    from torchaudio.io import StreamWriter
-except ImportError:
-    try:
-        import google.colab
-        print(
-            """
-            To enable running this notebook in Google Colab, install nightly
-            torch and torchaudio builds by adding the following code block to the top
-            of the notebook before running it:
-            !pip3 uninstall -y torch torchvision torchaudio
-            !pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-            """
-        )
-    except ModuleNotFoundError:
-        pass
-    raise
 print("FFmpeg library versions")
 for k, v in torchaudio.utils.ffmpeg_utils.get_versions().items():
@@ -84,9 +59,10 @@ import io
 import os
 import tempfile
-from torchaudio.utils import download_asset
 from IPython.display import Audio, Video
+from torchaudio.utils import download_asset
 SAMPLE_PATH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
 WAVEFORM, SAMPLE_RATE = torchaudio.load(SAMPLE_PATH, channels_first=False)
 NUM_FRAMES, NUM_CHANNELS = WAVEFORM.shape
@@ -503,47 +479,7 @@ print(f"{bytes2[:10]}...{bytes2[-10:]}\n")
 assert bytes1 == bytes2
-######################################################################
+import matplotlib.pyplot as plt
-# Note on slicing and AAC
-# ~~~~~~~~~~~~~~~~~~~~~~~
-#
-# .. warning::
-#
-#    FFmpeg's native AAC encoder (which is used by default when
-#    saving video with MP4 format) has a bug that affects the audibility.
-#
-#    Please refer to the examples bellow.
-#
-def test_slice(audio_encoder, slice_size, ext="mp4"):
-    path = get_path(f"slice_{slice_size}.{ext}")
-    s = StreamWriter(dst=path)
-    s.add_audio_stream(SAMPLE_RATE, NUM_CHANNELS, encoder=audio_encoder)
-    with s.open():
-        for start in range(0, NUM_FRAMES, slice_size):
-            end = start + slice_size
-            s.write_audio_chunk(0, WAVEFORM[start:end, ...])
-    return path
-######################################################################
-#
-# This causes some artifacts.
-# note:
-# Chrome does not support playing AAC audio directly while Safari does.
-# Using MP4 container and specifying AAC allows Chrome to play it.
-Video(test_slice(audio_encoder="aac", slice_size=8000, ext="mp4"), embed=True)
-######################################################################
-#
-# It is more noticeable when using smaller slice.
-Video(test_slice(audio_encoder="aac", slice_size=512, ext="mp4"), embed=True)
-######################################################################
-#
-# Lame MP3 encoder works fine for the same slice size.
-Audio(test_slice(audio_encoder="libmp3lame", slice_size=512, ext="mp3"))
 ######################################################################
 #
@@ -559,7 +495,6 @@ Audio(test_slice(audio_encoder="libmp3lame", slice_size=512, ext="mp3"))
 # then use StreamWriter to convert them to video with the original audio.
 import torchaudio.transforms as T
-import matplotlib.pyplot as plt
 ######################################################################
 #
@@ -590,7 +525,7 @@ specs = trans(WAVEFORM.T)[0].T
 #
 spec_db = T.AmplitudeToDB(stype="magnitude", top_db=80)(specs.T)
-_ = plt.imshow(spec_db, aspect="auto", origin='lower')
+_ = plt.imshow(spec_db, aspect="auto", origin="lower")
 ######################################################################
 #
@@ -611,21 +546,27 @@ ncols, nrows = fig.canvas.get_width_height()
 def _plot(data):
    ax.clear()
    x = list(range(len(data)))
-    R, G, B = 238/255, 76/255, 44/255
+    R, G, B = 238 / 255, 76 / 255, 44 / 255
    for coeff, alpha in [(0.8, 0.7), (1, 1)]:
-        d = data ** coeff
+        d = data**coeff
        ax.fill_between(x, d, -d, color=[R, G, B, alpha])
    xlim = n_fft // 2 + 1
    ax.set_xlim([-1, n_fft // 2 + 1])
    ax.set_ylim([-1, 1])
    ax.text(
-        xlim, 0.95,
+        xlim,
+        0.95,
        f"Created with TorchAudio\n{torchaudio.__version__}",
-        color="white", ha="right", va="top", backgroundcolor="black")
+        color="white",
+        ha="right",
+        va="top",
+        backgroundcolor="black",
+    )
    fig.canvas.draw()
    frame = torch.frombuffer(fig.canvas.tostring_rgb(), dtype=torch.uint8)
    return frame.reshape(nrows, ncols, 3).permute(2, 0, 1)
 # sphinx_gallery_defer_figures
 ######################################################################
@@ -646,10 +587,10 @@ with s.open():
    # Process by second
    for t in range(0, NUM_FRAMES, SAMPLE_RATE):
        # Write audio chunk
-        s.write_audio_chunk(0, WAVEFORM[t:t + SAMPLE_RATE, :])
+        s.write_audio_chunk(0, WAVEFORM[t : t + SAMPLE_RATE, :])
        # write 1 second of video chunk
-        frames = [_plot(spec) for spec in specs[i:i+frame_rate]]
+        frames = [_plot(spec) for spec in specs[i : i + frame_rate]]
        if frames:
            s.write_video_chunk(1, torch.stack(frames))
        i += frame_rate

--- a/examples/tutorials/tacotron2_pipeline_tutorial.py
+++ b/examples/tutorials/tacotron2_pipeline_tutorial.py
@@ -7,10 +7,6 @@ Text-to-Speech with Tacotron2
 """
-import IPython
-import matplotlib
-import matplotlib.pyplot as plt
 ######################################################################
 # Overview
 # --------
@@ -65,8 +61,6 @@ import matplotlib.pyplot as plt
 import torch
 import torchaudio
-matplotlib.rcParams["figure.figsize"] = [16.0, 4.8]
 torch.random.manual_seed(0)
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -75,6 +69,13 @@ print(torchaudio.__version__)
 print(device)
+######################################################################
+#
+import IPython
+import matplotlib.pyplot as plt
 ######################################################################
 # Text Processing
 # ---------------
@@ -218,7 +219,7 @@ with torch.inference_mode():
    spec, _, _ = tacotron2.infer(processed, lengths)
-_ = plt.imshow(spec[0].cpu().detach())
+_ = plt.imshow(spec[0].cpu().detach(), origin="lower", aspect="auto")
 ######################################################################
@@ -226,13 +227,17 @@ _ = plt.imshow(spec[0].cpu().detach())
 # therefor, the process of generating the spectrogram incurs randomness.
 #
-fig, ax = plt.subplots(3, 1, figsize=(16, 4.3 * 3))
-for i in range(3):
+def plot():
-    with torch.inference_mode():
+    fig, ax = plt.subplots(3, 1)
-        spec, spec_lengths, _ = tacotron2.infer(processed, lengths)
+    for i in range(3):
-    print(spec[0].shape)
+        with torch.inference_mode():
-    ax[i].imshow(spec[0].cpu().detach())
+            spec, spec_lengths, _ = tacotron2.infer(processed, lengths)
-plt.show()
+        print(spec[0].shape)
+        ax[i].imshow(spec[0].cpu().detach(), origin="lower", aspect="auto")
+plot()
 ######################################################################
@@ -270,11 +275,22 @@ with torch.inference_mode():
    spec, spec_lengths, _ = tacotron2.infer(processed, lengths)
    waveforms, lengths = vocoder(spec, spec_lengths)
-fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(16, 9))
+######################################################################
-ax1.imshow(spec[0].cpu().detach())
+#
-ax2.plot(waveforms[0].cpu().detach())
+def plot(waveforms, spec, sample_rate):
+    waveforms = waveforms.cpu().detach()
-IPython.display.Audio(waveforms[0:1].cpu(), rate=vocoder.sample_rate)
+    fig, [ax1, ax2] = plt.subplots(2, 1)
+    ax1.plot(waveforms[0])
+    ax1.set_xlim(0, waveforms.size(-1))
+    ax1.grid(True)
+    ax2.imshow(spec[0].cpu().detach(), origin="lower", aspect="auto")
+    return IPython.display.Audio(waveforms[0:1], rate=sample_rate)
+plot(waveforms, spec, vocoder.sample_rate)
 ######################################################################
@@ -300,11 +316,10 @@ with torch.inference_mode():
    spec, spec_lengths, _ = tacotron2.infer(processed, lengths)
 waveforms, lengths = vocoder(spec, spec_lengths)
-fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(16, 9))
+######################################################################
-ax1.imshow(spec[0].cpu().detach())
+#
-ax2.plot(waveforms[0].cpu().detach())
-IPython.display.Audio(waveforms[0:1].cpu(), rate=vocoder.sample_rate)
+plot(waveforms, spec, vocoder.sample_rate)
 ######################################################################
@@ -339,8 +354,7 @@ waveglow.eval()
 with torch.no_grad():
    waveforms = waveglow.infer(spec)
-fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(16, 9))
+######################################################################
-ax1.imshow(spec[0].cpu().detach())
+#
-ax2.plot(waveforms[0].cpu().detach())
-IPython.display.Audio(waveforms[0:1].cpu(), rate=22050)
+plot(waveforms, spec, 22050)
--- a/packaging/torchaudio/meta.yaml
+++ b/packaging/torchaudio/meta.yaml
@@ -14,11 +14,8 @@ requirements:
  host:
    - python
    - setuptools
-    - pkg-config  # [not win]
    - cmake
    - ninja
-    - numpy>=1.11 # [py <= 39]
-    - numpy>=1.21.2 # [py >= 310]
    - pytorch-mutex 1.0 {{ build_variant }}  # [not osx ]
    {{ environ.get('CONDA_PYTORCH_BUILD_CONSTRAINT', 'pytorch') }}
    {{ environ.get('CONDA_EXTRA_BUILD_CONSTRAINT', '') }}
@@ -26,8 +23,7 @@ requirements:
  run:
    - python
-    - numpy>=1.11 # [py <= 39]
+    - numpy
-    - numpy>=1.21.2 # [py >= 310]
    - pytorch-mutex 1.0 {{ build_variant }}  # [not osx ]
    {{ environ.get('CONDA_PYTORCH_CONSTRAINT', 'pytorch') }}
    {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT', '') }}
@@ -49,7 +45,6 @@ build:
    - TORCH_CUDA_ARCH_LIST
    - USE_FFMPEG
    - USE_OPENMP
-    - FFMPEG_ROOT
    - MACOSX_DEPLOYMENT_TARGET
 test:

--- a/packaging/vs2019/conda_build_config.yaml
+++ b/packaging/vs2019/conda_build_config.yaml
-blas_impl:
-  - mkl                        # [x86_64]
 c_compiler:
  - vs2019                     # [win]
 cxx_compiler:
  - vs2019                     # [win]
 python:
-  - 3.7
+  - 3.8
 # This differs from target_platform in that it determines what subdir the compiler
 #    will target, not what subdir the compiler package will be itself.
 #    For example, we need a win-64 vs2008_win-32 package, so that we compile win-32

--- a/packaging/windows/internal/cuda_install.bat
+++ b/packaging/windows/internal/cuda_install.bat
@@ -23,25 +23,25 @@ set CUDA_VERSION_STR=%CUDA_VER_MAJOR%.%CUDA_VER_MINOR%
 set CUDNN_FOLDER="cuda"
 set CUDNN_LIB_FOLDER="lib\x64"
-if %CUDA_VER% EQU 116 goto cuda116
+if %CUDA_VER% EQU 118 goto cuda118
-if %CUDA_VER% EQU 117 goto cuda117
+if %CUDA_VER% EQU 121 goto cuda121
 echo CUDA %CUDA_VERSION_STR% is not supported
 exit /b 1
-:cuda116
+:cuda118
-set CUDA_INSTALL_EXE=cuda_11.6.0_511.23_windows.exe
+set CUDA_INSTALL_EXE=cuda_11.8.0_522.06_windows.exe
 if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
    if errorlevel 1 exit /b 1
    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    set "ARGS=thrust_11.6 nvcc_11.6 cuobjdump_11.6 nvprune_11.6 nvprof_11.6 cupti_11.6 cublas_11.6 cublas_dev_11.6 cudart_11.6 cufft_11.6 cufft_dev_11.6 curand_11.6 curand_dev_11.6 cusolver_11.6 cusolver_dev_11.6 cusparse_11.6 cusparse_dev_11.6 npp_11.6 npp_dev_11.6 nvrtc_11.6 nvrtc_dev_11.6 nvml_dev_11.6"
+    set "ARGS=cuda_profiler_api_11.8 thrust_11.8 nvcc_11.8 cuobjdump_11.8 nvprune_11.8 nvprof_11.8 cupti_11.8 cublas_11.8 cublas_dev_11.8 cudart_11.8 cufft_11.8 cufft_dev_11.8 curand_11.8 curand_dev_11.8 cusolver_11.8 cusolver_dev_11.8 cusparse_11.8 cusparse_dev_11.8 npp_11.8 npp_dev_11.8 nvrtc_11.8 nvrtc_dev_11.8 nvml_dev_11.8"
 )
-set CUDNN_INSTALL_ZIP=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive.zip
+set CUDNN_FOLDER=cudnn-windows-x86_64-8.7.0.84_cuda11-archive
-set CUDNN_FOLDER=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive
 set CUDNN_LIB_FOLDER="lib"
+set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
 if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
    if errorlevel 1 exit /b 1
@@ -55,23 +55,23 @@ if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
 goto cuda_common
-:cuda117
+:cuda121
-set CUDA_INSTALL_EXE=cuda_11.7.0_516.01_windows.exe
+set CUDA_INSTALL_EXE=cuda_12.1.1_531.14_windows.exe
 if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
    if errorlevel 1 exit /b 1
    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    set "ARGS=thrust_11.7 nvcc_11.7 cuobjdump_11.7 nvprune_11.7 nvprof_11.7 cupti_11.7 cublas_11.7 cublas_dev_11.7 cudart_11.7 cufft_11.7 cufft_dev_11.7 curand_11.7 curand_dev_11.7 cusolver_11.7 cusolver_dev_11.7 cusparse_11.7 cusparse_dev_11.7 npp_11.7 npp_dev_11.7 nvrtc_11.7 nvrtc_dev_11.7 nvml_dev_11.7"
+    set "ARGS=cuda_profiler_api_12.1 thrust_12.1 nvcc_12.1 cuobjdump_12.1 nvprune_12.1 nvprof_12.1 cupti_12.1 cublas_12.1 cublas_dev_12.1 cudart_12.1 cufft_12.1 cufft_dev_12.1 curand_12.1 curand_dev_12.1 cusolver_12.1 cusolver_dev_12.1 cusparse_12.1 cusparse_dev_12.1 npp_12.1 npp_dev_12.1 nvrtc_12.1 nvrtc_dev_12.1 nvml_dev_12.1"
 )
-set CUDNN_INSTALL_ZIP=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive.zip
+set CUDNN_FOLDER=cudnn-windows-x86_64-8.8.1.3_cuda12-archive
-set CUDNN_FOLDER=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive
 set CUDNN_LIB_FOLDER="lib"
+set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
 if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
    if errorlevel 1 exit /b 1
-    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
+    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%
    rem Make sure windows path contains zlib dll
    curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip"

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,6 +5,9 @@ first_party_detection = false
 [tool.black]
 line-length = 120
-target-version = ["py37"]
+target-version = ["py38"]
 [tool.ufmt]
+excludes = [
+    "examples/tutorials/",
+]
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,6 @@ import os
 import re
 import shutil
 import subprocess
-import sys
 from pathlib import Path
 import torch
@@ -22,13 +21,14 @@ def _run_cmd(cmd, shell=False):
        return None
 def _get_version(sha):
    with open(ROOT_DIR / "version.txt", "r") as f:
        version = f.read().strip()
    if os.getenv("BUILD_VERSION"):
        version = os.getenv("BUILD_VERSION")
    elif sha is not None:
-        version += "+" + sha[:7]
+        version += "+das" + "." + "opt1" 
    return version
@@ -36,12 +36,14 @@ def _make_version_file(version, sha):
    sha = "Unknown" if sha is None else sha
    abi = _run_cmd(["echo '#include <string>' | gcc -x c++ -E -dM - | fgrep _GLIBCXX_USE_CXX11_ABI | awk '{print $3}'"], shell=True)
    dtk = _run_cmd(["cat", os.path.join(ROCM_HOME, '.info/rocm_version')])
-    dtk = ''.join(dtk.split('.')[:2])
+    dtk = ''.join(dtk.split('.')[:2]) + "2"
    torch_version = torch.__version__
-    dcu_version = f"{version}.abi{abi}.dtk{dtk}.torch{torch_version}"
+    dcu_version = f"{version}.dtk{dtk}"
    version_path = ROOT_DIR / "torchaudio" / "version.py"
+    version_write = version[:-9]
    with open(version_path, "w") as f:
-        f.write(f"__version__ = '{version}'\n")
+        f.write(f"__version__ = '{version_write}'\n")
        f.write(f"git_version = '{sha}'\n")
        f.write(f"abi = 'abi{abi}'\n")
        f.write(f"dtk = '{dtk}'\n")
@@ -50,7 +52,6 @@ def _make_version_file(version, sha):
    return dcu_version
 def _get_pytorch_version():
    if "PYTORCH_VERSION" in os.environ:
        return f"torch=={os.environ['PYTORCH_VERSION']}"
@@ -95,18 +96,6 @@ def _get_packages(branch_name, tag):
    return find_packages(exclude=exclude)
-def _init_submodule():
-    print(" --- Initializing submodules")
-    try:
-        subprocess.check_call(["git", "submodule", "init"])
-        subprocess.check_call(["git", "submodule", "update"])
-    except Exception:
-        print(" --- Submodule initalization failed")
-        print("Please run:\n\tgit submodule update --init --recursive")
-        sys.exit(1)
-    print(" --- Initialized submodule")
 def _parse_url(path):
    with open(path, "r") as file_:
        for line in file_:
@@ -116,18 +105,6 @@ def _parse_url(path):
                yield url
-def _parse_sources():
-    third_party_dir = ROOT_DIR / "third_party"
-    libs = ["zlib", "bzip2", "lzma", "sox"]
-    archive_dir = third_party_dir / "archives"
-    archive_dir.mkdir(exist_ok=True)
-    for lib in libs:
-        cmake_file = third_party_dir / lib / "CMakeLists.txt"
-        for url in _parse_url(cmake_file):
-            path = archive_dir / os.path.basename(url)
-            yield path, url
 def _fetch_archives(src):
    for dest, url in src:
        if not dest.exists():
@@ -135,12 +112,6 @@ def _fetch_archives(src):
            torch.hub.download_url_to_file(url, dest, progress=False)
-def _fetch_third_party_libraries():
-    _init_submodule()
-    if os.name != "nt":
-        _fetch_archives(_parse_sources())
 def _main():
    sha = _run_cmd(["git", "rev-parse", "HEAD"])
    branch = _run_cmd(["git", "rev-parse", "--abbrev-ref", "HEAD"])
@@ -154,14 +125,21 @@ def _main():
    print("-- Building version", version)
    dcu_version = _make_version_file(version, sha)
-    _fetch_third_party_libraries()
+    with open("README.md") as f:
+        long_description = f.read()
    setup(
        name="torchaudio",
        version=dcu_version,
        description="An audio package for PyTorch",
+        long_description=long_description,
+        long_description_content_type="text/markdown",
        url="https://github.com/pytorch/audio",
-        author="Soumith Chintala, David Pollack, Sean Naren, Peter Goldsborough, Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang",
+        author=(
+            "Soumith Chintala, David Pollack, Sean Naren, Peter Goldsborough, "
+            "Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang"
+        ),
        author_email="soumith@pytorch.org",
        maintainer="Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang",
        maintainer_email="moto@meta.com",
@@ -174,9 +152,10 @@ def _main():
            "Operating System :: Microsoft :: Windows",
            "Operating System :: POSIX",
            "Programming Language :: C++",
-            "Programming Language :: Python :: 3.7",
            "Programming Language :: Python :: 3.8",
            "Programming Language :: Python :: 3.9",
+            "Programming Language :: Python :: 3.10",
+            "Programming Language :: Python :: 3.11",
            "Programming Language :: Python :: Implementation :: CPython",
            "Topic :: Multimedia :: Sound/Audio",
            "Topic :: Scientific/Engineering :: Artificial Intelligence",

--- a/test/integration_tests/conftest.py
+++ b/test/integration_tests/conftest.py
@@ -102,7 +102,7 @@ def pytest_addoption(parser):
 @pytest.fixture(autouse=True)
 def temp_hub_dir(tmp_path, pytestconfig):
-    if not pytestconfig.getoption("use_tmp_hub_dir"):
+    if not pytestconfig.getoption("use_tmp_hub_dir", default=False):
        yield
    else:
        org_dir = torch.hub.get_dir()

--- a/test/integration_tests/prototype/hifi_gan_pipeline_test.py
+++ b/test/integration_tests/prototype/hifi_gan_pipeline_test.py
+import math
+import torch
+import torchaudio
+from torchaudio.prototype.functional import oscillator_bank
+from torchaudio.prototype.pipelines import HIFIGAN_VOCODER_V3_LJSPEECH
+def test_hifi_gan_pretrained_weights():
+    """Test that a waveform reconstructed from mel spectrogram by HiFiGAN bundle is close enough to the original.
+    The main transformations performed in this test can be represented as
+        - audio -> reference log mel spectrogram
+        - audio -> mel spectrogram -> audio -> estimated log mel spectrogram
+    In the end, we compare estimated log mel spectrogram to the reference one. See comments in code for details.
+    """
+    bundle = HIFIGAN_VOCODER_V3_LJSPEECH
+    # Get HiFiGAN-compatible transformation from waveform to mel spectrogram
+    mel_transform = bundle.get_mel_transform()
+    # Get HiFiGAN vocoder
+    vocoder = bundle.get_vocoder()
+    # Create a synthetic waveform
+    ref_waveform = get_sin_sweep(sample_rate=bundle.sample_rate, length=100000)
+    ref_waveform = ref_waveform[:, : -(ref_waveform.shape[1] % mel_transform.hop_size)]
+    # Generate mel spectrogram from waveform
+    mel_spectrogram = mel_transform(ref_waveform)
+    with torch.no_grad():
+        # Generate waveform from mel spectrogram
+        estimated_waveform = vocoder(mel_spectrogram).squeeze(0)
+    # Measure the reconstruction error.
+    # Even though the reconstructed audio is perceptually very close to the original, it doesn't score well on
+    # metrics like Si-SNR. It might be that HiFiGAN introduces non-uniform shifts to the reconstructed waveforms.
+    # So to evaluate the recontruction error we compute mel spectrograms of the reference and recontructed waveforms,
+    # and compare relative mean squared error of their logarithms.
+    final_spec = torchaudio.transforms.MelSpectrogram(sample_rate=bundle.sample_rate, normalized=True)
+    # Log mel spectrogram of the estimated waveform
+    estimated_spectorogram = final_spec(estimated_waveform)
+    estimated_spectorogram = torch.log(torch.clamp(estimated_spectorogram, min=1e-5))
+    # Log mel spectrogram of the reference waveform
+    ref_spectrogram = final_spec(ref_waveform)
+    ref_spectrogram = torch.log(torch.clamp(ref_spectrogram, min=1e-5))
+    # Check that relative MSE is below 4%
+    mse = ((estimated_spectorogram - ref_spectrogram) ** 2).mean()
+    mean_ref = ((ref_spectrogram) ** 2).mean()
+    print(mse / mean_ref)
+    assert mse / mean_ref < 0.04
+def get_sin_sweep(sample_rate, length):
+    """Create a waveform which changes frequency from 0 to the Nyquist frequency (half of the sample rate)"""
+    nyquist_freq = sample_rate / 2
+    freq = torch.logspace(0, math.log(0.99 * nyquist_freq, 10), length).unsqueeze(-1)
+    amp = torch.ones((length, 1))
+    waveform = oscillator_bank(freq, amp, sample_rate=sample_rate)
+    return waveform.unsqueeze(0)