Commit ffeba11a authored by mayp777's avatar mayp777
Browse files

UPDATE

parent 29deb085
......@@ -9,6 +9,25 @@ This tutorial shows how to align transcript to speech with
`CTC-Segmentation of Large Corpora for German End-to-end Speech
Recognition <https://arxiv.org/abs/2007.09127>`__.
.. note::
This tutorial was originally written to illustrate a usecase
for Wav2Vec2 pretrained model.
TorchAudio now has a set of APIs designed for forced alignment.
The `CTC forced alignment API tutorial
<./ctc_forced_alignment_api_tutorial.html>`__ illustrates the
usage of :py:func:`torchaudio.functional.forced_align`, which is
the core API.
If you are looking to align your corpus, we recommend to use
:py:class:`torchaudio.pipelines.Wav2Vec2FABundle`, which combines
:py:func:`~torchaudio.functional.forced_align` and other support
functions with pre-trained model specifically trained for
forced-alignment. Please refer to the
`Forced alignment for multilingual data
<forced_alignment_for_multilingual_data_tutorial.html>`__ which
illustrates its usage.
"""
import torch
......@@ -45,16 +64,11 @@ print(device)
# First we import the necessary packages, and fetch data that we work on.
#
# %matplotlib inline
from dataclasses import dataclass
import IPython
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams["figure.figsize"] = [16.0, 4.8]
torch.random.manual_seed(0)
SPEECH_FILE = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
......@@ -64,7 +78,7 @@ SPEECH_FILE = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-
# Generate frame-wise label probability
# -------------------------------------
#
# The first step is to generate the label class porbability of each aduio
# The first step is to generate the label class porbability of each audio
# frame. We can use a Wav2Vec2 model that is trained for ASR. Here we use
# :py:func:`torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H`.
#
......@@ -88,17 +102,24 @@ with torch.inference_mode():
emission = emissions[0].cpu().detach()
print(labels)
################################################################################
# Visualization
################################################################################
print(labels)
plt.imshow(emission.T)
plt.colorbar()
plt.title("Frame-wise class probability")
plt.xlabel("Time")
plt.ylabel("Labels")
plt.show()
# ~~~~~~~~~~~~~
def plot():
fig, ax = plt.subplots()
img = ax.imshow(emission.T)
ax.set_title("Frame-wise class probability")
ax.set_xlabel("Time")
ax.set_ylabel("Labels")
fig.colorbar(img, ax=ax, shrink=0.6, location="bottom")
fig.tight_layout()
plot()
######################################################################
# Generate alignment probability (trellis)
......@@ -138,7 +159,9 @@ plt.show()
# [`distill.pub <https://distill.pub/2017/ctc/>`__])
#
transcript = "I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT"
# We enclose the transcript with space tokens, which represent SOS and EOS.
transcript = "|I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|"
dictionary = {c: i for i, c in enumerate(labels)}
tokens = [dictionary[c] for c in transcript]
......@@ -149,21 +172,17 @@ def get_trellis(emission, tokens, blank_id=0):
num_frame = emission.size(0)
num_tokens = len(tokens)
# Trellis has extra diemsions for both time axis and tokens.
# The extra dim for tokens represents <SoS> (start-of-sentence)
# The extra dim for time axis is for simplification of the code.
trellis = torch.empty((num_frame + 1, num_tokens + 1))
trellis[0, 0] = 0
trellis[1:, 0] = torch.cumsum(emission[:, 0], 0)
trellis[0, -num_tokens:] = -float("inf")
trellis[-num_tokens:, 0] = float("inf")
trellis = torch.zeros((num_frame, num_tokens))
trellis[1:, 0] = torch.cumsum(emission[1:, blank_id], 0)
trellis[0, 1:] = -float("inf")
trellis[-num_tokens + 1 :, 0] = float("inf")
for t in range(num_frame):
for t in range(num_frame - 1):
trellis[t + 1, 1:] = torch.maximum(
# Score for staying at the same token
trellis[t, 1:] + emission[t, blank_id],
# Score for changing to the next token
trellis[t, :-1] + emission[t, tokens],
trellis[t, :-1] + emission[t, tokens[1:]],
)
return trellis
......@@ -172,11 +191,19 @@ trellis = get_trellis(emission, tokens)
################################################################################
# Visualization
################################################################################
plt.imshow(trellis[1:, 1:].T, origin="lower")
plt.annotate("- Inf", (trellis.size(1) / 5, trellis.size(1) / 1.5))
plt.colorbar()
plt.show()
# ~~~~~~~~~~~~~
def plot():
fig, ax = plt.subplots()
img = ax.imshow(trellis.T, origin="lower")
ax.annotate("- Inf", (trellis.size(1) / 5, trellis.size(1) / 1.5))
ax.annotate("+ Inf", (trellis.size(0) - trellis.size(1) / 5, trellis.size(1) / 3))
fig.colorbar(img, ax=ax, shrink=0.6, location="bottom")
fig.tight_layout()
plot()
######################################################################
# In the above visualization, we can see that there is a trace of high
......@@ -214,38 +241,38 @@ class Point:
def backtrack(trellis, emission, tokens, blank_id=0):
# Note:
# j and t are indices for trellis, which has extra dimensions
# for time and tokens at the beginning.
# When referring to time frame index `T` in trellis,
# the corresponding index in emission is `T-1`.
# Similarly, when referring to token index `J` in trellis,
# the corresponding index in transcript is `J-1`.
j = trellis.size(1) - 1
t_start = torch.argmax(trellis[:, j]).item()
path = []
for t in range(t_start, 0, -1):
t, j = trellis.size(0) - 1, trellis.size(1) - 1
path = [Point(j, t, emission[t, blank_id].exp().item())]
while j > 0:
# Should not happen but just in case
assert t > 0
# 1. Figure out if the current position was stay or change
# Note (again):
# `emission[J-1]` is the emission at time frame `J` of trellis dimension.
# Score for token staying the same from time frame J-1 to T.
stayed = trellis[t - 1, j] + emission[t - 1, blank_id]
# Score for token changing from C-1 at T-1 to J at T.
changed = trellis[t - 1, j - 1] + emission[t - 1, tokens[j - 1]]
# 2. Store the path with frame-wise probability.
prob = emission[t - 1, tokens[j - 1] if changed > stayed else 0].exp().item()
# Return token index and time index in non-trellis coordinate.
path.append(Point(j - 1, t - 1, prob))
# 3. Update the token
# Frame-wise score of stay vs change
p_stay = emission[t - 1, blank_id]
p_change = emission[t - 1, tokens[j]]
# Context-aware score for stay vs change
stayed = trellis[t - 1, j] + p_stay
changed = trellis[t - 1, j - 1] + p_change
# Update position
t -= 1
if changed > stayed:
j -= 1
if j == 0:
break
else:
raise ValueError("Failed to align")
# Store the path with frame-wise probability.
prob = (p_change if changed > stayed else p_stay).exp().item()
path.append(Point(j, t, prob))
# Now j == 0, which means, it reached the SoS.
# Fill up the rest for the sake of visualization
while t > 0:
prob = emission[t - 1, blank_id].exp().item()
path.append(Point(j, t - 1, prob))
t -= 1
return path[::-1]
......@@ -256,21 +283,28 @@ for p in path:
################################################################################
# Visualization
################################################################################
# ~~~~~~~~~~~~~
def plot_trellis_with_path(trellis, path):
# To plot trellis with path, we take advantage of 'nan' value
trellis_with_path = trellis.clone()
for _, p in enumerate(path):
trellis_with_path[p.time_index, p.token_index] = float("nan")
plt.imshow(trellis_with_path[1:, 1:].T, origin="lower")
plt.imshow(trellis_with_path.T, origin="lower")
plt.title("The path found by backtracking")
plt.tight_layout()
plot_trellis_with_path(trellis, path)
plt.title("The path found by backtracking")
plt.show()
######################################################################
# Looking good. Now this path contains repetations for the same labels, so
# Looking good.
######################################################################
# Segment the path
# ----------------
# Now this path contains repetations for the same labels, so
# let’s merge them to make it close to the original transcript.
#
# When merging the multiple path points, we simply take the average
......@@ -320,23 +354,24 @@ for seg in segments:
################################################################################
# Visualization
################################################################################
# ~~~~~~~~~~~~~
def plot_trellis_with_segments(trellis, segments, transcript):
# To plot trellis with path, we take advantage of 'nan' value
trellis_with_path = trellis.clone()
for i, seg in enumerate(segments):
if seg.label != "|":
trellis_with_path[seg.start + 1 : seg.end + 1, i + 1] = float("nan")
trellis_with_path[seg.start : seg.end, i] = float("nan")
fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(16, 9.5))
fig, [ax1, ax2] = plt.subplots(2, 1, sharex=True)
ax1.set_title("Path, label and probability for each label")
ax1.imshow(trellis_with_path.T, origin="lower")
ax1.set_xticks([])
ax1.imshow(trellis_with_path.T, origin="lower", aspect="auto")
for i, seg in enumerate(segments):
if seg.label != "|":
ax1.annotate(seg.label, (seg.start + 0.7, i + 0.3), weight="bold")
ax1.annotate(f"{seg.score:.2f}", (seg.start - 0.3, i + 4.3))
ax1.annotate(seg.label, (seg.start, i - 0.7), size="small")
ax1.annotate(f"{seg.score:.2f}", (seg.start, i + 3), size="small")
ax2.set_title("Label probability with and without repetation")
xs, hs, ws = [], [], []
......@@ -345,7 +380,7 @@ def plot_trellis_with_segments(trellis, segments, transcript):
xs.append((seg.end + seg.start) / 2 + 0.4)
hs.append(seg.score)
ws.append(seg.end - seg.start)
ax2.annotate(seg.label, (seg.start + 0.8, -0.07), weight="bold")
ax2.annotate(seg.label, (seg.start + 0.8, -0.07))
ax2.bar(xs, hs, width=ws, color="gray", alpha=0.5, edgecolor="black")
xs, hs = [], []
......@@ -357,17 +392,21 @@ def plot_trellis_with_segments(trellis, segments, transcript):
ax2.bar(xs, hs, width=0.5, alpha=0.5)
ax2.axhline(0, color="black")
ax2.set_xlim(ax1.get_xlim())
ax2.grid(True, axis="y")
ax2.set_ylim(-0.1, 1.1)
fig.tight_layout()
plot_trellis_with_segments(trellis, segments, transcript)
plt.tight_layout()
plt.show()
######################################################################
# Looks good. Now let’s merge the words. The Wav2Vec2 model uses ``'|'``
# Looks good.
######################################################################
# Merge the segments into words
# -----------------------------
# Now let’s merge the words. The Wav2Vec2 model uses ``'|'``
# as the word boundary, so we merge the segments before each occurance of
# ``'|'``.
#
......@@ -400,46 +439,43 @@ for word in word_segments:
################################################################################
# Visualization
################################################################################
def plot_alignments(trellis, segments, word_segments, waveform):
# ~~~~~~~~~~~~~
def plot_alignments(trellis, segments, word_segments, waveform, sample_rate=bundle.sample_rate):
trellis_with_path = trellis.clone()
for i, seg in enumerate(segments):
if seg.label != "|":
trellis_with_path[seg.start + 1 : seg.end + 1, i + 1] = float("nan")
trellis_with_path[seg.start : seg.end, i] = float("nan")
fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(16, 9.5))
fig, [ax1, ax2] = plt.subplots(2, 1)
ax1.imshow(trellis_with_path[1:, 1:].T, origin="lower")
ax1.imshow(trellis_with_path.T, origin="lower", aspect="auto")
ax1.set_facecolor("lightgray")
ax1.set_xticks([])
ax1.set_yticks([])
for word in word_segments:
ax1.axvline(word.start - 0.5)
ax1.axvline(word.end - 0.5)
ax1.axvspan(word.start - 0.5, word.end - 0.5, edgecolor="white", facecolor="none")
for i, seg in enumerate(segments):
if seg.label != "|":
ax1.annotate(seg.label, (seg.start, i + 0.3))
ax1.annotate(f"{seg.score:.2f}", (seg.start, i + 4), fontsize=8)
ax1.annotate(seg.label, (seg.start, i - 0.7), size="small")
ax1.annotate(f"{seg.score:.2f}", (seg.start, i + 3), size="small")
# The original waveform
ratio = waveform.size(0) / (trellis.size(0) - 1)
ax2.plot(waveform)
ratio = waveform.size(0) / sample_rate / trellis.size(0)
ax2.specgram(waveform, Fs=sample_rate)
for word in word_segments:
x0 = ratio * word.start
x1 = ratio * word.end
ax2.axvspan(x0, x1, alpha=0.1, color="red")
ax2.annotate(f"{word.score:.2f}", (x0, 0.8))
ax2.axvspan(x0, x1, facecolor="none", edgecolor="white", hatch="/")
ax2.annotate(f"{word.score:.2f}", (x0, sample_rate * 0.51), annotation_clip=False)
for seg in segments:
if seg.label != "|":
ax2.annotate(seg.label, (seg.start * ratio, 0.9))
xticks = ax2.get_xticks()
plt.xticks(xticks, xticks / bundle.sample_rate)
ax2.annotate(seg.label, (seg.start * ratio, sample_rate * 0.55), annotation_clip=False)
ax2.set_xlabel("time [second]")
ax2.set_yticks([])
ax2.set_ylim(-1.0, 1.0)
ax2.set_xlim(0, waveform.size(-1))
fig.tight_layout()
plot_alignments(
......@@ -448,16 +484,16 @@ plot_alignments(
word_segments,
waveform[0],
)
plt.show()
################################################################################
# Audio Samples
# -------------
#
# A trick to embed the resulting audio to the generated file.
# `IPython.display.Audio` has to be the last call in a cell,
# and there should be only one call par cell.
def display_segment(i):
ratio = waveform.size(1) / (trellis.size(0) - 1)
ratio = waveform.size(1) / trellis.size(0)
word = word_segments[i]
x0 = int(ratio * word.start)
x1 = int(ratio * word.end)
......
......@@ -45,6 +45,8 @@ import torchaudio
print(torch.__version__)
print(torchaudio.__version__)
import matplotlib.pyplot as plt
######################################################################
# In addition to ``torchaudio``, ``mir_eval`` is required to perform
# signal-to-distortion ratio (SDR) calculations. To install ``mir_eval``
......@@ -52,30 +54,9 @@ print(torchaudio.__version__)
#
from IPython.display import Audio
from mir_eval import separation
from torchaudio.pipelines import HDEMUCS_HIGH_MUSDB_PLUS
from torchaudio.utils import download_asset
import matplotlib.pyplot as plt
try:
from torchaudio.pipelines import HDEMUCS_HIGH_MUSDB_PLUS
from mir_eval import separation
except ModuleNotFoundError:
try:
import google.colab
print(
"""
To enable running this notebook in Google Colab, install nightly
torch and torchaudio builds by adding the following code block to the top
of the notebook before running it:
!pip3 uninstall -y torch torchvision torchaudio
!pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
!pip3 install mir_eval
"""
)
except ModuleNotFoundError:
pass
raise
######################################################################
# 3. Construct the pipeline
......@@ -130,11 +111,11 @@ from torchaudio.transforms import Fade
def separate_sources(
model,
mix,
segment=10.,
overlap=0.1,
device=None,
model,
mix,
segment=10.0,
overlap=0.1,
device=None,
):
"""
Apply model to a given mixture. Use fade, and add segments together in order to add model segment by segment.
......@@ -157,7 +138,7 @@ def separate_sources(
start = 0
end = chunk_len
overlap_frames = overlap * sample_rate
fade = Fade(fade_in_len=0, fade_out_len=int(overlap_frames), fade_shape='linear')
fade = Fade(fade_in_len=0, fade_out_len=int(overlap_frames), fade_shape="linear")
final = torch.zeros(batch, len(model.sources), channels, length, device=device)
......@@ -181,11 +162,10 @@ def separate_sources(
def plot_spectrogram(stft, title="Spectrogram"):
magnitude = stft.abs()
spectrogram = 20 * torch.log10(magnitude + 1e-8).numpy()
figure, axis = plt.subplots(1, 1)
img = axis.imshow(spectrogram, cmap="viridis", vmin=-60, vmax=0, origin="lower", aspect="auto")
figure.suptitle(title)
plt.colorbar(img, ax=axis)
plt.show()
_, axis = plt.subplots(1, 1)
axis.imshow(spectrogram, cmap="viridis", vmin=-60, vmax=0, origin="lower", aspect="auto")
axis.set_title(title)
plt.tight_layout()
######################################################################
......@@ -208,7 +188,7 @@ def plot_spectrogram(stft, title="Spectrogram"):
# We download the audio file from our storage. Feel free to download another file and use audio from a specific path
SAMPLE_SONG = download_asset("tutorial-assets/hdemucs_mix.wav")
waveform, sample_rate = torchaudio.load(SAMPLE_SONG) # replace SAMPLE_SONG with desired path for different song
waveform.to(device)
waveform = waveform.to(device)
mixture = waveform
# parameters
......@@ -265,12 +245,13 @@ stft = torchaudio.transforms.Spectrogram(
# scores.
#
def output_results(original_source: torch.Tensor, predicted_source: torch.Tensor, source: str):
print("SDR score is:",
separation.bss_eval_sources(
original_source.detach().numpy(),
predicted_source.detach().numpy())[0].mean())
plot_spectrogram(stft(predicted_source)[0], f'Spectrogram {source}')
print(
"SDR score is:",
separation.bss_eval_sources(original_source.detach().numpy(), predicted_source.detach().numpy())[0].mean(),
)
plot_spectrogram(stft(predicted_source)[0], f"Spectrogram - {source}")
return Audio(predicted_source, rate=sample_rate)
......@@ -285,23 +266,19 @@ bass_original = download_asset("tutorial-assets/hdemucs_bass_segment.wav")
vocals_original = download_asset("tutorial-assets/hdemucs_vocals_segment.wav")
other_original = download_asset("tutorial-assets/hdemucs_other_segment.wav")
drums_spec = audios["drums"][:, frame_start: frame_end]
drums_spec = audios["drums"][:, frame_start:frame_end].cpu()
drums, sample_rate = torchaudio.load(drums_original)
drums.to(device)
bass_spec = audios["bass"][:, frame_start: frame_end]
bass_spec = audios["bass"][:, frame_start:frame_end].cpu()
bass, sample_rate = torchaudio.load(bass_original)
bass.to(device)
vocals_spec = audios["vocals"][:, frame_start: frame_end]
vocals_spec = audios["vocals"][:, frame_start:frame_end].cpu()
vocals, sample_rate = torchaudio.load(vocals_original)
vocals.to(device)
other_spec = audios["other"][:, frame_start: frame_end]
other_spec = audios["other"][:, frame_start:frame_end].cpu()
other, sample_rate = torchaudio.load(other_original)
other.to(device)
mix_spec = mixture[:, frame_start: frame_end]
mix_spec = mixture[:, frame_start:frame_end].cpu()
######################################################################
......@@ -316,7 +293,7 @@ mix_spec = mixture[:, frame_start: frame_end]
#
# Mixture Clip
plot_spectrogram(stft(mix_spec)[0], "Spectrogram Mixture")
plot_spectrogram(stft(mix_spec)[0], "Spectrogram - Mixture")
Audio(mix_spec, rate=sample_rate)
######################################################################
......
......@@ -37,6 +37,10 @@ print(torch.__version__)
print(torchaudio.__version__)
import matplotlib.pyplot as plt
import mir_eval
from IPython.display import Audio
######################################################################
# 2. Preparation
# --------------
......@@ -59,10 +63,6 @@ print(torchaudio.__version__)
from pesq import pesq
from pystoi import stoi
import mir_eval
import matplotlib.pyplot as plt
from IPython.display import Audio
from torchaudio.utils import download_asset
######################################################################
......@@ -98,23 +98,21 @@ SAMPLE_NOISE = download_asset("tutorial-assets/mvdr/noise.wav")
#
def plot_spectrogram(stft, title="Spectrogram", xlim=None):
def plot_spectrogram(stft, title="Spectrogram"):
magnitude = stft.abs()
spectrogram = 20 * torch.log10(magnitude + 1e-8).numpy()
figure, axis = plt.subplots(1, 1)
img = axis.imshow(spectrogram, cmap="viridis", vmin=-100, vmax=0, origin="lower", aspect="auto")
figure.suptitle(title)
axis.set_title(title)
plt.colorbar(img, ax=axis)
plt.show()
def plot_mask(mask, title="Mask", xlim=None):
def plot_mask(mask, title="Mask"):
mask = mask.numpy()
figure, axis = plt.subplots(1, 1)
img = axis.imshow(mask, cmap="viridis", origin="lower", aspect="auto")
figure.suptitle(title)
axis.set_title(title)
plt.colorbar(img, ax=axis)
plt.show()
def si_snr(estimate, reference, epsilon=1e-8):
......
"""
Accelerated video decoding with NVDEC
=====================================
.. _nvdec_tutorial:
**Author**: `Moto Hira <moto@meta.com>`__
This tutorial shows how to use NVIDIA’s hardware video decoder (NVDEC)
with TorchAudio, and how it improves the performance of video decoding.
"""
######################################################################
#
# .. note::
#
# This tutorial requires FFmpeg libraries compiled with HW
# acceleration enabled.
#
# Please refer to
# :ref:`Enabling GPU video decoder/encoder <enabling_hw_decoder>`
# for how to build FFmpeg with HW acceleration.
#
import torch
import torchaudio
print(torch.__version__)
print(torchaudio.__version__)
######################################################################
#
import os
import time
import matplotlib.pyplot as plt
from torchaudio.io import StreamReader
######################################################################
#
# Check the prerequisites
# -----------------------
#
# First, we check that TorchAudio correctly detects FFmpeg libraries
# that support HW decoder/encoder.
#
from torchaudio.utils import ffmpeg_utils
######################################################################
#
print("FFmpeg Library versions:")
for k, ver in ffmpeg_utils.get_versions().items():
print(f" {k}:\t{'.'.join(str(v) for v in ver)}")
######################################################################
#
print("Available NVDEC Decoders:")
for k in ffmpeg_utils.get_video_decoders().keys():
if "cuvid" in k:
print(f" - {k}")
######################################################################
#
print("Avaialbe GPU:")
print(torch.cuda.get_device_properties(0))
######################################################################
#
# We will use the following video which has the following properties;
#
# - Codec: H.264
# - Resolution: 960x540
# - FPS: 29.97
# - Pixel format: YUV420P
#
# .. raw:: html
#
# <video style="max-width: 100%" controls>
# <source src="https://download.pytorch.org/torchaudio/tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4" type="video/mp4">
# </video>
######################################################################
#
src = torchaudio.utils.download_asset(
"tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4"
)
######################################################################
# Decoding videos with NVDEC
# --------------------------
#
# To use HW video decoder, you need to specify the HW decoder when
# defining the output video stream by passing ``decoder`` option to
# :py:meth:`~torchaudio.io.StreamReader.add_video_stream` method.
#
s = StreamReader(src)
s.add_video_stream(5, decoder="h264_cuvid")
s.fill_buffer()
(video,) = s.pop_chunks()
######################################################################
#
# The video frames are decoded and returned as tensor of NCHW format.
print(video.shape, video.dtype)
######################################################################
#
# By default, the decoded frames are sent back to CPU memory, and
# CPU tensors are created.
print(video.device)
######################################################################
#
# By specifying ``hw_accel`` option, you can convert the decoded frames
# to CUDA tensor.
# ``hw_accel`` option takes string values and pass it
# to :py:class:`torch.device`.
#
# .. note::
#
# Currently, ``hw_accel`` option and
# :py:meth:`~torchaudio.io.StreamReader.add_basic_video_stream`
# are not compatible. ``add_basic_video_stream`` adds post-decoding
# process, which is designed for frames in CPU memory.
# Please use :py:meth:`~torchaudio.io.StreamReader.add_video_stream`.
#
s = StreamReader(src)
s.add_video_stream(5, decoder="h264_cuvid", hw_accel="cuda:0")
s.fill_buffer()
(video,) = s.pop_chunks()
print(video.shape, video.dtype, video.device)
######################################################################
# .. note::
#
# When there are multiple of GPUs available, ``StreamReader`` by
# default uses the first GPU. You can change this by providing
# ``"gpu"`` option.
#
# .. code::
#
# # Video data is sent to CUDA device 0, decoded and
# # converted on the same device.
# s.add_video_stream(
# ...,
# decoder="h264_cuvid",
# decoder_option={"gpu": "0"},
# hw_accel="cuda:0",
# )
#
# .. note::
#
# ``"gpu"`` option and ``hw_accel`` option can be specified
# independently. If they do not match, decoded frames are
# transfered to the device specified by ``hw_accell``
# automatically.
#
# .. code::
#
# # Video data is sent to CUDA device 0, and decoded there.
# # Then it is transfered to CUDA device 1, and converted to
# # CUDA tensor.
# s.add_video_stream(
# ...,
# decoder="h264_cuvid",
# decoder_option={"gpu": "0"},
# hw_accel="cuda:1",
# )
######################################################################
# Visualization
# -------------
#
# Let's look at the frames decoded by HW decoder and compare them
# against equivalent results from software decoders.
#
# The following function seeks into the given timestamp and decode one
# frame with the specificed decoder.
def test_decode(decoder: str, seek: float):
s = StreamReader(src)
s.seek(seek)
s.add_video_stream(1, decoder=decoder)
s.fill_buffer()
(video,) = s.pop_chunks()
return video[0]
######################################################################
#
timestamps = [12, 19, 45, 131, 180]
cpu_frames = [test_decode(decoder="h264", seek=ts) for ts in timestamps]
cuda_frames = [test_decode(decoder="h264_cuvid", seek=ts) for ts in timestamps]
######################################################################
#
# .. note::
#
# Currently, HW decoder does not support colorspace conversion.
# Decoded frames are YUV format.
# The following function performs YUV to RGB covnersion
# (and axis shuffling for plotting).
def yuv_to_rgb(frames):
frames = frames.cpu().to(torch.float)
y = frames[..., 0, :, :]
u = frames[..., 1, :, :]
v = frames[..., 2, :, :]
y /= 255
u = u / 255 - 0.5
v = v / 255 - 0.5
r = y + 1.14 * v
g = y + -0.396 * u - 0.581 * v
b = y + 2.029 * u
rgb = torch.stack([r, g, b], -1)
rgb = (rgb * 255).clamp(0, 255).to(torch.uint8)
return rgb.numpy()
######################################################################
#
# Now we visualize the resutls.
#
def plot():
n_rows = len(timestamps)
fig, axes = plt.subplots(n_rows, 2, figsize=[12.8, 16.0])
for i in range(n_rows):
axes[i][0].imshow(yuv_to_rgb(cpu_frames[i]))
axes[i][1].imshow(yuv_to_rgb(cuda_frames[i]))
axes[0][0].set_title("Software decoder")
axes[0][1].set_title("HW decoder")
plt.setp(axes, xticks=[], yticks=[])
plt.tight_layout()
plot()
######################################################################
#
# They are indistinguishable to the eyes of the author.
# Feel free to let us know if you spot something. :)
#
######################################################################
# HW resizing and cropping
# ------------------------
#
# You can use ``decoder_option`` argument to provide decoder-specific
# options.
#
# The following options are often relevant in preprocessing.
#
# - ``resize``: Resize the frame into ``(width)x(height)``.
# - ``crop``: Crop the frame ``(top)x(bottom)x(left)x(right)``.
# Note that the specified values are the amount of rows/columns removed.
# The final image size is ``(width - left - right)x(height - top -bottom)``.
# If ``crop`` and ``resize`` options are used together,
# ``crop`` is performed first.
#
# For other available options, please run
# ``ffmpeg -h decoder=h264_cuvid``.
#
def test_options(option):
s = StreamReader(src)
s.seek(87)
s.add_video_stream(1, decoder="h264_cuvid", hw_accel="cuda:0", decoder_option=option)
s.fill_buffer()
(video,) = s.pop_chunks()
print(f"Option: {option}:\t{video.shape}")
return video[0]
######################################################################
#
original = test_options(option=None)
resized = test_options(option={"resize": "480x270"})
cropped = test_options(option={"crop": "135x135x240x240"})
cropped_and_resized = test_options(option={"crop": "135x135x240x240", "resize": "640x360"})
######################################################################
#
def plot():
fig, axes = plt.subplots(2, 2, figsize=[12.8, 9.6])
axes[0][0].imshow(yuv_to_rgb(original))
axes[0][1].imshow(yuv_to_rgb(resized))
axes[1][0].imshow(yuv_to_rgb(cropped))
axes[1][1].imshow(yuv_to_rgb(cropped_and_resized))
axes[0][0].set_title("Original")
axes[0][1].set_title("Resized")
axes[1][0].set_title("Cropped")
axes[1][1].set_title("Cropped and resized")
plt.tight_layout()
return fig
plot()
######################################################################
# Comparing resizing methods
# --------------------------
#
# Unlike software scaling, NVDEC does not provide an option to choose
# the scaling algorithm.
# In ML applicatoins, it is often necessary to construct a
# preprocessing pipeline with a similar numerical property.
# So here we compare the result of hardware resizing with software
# resizing of different algorithms.
#
# We will use the following video, which contains the test pattern
# generated using the following command.
#
# .. code::
#
# ffmpeg -y -f lavfi -t 12.05 -i mptestsrc -movflags +faststart mptestsrc.mp4
#
# .. raw:: html
#
# <video style="max-width: 100%" controls>
# <source src="https://download.pytorch.org/torchaudio/tutorial-assets/mptestsrc.mp4" type="video/mp4">
# </video>
######################################################################
#
test_src = torchaudio.utils.download_asset("tutorial-assets/mptestsrc.mp4")
######################################################################
# The following function decodes video and
# apply the specified scaling algorithm.
#
def decode_resize_ffmpeg(mode, height, width, seek):
filter_desc = None if mode is None else f"scale={width}:{height}:sws_flags={mode}"
s = StreamReader(test_src)
s.add_video_stream(1, filter_desc=filter_desc)
s.seek(seek)
s.fill_buffer()
(chunk,) = s.pop_chunks()
return chunk
######################################################################
# The following function uses HW decoder to decode video and resize.
#
def decode_resize_cuvid(height, width, seek):
s = StreamReader(test_src)
s.add_video_stream(1, decoder="h264_cuvid", decoder_option={"resize": f"{width}x{height}"}, hw_accel="cuda:0")
s.seek(seek)
s.fill_buffer()
(chunk,) = s.pop_chunks()
return chunk.cpu()
######################################################################
# Now we execute them and visualize the resulting frames.
params = {"height": 224, "width": 224, "seek": 3}
frames = [
decode_resize_ffmpeg(None, **params),
decode_resize_ffmpeg("neighbor", **params),
decode_resize_ffmpeg("bilinear", **params),
decode_resize_ffmpeg("bicubic", **params),
decode_resize_cuvid(**params),
decode_resize_ffmpeg("spline", **params),
decode_resize_ffmpeg("lanczos:param0=1", **params),
decode_resize_ffmpeg("lanczos:param0=3", **params),
decode_resize_ffmpeg("lanczos:param0=5", **params),
]
######################################################################
#
def plot():
fig, axes = plt.subplots(3, 3, figsize=[12.8, 15.2])
for i, f in enumerate(frames):
h, w = f.shape[2:4]
f = f[..., : h // 4, : w // 4]
axes[i // 3][i % 3].imshow(yuv_to_rgb(f[0]))
axes[0][0].set_title("Original")
axes[0][1].set_title("nearest neighbor")
axes[0][2].set_title("bilinear")
axes[1][0].set_title("bicubic")
axes[1][1].set_title("NVDEC")
axes[1][2].set_title("spline")
axes[2][0].set_title("lanczos(1)")
axes[2][1].set_title("lanczos(3)")
axes[2][2].set_title("lanczos(5)")
plt.setp(axes, xticks=[], yticks=[])
plt.tight_layout()
plot()
######################################################################
# None of them is exactly the same. To the eyes of authors, lanczos(1)
# appears to be most similar to NVDEC.
# The bicubic looks close as well.
######################################################################
#
# Benchmark NVDEC with StreamReader
# ---------------------------------
#
# In this section, we compare the performace of software video
# decoding and HW video decoding.
#
######################################################################
# Decode as CUDA frames
# ---------------------
#
# First, we compare the time it takes for software decoder and
# hardware encoder to decode the same video.
# To make the result comparable, when using software decoder, we move
# the resulting tensor to CUDA.
#
# The procedures to test look like the following
#
# - Use hardware decoder and place data on CUDA directly
# - Use software decoder, generate CPU Tensors and move them to CUDA.
#
# .. note:
#
# Because HW decoder currently only supports reading videos as
# YUV444P format, we decode frames into YUV444P format for the case of
# software decoder as well.
#
######################################################################
# The following function implements the hardware decoder test case.
def test_decode_cuda(src, decoder, hw_accel="cuda", frames_per_chunk=5):
s = StreamReader(src)
s.add_video_stream(frames_per_chunk, decoder=decoder, hw_accel=hw_accel)
num_frames = 0
chunk = None
t0 = time.monotonic()
for (chunk,) in s.stream():
num_frames += chunk.shape[0]
elapsed = time.monotonic() - t0
print(f" - Shape: {chunk.shape}")
fps = num_frames / elapsed
print(f" - Processed {num_frames} frames in {elapsed:.2f} seconds. ({fps:.2f} fps)")
return fps
######################################################################
# The following function implements the software decoder test case.
def test_decode_cpu(src, threads, decoder=None, frames_per_chunk=5):
s = StreamReader(src)
s.add_video_stream(frames_per_chunk, decoder=decoder, decoder_option={"threads": f"{threads}"})
num_frames = 0
device = torch.device("cuda")
t0 = time.monotonic()
for i, (chunk,) in enumerate(s.stream()):
if i == 0:
print(f" - Shape: {chunk.shape}")
num_frames += chunk.shape[0]
chunk = chunk.to(device)
elapsed = time.monotonic() - t0
fps = num_frames / elapsed
print(f" - Processed {num_frames} frames in {elapsed:.2f} seconds. ({fps:.2f} fps)")
return fps
######################################################################
# For each resolution of video, we run multiple software decoder test
# cases with different number of threads.
def run_decode_tests(src, frames_per_chunk=5):
fps = []
print(f"Testing: {os.path.basename(src)}")
for threads in [1, 4, 8, 16]:
print(f"* Software decoding (num_threads={threads})")
fps.append(test_decode_cpu(src, threads))
print("* Hardware decoding")
fps.append(test_decode_cuda(src, decoder="h264_cuvid"))
return fps
######################################################################
# Now we run the tests with videos of different resolutions.
#
# QVGA
# ----
src_qvga = torchaudio.utils.download_asset("tutorial-assets/testsrc2_qvga.h264.mp4")
fps_qvga = run_decode_tests(src_qvga)
######################################################################
# VGA
# ---
src_vga = torchaudio.utils.download_asset("tutorial-assets/testsrc2_vga.h264.mp4")
fps_vga = run_decode_tests(src_vga)
######################################################################
# XGA
# ---
src_xga = torchaudio.utils.download_asset("tutorial-assets/testsrc2_xga.h264.mp4")
fps_xga = run_decode_tests(src_xga)
######################################################################
# Result
# ------
#
# Now we plot the result.
def plot():
fig, ax = plt.subplots(figsize=[9.6, 6.4])
for items in zip(fps_qvga, fps_vga, fps_xga, "ov^sx"):
ax.plot(items[:-1], marker=items[-1])
ax.grid(axis="both")
ax.set_xticks([0, 1, 2], ["QVGA (320x240)", "VGA (640x480)", "XGA (1024x768)"])
ax.legend(
[
"Software Decoding (threads=1)",
"Software Decoding (threads=4)",
"Software Decoding (threads=8)",
"Software Decoding (threads=16)",
"Hardware Decoding (CUDA Tensor)",
]
)
ax.set_title("Speed of processing video frames")
ax.set_ylabel("Frames per second")
plt.tight_layout()
plot()
######################################################################
#
# We observe couple of things
#
# - Increasing the number of threads in software decoding makes the
# pipeline faster, but the performance saturates around 8 threads.
# - The performance gain from using hardware decoder depends on the
# resolution of video.
# - At lower resolutions like QVGA, hardware decoding is slower than
# software decoding
# - At higher resolutions like XGA, hardware decoding is faster
# than software decoding.
#
#
# It is worth noting that the performance gain also depends on the
# type of GPU.
# We observed that when decoding VGA videos using V100 or A100 GPUs,
# hardware decoders are slower than software decoders. But using A10
# GPU hardware deocder is faster than software decodr.
#
######################################################################
# Decode and resize
# -----------------
#
# Next, we add resize operation to the pipeline.
# We will compare the following pipelines.
#
# 1. Decode video using software decoder and read the frames as
# PyTorch Tensor. Resize the tensor using
# :py:func:`torch.nn.functional.interpolate`, then send
# the resulting tensor to CUDA device.
# 2. Decode video using software decoder, resize the frame with
# FFmpeg's filter graph, read the resized frames as PyTorch tensor,
# then send it to CUDA device.
# 3. Decode and resize video simulaneously with HW decoder, read the
# resulting frames as CUDA tensor.
#
# The pipeline 1 represents common video loading implementations.
#
# The pipeline 2 uses FFmpeg's filter graph, which allows to manipulate
# raw frames before converting them to Tensors.
#
# The pipeline 3 has the minimum amount of data transfer from CPU to
# CUDA, which significantly contribute to performant data loading.
#
######################################################################
# The following function implements the pipeline 1. It uses PyTorch's
# :py:func:`torch.nn.functional.interpolate`.
# We use ``bincubic`` mode, as we saw that the resulting frames are
# closest to NVDEC resizing.
#
def test_decode_then_resize(src, height, width, mode="bicubic", frames_per_chunk=5):
s = StreamReader(src)
s.add_video_stream(frames_per_chunk, decoder_option={"threads": "8"})
num_frames = 0
device = torch.device("cuda")
chunk = None
t0 = time.monotonic()
for (chunk,) in s.stream():
num_frames += chunk.shape[0]
chunk = torch.nn.functional.interpolate(chunk, [height, width], mode=mode, antialias=True)
chunk = chunk.to(device)
elapsed = time.monotonic() - t0
fps = num_frames / elapsed
print(f" - Shape: {chunk.shape}")
print(f" - Processed {num_frames} frames in {elapsed:.2f} seconds. ({fps:.2f} fps)")
return fps
######################################################################
# The following function implements the pipeline 2. Frames are resized
# as part of decoding process, then sent to CUDA device.
#
# We use ``bincubic`` mode, to make the result comparable with
# PyTorch-based implementation above.
#
def test_decode_and_resize(src, height, width, mode="bicubic", frames_per_chunk=5):
s = StreamReader(src)
s.add_video_stream(
frames_per_chunk, filter_desc=f"scale={width}:{height}:sws_flags={mode}", decoder_option={"threads": "8"}
)
num_frames = 0
device = torch.device("cuda")
chunk = None
t0 = time.monotonic()
for (chunk,) in s.stream():
num_frames += chunk.shape[0]
chunk = chunk.to(device)
elapsed = time.monotonic() - t0
fps = num_frames / elapsed
print(f" - Shape: {chunk.shape}")
print(f" - Processed {num_frames} frames in {elapsed:.2f} seconds. ({fps:.2f} fps)")
return fps
######################################################################
# The following function implements the pipeline 3. Resizing is
# performed by NVDEC and the resulting tensor is placed on CUDA memory.
def test_hw_decode_and_resize(src, decoder, decoder_option, hw_accel="cuda", frames_per_chunk=5):
s = StreamReader(src)
s.add_video_stream(5, decoder=decoder, decoder_option=decoder_option, hw_accel=hw_accel)
num_frames = 0
chunk = None
t0 = time.monotonic()
for (chunk,) in s.stream():
num_frames += chunk.shape[0]
elapsed = time.monotonic() - t0
fps = num_frames / elapsed
print(f" - Shape: {chunk.shape}")
print(f" - Processed {num_frames} frames in {elapsed:.2f} seconds. ({fps:.2f} fps)")
return fps
######################################################################
#
# The following function run the benchmark functions on given sources.
#
def run_resize_tests(src):
print(f"Testing: {os.path.basename(src)}")
height, width = 224, 224
print("* Software decoding with PyTorch interpolate")
cpu_resize1 = test_decode_then_resize(src, height=height, width=width)
print("* Software decoding with FFmpeg scale")
cpu_resize2 = test_decode_and_resize(src, height=height, width=width)
print("* Hardware decoding with resize")
cuda_resize = test_hw_decode_and_resize(src, decoder="h264_cuvid", decoder_option={"resize": f"{width}x{height}"})
return [cpu_resize1, cpu_resize2, cuda_resize]
######################################################################
#
# Now we run the tests.
######################################################################
# QVGA
# ----
fps_qvga = run_resize_tests(src_qvga)
######################################################################
# VGA
# ---
fps_vga = run_resize_tests(src_vga)
######################################################################
# XGA
# ---
fps_xga = run_resize_tests(src_xga)
######################################################################
# Result
# ------
# Now we plot the result.
#
def plot():
fig, ax = plt.subplots(figsize=[9.6, 6.4])
for items in zip(fps_qvga, fps_vga, fps_xga, "ov^sx"):
ax.plot(items[:-1], marker=items[-1])
ax.grid(axis="both")
ax.set_xticks([0, 1, 2], ["QVGA (320x240)", "VGA (640x480)", "XGA (1024x768)"])
ax.legend(
[
"Software decoding\nwith resize\n(PyTorch interpolate)",
"Software decoding\nwith resize\n(FFmpeg scale)",
"NVDEC\nwith resizing",
]
)
ax.set_title("Speed of processing video frames")
ax.set_xlabel("Input video resolution")
ax.set_ylabel("Frames per second")
plt.tight_layout()
plot()
######################################################################
#
# Hardware deocder shows a similar trend as previous experiment.
# In fact, the performance is almost the same. Hardware resizing has
# almost zero overhead for scaling down the frames.
#
# Software decoding also shows a similar trend. Performing resizing as
# part of decoding is faster. One possible explanation is that, video
# frames are internally stored as YUV420P, which has half the number
# of pixels compared to RGB24, or YUV444P. This means that if resizing
# before copying frame data to PyTorch tensor, the number of pixels
# manipulated and copied are smaller than the case where applying
# resizing after frames are converted to Tensor.
#
######################################################################
#
# Tag: :obj:`torchaudio.io`
"""
Accelerated video encoding with NVENC
=====================================
.. _nvenc_tutorial:
**Author**: `Moto Hira <moto@meta.com>`__
This tutorial shows how to use NVIDIA’s hardware video encoder (NVENC)
with TorchAudio, and how it improves the performance of video encoding.
"""
######################################################################
# .. note::
#
# This tutorial requires FFmpeg libraries compiled with HW
# acceleration enabled.
#
# Please refer to
# :ref:`Enabling GPU video decoder/encoder <enabling_hw_decoder>`
# for how to build FFmpeg with HW acceleration.
#
# .. note::
#
# Most modern GPUs have both HW decoder and encoder, but some
# highend GPUs like A100 and H100 do not have HW encoder.
# Please refer to the following for the availability and
# format coverage.
# https://developer.nvidia.com/video-encode-and-decode-gpu-support-matrix-new
#
# Attempting to use HW encoder on these GPUs fails with an error
# message like ``Generic error in an external library``.
# You can enable debug log with
# :py:func:`torchaudio.utils.ffmpeg_utils.set_log_level` to see more
# detailed error messages issued along the way.
#
import torch
import torchaudio
print(torch.__version__)
print(torchaudio.__version__)
import io
import time
import matplotlib.pyplot as plt
from IPython.display import Video
from torchaudio.io import StreamReader, StreamWriter
######################################################################
#
# Check the prerequisites
# -----------------------
#
# First, we check that TorchAudio correctly detects FFmpeg libraries
# that support HW decoder/encoder.
#
from torchaudio.utils import ffmpeg_utils
######################################################################
#
print("FFmpeg Library versions:")
for k, ver in ffmpeg_utils.get_versions().items():
print(f" {k}:\t{'.'.join(str(v) for v in ver)}")
######################################################################
#
print("Available NVENC Encoders:")
for k in ffmpeg_utils.get_video_encoders().keys():
if "nvenc" in k:
print(f" - {k}")
######################################################################
#
print("Avaialbe GPU:")
print(torch.cuda.get_device_properties(0))
######################################################################
# We use the following helper function to generate test frame data.
# For the detail of synthetic video generation please refer to
# :ref:`StreamReader Advanced Usage <lavfi>`.
def get_data(height, width, format="yuv444p", frame_rate=30000 / 1001, duration=4):
src = f"testsrc2=rate={frame_rate}:size={width}x{height}:duration={duration}"
s = StreamReader(src=src, format="lavfi")
s.add_basic_video_stream(-1, format=format)
s.process_all_packets()
(video,) = s.pop_chunks()
return video
######################################################################
# Encoding videos with NVENC
# --------------------------
#
# To use HW video encoder, you need to specify the HW encoder when
# defining the output video stream by providing ``encoder`` option to
# :py:meth:`~torchaudio.io.StreamWriter.add_video_stream`.
#
######################################################################
#
pict_config = {
"height": 360,
"width": 640,
"frame_rate": 30000 / 1001,
"format": "yuv444p",
}
frame_data = get_data(**pict_config)
######################################################################
#
w = StreamWriter(io.BytesIO(), format="mp4")
w.add_video_stream(**pict_config, encoder="h264_nvenc", encoder_format="yuv444p")
with w.open():
w.write_video_chunk(0, frame_data)
######################################################################
# Similar to the HW decoder, by default, the encoder expects the frame
# data to be on CPU memory. To send data from CUDA memory, you need to
# specify ``hw_accel`` option.
#
buffer = io.BytesIO()
w = StreamWriter(buffer, format="mp4")
w.add_video_stream(**pict_config, encoder="h264_nvenc", encoder_format="yuv444p", hw_accel="cuda:0")
with w.open():
w.write_video_chunk(0, frame_data.to(torch.device("cuda:0")))
buffer.seek(0)
video_cuda = buffer.read()
######################################################################
#
Video(video_cuda, embed=True, mimetype="video/mp4")
######################################################################
# Benchmark NVENC with StreamWriter
# ---------------------------------
#
# Now we compare the performance of software encoder and hardware
# encoder.
#
# Similar to the benchmark in NVDEC, we process the videos of different
# resolution, and measure the time it takes to encode them.
#
# We also measure the size of resulting video file.
######################################################################
# The following function encodes the given frames and measure the time
# it takes to encode and the size of the resulting video data.
#
def test_encode(data, encoder, width, height, hw_accel=None, **config):
assert data.is_cuda
buffer = io.BytesIO()
s = StreamWriter(buffer, format="mp4")
s.add_video_stream(encoder=encoder, width=width, height=height, hw_accel=hw_accel, **config)
with s.open():
t0 = time.monotonic()
if hw_accel is None:
data = data.to("cpu")
s.write_video_chunk(0, data)
elapsed = time.monotonic() - t0
size = buffer.tell()
fps = len(data) / elapsed
print(f" - Processed {len(data)} frames in {elapsed:.2f} seconds. ({fps:.2f} fps)")
print(f" - Encoded data size: {size} bytes")
return elapsed, size
######################################################################
# We conduct the tests for the following configurations
#
# - Software encoder with the number of threads 1, 4, 8
# - Hardware encoder with and without ``hw_accel`` option.
#
def run_tests(height, width, duration=4):
# Generate the test data
print(f"Testing resolution: {width}x{height}")
pict_config = {
"height": height,
"width": width,
"frame_rate": 30000 / 1001,
"format": "yuv444p",
}
data = get_data(**pict_config, duration=duration)
data = data.to(torch.device("cuda:0"))
times = []
sizes = []
# Test software encoding
encoder_config = {
"encoder": "libx264",
"encoder_format": "yuv444p",
}
for i, num_threads in enumerate([1, 4, 8]):
print(f"* Software Encoder (num_threads={num_threads})")
time_, size = test_encode(
data,
encoder_option={"threads": str(num_threads)},
**pict_config,
**encoder_config,
)
times.append(time_)
if i == 0:
sizes.append(size)
# Test hardware encoding
encoder_config = {
"encoder": "h264_nvenc",
"encoder_format": "yuv444p",
"encoder_option": {"gpu": "0"},
}
for i, hw_accel in enumerate([None, "cuda"]):
print(f"* Hardware Encoder {'(CUDA frames)' if hw_accel else ''}")
time_, size = test_encode(
data,
**pict_config,
**encoder_config,
hw_accel=hw_accel,
)
times.append(time_)
if i == 0:
sizes.append(size)
return times, sizes
######################################################################
# And we change the resolution of videos to see how these measurement
# change.
#
# 360P
# ----
#
time_360, size_360 = run_tests(360, 640)
######################################################################
# 720P
# ----
#
time_720, size_720 = run_tests(720, 1280)
######################################################################
# 1080P
# -----
#
time_1080, size_1080 = run_tests(1080, 1920)
######################################################################
# Now we plot the result.
#
def plot():
fig, axes = plt.subplots(2, 1, sharex=True, figsize=[9.6, 7.2])
for items in zip(time_360, time_720, time_1080, "ov^X+"):
axes[0].plot(items[:-1], marker=items[-1])
axes[0].grid(axis="both")
axes[0].set_xticks([0, 1, 2], ["360p", "720p", "1080p"], visible=True)
axes[0].tick_params(labeltop=False)
axes[0].legend(
[
"Software Encoding (threads=1)",
"Software Encoding (threads=4)",
"Software Encoding (threads=8)",
"Hardware Encoding (CPU Tensor)",
"Hardware Encoding (CUDA Tensor)",
]
)
axes[0].set_title("Time to encode videos with different resolutions")
axes[0].set_ylabel("Time [s]")
for items in zip(size_360, size_720, size_1080, "v^"):
axes[1].plot(items[:-1], marker=items[-1])
axes[1].grid(axis="both")
axes[1].set_xticks([0, 1, 2], ["360p", "720p", "1080p"])
axes[1].set_ylabel("The encoded size [bytes]")
axes[1].set_title("The size of encoded videos")
axes[1].legend(
[
"Software Encoding",
"Hardware Encoding",
]
)
plt.tight_layout()
plot()
######################################################################
# Result
# ------
#
# We observe couple of things;
#
# - The time to encode video grows as the resolution becomes larger.
# - In the case of software encoding, increasing the number of threads
# helps reduce the decoding time.
# - The gain from extra threads diminishes around 8.
# - Hardware encoding is faster than software encoding in general.
# - Using ``hw_accel`` does not improve the speed of encoding itself
# as much.
# - The size of the resulting videos grow as the resolution becomes
# larger.
# - Hardware encoder produces smaller video file at larger resolution.
#
# The last point is somewhat strange to the author (who is not an
# expert in production of videos.)
# It is often said that hardware decoders produce larger video
# compared to software encoders.
# Some says that software encoders allow fine-grained control over
# encoding configuration, so the resulting video is more optimal.
# Meanwhile, hardware encoders are optimized for performance, thus
# does not provide as much control over quality and binary size.
#
######################################################################
# Quality Spotcheck
# -----------------
#
# So, how are the quality of videos produced with hardware encoders?
# A quick spot check of high resolution videos uncovers that they have
# more noticeable artifacts on higher resolution.
# Which might be an explanation of the smaller binary size. (meaning,
# it is not allocating enough bits to produce quality output.)
#
# The following images are raw frames of videos encoded with hardware
# encoders.
#
######################################################################
# 360P
# ----
#
# .. raw:: html
#
# <img style="max-width: 100%" src="https://download.pytorch.org/torchaudio/tutorial-assets/nvenc_testsrc2_360_097.png" alt="NVENC sample 360P">
######################################################################
# 720P
# ----
#
# .. raw:: html
#
# <img style="max-width: 100%" src="https://download.pytorch.org/torchaudio/tutorial-assets/nvenc_testsrc2_720_097.png" alt="NVENC sample 720P">
######################################################################
# 1080P
# -----
#
# .. raw:: html
#
# <img style="max-width: 100%" src="https://download.pytorch.org/torchaudio/tutorial-assets/nvenc_testsrc2_1080_097.png" alt="NVENC sample 1080P">
######################################################################
#
# We can see that there are more artifacts at higher resolution, which
# are noticeable.
#
# Perhaps one might be able to reduce these using ``encoder_options``
# arguments.
# We did not try, but if you try that and find a better quality
# setting, feel free to let us know. ;)
######################################################################
#
# Tag: :obj:`torchaudio.io`
......@@ -13,14 +13,11 @@ to perform online speech recognition.
#
# .. note::
#
# This tutorial requires FFmpeg libraries (>=4.1, <4.4) and SentencePiece.
# This tutorial requires FFmpeg libraries and SentencePiece.
#
# There are multiple ways to install FFmpeg libraries.
# If you are using Anaconda Python distribution,
# ``conda install 'ffmpeg<4.4'`` will install
# the required FFmpeg libraries.
# Please refer to :ref:`Optional Dependencies <optional_dependencies>`
# for the detail.
#
# You can install SentencePiece by running ``pip install sentencepiece``.
######################################################################
# 1. Overview
......@@ -45,29 +42,9 @@ import torchaudio
print(torch.__version__)
print(torchaudio.__version__)
######################################################################
#
import IPython
try:
from torchaudio.io import StreamReader
except ModuleNotFoundError:
try:
import google.colab
print(
"""
To enable running this notebook in Google Colab, install the requisite
third party libraries by running the following code block:
!add-apt-repository -y ppa:savoury1/ffmpeg4
!apt-get -qq install -y ffmpeg
"""
)
except ModuleNotFoundError:
pass
raise
import matplotlib.pyplot as plt
from torchaudio.io import StreamReader
######################################################################
# 3. Construct the pipeline
......@@ -195,22 +172,43 @@ state, hypothesis = None, None
stream_iterator = streamer.stream()
def _plot(feats, num_iter, unit=25):
unit_dur = segment_length / sample_rate * unit
num_plots = num_iter // unit + (1 if num_iter % unit else 0)
fig, axes = plt.subplots(num_plots, 1)
t0 = 0
for i, ax in enumerate(axes):
feats_ = feats[i * unit : (i + 1) * unit]
t1 = t0 + segment_length / sample_rate * len(feats_)
feats_ = torch.cat([f[2:-2] for f in feats_]) # remove boundary effect and overlap
ax.imshow(feats_.T, extent=[t0, t1, 0, 1], aspect="auto", origin="lower")
ax.tick_params(which="both", left=False, labelleft=False)
ax.set_xlim(t0, t0 + unit_dur)
t0 = t1
fig.suptitle("MelSpectrogram Feature")
plt.tight_layout()
@torch.inference_mode()
def run_inference(num_iter=200):
def run_inference(num_iter=100):
global state, hypothesis
chunks = []
feats = []
for i, (chunk,) in enumerate(stream_iterator, start=1):
segment = cacher(chunk[:, 0])
features, length = feature_extractor(segment)
hypos, state = decoder.infer(features, length, 10, state=state, hypothesis=hypothesis)
hypothesis = hypos[0]
transcript = token_processor(hypothesis[0], lstrip=False)
print(transcript, end="", flush=True)
hypothesis = hypos
transcript = token_processor(hypos[0][0], lstrip=False)
print(transcript, end="\r", flush=True)
chunks.append(chunk)
feats.append(features)
if i == num_iter:
break
# Plot the features
_plot(feats, num_iter)
return IPython.display.Audio(torch.cat(chunks).T.numpy(), rate=bundle.sample_rate)
......@@ -249,6 +247,36 @@ run_inference()
run_inference()
######################################################################
#
run_inference()
######################################################################
#
run_inference()
######################################################################
#
run_inference()
######################################################################
#
run_inference()
######################################################################
#
run_inference()
######################################################################
#
run_inference()
######################################################################
#
# Tag: :obj:`torchaudio.io`
......@@ -160,8 +160,7 @@ for i, feats in enumerate(features):
ax[i].set_title(f"Feature from transformer layer {i+1}")
ax[i].set_xlabel("Feature dimension")
ax[i].set_ylabel("Frame (time-axis)")
plt.tight_layout()
plt.show()
fig.tight_layout()
######################################################################
......@@ -190,7 +189,7 @@ plt.imshow(emission[0].cpu().T, interpolation="nearest")
plt.title("Classification result")
plt.xlabel("Frame (time-axis)")
plt.ylabel("Class")
plt.show()
plt.tight_layout()
print("Class labels:", bundle.get_labels())
......
"""
Torchaudio-Squim: Non-intrusive Speech Assessment in TorchAudio
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
"""
######################################################################
# Author: `Anurag Kumar <anuragkr90@meta.com>`__, `Zhaoheng
# Ni <zni@meta.com>`__
#
######################################################################
# 1. Overview
# ^^^^^^^^^^^
#
######################################################################
# This tutorial shows uses of Torchaudio-Squim to estimate objective and
# subjective metrics for assessment of speech quality and intelligibility.
#
# TorchAudio-Squim enables speech assessment in Torchaudio. It provides
# interface and pre-trained models to estimate various speech quality and
# intelligibility metrics. Currently, Torchaudio-Squim [1] supports
# reference-free estimation 3 widely used objective metrics:
#
# - Wideband Perceptual Estimation of Speech Quality (PESQ) [2]
#
# - Short-Time Objective Intelligibility (STOI) [3]
#
# - Scale-Invariant Signal-to-Distortion Ratio (SI-SDR) [4]
#
# It also supports estimation of subjective Mean Opinion Score (MOS) for a
# given audio waveform using Non-Matching References [1, 5].
#
# **References**
#
# [1] Kumar, Anurag, et al. “TorchAudio-Squim: Reference-less Speech
# Quality and Intelligibility measures in TorchAudio.” ICASSP 2023-2023
# IEEE International Conference on Acoustics, Speech and Signal Processing
# (ICASSP). IEEE, 2023.
#
# [2] I. Rec, “P.862.2: Wideband extension to recommendation P.862 for the
# assessment of wideband telephone networks and speech codecs,”
# International Telecommunication Union, CH–Geneva, 2005.
#
# [3] Taal, C. H., Hendriks, R. C., Heusdens, R., & Jensen, J. (2010,
# March). A short-time objective intelligibility measure for
# time-frequency weighted noisy speech. In 2010 IEEE international
# conference on acoustics, speech and signal processing (pp. 4214-4217).
# IEEE.
#
# [4] Le Roux, Jonathan, et al. “SDR–half-baked or well done?.” ICASSP
# 2019-2019 IEEE International Conference on Acoustics, Speech and Signal
# Processing (ICASSP). IEEE, 2019.
#
# [5] Manocha, Pranay, and Anurag Kumar. “Speech quality assessment
# through MOS using non-matching references.” Interspeech, 2022.
#
import torch
import torchaudio
print(torch.__version__)
print(torchaudio.__version__)
######################################################################
# 2. Preparation
# ^^^^^^^^^^^^^^
#
# First import the modules and define the helper functions.
#
# We will need torch, torchaudio to use Torchaudio-squim, Matplotlib to
# plot data, pystoi, pesq for computing reference metrics.
#
try:
from pesq import pesq
from pystoi import stoi
from torchaudio.pipelines import SQUIM_OBJECTIVE, SQUIM_SUBJECTIVE
except ImportError:
try:
import google.colab # noqa: F401
print(
"""
To enable running this notebook in Google Colab, install nightly
torch and torchaudio builds by adding the following code block to the top
of the notebook before running it:
!pip3 uninstall -y torch torchvision torchaudio
!pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
!pip3 install pesq
!pip3 install pystoi
"""
)
except Exception:
pass
raise
import matplotlib.pyplot as plt
######################################################################
#
#
import torchaudio.functional as F
from IPython.display import Audio
from torchaudio.utils import download_asset
def si_snr(estimate, reference, epsilon=1e-8):
estimate = estimate - estimate.mean()
reference = reference - reference.mean()
reference_pow = reference.pow(2).mean(axis=1, keepdim=True)
mix_pow = (estimate * reference).mean(axis=1, keepdim=True)
scale = mix_pow / (reference_pow + epsilon)
reference = scale * reference
error = estimate - reference
reference_pow = reference.pow(2)
error_pow = error.pow(2)
reference_pow = reference_pow.mean(axis=1)
error_pow = error_pow.mean(axis=1)
si_snr = 10 * torch.log10(reference_pow) - 10 * torch.log10(error_pow)
return si_snr.item()
def plot(waveform, title, sample_rate=16000):
wav_numpy = waveform.numpy()
sample_size = waveform.shape[1]
time_axis = torch.arange(0, sample_size) / sample_rate
figure, axes = plt.subplots(2, 1)
axes[0].plot(time_axis, wav_numpy[0], linewidth=1)
axes[0].grid(True)
axes[1].specgram(wav_numpy[0], Fs=sample_rate)
figure.suptitle(title)
######################################################################
# 3. Load Speech and Noise Sample
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
SAMPLE_SPEECH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
SAMPLE_NOISE = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo.wav")
######################################################################
#
#
WAVEFORM_SPEECH, SAMPLE_RATE_SPEECH = torchaudio.load(SAMPLE_SPEECH)
WAVEFORM_NOISE, SAMPLE_RATE_NOISE = torchaudio.load(SAMPLE_NOISE)
WAVEFORM_NOISE = WAVEFORM_NOISE[0:1, :]
######################################################################
# Currently, Torchaudio-Squim model only supports 16000 Hz sampling rate.
# Resample the waveforms if necessary.
#
if SAMPLE_RATE_SPEECH != 16000:
WAVEFORM_SPEECH = F.resample(WAVEFORM_SPEECH, SAMPLE_RATE_SPEECH, 16000)
if SAMPLE_RATE_NOISE != 16000:
WAVEFORM_NOISE = F.resample(WAVEFORM_NOISE, SAMPLE_RATE_NOISE, 16000)
######################################################################
# Trim waveforms so that they have the same number of frames.
#
if WAVEFORM_SPEECH.shape[1] < WAVEFORM_NOISE.shape[1]:
WAVEFORM_NOISE = WAVEFORM_NOISE[:, : WAVEFORM_SPEECH.shape[1]]
else:
WAVEFORM_SPEECH = WAVEFORM_SPEECH[:, : WAVEFORM_NOISE.shape[1]]
######################################################################
# Play speech sample
#
Audio(WAVEFORM_SPEECH.numpy()[0], rate=16000)
######################################################################
# Play noise sample
#
Audio(WAVEFORM_NOISE.numpy()[0], rate=16000)
######################################################################
# 4. Create distorted (noisy) speech samples
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
snr_dbs = torch.tensor([20, -5])
WAVEFORM_DISTORTED = F.add_noise(WAVEFORM_SPEECH, WAVEFORM_NOISE, snr_dbs)
######################################################################
# Play distorted speech with 20dB SNR
#
Audio(WAVEFORM_DISTORTED.numpy()[0], rate=16000)
######################################################################
# Play distorted speech with -5dB SNR
#
Audio(WAVEFORM_DISTORTED.numpy()[1], rate=16000)
######################################################################
# 5. Visualize the waveforms
# ^^^^^^^^^^^^^^^^^^^^^^^^^^
#
######################################################################
# Visualize speech sample
#
plot(WAVEFORM_SPEECH, "Clean Speech")
######################################################################
# Visualize noise sample
#
plot(WAVEFORM_NOISE, "Noise")
######################################################################
# Visualize distorted speech with 20dB SNR
#
plot(WAVEFORM_DISTORTED[0:1], f"Distorted Speech with {snr_dbs[0]}dB SNR")
######################################################################
# Visualize distorted speech with -5dB SNR
#
plot(WAVEFORM_DISTORTED[1:2], f"Distorted Speech with {snr_dbs[1]}dB SNR")
######################################################################
# 6. Predict Objective Metrics
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
######################################################################
# Get the pre-trained ``SquimObjective``\ model.
#
objective_model = SQUIM_OBJECTIVE.get_model()
######################################################################
# Compare model outputs with ground truths for distorted speech with 20dB
# SNR
#
stoi_hyp, pesq_hyp, si_sdr_hyp = objective_model(WAVEFORM_DISTORTED[0:1, :])
print(f"Estimated metrics for distorted speech at {snr_dbs[0]}dB are\n")
print(f"STOI: {stoi_hyp[0]}")
print(f"PESQ: {pesq_hyp[0]}")
print(f"SI-SDR: {si_sdr_hyp[0]}\n")
pesq_ref = pesq(16000, WAVEFORM_SPEECH[0].numpy(), WAVEFORM_DISTORTED[0].numpy(), mode="wb")
stoi_ref = stoi(WAVEFORM_SPEECH[0].numpy(), WAVEFORM_DISTORTED[0].numpy(), 16000, extended=False)
si_sdr_ref = si_snr(WAVEFORM_DISTORTED[0:1], WAVEFORM_SPEECH)
print(f"Reference metrics for distorted speech at {snr_dbs[0]}dB are\n")
print(f"STOI: {stoi_ref}")
print(f"PESQ: {pesq_ref}")
print(f"SI-SDR: {si_sdr_ref}")
######################################################################
# Compare model outputs with ground truths for distorted speech with -5dB
# SNR
#
stoi_hyp, pesq_hyp, si_sdr_hyp = objective_model(WAVEFORM_DISTORTED[1:2, :])
print(f"Estimated metrics for distorted speech at {snr_dbs[1]}dB are\n")
print(f"STOI: {stoi_hyp[0]}")
print(f"PESQ: {pesq_hyp[0]}")
print(f"SI-SDR: {si_sdr_hyp[0]}\n")
pesq_ref = pesq(16000, WAVEFORM_SPEECH[0].numpy(), WAVEFORM_DISTORTED[1].numpy(), mode="wb")
stoi_ref = stoi(WAVEFORM_SPEECH[0].numpy(), WAVEFORM_DISTORTED[1].numpy(), 16000, extended=False)
si_sdr_ref = si_snr(WAVEFORM_DISTORTED[1:2], WAVEFORM_SPEECH)
print(f"Reference metrics for distorted speech at {snr_dbs[1]}dB are\n")
print(f"STOI: {stoi_ref}")
print(f"PESQ: {pesq_ref}")
print(f"SI-SDR: {si_sdr_ref}")
######################################################################
# 7. Predict Mean Opinion Scores (Subjective) Metric
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
######################################################################
# Get the pre-trained ``SquimSubjective`` model.
#
subjective_model = SQUIM_SUBJECTIVE.get_model()
######################################################################
# Load a non-matching reference (NMR)
#
NMR_SPEECH = download_asset("tutorial-assets/ctc-decoding/1688-142285-0007.wav")
WAVEFORM_NMR, SAMPLE_RATE_NMR = torchaudio.load(NMR_SPEECH)
if SAMPLE_RATE_NMR != 16000:
WAVEFORM_NMR = F.resample(WAVEFORM_NMR, SAMPLE_RATE_NMR, 16000)
######################################################################
# Compute MOS metric for distorted speech with 20dB SNR
#
mos = subjective_model(WAVEFORM_DISTORTED[0:1, :], WAVEFORM_NMR)
print(f"Estimated MOS for distorted speech at {snr_dbs[0]}dB is MOS: {mos[0]}")
######################################################################
# Compute MOS metric for distorted speech with -5dB SNR
#
mos = subjective_model(WAVEFORM_DISTORTED[1:2, :], WAVEFORM_NMR)
print(f"Estimated MOS for distorted speech at {snr_dbs[1]}dB is MOS: {mos[0]}")
######################################################################
# 8. Comparison with ground truths and baselines
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
# Visualizing the estimated metrics by the ``SquimObjective`` and
# ``SquimSubjective`` models can help users better understand how the
# models can be applicable in real scenario. The graph below shows scatter
# plots of three different systems: MOSA-Net [1], AMSA [2], and the
# ``SquimObjective`` model, where y axis represents the estimated STOI,
# PESQ, and Si-SDR scores, and x axis represents the corresponding ground
# truth.
#
# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/objective_plot.png
# :width: 500px
# :align: center
#
# [1] Zezario, Ryandhimas E., Szu-Wei Fu, Fei Chen, Chiou-Shann Fuh,
# Hsin-Min Wang, and Yu Tsao. “Deep learning-based non-intrusive
# multi-objective speech assessment model with cross-domain features.”
# IEEE/ACM Transactions on Audio, Speech, and Language Processing 31
# (2022): 54-70.
#
# [2] Dong, Xuan, and Donald S. Williamson. “An attention enhanced
# multi-task model for objective speech assessment in real-world
# environments.” In ICASSP 2020-2020 IEEE International Conference on
# Acoustics, Speech and Signal Processing (ICASSP), pp. 911-915. IEEE,
# 2020.
#
######################################################################
# The graph below shows scatter plot of the ``SquimSubjective`` model,
# where y axis represents the estimated MOS metric score, and x axis
# represents the corresponding ground truth.
#
# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/subjective_plot.png
# :width: 500px
# :align: center
#
......@@ -20,35 +20,15 @@ import torchaudio
print(torch.__version__)
print(torchaudio.__version__)
######################################################################
#
try:
from torchaudio.io import StreamReader
except ModuleNotFoundError:
try:
import google.colab
print(
"""
To enable running this notebook in Google Colab, install the requisite
third party libraries by running the following code:
!add-apt-repository -y ppa:savoury1/ffmpeg4
!apt-get -qq install -y ffmpeg
"""
)
except ModuleNotFoundError:
pass
raise
import IPython
import matplotlib.pyplot as plt
from torchaudio.io import StreamReader
base_url = "https://download.pytorch.org/torchaudio/tutorial-assets"
AUDIO_URL = f"{base_url}/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
VIDEO_URL = f"{base_url}/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4.mp4"
######################################################################
# Audio / Video device input
# --------------------------
......@@ -122,6 +102,9 @@ VIDEO_URL = f"{base_url}/stream-api/NASAs_Most_Scientifically_Complex_Space_Obse
#
######################################################################
#
# .. _lavfi:
#
# Synthetic source streams
# ------------------------
#
......@@ -372,13 +355,14 @@ chunks = next(streamer.stream())
def _display(i):
print("filter_desc:", streamer.get_out_stream_info(i).filter_description)
_, axs = plt.subplots(2, 1)
fig, axs = plt.subplots(2, 1)
waveform = chunks[i][:, 0]
axs[0].plot(waveform)
axs[0].grid(True)
axs[0].set_ylim([-1, 1])
plt.setp(axs[0].get_xticklabels(), visible=False)
axs[1].specgram(waveform, Fs=sample_rate)
fig.tight_layout()
return IPython.display.Audio(chunks[i].T, rate=sample_rate)
......@@ -457,7 +441,6 @@ def _display(i):
axs[j].imshow(chunk[10 * j + 1].permute(1, 2, 0))
axs[j].set_axis_off()
plt.tight_layout()
plt.show(block=False)
######################################################################
......
......@@ -14,12 +14,9 @@ libavfilter provides.
#
# .. note::
#
# This tutorial requires FFmpeg libraries (>=4.1, <4.4).
#
# There are multiple ways to install FFmpeg libraries.
# If you are using Anaconda Python distribution,
# ``conda install -c anaconda 'ffmpeg<4.4'`` will install
# the required libraries.
# This tutorial requires FFmpeg libraries.
# Please refer to :ref:`FFmpeg dependency <ffmpeg_dependency>` for
# the detail.
#
######################################################################
......@@ -65,29 +62,8 @@ import torchaudio
print(torch.__version__)
print(torchaudio.__version__)
######################################################################
#
try:
from torchaudio.io import StreamReader
except ModuleNotFoundError:
try:
import google.colab
print(
"""
To enable running this notebook in Google Colab, install the requisite
third party libraries by running the following code:
!add-apt-repository -y ppa:savoury1/ffmpeg4
!apt-get -qq install -y ffmpeg
"""
)
except ModuleNotFoundError:
pass
raise
import matplotlib.pyplot as plt
from torchaudio.io import StreamReader
base_url = "https://download.pytorch.org/torchaudio/tutorial-assets"
AUDIO_URL = f"{base_url}/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
......@@ -613,7 +589,6 @@ for i, vid in enumerate(vids2):
if i == 0 and j == 0:
ax.set_ylabel("Stream 2")
plt.tight_layout()
plt.show(block=False)
######################################################################
#
......
......@@ -23,17 +23,9 @@ play audio and video.
#
# .. note::
#
# This tutorial requires torchaudio nightly build and FFmpeg libraries (>=4.1, <4.4).
#
# To install torchaudio nightly build, please refer to
# https://pytorch.org/get-started/locally/ .
#
#
# There are multiple ways to install FFmpeg libraries.
# If you are using Anaconda Python distribution,
# ``conda install 'ffmpeg<4.4'`` will install the required FFmpeg libraries,
# however, this distribution does not have SDL plugin, so it cannot play
# video.
# This tutorial requires FFmpeg libraries.
# Please refer to :ref:`FFmpeg dependency <ffmpeg_dependency>` for
# the detail.
#
######################################################################
......@@ -74,7 +66,9 @@ from torchaudio.io import StreamWriter
from torchaudio.utils import download_asset
AUDIO_PATH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
VIDEO_PATH = download_asset("tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4")
VIDEO_PATH = download_asset(
"tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4"
)
######################################################################
#
......@@ -140,7 +134,7 @@ s.add_audio_stream(sample_rate, num_channels, format="s16")
# Write audio to the device
with s.open():
for i in range(0, num_frames, 256):
s.write_audio_chunk(0, waveform[i:i+256])
s.write_audio_chunk(0, waveform[i : i + 256])
######################################################################
#
......@@ -186,8 +180,12 @@ width, height = 640, 360
# a background thread and give chunks
running = True
def video_streamer(path, frames_per_chunk):
import queue, threading
import queue
import threading
from torchaudio.io import StreamReader
q = queue.Queue()
......@@ -196,9 +194,9 @@ def video_streamer(path, frames_per_chunk):
def _streamer():
streamer = StreamReader(path)
streamer.add_basic_video_stream(
frames_per_chunk, format="rgb24",
frame_rate=frame_rate, width=width, height=height)
for (chunk_, ) in streamer.stream():
frames_per_chunk, format="rgb24", frame_rate=frame_rate, width=width, height=height
)
for (chunk_,) in streamer.stream():
q.put(chunk_)
if not running:
break
......@@ -246,7 +244,7 @@ with s.open():
# <source src="https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio-sdl-demo.mp4">
# </video>
#
# [`code <https://download.pytorch.org/torchaudio/tutorial-assets/sdl.py>`_]
# [`code <https://download.pytorch.org/torchaudio/tutorial-assets/sdl.py>`__]
#
######################################################################
......@@ -292,7 +290,7 @@ with s.open():
# <source src="https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio-rtmp-demo.mp4">
# </video>
#
# [`code <https://download.pytorch.org/torchaudio/tutorial-assets/rtmp.py>`_]
# [`code <https://download.pytorch.org/torchaudio/tutorial-assets/rtmp.py>`__]
#
......@@ -324,7 +322,7 @@ with s.open():
# <source src="https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio-udp-demo.mp4">
# </video>
#
# [`code <https://download.pytorch.org/torchaudio/tutorial-assets/udp.py>`_]
# [`code <https://download.pytorch.org/torchaudio/tutorial-assets/udp.py>`__]
#
######################################################################
......
......@@ -13,14 +13,9 @@ encode and save audio/video data into various formats/destinations.
#
# .. note::
#
# This tutorial requires torchaudio nightly build and FFmpeg libraries (>=4.1, <4.4).
#
# To install torchaudio nightly build, please refer to
# https://pytorch.org/get-started/locally/ .
#
# There are multiple ways to install FFmpeg libraries.
# If you are using Anaconda Python distribution,
# ``conda install 'ffmpeg<4.4'`` will install the required FFmpeg libraries.
# This tutorial requires FFmpeg libraries.
# Please refer to :ref:`FFmpeg dependency <ffmpeg_dependency>` for
# the detail.
#
######################################################################
......@@ -51,27 +46,7 @@ import torchaudio
print(torch.__version__)
print(torchaudio.__version__)
######################################################################
#
try:
from torchaudio.io import StreamWriter
except ImportError:
try:
import google.colab
print(
"""
To enable running this notebook in Google Colab, install nightly
torch and torchaudio builds by adding the following code block to the top
of the notebook before running it:
!pip3 uninstall -y torch torchvision torchaudio
!pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
"""
)
except ModuleNotFoundError:
pass
raise
from torchaudio.io import StreamWriter
print("FFmpeg library versions")
for k, v in torchaudio.utils.ffmpeg_utils.get_versions().items():
......@@ -84,9 +59,10 @@ import io
import os
import tempfile
from torchaudio.utils import download_asset
from IPython.display import Audio, Video
from torchaudio.utils import download_asset
SAMPLE_PATH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
WAVEFORM, SAMPLE_RATE = torchaudio.load(SAMPLE_PATH, channels_first=False)
NUM_FRAMES, NUM_CHANNELS = WAVEFORM.shape
......@@ -503,47 +479,7 @@ print(f"{bytes2[:10]}...{bytes2[-10:]}\n")
assert bytes1 == bytes2
######################################################################
# Note on slicing and AAC
# ~~~~~~~~~~~~~~~~~~~~~~~
#
# .. warning::
#
# FFmpeg's native AAC encoder (which is used by default when
# saving video with MP4 format) has a bug that affects the audibility.
#
# Please refer to the examples bellow.
#
def test_slice(audio_encoder, slice_size, ext="mp4"):
path = get_path(f"slice_{slice_size}.{ext}")
s = StreamWriter(dst=path)
s.add_audio_stream(SAMPLE_RATE, NUM_CHANNELS, encoder=audio_encoder)
with s.open():
for start in range(0, NUM_FRAMES, slice_size):
end = start + slice_size
s.write_audio_chunk(0, WAVEFORM[start:end, ...])
return path
######################################################################
#
# This causes some artifacts.
# note:
# Chrome does not support playing AAC audio directly while Safari does.
# Using MP4 container and specifying AAC allows Chrome to play it.
Video(test_slice(audio_encoder="aac", slice_size=8000, ext="mp4"), embed=True)
######################################################################
#
# It is more noticeable when using smaller slice.
Video(test_slice(audio_encoder="aac", slice_size=512, ext="mp4"), embed=True)
######################################################################
#
# Lame MP3 encoder works fine for the same slice size.
Audio(test_slice(audio_encoder="libmp3lame", slice_size=512, ext="mp3"))
import matplotlib.pyplot as plt
######################################################################
#
......@@ -559,7 +495,6 @@ Audio(test_slice(audio_encoder="libmp3lame", slice_size=512, ext="mp3"))
# then use StreamWriter to convert them to video with the original audio.
import torchaudio.transforms as T
import matplotlib.pyplot as plt
######################################################################
#
......@@ -590,7 +525,7 @@ specs = trans(WAVEFORM.T)[0].T
#
spec_db = T.AmplitudeToDB(stype="magnitude", top_db=80)(specs.T)
_ = plt.imshow(spec_db, aspect="auto", origin='lower')
_ = plt.imshow(spec_db, aspect="auto", origin="lower")
######################################################################
#
......@@ -611,21 +546,27 @@ ncols, nrows = fig.canvas.get_width_height()
def _plot(data):
ax.clear()
x = list(range(len(data)))
R, G, B = 238/255, 76/255, 44/255
R, G, B = 238 / 255, 76 / 255, 44 / 255
for coeff, alpha in [(0.8, 0.7), (1, 1)]:
d = data ** coeff
d = data**coeff
ax.fill_between(x, d, -d, color=[R, G, B, alpha])
xlim = n_fft // 2 + 1
ax.set_xlim([-1, n_fft // 2 + 1])
ax.set_ylim([-1, 1])
ax.text(
xlim, 0.95,
xlim,
0.95,
f"Created with TorchAudio\n{torchaudio.__version__}",
color="white", ha="right", va="top", backgroundcolor="black")
color="white",
ha="right",
va="top",
backgroundcolor="black",
)
fig.canvas.draw()
frame = torch.frombuffer(fig.canvas.tostring_rgb(), dtype=torch.uint8)
return frame.reshape(nrows, ncols, 3).permute(2, 0, 1)
# sphinx_gallery_defer_figures
######################################################################
......@@ -646,10 +587,10 @@ with s.open():
# Process by second
for t in range(0, NUM_FRAMES, SAMPLE_RATE):
# Write audio chunk
s.write_audio_chunk(0, WAVEFORM[t:t + SAMPLE_RATE, :])
s.write_audio_chunk(0, WAVEFORM[t : t + SAMPLE_RATE, :])
# write 1 second of video chunk
frames = [_plot(spec) for spec in specs[i:i+frame_rate]]
frames = [_plot(spec) for spec in specs[i : i + frame_rate]]
if frames:
s.write_video_chunk(1, torch.stack(frames))
i += frame_rate
......
......@@ -7,10 +7,6 @@ Text-to-Speech with Tacotron2
"""
import IPython
import matplotlib
import matplotlib.pyplot as plt
######################################################################
# Overview
# --------
......@@ -65,8 +61,6 @@ import matplotlib.pyplot as plt
import torch
import torchaudio
matplotlib.rcParams["figure.figsize"] = [16.0, 4.8]
torch.random.manual_seed(0)
device = "cuda" if torch.cuda.is_available() else "cpu"
......@@ -75,6 +69,13 @@ print(torchaudio.__version__)
print(device)
######################################################################
#
import IPython
import matplotlib.pyplot as plt
######################################################################
# Text Processing
# ---------------
......@@ -218,7 +219,7 @@ with torch.inference_mode():
spec, _, _ = tacotron2.infer(processed, lengths)
_ = plt.imshow(spec[0].cpu().detach())
_ = plt.imshow(spec[0].cpu().detach(), origin="lower", aspect="auto")
######################################################################
......@@ -226,13 +227,17 @@ _ = plt.imshow(spec[0].cpu().detach())
# therefor, the process of generating the spectrogram incurs randomness.
#
fig, ax = plt.subplots(3, 1, figsize=(16, 4.3 * 3))
for i in range(3):
with torch.inference_mode():
spec, spec_lengths, _ = tacotron2.infer(processed, lengths)
print(spec[0].shape)
ax[i].imshow(spec[0].cpu().detach())
plt.show()
def plot():
fig, ax = plt.subplots(3, 1)
for i in range(3):
with torch.inference_mode():
spec, spec_lengths, _ = tacotron2.infer(processed, lengths)
print(spec[0].shape)
ax[i].imshow(spec[0].cpu().detach(), origin="lower", aspect="auto")
plot()
######################################################################
......@@ -270,11 +275,22 @@ with torch.inference_mode():
spec, spec_lengths, _ = tacotron2.infer(processed, lengths)
waveforms, lengths = vocoder(spec, spec_lengths)
fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(16, 9))
ax1.imshow(spec[0].cpu().detach())
ax2.plot(waveforms[0].cpu().detach())
######################################################################
#
def plot(waveforms, spec, sample_rate):
waveforms = waveforms.cpu().detach()
IPython.display.Audio(waveforms[0:1].cpu(), rate=vocoder.sample_rate)
fig, [ax1, ax2] = plt.subplots(2, 1)
ax1.plot(waveforms[0])
ax1.set_xlim(0, waveforms.size(-1))
ax1.grid(True)
ax2.imshow(spec[0].cpu().detach(), origin="lower", aspect="auto")
return IPython.display.Audio(waveforms[0:1], rate=sample_rate)
plot(waveforms, spec, vocoder.sample_rate)
######################################################################
......@@ -300,11 +316,10 @@ with torch.inference_mode():
spec, spec_lengths, _ = tacotron2.infer(processed, lengths)
waveforms, lengths = vocoder(spec, spec_lengths)
fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(16, 9))
ax1.imshow(spec[0].cpu().detach())
ax2.plot(waveforms[0].cpu().detach())
######################################################################
#
IPython.display.Audio(waveforms[0:1].cpu(), rate=vocoder.sample_rate)
plot(waveforms, spec, vocoder.sample_rate)
######################################################################
......@@ -339,8 +354,7 @@ waveglow.eval()
with torch.no_grad():
waveforms = waveglow.infer(spec)
fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(16, 9))
ax1.imshow(spec[0].cpu().detach())
ax2.plot(waveforms[0].cpu().detach())
######################################################################
#
IPython.display.Audio(waveforms[0:1].cpu(), rate=22050)
plot(waveforms, spec, 22050)
......@@ -14,11 +14,8 @@ requirements:
host:
- python
- setuptools
- pkg-config # [not win]
- cmake
- ninja
- numpy>=1.11 # [py <= 39]
- numpy>=1.21.2 # [py >= 310]
- pytorch-mutex 1.0 {{ build_variant }} # [not osx ]
{{ environ.get('CONDA_PYTORCH_BUILD_CONSTRAINT', 'pytorch') }}
{{ environ.get('CONDA_EXTRA_BUILD_CONSTRAINT', '') }}
......@@ -26,8 +23,7 @@ requirements:
run:
- python
- numpy>=1.11 # [py <= 39]
- numpy>=1.21.2 # [py >= 310]
- numpy
- pytorch-mutex 1.0 {{ build_variant }} # [not osx ]
{{ environ.get('CONDA_PYTORCH_CONSTRAINT', 'pytorch') }}
{{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT', '') }}
......@@ -49,7 +45,6 @@ build:
- TORCH_CUDA_ARCH_LIST
- USE_FFMPEG
- USE_OPENMP
- FFMPEG_ROOT
- MACOSX_DEPLOYMENT_TARGET
test:
......
blas_impl:
- mkl # [x86_64]
c_compiler:
- vs2019 # [win]
cxx_compiler:
- vs2019 # [win]
python:
- 3.7
- 3.8
# This differs from target_platform in that it determines what subdir the compiler
# will target, not what subdir the compiler package will be itself.
# For example, we need a win-64 vs2008_win-32 package, so that we compile win-32
......
......@@ -23,25 +23,25 @@ set CUDA_VERSION_STR=%CUDA_VER_MAJOR%.%CUDA_VER_MINOR%
set CUDNN_FOLDER="cuda"
set CUDNN_LIB_FOLDER="lib\x64"
if %CUDA_VER% EQU 116 goto cuda116
if %CUDA_VER% EQU 117 goto cuda117
if %CUDA_VER% EQU 118 goto cuda118
if %CUDA_VER% EQU 121 goto cuda121
echo CUDA %CUDA_VERSION_STR% is not supported
exit /b 1
:cuda116
:cuda118
set CUDA_INSTALL_EXE=cuda_11.6.0_511.23_windows.exe
set CUDA_INSTALL_EXE=cuda_11.8.0_522.06_windows.exe
if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
if errorlevel 1 exit /b 1
set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
set "ARGS=thrust_11.6 nvcc_11.6 cuobjdump_11.6 nvprune_11.6 nvprof_11.6 cupti_11.6 cublas_11.6 cublas_dev_11.6 cudart_11.6 cufft_11.6 cufft_dev_11.6 curand_11.6 curand_dev_11.6 cusolver_11.6 cusolver_dev_11.6 cusparse_11.6 cusparse_dev_11.6 npp_11.6 npp_dev_11.6 nvrtc_11.6 nvrtc_dev_11.6 nvml_dev_11.6"
set "ARGS=cuda_profiler_api_11.8 thrust_11.8 nvcc_11.8 cuobjdump_11.8 nvprune_11.8 nvprof_11.8 cupti_11.8 cublas_11.8 cublas_dev_11.8 cudart_11.8 cufft_11.8 cufft_dev_11.8 curand_11.8 curand_dev_11.8 cusolver_11.8 cusolver_dev_11.8 cusparse_11.8 cusparse_dev_11.8 npp_11.8 npp_dev_11.8 nvrtc_11.8 nvrtc_dev_11.8 nvml_dev_11.8"
)
set CUDNN_INSTALL_ZIP=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive.zip
set CUDNN_FOLDER=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive
set CUDNN_FOLDER=cudnn-windows-x86_64-8.7.0.84_cuda11-archive
set CUDNN_LIB_FOLDER="lib"
set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
if errorlevel 1 exit /b 1
......@@ -55,23 +55,23 @@ if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
goto cuda_common
:cuda117
:cuda121
set CUDA_INSTALL_EXE=cuda_11.7.0_516.01_windows.exe
set CUDA_INSTALL_EXE=cuda_12.1.1_531.14_windows.exe
if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
if errorlevel 1 exit /b 1
set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
set "ARGS=thrust_11.7 nvcc_11.7 cuobjdump_11.7 nvprune_11.7 nvprof_11.7 cupti_11.7 cublas_11.7 cublas_dev_11.7 cudart_11.7 cufft_11.7 cufft_dev_11.7 curand_11.7 curand_dev_11.7 cusolver_11.7 cusolver_dev_11.7 cusparse_11.7 cusparse_dev_11.7 npp_11.7 npp_dev_11.7 nvrtc_11.7 nvrtc_dev_11.7 nvml_dev_11.7"
set "ARGS=cuda_profiler_api_12.1 thrust_12.1 nvcc_12.1 cuobjdump_12.1 nvprune_12.1 nvprof_12.1 cupti_12.1 cublas_12.1 cublas_dev_12.1 cudart_12.1 cufft_12.1 cufft_dev_12.1 curand_12.1 curand_dev_12.1 cusolver_12.1 cusolver_dev_12.1 cusparse_12.1 cusparse_dev_12.1 npp_12.1 npp_dev_12.1 nvrtc_12.1 nvrtc_dev_12.1 nvml_dev_12.1"
)
set CUDNN_INSTALL_ZIP=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive.zip
set CUDNN_FOLDER=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive
set CUDNN_FOLDER=cudnn-windows-x86_64-8.8.1.3_cuda12-archive
set CUDNN_LIB_FOLDER="lib"
set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
if errorlevel 1 exit /b 1
set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%
rem Make sure windows path contains zlib dll
curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip"
......
......@@ -5,6 +5,9 @@ first_party_detection = false
[tool.black]
line-length = 120
target-version = ["py37"]
target-version = ["py38"]
[tool.ufmt]
excludes = [
"examples/tutorials/",
]
......@@ -4,7 +4,6 @@ import os
import re
import shutil
import subprocess
import sys
from pathlib import Path
import torch
......@@ -22,13 +21,14 @@ def _run_cmd(cmd, shell=False):
return None
def _get_version(sha):
with open(ROOT_DIR / "version.txt", "r") as f:
version = f.read().strip()
if os.getenv("BUILD_VERSION"):
version = os.getenv("BUILD_VERSION")
elif sha is not None:
version += "+" + sha[:7]
version += "+das" + "." + "opt1"
return version
......@@ -36,12 +36,14 @@ def _make_version_file(version, sha):
sha = "Unknown" if sha is None else sha
abi = _run_cmd(["echo '#include <string>' | gcc -x c++ -E -dM - | fgrep _GLIBCXX_USE_CXX11_ABI | awk '{print $3}'"], shell=True)
dtk = _run_cmd(["cat", os.path.join(ROCM_HOME, '.info/rocm_version')])
dtk = ''.join(dtk.split('.')[:2])
dtk = ''.join(dtk.split('.')[:2]) + "2"
torch_version = torch.__version__
dcu_version = f"{version}.abi{abi}.dtk{dtk}.torch{torch_version}"
dcu_version = f"{version}.dtk{dtk}"
version_path = ROOT_DIR / "torchaudio" / "version.py"
version_write = version[:-9]
with open(version_path, "w") as f:
f.write(f"__version__ = '{version}'\n")
f.write(f"__version__ = '{version_write}'\n")
f.write(f"git_version = '{sha}'\n")
f.write(f"abi = 'abi{abi}'\n")
f.write(f"dtk = '{dtk}'\n")
......@@ -50,7 +52,6 @@ def _make_version_file(version, sha):
return dcu_version
def _get_pytorch_version():
if "PYTORCH_VERSION" in os.environ:
return f"torch=={os.environ['PYTORCH_VERSION']}"
......@@ -95,18 +96,6 @@ def _get_packages(branch_name, tag):
return find_packages(exclude=exclude)
def _init_submodule():
print(" --- Initializing submodules")
try:
subprocess.check_call(["git", "submodule", "init"])
subprocess.check_call(["git", "submodule", "update"])
except Exception:
print(" --- Submodule initalization failed")
print("Please run:\n\tgit submodule update --init --recursive")
sys.exit(1)
print(" --- Initialized submodule")
def _parse_url(path):
with open(path, "r") as file_:
for line in file_:
......@@ -116,18 +105,6 @@ def _parse_url(path):
yield url
def _parse_sources():
third_party_dir = ROOT_DIR / "third_party"
libs = ["zlib", "bzip2", "lzma", "sox"]
archive_dir = third_party_dir / "archives"
archive_dir.mkdir(exist_ok=True)
for lib in libs:
cmake_file = third_party_dir / lib / "CMakeLists.txt"
for url in _parse_url(cmake_file):
path = archive_dir / os.path.basename(url)
yield path, url
def _fetch_archives(src):
for dest, url in src:
if not dest.exists():
......@@ -135,12 +112,6 @@ def _fetch_archives(src):
torch.hub.download_url_to_file(url, dest, progress=False)
def _fetch_third_party_libraries():
_init_submodule()
if os.name != "nt":
_fetch_archives(_parse_sources())
def _main():
sha = _run_cmd(["git", "rev-parse", "HEAD"])
branch = _run_cmd(["git", "rev-parse", "--abbrev-ref", "HEAD"])
......@@ -154,14 +125,21 @@ def _main():
print("-- Building version", version)
dcu_version = _make_version_file(version, sha)
_fetch_third_party_libraries()
with open("README.md") as f:
long_description = f.read()
setup(
name="torchaudio",
version=dcu_version,
description="An audio package for PyTorch",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/pytorch/audio",
author="Soumith Chintala, David Pollack, Sean Naren, Peter Goldsborough, Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang",
author=(
"Soumith Chintala, David Pollack, Sean Naren, Peter Goldsborough, "
"Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang"
),
author_email="soumith@pytorch.org",
maintainer="Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang",
maintainer_email="moto@meta.com",
......@@ -174,9 +152,10 @@ def _main():
"Operating System :: Microsoft :: Windows",
"Operating System :: POSIX",
"Programming Language :: C++",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: Implementation :: CPython",
"Topic :: Multimedia :: Sound/Audio",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
......
......@@ -102,7 +102,7 @@ def pytest_addoption(parser):
@pytest.fixture(autouse=True)
def temp_hub_dir(tmp_path, pytestconfig):
if not pytestconfig.getoption("use_tmp_hub_dir"):
if not pytestconfig.getoption("use_tmp_hub_dir", default=False):
yield
else:
org_dir = torch.hub.get_dir()
......
import math
import torch
import torchaudio
from torchaudio.prototype.functional import oscillator_bank
from torchaudio.prototype.pipelines import HIFIGAN_VOCODER_V3_LJSPEECH
def test_hifi_gan_pretrained_weights():
"""Test that a waveform reconstructed from mel spectrogram by HiFiGAN bundle is close enough to the original.
The main transformations performed in this test can be represented as
- audio -> reference log mel spectrogram
- audio -> mel spectrogram -> audio -> estimated log mel spectrogram
In the end, we compare estimated log mel spectrogram to the reference one. See comments in code for details.
"""
bundle = HIFIGAN_VOCODER_V3_LJSPEECH
# Get HiFiGAN-compatible transformation from waveform to mel spectrogram
mel_transform = bundle.get_mel_transform()
# Get HiFiGAN vocoder
vocoder = bundle.get_vocoder()
# Create a synthetic waveform
ref_waveform = get_sin_sweep(sample_rate=bundle.sample_rate, length=100000)
ref_waveform = ref_waveform[:, : -(ref_waveform.shape[1] % mel_transform.hop_size)]
# Generate mel spectrogram from waveform
mel_spectrogram = mel_transform(ref_waveform)
with torch.no_grad():
# Generate waveform from mel spectrogram
estimated_waveform = vocoder(mel_spectrogram).squeeze(0)
# Measure the reconstruction error.
# Even though the reconstructed audio is perceptually very close to the original, it doesn't score well on
# metrics like Si-SNR. It might be that HiFiGAN introduces non-uniform shifts to the reconstructed waveforms.
# So to evaluate the recontruction error we compute mel spectrograms of the reference and recontructed waveforms,
# and compare relative mean squared error of their logarithms.
final_spec = torchaudio.transforms.MelSpectrogram(sample_rate=bundle.sample_rate, normalized=True)
# Log mel spectrogram of the estimated waveform
estimated_spectorogram = final_spec(estimated_waveform)
estimated_spectorogram = torch.log(torch.clamp(estimated_spectorogram, min=1e-5))
# Log mel spectrogram of the reference waveform
ref_spectrogram = final_spec(ref_waveform)
ref_spectrogram = torch.log(torch.clamp(ref_spectrogram, min=1e-5))
# Check that relative MSE is below 4%
mse = ((estimated_spectorogram - ref_spectrogram) ** 2).mean()
mean_ref = ((ref_spectrogram) ** 2).mean()
print(mse / mean_ref)
assert mse / mean_ref < 0.04
def get_sin_sweep(sample_rate, length):
"""Create a waveform which changes frequency from 0 to the Nyquist frequency (half of the sample rate)"""
nyquist_freq = sample_rate / 2
freq = torch.logspace(0, math.log(0.99 * nyquist_freq, 10), length).unsqueeze(-1)
amp = torch.ones((length, 1))
waveform = oscillator_bank(freq, amp, sample_rate=sample_rate)
return waveform.unsqueeze(0)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment