Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Torchaudio
Commits
ffeba11a
Commit
ffeba11a
authored
Sep 02, 2024
by
mayp777
Browse files
UPDATE
parent
29deb085
Changes
337
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1986 additions
and
435 deletions
+1986
-435
examples/tutorials/forced_alignment_tutorial.py
examples/tutorials/forced_alignment_tutorial.py
+135
-99
examples/tutorials/hybrid_demucs_tutorial.py
examples/tutorials/hybrid_demucs_tutorial.py
+27
-50
examples/tutorials/mvdr_tutorial.py
examples/tutorials/mvdr_tutorial.py
+8
-10
examples/tutorials/nvdec_tutorial.py
examples/tutorials/nvdec_tutorial.py
+791
-0
examples/tutorials/nvenc_tutorial.py
examples/tutorials/nvenc_tutorial.py
+388
-0
examples/tutorials/online_asr_tutorial.py
examples/tutorials/online_asr_tutorial.py
+60
-32
examples/tutorials/speech_recognition_pipeline_tutorial.py
examples/tutorials/speech_recognition_pipeline_tutorial.py
+2
-3
examples/tutorials/squim_tutorial.py
examples/tutorials/squim_tutorial.py
+390
-0
examples/tutorials/streamreader_advanced_tutorial.py
examples/tutorials/streamreader_advanced_tutorial.py
+7
-24
examples/tutorials/streamreader_basic_tutorial.py
examples/tutorials/streamreader_basic_tutorial.py
+4
-29
examples/tutorials/streamwriter_advanced.py
examples/tutorials/streamwriter_advanced.py
+18
-20
examples/tutorials/streamwriter_basic_tutorial.py
examples/tutorials/streamwriter_basic_tutorial.py
+20
-79
examples/tutorials/tacotron2_pipeline_tutorial.py
examples/tutorials/tacotron2_pipeline_tutorial.py
+40
-26
packaging/torchaudio/meta.yaml
packaging/torchaudio/meta.yaml
+1
-6
packaging/vs2019/conda_build_config.yaml
packaging/vs2019/conda_build_config.yaml
+1
-3
packaging/windows/internal/cuda_install.bat
packaging/windows/internal/cuda_install.bat
+13
-13
pyproject.toml
pyproject.toml
+4
-1
setup.py
setup.py
+18
-39
test/integration_tests/conftest.py
test/integration_tests/conftest.py
+1
-1
test/integration_tests/prototype/hifi_gan_pipeline_test.py
test/integration_tests/prototype/hifi_gan_pipeline_test.py
+58
-0
No files found.
Too many changes to show.
To preserve performance only
337 of 337+
files are displayed.
Plain diff
Email patch
examples/tutorials/forced_alignment_tutorial.py
View file @
ffeba11a
...
@@ -9,6 +9,25 @@ This tutorial shows how to align transcript to speech with
...
@@ -9,6 +9,25 @@ This tutorial shows how to align transcript to speech with
`CTC-Segmentation of Large Corpora for German End-to-end Speech
`CTC-Segmentation of Large Corpora for German End-to-end Speech
Recognition <https://arxiv.org/abs/2007.09127>`__.
Recognition <https://arxiv.org/abs/2007.09127>`__.
.. note::
This tutorial was originally written to illustrate a usecase
for Wav2Vec2 pretrained model.
TorchAudio now has a set of APIs designed for forced alignment.
The `CTC forced alignment API tutorial
<./ctc_forced_alignment_api_tutorial.html>`__ illustrates the
usage of :py:func:`torchaudio.functional.forced_align`, which is
the core API.
If you are looking to align your corpus, we recommend to use
:py:class:`torchaudio.pipelines.Wav2Vec2FABundle`, which combines
:py:func:`~torchaudio.functional.forced_align` and other support
functions with pre-trained model specifically trained for
forced-alignment. Please refer to the
`Forced alignment for multilingual data
<forced_alignment_for_multilingual_data_tutorial.html>`__ which
illustrates its usage.
"""
"""
import
torch
import
torch
...
@@ -45,16 +64,11 @@ print(device)
...
@@ -45,16 +64,11 @@ print(device)
# First we import the necessary packages, and fetch data that we work on.
# First we import the necessary packages, and fetch data that we work on.
#
#
# %matplotlib inline
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
import
IPython
import
IPython
import
matplotlib
import
matplotlib.pyplot
as
plt
import
matplotlib.pyplot
as
plt
matplotlib
.
rcParams
[
"figure.figsize"
]
=
[
16.0
,
4.8
]
torch
.
random
.
manual_seed
(
0
)
torch
.
random
.
manual_seed
(
0
)
SPEECH_FILE
=
torchaudio
.
utils
.
download_asset
(
"tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
)
SPEECH_FILE
=
torchaudio
.
utils
.
download_asset
(
"tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
)
...
@@ -64,7 +78,7 @@ SPEECH_FILE = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-
...
@@ -64,7 +78,7 @@ SPEECH_FILE = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-
# Generate frame-wise label probability
# Generate frame-wise label probability
# -------------------------------------
# -------------------------------------
#
#
# The first step is to generate the label class porbability of each a
d
uio
# The first step is to generate the label class porbability of each au
d
io
# frame. We can use a Wav2Vec2 model that is trained for ASR. Here we use
# frame. We can use a Wav2Vec2 model that is trained for ASR. Here we use
# :py:func:`torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H`.
# :py:func:`torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H`.
#
#
...
@@ -88,17 +102,24 @@ with torch.inference_mode():
...
@@ -88,17 +102,24 @@ with torch.inference_mode():
emission
=
emissions
[
0
].
cpu
().
detach
()
emission
=
emissions
[
0
].
cpu
().
detach
()
print
(
labels
)
################################################################################
################################################################################
# Visualization
# Visualization
################################################################################
# ~~~~~~~~~~~~~
print
(
labels
)
plt
.
imshow
(
emission
.
T
)
plt
.
colorbar
()
def
plot
():
plt
.
title
(
"Frame-wise class probability"
)
fig
,
ax
=
plt
.
subplots
()
plt
.
xlabel
(
"Time"
)
img
=
ax
.
imshow
(
emission
.
T
)
plt
.
ylabel
(
"Labels"
)
ax
.
set_title
(
"Frame-wise class probability"
)
plt
.
show
()
ax
.
set_xlabel
(
"Time"
)
ax
.
set_ylabel
(
"Labels"
)
fig
.
colorbar
(
img
,
ax
=
ax
,
shrink
=
0.6
,
location
=
"bottom"
)
fig
.
tight_layout
()
plot
()
######################################################################
######################################################################
# Generate alignment probability (trellis)
# Generate alignment probability (trellis)
...
@@ -138,7 +159,9 @@ plt.show()
...
@@ -138,7 +159,9 @@ plt.show()
# [`distill.pub <https://distill.pub/2017/ctc/>`__])
# [`distill.pub <https://distill.pub/2017/ctc/>`__])
#
#
transcript
=
"I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT"
# We enclose the transcript with space tokens, which represent SOS and EOS.
transcript
=
"|I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|"
dictionary
=
{
c
:
i
for
i
,
c
in
enumerate
(
labels
)}
dictionary
=
{
c
:
i
for
i
,
c
in
enumerate
(
labels
)}
tokens
=
[
dictionary
[
c
]
for
c
in
transcript
]
tokens
=
[
dictionary
[
c
]
for
c
in
transcript
]
...
@@ -149,21 +172,17 @@ def get_trellis(emission, tokens, blank_id=0):
...
@@ -149,21 +172,17 @@ def get_trellis(emission, tokens, blank_id=0):
num_frame
=
emission
.
size
(
0
)
num_frame
=
emission
.
size
(
0
)
num_tokens
=
len
(
tokens
)
num_tokens
=
len
(
tokens
)
# Trellis has extra diemsions for both time axis and tokens.
trellis
=
torch
.
zeros
((
num_frame
,
num_tokens
))
# The extra dim for tokens represents <SoS> (start-of-sentence)
trellis
[
1
:,
0
]
=
torch
.
cumsum
(
emission
[
1
:,
blank_id
],
0
)
# The extra dim for time axis is for simplification of the code.
trellis
[
0
,
1
:]
=
-
float
(
"inf"
)
trellis
=
torch
.
empty
((
num_frame
+
1
,
num_tokens
+
1
))
trellis
[
-
num_tokens
+
1
:,
0
]
=
float
(
"inf"
)
trellis
[
0
,
0
]
=
0
trellis
[
1
:,
0
]
=
torch
.
cumsum
(
emission
[:,
0
],
0
)
trellis
[
0
,
-
num_tokens
:]
=
-
float
(
"inf"
)
trellis
[
-
num_tokens
:,
0
]
=
float
(
"inf"
)
for
t
in
range
(
num_frame
):
for
t
in
range
(
num_frame
-
1
):
trellis
[
t
+
1
,
1
:]
=
torch
.
maximum
(
trellis
[
t
+
1
,
1
:]
=
torch
.
maximum
(
# Score for staying at the same token
# Score for staying at the same token
trellis
[
t
,
1
:]
+
emission
[
t
,
blank_id
],
trellis
[
t
,
1
:]
+
emission
[
t
,
blank_id
],
# Score for changing to the next token
# Score for changing to the next token
trellis
[
t
,
:
-
1
]
+
emission
[
t
,
tokens
],
trellis
[
t
,
:
-
1
]
+
emission
[
t
,
tokens
[
1
:]
],
)
)
return
trellis
return
trellis
...
@@ -172,11 +191,19 @@ trellis = get_trellis(emission, tokens)
...
@@ -172,11 +191,19 @@ trellis = get_trellis(emission, tokens)
################################################################################
################################################################################
# Visualization
# Visualization
################################################################################
# ~~~~~~~~~~~~~
plt
.
imshow
(
trellis
[
1
:,
1
:].
T
,
origin
=
"lower"
)
plt
.
annotate
(
"- Inf"
,
(
trellis
.
size
(
1
)
/
5
,
trellis
.
size
(
1
)
/
1.5
))
plt
.
colorbar
()
def
plot
():
plt
.
show
()
fig
,
ax
=
plt
.
subplots
()
img
=
ax
.
imshow
(
trellis
.
T
,
origin
=
"lower"
)
ax
.
annotate
(
"- Inf"
,
(
trellis
.
size
(
1
)
/
5
,
trellis
.
size
(
1
)
/
1.5
))
ax
.
annotate
(
"+ Inf"
,
(
trellis
.
size
(
0
)
-
trellis
.
size
(
1
)
/
5
,
trellis
.
size
(
1
)
/
3
))
fig
.
colorbar
(
img
,
ax
=
ax
,
shrink
=
0.6
,
location
=
"bottom"
)
fig
.
tight_layout
()
plot
()
######################################################################
######################################################################
# In the above visualization, we can see that there is a trace of high
# In the above visualization, we can see that there is a trace of high
...
@@ -214,38 +241,38 @@ class Point:
...
@@ -214,38 +241,38 @@ class Point:
def
backtrack
(
trellis
,
emission
,
tokens
,
blank_id
=
0
):
def
backtrack
(
trellis
,
emission
,
tokens
,
blank_id
=
0
):
# Note:
t
,
j
=
trellis
.
size
(
0
)
-
1
,
trellis
.
size
(
1
)
-
1
# j and t are indices for trellis, which has extra dimensions
# for time and tokens at the beginning.
path
=
[
Point
(
j
,
t
,
emission
[
t
,
blank_id
].
exp
().
item
())]
# When referring to time frame index `T` in trellis,
while
j
>
0
:
# the corresponding index in emission is `T-1`.
# Should not happen but just in case
# Similarly, when referring to token index `J` in trellis,
assert
t
>
0
# the corresponding index in transcript is `J-1`.
j
=
trellis
.
size
(
1
)
-
1
t_start
=
torch
.
argmax
(
trellis
[:,
j
]).
item
()
path
=
[]
for
t
in
range
(
t_start
,
0
,
-
1
):
# 1. Figure out if the current position was stay or change
# 1. Figure out if the current position was stay or change
# Note (again):
# Frame-wise score of stay vs change
# `emission[J-1]` is the emission at time frame `J` of trellis dimension.
p_stay
=
emission
[
t
-
1
,
blank_id
]
# Score for token staying the same from time frame J-1 to T.
p_change
=
emission
[
t
-
1
,
tokens
[
j
]]
stayed
=
trellis
[
t
-
1
,
j
]
+
emission
[
t
-
1
,
blank_id
]
# Score for token changing from C-1 at T-1 to J at T.
# Context-aware score for stay vs change
changed
=
trellis
[
t
-
1
,
j
-
1
]
+
emission
[
t
-
1
,
tokens
[
j
-
1
]]
stayed
=
trellis
[
t
-
1
,
j
]
+
p_stay
changed
=
trellis
[
t
-
1
,
j
-
1
]
+
p_change
# 2. Store the path with frame-wise probability.
prob
=
emission
[
t
-
1
,
tokens
[
j
-
1
]
if
changed
>
stayed
else
0
].
exp
().
item
()
# Update position
# Return token index and time index in non-trellis coordinate.
t
-=
1
path
.
append
(
Point
(
j
-
1
,
t
-
1
,
prob
))
# 3. Update the token
if
changed
>
stayed
:
if
changed
>
stayed
:
j
-=
1
j
-=
1
if
j
==
0
:
break
# Store the path with frame-wise probability.
else
:
prob
=
(
p_change
if
changed
>
stayed
else
p_stay
).
exp
().
item
()
raise
ValueError
(
"Failed to align"
)
path
.
append
(
Point
(
j
,
t
,
prob
))
# Now j == 0, which means, it reached the SoS.
# Fill up the rest for the sake of visualization
while
t
>
0
:
prob
=
emission
[
t
-
1
,
blank_id
].
exp
().
item
()
path
.
append
(
Point
(
j
,
t
-
1
,
prob
))
t
-=
1
return
path
[::
-
1
]
return
path
[::
-
1
]
...
@@ -256,21 +283,28 @@ for p in path:
...
@@ -256,21 +283,28 @@ for p in path:
################################################################################
################################################################################
# Visualization
# Visualization
################################################################################
# ~~~~~~~~~~~~~
def
plot_trellis_with_path
(
trellis
,
path
):
def
plot_trellis_with_path
(
trellis
,
path
):
# To plot trellis with path, we take advantage of 'nan' value
# To plot trellis with path, we take advantage of 'nan' value
trellis_with_path
=
trellis
.
clone
()
trellis_with_path
=
trellis
.
clone
()
for
_
,
p
in
enumerate
(
path
):
for
_
,
p
in
enumerate
(
path
):
trellis_with_path
[
p
.
time_index
,
p
.
token_index
]
=
float
(
"nan"
)
trellis_with_path
[
p
.
time_index
,
p
.
token_index
]
=
float
(
"nan"
)
plt
.
imshow
(
trellis_with_path
[
1
:,
1
:].
T
,
origin
=
"lower"
)
plt
.
imshow
(
trellis_with_path
.
T
,
origin
=
"lower"
)
plt
.
title
(
"The path found by backtracking"
)
plt
.
tight_layout
()
plot_trellis_with_path
(
trellis
,
path
)
plot_trellis_with_path
(
trellis
,
path
)
plt
.
title
(
"The path found by backtracking"
)
plt
.
show
()
######################################################################
######################################################################
# Looking good. Now this path contains repetations for the same labels, so
# Looking good.
######################################################################
# Segment the path
# ----------------
# Now this path contains repetations for the same labels, so
# let’s merge them to make it close to the original transcript.
# let’s merge them to make it close to the original transcript.
#
#
# When merging the multiple path points, we simply take the average
# When merging the multiple path points, we simply take the average
...
@@ -320,23 +354,24 @@ for seg in segments:
...
@@ -320,23 +354,24 @@ for seg in segments:
################################################################################
################################################################################
# Visualization
# Visualization
################################################################################
# ~~~~~~~~~~~~~
def
plot_trellis_with_segments
(
trellis
,
segments
,
transcript
):
def
plot_trellis_with_segments
(
trellis
,
segments
,
transcript
):
# To plot trellis with path, we take advantage of 'nan' value
# To plot trellis with path, we take advantage of 'nan' value
trellis_with_path
=
trellis
.
clone
()
trellis_with_path
=
trellis
.
clone
()
for
i
,
seg
in
enumerate
(
segments
):
for
i
,
seg
in
enumerate
(
segments
):
if
seg
.
label
!=
"|"
:
if
seg
.
label
!=
"|"
:
trellis_with_path
[
seg
.
start
+
1
:
seg
.
end
+
1
,
i
+
1
]
=
float
(
"nan"
)
trellis_with_path
[
seg
.
start
:
seg
.
end
,
i
]
=
float
(
"nan"
)
fig
,
[
ax1
,
ax2
]
=
plt
.
subplots
(
2
,
1
,
figsize
=
(
16
,
9.5
)
)
fig
,
[
ax1
,
ax2
]
=
plt
.
subplots
(
2
,
1
,
sharex
=
True
)
ax1
.
set_title
(
"Path, label and probability for each label"
)
ax1
.
set_title
(
"Path, label and probability for each label"
)
ax1
.
imshow
(
trellis_with_path
.
T
,
origin
=
"lower"
)
ax1
.
imshow
(
trellis_with_path
.
T
,
origin
=
"lower"
,
aspect
=
"auto"
)
ax1
.
set_xticks
([])
for
i
,
seg
in
enumerate
(
segments
):
for
i
,
seg
in
enumerate
(
segments
):
if
seg
.
label
!=
"|"
:
if
seg
.
label
!=
"|"
:
ax1
.
annotate
(
seg
.
label
,
(
seg
.
start
+
0.7
,
i
+
0.
3
),
weight
=
"bold
"
)
ax1
.
annotate
(
seg
.
label
,
(
seg
.
start
,
i
-
0.
7
),
size
=
"small
"
)
ax1
.
annotate
(
f
"
{
seg
.
score
:.
2
f
}
"
,
(
seg
.
start
-
0.3
,
i
+
4.3
)
)
ax1
.
annotate
(
f
"
{
seg
.
score
:.
2
f
}
"
,
(
seg
.
start
,
i
+
3
),
size
=
"small"
)
ax2
.
set_title
(
"Label probability with and without repetation"
)
ax2
.
set_title
(
"Label probability with and without repetation"
)
xs
,
hs
,
ws
=
[],
[],
[]
xs
,
hs
,
ws
=
[],
[],
[]
...
@@ -345,7 +380,7 @@ def plot_trellis_with_segments(trellis, segments, transcript):
...
@@ -345,7 +380,7 @@ def plot_trellis_with_segments(trellis, segments, transcript):
xs
.
append
((
seg
.
end
+
seg
.
start
)
/
2
+
0.4
)
xs
.
append
((
seg
.
end
+
seg
.
start
)
/
2
+
0.4
)
hs
.
append
(
seg
.
score
)
hs
.
append
(
seg
.
score
)
ws
.
append
(
seg
.
end
-
seg
.
start
)
ws
.
append
(
seg
.
end
-
seg
.
start
)
ax2
.
annotate
(
seg
.
label
,
(
seg
.
start
+
0.8
,
-
0.07
)
,
weight
=
"bold"
)
ax2
.
annotate
(
seg
.
label
,
(
seg
.
start
+
0.8
,
-
0.07
))
ax2
.
bar
(
xs
,
hs
,
width
=
ws
,
color
=
"gray"
,
alpha
=
0.5
,
edgecolor
=
"black"
)
ax2
.
bar
(
xs
,
hs
,
width
=
ws
,
color
=
"gray"
,
alpha
=
0.5
,
edgecolor
=
"black"
)
xs
,
hs
=
[],
[]
xs
,
hs
=
[],
[]
...
@@ -357,17 +392,21 @@ def plot_trellis_with_segments(trellis, segments, transcript):
...
@@ -357,17 +392,21 @@ def plot_trellis_with_segments(trellis, segments, transcript):
ax2
.
bar
(
xs
,
hs
,
width
=
0.5
,
alpha
=
0.5
)
ax2
.
bar
(
xs
,
hs
,
width
=
0.5
,
alpha
=
0.5
)
ax2
.
axhline
(
0
,
color
=
"black"
)
ax2
.
axhline
(
0
,
color
=
"black"
)
ax2
.
set_xlim
(
ax1
.
get_xlim
()
)
ax2
.
grid
(
True
,
axis
=
"y"
)
ax2
.
set_ylim
(
-
0.1
,
1.1
)
ax2
.
set_ylim
(
-
0.1
,
1.1
)
fig
.
tight_layout
()
plot_trellis_with_segments
(
trellis
,
segments
,
transcript
)
plot_trellis_with_segments
(
trellis
,
segments
,
transcript
)
plt
.
tight_layout
()
plt
.
show
()
######################################################################
######################################################################
# Looks good. Now let’s merge the words. The Wav2Vec2 model uses ``'|'``
# Looks good.
######################################################################
# Merge the segments into words
# -----------------------------
# Now let’s merge the words. The Wav2Vec2 model uses ``'|'``
# as the word boundary, so we merge the segments before each occurance of
# as the word boundary, so we merge the segments before each occurance of
# ``'|'``.
# ``'|'``.
#
#
...
@@ -400,46 +439,43 @@ for word in word_segments:
...
@@ -400,46 +439,43 @@ for word in word_segments:
################################################################################
################################################################################
# Visualization
# Visualization
#
###############################################################################
#
~~~~~~~~~~~~~
def
plot_alignments
(
trellis
,
segments
,
word_segments
,
waveform
):
def
plot_alignments
(
trellis
,
segments
,
word_segments
,
waveform
,
sample_rate
=
bundle
.
sample_rate
):
trellis_with_path
=
trellis
.
clone
()
trellis_with_path
=
trellis
.
clone
()
for
i
,
seg
in
enumerate
(
segments
):
for
i
,
seg
in
enumerate
(
segments
):
if
seg
.
label
!=
"|"
:
if
seg
.
label
!=
"|"
:
trellis_with_path
[
seg
.
start
+
1
:
seg
.
end
+
1
,
i
+
1
]
=
float
(
"nan"
)
trellis_with_path
[
seg
.
start
:
seg
.
end
,
i
]
=
float
(
"nan"
)
fig
,
[
ax1
,
ax2
]
=
plt
.
subplots
(
2
,
1
,
figsize
=
(
16
,
9.5
)
)
fig
,
[
ax1
,
ax2
]
=
plt
.
subplots
(
2
,
1
)
ax1
.
imshow
(
trellis_with_path
[
1
:,
1
:].
T
,
origin
=
"lower"
)
ax1
.
imshow
(
trellis_with_path
.
T
,
origin
=
"lower"
,
aspect
=
"auto"
)
ax1
.
set_facecolor
(
"lightgray"
)
ax1
.
set_xticks
([])
ax1
.
set_xticks
([])
ax1
.
set_yticks
([])
ax1
.
set_yticks
([])
for
word
in
word_segments
:
for
word
in
word_segments
:
ax1
.
axvline
(
word
.
start
-
0.5
)
ax1
.
axvspan
(
word
.
start
-
0.5
,
word
.
end
-
0.5
,
edgecolor
=
"white"
,
facecolor
=
"none"
)
ax1
.
axvline
(
word
.
end
-
0.5
)
for
i
,
seg
in
enumerate
(
segments
):
for
i
,
seg
in
enumerate
(
segments
):
if
seg
.
label
!=
"|"
:
if
seg
.
label
!=
"|"
:
ax1
.
annotate
(
seg
.
label
,
(
seg
.
start
,
i
+
0.
3
)
)
ax1
.
annotate
(
seg
.
label
,
(
seg
.
start
,
i
-
0.
7
),
size
=
"small"
)
ax1
.
annotate
(
f
"
{
seg
.
score
:.
2
f
}
"
,
(
seg
.
start
,
i
+
4
),
font
size
=
8
)
ax1
.
annotate
(
f
"
{
seg
.
score
:.
2
f
}
"
,
(
seg
.
start
,
i
+
3
),
size
=
"small"
)
# The original waveform
# The original waveform
ratio
=
waveform
.
size
(
0
)
/
(
trellis
.
size
(
0
)
-
1
)
ratio
=
waveform
.
size
(
0
)
/
sample_rate
/
trellis
.
size
(
0
)
ax2
.
plot
(
waveform
)
ax2
.
specgram
(
waveform
,
Fs
=
sample_rate
)
for
word
in
word_segments
:
for
word
in
word_segments
:
x0
=
ratio
*
word
.
start
x0
=
ratio
*
word
.
start
x1
=
ratio
*
word
.
end
x1
=
ratio
*
word
.
end
ax2
.
axvspan
(
x0
,
x1
,
alpha
=
0.1
,
color
=
"red
"
)
ax2
.
axvspan
(
x0
,
x1
,
facecolor
=
"none"
,
edgecolor
=
"white"
,
hatch
=
"/
"
)
ax2
.
annotate
(
f
"
{
word
.
score
:.
2
f
}
"
,
(
x0
,
0.8
)
)
ax2
.
annotate
(
f
"
{
word
.
score
:.
2
f
}
"
,
(
x0
,
sample_rate
*
0.51
),
annotation_clip
=
False
)
for
seg
in
segments
:
for
seg
in
segments
:
if
seg
.
label
!=
"|"
:
if
seg
.
label
!=
"|"
:
ax2
.
annotate
(
seg
.
label
,
(
seg
.
start
*
ratio
,
0.9
))
ax2
.
annotate
(
seg
.
label
,
(
seg
.
start
*
ratio
,
sample_rate
*
0.55
),
annotation_clip
=
False
)
xticks
=
ax2
.
get_xticks
()
plt
.
xticks
(
xticks
,
xticks
/
bundle
.
sample_rate
)
ax2
.
set_xlabel
(
"time [second]"
)
ax2
.
set_xlabel
(
"time [second]"
)
ax2
.
set_yticks
([])
ax2
.
set_yticks
([])
ax2
.
set_ylim
(
-
1.0
,
1.0
)
fig
.
tight_layout
()
ax2
.
set_xlim
(
0
,
waveform
.
size
(
-
1
))
plot_alignments
(
plot_alignments
(
...
@@ -448,16 +484,16 @@ plot_alignments(
...
@@ -448,16 +484,16 @@ plot_alignments(
word_segments
,
word_segments
,
waveform
[
0
],
waveform
[
0
],
)
)
plt
.
show
()
################################################################################
################################################################################
# Audio Samples
# -------------
#
#
# A trick to embed the resulting audio to the generated file.
# `IPython.display.Audio` has to be the last call in a cell,
# and there should be only one call par cell.
def
display_segment
(
i
):
def
display_segment
(
i
):
ratio
=
waveform
.
size
(
1
)
/
(
trellis
.
size
(
0
)
-
1
)
ratio
=
waveform
.
size
(
1
)
/
trellis
.
size
(
0
)
word
=
word_segments
[
i
]
word
=
word_segments
[
i
]
x0
=
int
(
ratio
*
word
.
start
)
x0
=
int
(
ratio
*
word
.
start
)
x1
=
int
(
ratio
*
word
.
end
)
x1
=
int
(
ratio
*
word
.
end
)
...
...
examples/tutorials/hybrid_demucs_tutorial.py
View file @
ffeba11a
...
@@ -45,6 +45,8 @@ import torchaudio
...
@@ -45,6 +45,8 @@ import torchaudio
print
(
torch
.
__version__
)
print
(
torch
.
__version__
)
print
(
torchaudio
.
__version__
)
print
(
torchaudio
.
__version__
)
import
matplotlib.pyplot
as
plt
######################################################################
######################################################################
# In addition to ``torchaudio``, ``mir_eval`` is required to perform
# In addition to ``torchaudio``, ``mir_eval`` is required to perform
# signal-to-distortion ratio (SDR) calculations. To install ``mir_eval``
# signal-to-distortion ratio (SDR) calculations. To install ``mir_eval``
...
@@ -52,30 +54,9 @@ print(torchaudio.__version__)
...
@@ -52,30 +54,9 @@ print(torchaudio.__version__)
#
#
from
IPython.display
import
Audio
from
IPython.display
import
Audio
from
mir_eval
import
separation
from
torchaudio.pipelines
import
HDEMUCS_HIGH_MUSDB_PLUS
from
torchaudio.utils
import
download_asset
from
torchaudio.utils
import
download_asset
import
matplotlib.pyplot
as
plt
try
:
from
torchaudio.pipelines
import
HDEMUCS_HIGH_MUSDB_PLUS
from
mir_eval
import
separation
except
ModuleNotFoundError
:
try
:
import
google.colab
print
(
"""
To enable running this notebook in Google Colab, install nightly
torch and torchaudio builds by adding the following code block to the top
of the notebook before running it:
!pip3 uninstall -y torch torchvision torchaudio
!pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
!pip3 install mir_eval
"""
)
except
ModuleNotFoundError
:
pass
raise
######################################################################
######################################################################
# 3. Construct the pipeline
# 3. Construct the pipeline
...
@@ -130,11 +111,11 @@ from torchaudio.transforms import Fade
...
@@ -130,11 +111,11 @@ from torchaudio.transforms import Fade
def
separate_sources
(
def
separate_sources
(
model
,
model
,
mix
,
mix
,
segment
=
10.
,
segment
=
10.
0
,
overlap
=
0.1
,
overlap
=
0.1
,
device
=
None
,
device
=
None
,
):
):
"""
"""
Apply model to a given mixture. Use fade, and add segments together in order to add model segment by segment.
Apply model to a given mixture. Use fade, and add segments together in order to add model segment by segment.
...
@@ -157,7 +138,7 @@ def separate_sources(
...
@@ -157,7 +138,7 @@ def separate_sources(
start
=
0
start
=
0
end
=
chunk_len
end
=
chunk_len
overlap_frames
=
overlap
*
sample_rate
overlap_frames
=
overlap
*
sample_rate
fade
=
Fade
(
fade_in_len
=
0
,
fade_out_len
=
int
(
overlap_frames
),
fade_shape
=
'
linear
'
)
fade
=
Fade
(
fade_in_len
=
0
,
fade_out_len
=
int
(
overlap_frames
),
fade_shape
=
"
linear
"
)
final
=
torch
.
zeros
(
batch
,
len
(
model
.
sources
),
channels
,
length
,
device
=
device
)
final
=
torch
.
zeros
(
batch
,
len
(
model
.
sources
),
channels
,
length
,
device
=
device
)
...
@@ -181,11 +162,10 @@ def separate_sources(
...
@@ -181,11 +162,10 @@ def separate_sources(
def
plot_spectrogram
(
stft
,
title
=
"Spectrogram"
):
def
plot_spectrogram
(
stft
,
title
=
"Spectrogram"
):
magnitude
=
stft
.
abs
()
magnitude
=
stft
.
abs
()
spectrogram
=
20
*
torch
.
log10
(
magnitude
+
1e-8
).
numpy
()
spectrogram
=
20
*
torch
.
log10
(
magnitude
+
1e-8
).
numpy
()
figure
,
axis
=
plt
.
subplots
(
1
,
1
)
_
,
axis
=
plt
.
subplots
(
1
,
1
)
img
=
axis
.
imshow
(
spectrogram
,
cmap
=
"viridis"
,
vmin
=-
60
,
vmax
=
0
,
origin
=
"lower"
,
aspect
=
"auto"
)
axis
.
imshow
(
spectrogram
,
cmap
=
"viridis"
,
vmin
=-
60
,
vmax
=
0
,
origin
=
"lower"
,
aspect
=
"auto"
)
figure
.
suptitle
(
title
)
axis
.
set_title
(
title
)
plt
.
colorbar
(
img
,
ax
=
axis
)
plt
.
tight_layout
()
plt
.
show
()
######################################################################
######################################################################
...
@@ -208,7 +188,7 @@ def plot_spectrogram(stft, title="Spectrogram"):
...
@@ -208,7 +188,7 @@ def plot_spectrogram(stft, title="Spectrogram"):
# We download the audio file from our storage. Feel free to download another file and use audio from a specific path
# We download the audio file from our storage. Feel free to download another file and use audio from a specific path
SAMPLE_SONG
=
download_asset
(
"tutorial-assets/hdemucs_mix.wav"
)
SAMPLE_SONG
=
download_asset
(
"tutorial-assets/hdemucs_mix.wav"
)
waveform
,
sample_rate
=
torchaudio
.
load
(
SAMPLE_SONG
)
# replace SAMPLE_SONG with desired path for different song
waveform
,
sample_rate
=
torchaudio
.
load
(
SAMPLE_SONG
)
# replace SAMPLE_SONG with desired path for different song
waveform
.
to
(
device
)
waveform
=
waveform
.
to
(
device
)
mixture
=
waveform
mixture
=
waveform
# parameters
# parameters
...
@@ -265,12 +245,13 @@ stft = torchaudio.transforms.Spectrogram(
...
@@ -265,12 +245,13 @@ stft = torchaudio.transforms.Spectrogram(
# scores.
# scores.
#
#
def
output_results
(
original_source
:
torch
.
Tensor
,
predicted_source
:
torch
.
Tensor
,
source
:
str
):
def
output_results
(
original_source
:
torch
.
Tensor
,
predicted_source
:
torch
.
Tensor
,
source
:
str
):
print
(
"SDR score is:"
,
print
(
separation
.
bss_eval_sources
(
"SDR score is:"
,
original_source
.
detach
().
numpy
(),
separation
.
bss_eval_sources
(
original_source
.
detach
().
numpy
(),
predicted_source
.
detach
().
numpy
())[
0
].
mean
(),
predicted_source
.
detach
().
numpy
())[
0
].
mean
()
)
)
plot_spectrogram
(
stft
(
predicted_source
)[
0
],
f
'
Spectrogram
{
source
}
'
)
plot_spectrogram
(
stft
(
predicted_source
)[
0
],
f
"
Spectrogram
-
{
source
}
"
)
return
Audio
(
predicted_source
,
rate
=
sample_rate
)
return
Audio
(
predicted_source
,
rate
=
sample_rate
)
...
@@ -285,23 +266,19 @@ bass_original = download_asset("tutorial-assets/hdemucs_bass_segment.wav")
...
@@ -285,23 +266,19 @@ bass_original = download_asset("tutorial-assets/hdemucs_bass_segment.wav")
vocals_original
=
download_asset
(
"tutorial-assets/hdemucs_vocals_segment.wav"
)
vocals_original
=
download_asset
(
"tutorial-assets/hdemucs_vocals_segment.wav"
)
other_original
=
download_asset
(
"tutorial-assets/hdemucs_other_segment.wav"
)
other_original
=
download_asset
(
"tutorial-assets/hdemucs_other_segment.wav"
)
drums_spec
=
audios
[
"drums"
][:,
frame_start
:
frame_end
]
drums_spec
=
audios
[
"drums"
][:,
frame_start
:
frame_end
]
.
cpu
()
drums
,
sample_rate
=
torchaudio
.
load
(
drums_original
)
drums
,
sample_rate
=
torchaudio
.
load
(
drums_original
)
drums
.
to
(
device
)
bass_spec
=
audios
[
"bass"
][:,
frame_start
:
frame_end
]
bass_spec
=
audios
[
"bass"
][:,
frame_start
:
frame_end
]
.
cpu
()
bass
,
sample_rate
=
torchaudio
.
load
(
bass_original
)
bass
,
sample_rate
=
torchaudio
.
load
(
bass_original
)
bass
.
to
(
device
)
vocals_spec
=
audios
[
"vocals"
][:,
frame_start
:
frame_end
]
vocals_spec
=
audios
[
"vocals"
][:,
frame_start
:
frame_end
]
.
cpu
()
vocals
,
sample_rate
=
torchaudio
.
load
(
vocals_original
)
vocals
,
sample_rate
=
torchaudio
.
load
(
vocals_original
)
vocals
.
to
(
device
)
other_spec
=
audios
[
"other"
][:,
frame_start
:
frame_end
]
other_spec
=
audios
[
"other"
][:,
frame_start
:
frame_end
]
.
cpu
()
other
,
sample_rate
=
torchaudio
.
load
(
other_original
)
other
,
sample_rate
=
torchaudio
.
load
(
other_original
)
other
.
to
(
device
)
mix_spec
=
mixture
[:,
frame_start
:
frame_end
]
mix_spec
=
mixture
[:,
frame_start
:
frame_end
]
.
cpu
()
######################################################################
######################################################################
...
@@ -316,7 +293,7 @@ mix_spec = mixture[:, frame_start: frame_end]
...
@@ -316,7 +293,7 @@ mix_spec = mixture[:, frame_start: frame_end]
#
#
# Mixture Clip
# Mixture Clip
plot_spectrogram
(
stft
(
mix_spec
)[
0
],
"Spectrogram Mixture"
)
plot_spectrogram
(
stft
(
mix_spec
)[
0
],
"Spectrogram
-
Mixture"
)
Audio
(
mix_spec
,
rate
=
sample_rate
)
Audio
(
mix_spec
,
rate
=
sample_rate
)
######################################################################
######################################################################
...
...
examples/tutorials/mvdr_tutorial.py
View file @
ffeba11a
...
@@ -37,6 +37,10 @@ print(torch.__version__)
...
@@ -37,6 +37,10 @@ print(torch.__version__)
print
(
torchaudio
.
__version__
)
print
(
torchaudio
.
__version__
)
import
matplotlib.pyplot
as
plt
import
mir_eval
from
IPython.display
import
Audio
######################################################################
######################################################################
# 2. Preparation
# 2. Preparation
# --------------
# --------------
...
@@ -59,10 +63,6 @@ print(torchaudio.__version__)
...
@@ -59,10 +63,6 @@ print(torchaudio.__version__)
from
pesq
import
pesq
from
pesq
import
pesq
from
pystoi
import
stoi
from
pystoi
import
stoi
import
mir_eval
import
matplotlib.pyplot
as
plt
from
IPython.display
import
Audio
from
torchaudio.utils
import
download_asset
from
torchaudio.utils
import
download_asset
######################################################################
######################################################################
...
@@ -98,23 +98,21 @@ SAMPLE_NOISE = download_asset("tutorial-assets/mvdr/noise.wav")
...
@@ -98,23 +98,21 @@ SAMPLE_NOISE = download_asset("tutorial-assets/mvdr/noise.wav")
#
#
def
plot_spectrogram
(
stft
,
title
=
"Spectrogram"
,
xlim
=
None
):
def
plot_spectrogram
(
stft
,
title
=
"Spectrogram"
):
magnitude
=
stft
.
abs
()
magnitude
=
stft
.
abs
()
spectrogram
=
20
*
torch
.
log10
(
magnitude
+
1e-8
).
numpy
()
spectrogram
=
20
*
torch
.
log10
(
magnitude
+
1e-8
).
numpy
()
figure
,
axis
=
plt
.
subplots
(
1
,
1
)
figure
,
axis
=
plt
.
subplots
(
1
,
1
)
img
=
axis
.
imshow
(
spectrogram
,
cmap
=
"viridis"
,
vmin
=-
100
,
vmax
=
0
,
origin
=
"lower"
,
aspect
=
"auto"
)
img
=
axis
.
imshow
(
spectrogram
,
cmap
=
"viridis"
,
vmin
=-
100
,
vmax
=
0
,
origin
=
"lower"
,
aspect
=
"auto"
)
figure
.
sup
title
(
title
)
axis
.
set_
title
(
title
)
plt
.
colorbar
(
img
,
ax
=
axis
)
plt
.
colorbar
(
img
,
ax
=
axis
)
plt
.
show
()
def
plot_mask
(
mask
,
title
=
"Mask"
,
xlim
=
None
):
def
plot_mask
(
mask
,
title
=
"Mask"
):
mask
=
mask
.
numpy
()
mask
=
mask
.
numpy
()
figure
,
axis
=
plt
.
subplots
(
1
,
1
)
figure
,
axis
=
plt
.
subplots
(
1
,
1
)
img
=
axis
.
imshow
(
mask
,
cmap
=
"viridis"
,
origin
=
"lower"
,
aspect
=
"auto"
)
img
=
axis
.
imshow
(
mask
,
cmap
=
"viridis"
,
origin
=
"lower"
,
aspect
=
"auto"
)
figure
.
sup
title
(
title
)
axis
.
set_
title
(
title
)
plt
.
colorbar
(
img
,
ax
=
axis
)
plt
.
colorbar
(
img
,
ax
=
axis
)
plt
.
show
()
def
si_snr
(
estimate
,
reference
,
epsilon
=
1e-8
):
def
si_snr
(
estimate
,
reference
,
epsilon
=
1e-8
):
...
...
examples/tutorials/nvdec_tutorial.py
0 → 100644
View file @
ffeba11a
"""
Accelerated video decoding with NVDEC
=====================================
.. _nvdec_tutorial:
**Author**: `Moto Hira <moto@meta.com>`__
This tutorial shows how to use NVIDIA’s hardware video decoder (NVDEC)
with TorchAudio, and how it improves the performance of video decoding.
"""
######################################################################
#
# .. note::
#
# This tutorial requires FFmpeg libraries compiled with HW
# acceleration enabled.
#
# Please refer to
# :ref:`Enabling GPU video decoder/encoder <enabling_hw_decoder>`
# for how to build FFmpeg with HW acceleration.
#
import
torch
import
torchaudio
print
(
torch
.
__version__
)
print
(
torchaudio
.
__version__
)
######################################################################
#
import
os
import
time
import
matplotlib.pyplot
as
plt
from
torchaudio.io
import
StreamReader
######################################################################
#
# Check the prerequisites
# -----------------------
#
# First, we check that TorchAudio correctly detects FFmpeg libraries
# that support HW decoder/encoder.
#
from
torchaudio.utils
import
ffmpeg_utils
######################################################################
#
print
(
"FFmpeg Library versions:"
)
for
k
,
ver
in
ffmpeg_utils
.
get_versions
().
items
():
print
(
f
"
{
k
}
:
\t
{
'.'
.
join
(
str
(
v
)
for
v
in
ver
)
}
"
)
######################################################################
#
print
(
"Available NVDEC Decoders:"
)
for
k
in
ffmpeg_utils
.
get_video_decoders
().
keys
():
if
"cuvid"
in
k
:
print
(
f
" -
{
k
}
"
)
######################################################################
#
print
(
"Avaialbe GPU:"
)
print
(
torch
.
cuda
.
get_device_properties
(
0
))
######################################################################
#
# We will use the following video which has the following properties;
#
# - Codec: H.264
# - Resolution: 960x540
# - FPS: 29.97
# - Pixel format: YUV420P
#
# .. raw:: html
#
# <video style="max-width: 100%" controls>
# <source src="https://download.pytorch.org/torchaudio/tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4" type="video/mp4">
# </video>
######################################################################
#
src
=
torchaudio
.
utils
.
download_asset
(
"tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4"
)
######################################################################
# Decoding videos with NVDEC
# --------------------------
#
# To use HW video decoder, you need to specify the HW decoder when
# defining the output video stream by passing ``decoder`` option to
# :py:meth:`~torchaudio.io.StreamReader.add_video_stream` method.
#
s
=
StreamReader
(
src
)
s
.
add_video_stream
(
5
,
decoder
=
"h264_cuvid"
)
s
.
fill_buffer
()
(
video
,)
=
s
.
pop_chunks
()
######################################################################
#
# The video frames are decoded and returned as tensor of NCHW format.
print
(
video
.
shape
,
video
.
dtype
)
######################################################################
#
# By default, the decoded frames are sent back to CPU memory, and
# CPU tensors are created.
print
(
video
.
device
)
######################################################################
#
# By specifying ``hw_accel`` option, you can convert the decoded frames
# to CUDA tensor.
# ``hw_accel`` option takes string values and pass it
# to :py:class:`torch.device`.
#
# .. note::
#
# Currently, ``hw_accel`` option and
# :py:meth:`~torchaudio.io.StreamReader.add_basic_video_stream`
# are not compatible. ``add_basic_video_stream`` adds post-decoding
# process, which is designed for frames in CPU memory.
# Please use :py:meth:`~torchaudio.io.StreamReader.add_video_stream`.
#
s
=
StreamReader
(
src
)
s
.
add_video_stream
(
5
,
decoder
=
"h264_cuvid"
,
hw_accel
=
"cuda:0"
)
s
.
fill_buffer
()
(
video
,)
=
s
.
pop_chunks
()
print
(
video
.
shape
,
video
.
dtype
,
video
.
device
)
######################################################################
# .. note::
#
# When there are multiple of GPUs available, ``StreamReader`` by
# default uses the first GPU. You can change this by providing
# ``"gpu"`` option.
#
# .. code::
#
# # Video data is sent to CUDA device 0, decoded and
# # converted on the same device.
# s.add_video_stream(
# ...,
# decoder="h264_cuvid",
# decoder_option={"gpu": "0"},
# hw_accel="cuda:0",
# )
#
# .. note::
#
# ``"gpu"`` option and ``hw_accel`` option can be specified
# independently. If they do not match, decoded frames are
# transfered to the device specified by ``hw_accell``
# automatically.
#
# .. code::
#
# # Video data is sent to CUDA device 0, and decoded there.
# # Then it is transfered to CUDA device 1, and converted to
# # CUDA tensor.
# s.add_video_stream(
# ...,
# decoder="h264_cuvid",
# decoder_option={"gpu": "0"},
# hw_accel="cuda:1",
# )
######################################################################
# Visualization
# -------------
#
# Let's look at the frames decoded by HW decoder and compare them
# against equivalent results from software decoders.
#
# The following function seeks into the given timestamp and decode one
# frame with the specificed decoder.
def
test_decode
(
decoder
:
str
,
seek
:
float
):
s
=
StreamReader
(
src
)
s
.
seek
(
seek
)
s
.
add_video_stream
(
1
,
decoder
=
decoder
)
s
.
fill_buffer
()
(
video
,)
=
s
.
pop_chunks
()
return
video
[
0
]
######################################################################
#
timestamps
=
[
12
,
19
,
45
,
131
,
180
]
cpu_frames
=
[
test_decode
(
decoder
=
"h264"
,
seek
=
ts
)
for
ts
in
timestamps
]
cuda_frames
=
[
test_decode
(
decoder
=
"h264_cuvid"
,
seek
=
ts
)
for
ts
in
timestamps
]
######################################################################
#
# .. note::
#
# Currently, HW decoder does not support colorspace conversion.
# Decoded frames are YUV format.
# The following function performs YUV to RGB covnersion
# (and axis shuffling for plotting).
def
yuv_to_rgb
(
frames
):
frames
=
frames
.
cpu
().
to
(
torch
.
float
)
y
=
frames
[...,
0
,
:,
:]
u
=
frames
[...,
1
,
:,
:]
v
=
frames
[...,
2
,
:,
:]
y
/=
255
u
=
u
/
255
-
0.5
v
=
v
/
255
-
0.5
r
=
y
+
1.14
*
v
g
=
y
+
-
0.396
*
u
-
0.581
*
v
b
=
y
+
2.029
*
u
rgb
=
torch
.
stack
([
r
,
g
,
b
],
-
1
)
rgb
=
(
rgb
*
255
).
clamp
(
0
,
255
).
to
(
torch
.
uint8
)
return
rgb
.
numpy
()
######################################################################
#
# Now we visualize the resutls.
#
def
plot
():
n_rows
=
len
(
timestamps
)
fig
,
axes
=
plt
.
subplots
(
n_rows
,
2
,
figsize
=
[
12.8
,
16.0
])
for
i
in
range
(
n_rows
):
axes
[
i
][
0
].
imshow
(
yuv_to_rgb
(
cpu_frames
[
i
]))
axes
[
i
][
1
].
imshow
(
yuv_to_rgb
(
cuda_frames
[
i
]))
axes
[
0
][
0
].
set_title
(
"Software decoder"
)
axes
[
0
][
1
].
set_title
(
"HW decoder"
)
plt
.
setp
(
axes
,
xticks
=
[],
yticks
=
[])
plt
.
tight_layout
()
plot
()
######################################################################
#
# They are indistinguishable to the eyes of the author.
# Feel free to let us know if you spot something. :)
#
######################################################################
# HW resizing and cropping
# ------------------------
#
# You can use ``decoder_option`` argument to provide decoder-specific
# options.
#
# The following options are often relevant in preprocessing.
#
# - ``resize``: Resize the frame into ``(width)x(height)``.
# - ``crop``: Crop the frame ``(top)x(bottom)x(left)x(right)``.
# Note that the specified values are the amount of rows/columns removed.
# The final image size is ``(width - left - right)x(height - top -bottom)``.
# If ``crop`` and ``resize`` options are used together,
# ``crop`` is performed first.
#
# For other available options, please run
# ``ffmpeg -h decoder=h264_cuvid``.
#
def
test_options
(
option
):
s
=
StreamReader
(
src
)
s
.
seek
(
87
)
s
.
add_video_stream
(
1
,
decoder
=
"h264_cuvid"
,
hw_accel
=
"cuda:0"
,
decoder_option
=
option
)
s
.
fill_buffer
()
(
video
,)
=
s
.
pop_chunks
()
print
(
f
"Option:
{
option
}
:
\t
{
video
.
shape
}
"
)
return
video
[
0
]
######################################################################
#
original
=
test_options
(
option
=
None
)
resized
=
test_options
(
option
=
{
"resize"
:
"480x270"
})
cropped
=
test_options
(
option
=
{
"crop"
:
"135x135x240x240"
})
cropped_and_resized
=
test_options
(
option
=
{
"crop"
:
"135x135x240x240"
,
"resize"
:
"640x360"
})
######################################################################
#
def
plot
():
fig
,
axes
=
plt
.
subplots
(
2
,
2
,
figsize
=
[
12.8
,
9.6
])
axes
[
0
][
0
].
imshow
(
yuv_to_rgb
(
original
))
axes
[
0
][
1
].
imshow
(
yuv_to_rgb
(
resized
))
axes
[
1
][
0
].
imshow
(
yuv_to_rgb
(
cropped
))
axes
[
1
][
1
].
imshow
(
yuv_to_rgb
(
cropped_and_resized
))
axes
[
0
][
0
].
set_title
(
"Original"
)
axes
[
0
][
1
].
set_title
(
"Resized"
)
axes
[
1
][
0
].
set_title
(
"Cropped"
)
axes
[
1
][
1
].
set_title
(
"Cropped and resized"
)
plt
.
tight_layout
()
return
fig
plot
()
######################################################################
# Comparing resizing methods
# --------------------------
#
# Unlike software scaling, NVDEC does not provide an option to choose
# the scaling algorithm.
# In ML applicatoins, it is often necessary to construct a
# preprocessing pipeline with a similar numerical property.
# So here we compare the result of hardware resizing with software
# resizing of different algorithms.
#
# We will use the following video, which contains the test pattern
# generated using the following command.
#
# .. code::
#
# ffmpeg -y -f lavfi -t 12.05 -i mptestsrc -movflags +faststart mptestsrc.mp4
#
# .. raw:: html
#
# <video style="max-width: 100%" controls>
# <source src="https://download.pytorch.org/torchaudio/tutorial-assets/mptestsrc.mp4" type="video/mp4">
# </video>
######################################################################
#
test_src
=
torchaudio
.
utils
.
download_asset
(
"tutorial-assets/mptestsrc.mp4"
)
######################################################################
# The following function decodes video and
# apply the specified scaling algorithm.
#
def
decode_resize_ffmpeg
(
mode
,
height
,
width
,
seek
):
filter_desc
=
None
if
mode
is
None
else
f
"scale=
{
width
}
:
{
height
}
:sws_flags=
{
mode
}
"
s
=
StreamReader
(
test_src
)
s
.
add_video_stream
(
1
,
filter_desc
=
filter_desc
)
s
.
seek
(
seek
)
s
.
fill_buffer
()
(
chunk
,)
=
s
.
pop_chunks
()
return
chunk
######################################################################
# The following function uses HW decoder to decode video and resize.
#
def
decode_resize_cuvid
(
height
,
width
,
seek
):
s
=
StreamReader
(
test_src
)
s
.
add_video_stream
(
1
,
decoder
=
"h264_cuvid"
,
decoder_option
=
{
"resize"
:
f
"
{
width
}
x
{
height
}
"
},
hw_accel
=
"cuda:0"
)
s
.
seek
(
seek
)
s
.
fill_buffer
()
(
chunk
,)
=
s
.
pop_chunks
()
return
chunk
.
cpu
()
######################################################################
# Now we execute them and visualize the resulting frames.
params
=
{
"height"
:
224
,
"width"
:
224
,
"seek"
:
3
}
frames
=
[
decode_resize_ffmpeg
(
None
,
**
params
),
decode_resize_ffmpeg
(
"neighbor"
,
**
params
),
decode_resize_ffmpeg
(
"bilinear"
,
**
params
),
decode_resize_ffmpeg
(
"bicubic"
,
**
params
),
decode_resize_cuvid
(
**
params
),
decode_resize_ffmpeg
(
"spline"
,
**
params
),
decode_resize_ffmpeg
(
"lanczos:param0=1"
,
**
params
),
decode_resize_ffmpeg
(
"lanczos:param0=3"
,
**
params
),
decode_resize_ffmpeg
(
"lanczos:param0=5"
,
**
params
),
]
######################################################################
#
def
plot
():
fig
,
axes
=
plt
.
subplots
(
3
,
3
,
figsize
=
[
12.8
,
15.2
])
for
i
,
f
in
enumerate
(
frames
):
h
,
w
=
f
.
shape
[
2
:
4
]
f
=
f
[...,
:
h
//
4
,
:
w
//
4
]
axes
[
i
//
3
][
i
%
3
].
imshow
(
yuv_to_rgb
(
f
[
0
]))
axes
[
0
][
0
].
set_title
(
"Original"
)
axes
[
0
][
1
].
set_title
(
"nearest neighbor"
)
axes
[
0
][
2
].
set_title
(
"bilinear"
)
axes
[
1
][
0
].
set_title
(
"bicubic"
)
axes
[
1
][
1
].
set_title
(
"NVDEC"
)
axes
[
1
][
2
].
set_title
(
"spline"
)
axes
[
2
][
0
].
set_title
(
"lanczos(1)"
)
axes
[
2
][
1
].
set_title
(
"lanczos(3)"
)
axes
[
2
][
2
].
set_title
(
"lanczos(5)"
)
plt
.
setp
(
axes
,
xticks
=
[],
yticks
=
[])
plt
.
tight_layout
()
plot
()
######################################################################
# None of them is exactly the same. To the eyes of authors, lanczos(1)
# appears to be most similar to NVDEC.
# The bicubic looks close as well.
######################################################################
#
# Benchmark NVDEC with StreamReader
# ---------------------------------
#
# In this section, we compare the performace of software video
# decoding and HW video decoding.
#
######################################################################
# Decode as CUDA frames
# ---------------------
#
# First, we compare the time it takes for software decoder and
# hardware encoder to decode the same video.
# To make the result comparable, when using software decoder, we move
# the resulting tensor to CUDA.
#
# The procedures to test look like the following
#
# - Use hardware decoder and place data on CUDA directly
# - Use software decoder, generate CPU Tensors and move them to CUDA.
#
# .. note:
#
# Because HW decoder currently only supports reading videos as
# YUV444P format, we decode frames into YUV444P format for the case of
# software decoder as well.
#
######################################################################
# The following function implements the hardware decoder test case.
def
test_decode_cuda
(
src
,
decoder
,
hw_accel
=
"cuda"
,
frames_per_chunk
=
5
):
s
=
StreamReader
(
src
)
s
.
add_video_stream
(
frames_per_chunk
,
decoder
=
decoder
,
hw_accel
=
hw_accel
)
num_frames
=
0
chunk
=
None
t0
=
time
.
monotonic
()
for
(
chunk
,)
in
s
.
stream
():
num_frames
+=
chunk
.
shape
[
0
]
elapsed
=
time
.
monotonic
()
-
t0
print
(
f
" - Shape:
{
chunk
.
shape
}
"
)
fps
=
num_frames
/
elapsed
print
(
f
" - Processed
{
num_frames
}
frames in
{
elapsed
:.
2
f
}
seconds. (
{
fps
:.
2
f
}
fps)"
)
return
fps
######################################################################
# The following function implements the software decoder test case.
def
test_decode_cpu
(
src
,
threads
,
decoder
=
None
,
frames_per_chunk
=
5
):
s
=
StreamReader
(
src
)
s
.
add_video_stream
(
frames_per_chunk
,
decoder
=
decoder
,
decoder_option
=
{
"threads"
:
f
"
{
threads
}
"
})
num_frames
=
0
device
=
torch
.
device
(
"cuda"
)
t0
=
time
.
monotonic
()
for
i
,
(
chunk
,)
in
enumerate
(
s
.
stream
()):
if
i
==
0
:
print
(
f
" - Shape:
{
chunk
.
shape
}
"
)
num_frames
+=
chunk
.
shape
[
0
]
chunk
=
chunk
.
to
(
device
)
elapsed
=
time
.
monotonic
()
-
t0
fps
=
num_frames
/
elapsed
print
(
f
" - Processed
{
num_frames
}
frames in
{
elapsed
:.
2
f
}
seconds. (
{
fps
:.
2
f
}
fps)"
)
return
fps
######################################################################
# For each resolution of video, we run multiple software decoder test
# cases with different number of threads.
def
run_decode_tests
(
src
,
frames_per_chunk
=
5
):
fps
=
[]
print
(
f
"Testing:
{
os
.
path
.
basename
(
src
)
}
"
)
for
threads
in
[
1
,
4
,
8
,
16
]:
print
(
f
"* Software decoding (num_threads=
{
threads
}
)"
)
fps
.
append
(
test_decode_cpu
(
src
,
threads
))
print
(
"* Hardware decoding"
)
fps
.
append
(
test_decode_cuda
(
src
,
decoder
=
"h264_cuvid"
))
return
fps
######################################################################
# Now we run the tests with videos of different resolutions.
#
# QVGA
# ----
src_qvga
=
torchaudio
.
utils
.
download_asset
(
"tutorial-assets/testsrc2_qvga.h264.mp4"
)
fps_qvga
=
run_decode_tests
(
src_qvga
)
######################################################################
# VGA
# ---
src_vga
=
torchaudio
.
utils
.
download_asset
(
"tutorial-assets/testsrc2_vga.h264.mp4"
)
fps_vga
=
run_decode_tests
(
src_vga
)
######################################################################
# XGA
# ---
src_xga
=
torchaudio
.
utils
.
download_asset
(
"tutorial-assets/testsrc2_xga.h264.mp4"
)
fps_xga
=
run_decode_tests
(
src_xga
)
######################################################################
# Result
# ------
#
# Now we plot the result.
def
plot
():
fig
,
ax
=
plt
.
subplots
(
figsize
=
[
9.6
,
6.4
])
for
items
in
zip
(
fps_qvga
,
fps_vga
,
fps_xga
,
"ov^sx"
):
ax
.
plot
(
items
[:
-
1
],
marker
=
items
[
-
1
])
ax
.
grid
(
axis
=
"both"
)
ax
.
set_xticks
([
0
,
1
,
2
],
[
"QVGA (320x240)"
,
"VGA (640x480)"
,
"XGA (1024x768)"
])
ax
.
legend
(
[
"Software Decoding (threads=1)"
,
"Software Decoding (threads=4)"
,
"Software Decoding (threads=8)"
,
"Software Decoding (threads=16)"
,
"Hardware Decoding (CUDA Tensor)"
,
]
)
ax
.
set_title
(
"Speed of processing video frames"
)
ax
.
set_ylabel
(
"Frames per second"
)
plt
.
tight_layout
()
plot
()
######################################################################
#
# We observe couple of things
#
# - Increasing the number of threads in software decoding makes the
# pipeline faster, but the performance saturates around 8 threads.
# - The performance gain from using hardware decoder depends on the
# resolution of video.
# - At lower resolutions like QVGA, hardware decoding is slower than
# software decoding
# - At higher resolutions like XGA, hardware decoding is faster
# than software decoding.
#
#
# It is worth noting that the performance gain also depends on the
# type of GPU.
# We observed that when decoding VGA videos using V100 or A100 GPUs,
# hardware decoders are slower than software decoders. But using A10
# GPU hardware deocder is faster than software decodr.
#
######################################################################
# Decode and resize
# -----------------
#
# Next, we add resize operation to the pipeline.
# We will compare the following pipelines.
#
# 1. Decode video using software decoder and read the frames as
# PyTorch Tensor. Resize the tensor using
# :py:func:`torch.nn.functional.interpolate`, then send
# the resulting tensor to CUDA device.
# 2. Decode video using software decoder, resize the frame with
# FFmpeg's filter graph, read the resized frames as PyTorch tensor,
# then send it to CUDA device.
# 3. Decode and resize video simulaneously with HW decoder, read the
# resulting frames as CUDA tensor.
#
# The pipeline 1 represents common video loading implementations.
#
# The pipeline 2 uses FFmpeg's filter graph, which allows to manipulate
# raw frames before converting them to Tensors.
#
# The pipeline 3 has the minimum amount of data transfer from CPU to
# CUDA, which significantly contribute to performant data loading.
#
######################################################################
# The following function implements the pipeline 1. It uses PyTorch's
# :py:func:`torch.nn.functional.interpolate`.
# We use ``bincubic`` mode, as we saw that the resulting frames are
# closest to NVDEC resizing.
#
def
test_decode_then_resize
(
src
,
height
,
width
,
mode
=
"bicubic"
,
frames_per_chunk
=
5
):
s
=
StreamReader
(
src
)
s
.
add_video_stream
(
frames_per_chunk
,
decoder_option
=
{
"threads"
:
"8"
})
num_frames
=
0
device
=
torch
.
device
(
"cuda"
)
chunk
=
None
t0
=
time
.
monotonic
()
for
(
chunk
,)
in
s
.
stream
():
num_frames
+=
chunk
.
shape
[
0
]
chunk
=
torch
.
nn
.
functional
.
interpolate
(
chunk
,
[
height
,
width
],
mode
=
mode
,
antialias
=
True
)
chunk
=
chunk
.
to
(
device
)
elapsed
=
time
.
monotonic
()
-
t0
fps
=
num_frames
/
elapsed
print
(
f
" - Shape:
{
chunk
.
shape
}
"
)
print
(
f
" - Processed
{
num_frames
}
frames in
{
elapsed
:.
2
f
}
seconds. (
{
fps
:.
2
f
}
fps)"
)
return
fps
######################################################################
# The following function implements the pipeline 2. Frames are resized
# as part of decoding process, then sent to CUDA device.
#
# We use ``bincubic`` mode, to make the result comparable with
# PyTorch-based implementation above.
#
def
test_decode_and_resize
(
src
,
height
,
width
,
mode
=
"bicubic"
,
frames_per_chunk
=
5
):
s
=
StreamReader
(
src
)
s
.
add_video_stream
(
frames_per_chunk
,
filter_desc
=
f
"scale=
{
width
}
:
{
height
}
:sws_flags=
{
mode
}
"
,
decoder_option
=
{
"threads"
:
"8"
}
)
num_frames
=
0
device
=
torch
.
device
(
"cuda"
)
chunk
=
None
t0
=
time
.
monotonic
()
for
(
chunk
,)
in
s
.
stream
():
num_frames
+=
chunk
.
shape
[
0
]
chunk
=
chunk
.
to
(
device
)
elapsed
=
time
.
monotonic
()
-
t0
fps
=
num_frames
/
elapsed
print
(
f
" - Shape:
{
chunk
.
shape
}
"
)
print
(
f
" - Processed
{
num_frames
}
frames in
{
elapsed
:.
2
f
}
seconds. (
{
fps
:.
2
f
}
fps)"
)
return
fps
######################################################################
# The following function implements the pipeline 3. Resizing is
# performed by NVDEC and the resulting tensor is placed on CUDA memory.
def
test_hw_decode_and_resize
(
src
,
decoder
,
decoder_option
,
hw_accel
=
"cuda"
,
frames_per_chunk
=
5
):
s
=
StreamReader
(
src
)
s
.
add_video_stream
(
5
,
decoder
=
decoder
,
decoder_option
=
decoder_option
,
hw_accel
=
hw_accel
)
num_frames
=
0
chunk
=
None
t0
=
time
.
monotonic
()
for
(
chunk
,)
in
s
.
stream
():
num_frames
+=
chunk
.
shape
[
0
]
elapsed
=
time
.
monotonic
()
-
t0
fps
=
num_frames
/
elapsed
print
(
f
" - Shape:
{
chunk
.
shape
}
"
)
print
(
f
" - Processed
{
num_frames
}
frames in
{
elapsed
:.
2
f
}
seconds. (
{
fps
:.
2
f
}
fps)"
)
return
fps
######################################################################
#
# The following function run the benchmark functions on given sources.
#
def
run_resize_tests
(
src
):
print
(
f
"Testing:
{
os
.
path
.
basename
(
src
)
}
"
)
height
,
width
=
224
,
224
print
(
"* Software decoding with PyTorch interpolate"
)
cpu_resize1
=
test_decode_then_resize
(
src
,
height
=
height
,
width
=
width
)
print
(
"* Software decoding with FFmpeg scale"
)
cpu_resize2
=
test_decode_and_resize
(
src
,
height
=
height
,
width
=
width
)
print
(
"* Hardware decoding with resize"
)
cuda_resize
=
test_hw_decode_and_resize
(
src
,
decoder
=
"h264_cuvid"
,
decoder_option
=
{
"resize"
:
f
"
{
width
}
x
{
height
}
"
})
return
[
cpu_resize1
,
cpu_resize2
,
cuda_resize
]
######################################################################
#
# Now we run the tests.
######################################################################
# QVGA
# ----
fps_qvga
=
run_resize_tests
(
src_qvga
)
######################################################################
# VGA
# ---
fps_vga
=
run_resize_tests
(
src_vga
)
######################################################################
# XGA
# ---
fps_xga
=
run_resize_tests
(
src_xga
)
######################################################################
# Result
# ------
# Now we plot the result.
#
def
plot
():
fig
,
ax
=
plt
.
subplots
(
figsize
=
[
9.6
,
6.4
])
for
items
in
zip
(
fps_qvga
,
fps_vga
,
fps_xga
,
"ov^sx"
):
ax
.
plot
(
items
[:
-
1
],
marker
=
items
[
-
1
])
ax
.
grid
(
axis
=
"both"
)
ax
.
set_xticks
([
0
,
1
,
2
],
[
"QVGA (320x240)"
,
"VGA (640x480)"
,
"XGA (1024x768)"
])
ax
.
legend
(
[
"Software decoding
\n
with resize
\n
(PyTorch interpolate)"
,
"Software decoding
\n
with resize
\n
(FFmpeg scale)"
,
"NVDEC
\n
with resizing"
,
]
)
ax
.
set_title
(
"Speed of processing video frames"
)
ax
.
set_xlabel
(
"Input video resolution"
)
ax
.
set_ylabel
(
"Frames per second"
)
plt
.
tight_layout
()
plot
()
######################################################################
#
# Hardware deocder shows a similar trend as previous experiment.
# In fact, the performance is almost the same. Hardware resizing has
# almost zero overhead for scaling down the frames.
#
# Software decoding also shows a similar trend. Performing resizing as
# part of decoding is faster. One possible explanation is that, video
# frames are internally stored as YUV420P, which has half the number
# of pixels compared to RGB24, or YUV444P. This means that if resizing
# before copying frame data to PyTorch tensor, the number of pixels
# manipulated and copied are smaller than the case where applying
# resizing after frames are converted to Tensor.
#
######################################################################
#
# Tag: :obj:`torchaudio.io`
examples/tutorials/nvenc_tutorial.py
0 → 100644
View file @
ffeba11a
"""
Accelerated video encoding with NVENC
=====================================
.. _nvenc_tutorial:
**Author**: `Moto Hira <moto@meta.com>`__
This tutorial shows how to use NVIDIA’s hardware video encoder (NVENC)
with TorchAudio, and how it improves the performance of video encoding.
"""
######################################################################
# .. note::
#
# This tutorial requires FFmpeg libraries compiled with HW
# acceleration enabled.
#
# Please refer to
# :ref:`Enabling GPU video decoder/encoder <enabling_hw_decoder>`
# for how to build FFmpeg with HW acceleration.
#
# .. note::
#
# Most modern GPUs have both HW decoder and encoder, but some
# highend GPUs like A100 and H100 do not have HW encoder.
# Please refer to the following for the availability and
# format coverage.
# https://developer.nvidia.com/video-encode-and-decode-gpu-support-matrix-new
#
# Attempting to use HW encoder on these GPUs fails with an error
# message like ``Generic error in an external library``.
# You can enable debug log with
# :py:func:`torchaudio.utils.ffmpeg_utils.set_log_level` to see more
# detailed error messages issued along the way.
#
import
torch
import
torchaudio
print
(
torch
.
__version__
)
print
(
torchaudio
.
__version__
)
import
io
import
time
import
matplotlib.pyplot
as
plt
from
IPython.display
import
Video
from
torchaudio.io
import
StreamReader
,
StreamWriter
######################################################################
#
# Check the prerequisites
# -----------------------
#
# First, we check that TorchAudio correctly detects FFmpeg libraries
# that support HW decoder/encoder.
#
from
torchaudio.utils
import
ffmpeg_utils
######################################################################
#
print
(
"FFmpeg Library versions:"
)
for
k
,
ver
in
ffmpeg_utils
.
get_versions
().
items
():
print
(
f
"
{
k
}
:
\t
{
'.'
.
join
(
str
(
v
)
for
v
in
ver
)
}
"
)
######################################################################
#
print
(
"Available NVENC Encoders:"
)
for
k
in
ffmpeg_utils
.
get_video_encoders
().
keys
():
if
"nvenc"
in
k
:
print
(
f
" -
{
k
}
"
)
######################################################################
#
print
(
"Avaialbe GPU:"
)
print
(
torch
.
cuda
.
get_device_properties
(
0
))
######################################################################
# We use the following helper function to generate test frame data.
# For the detail of synthetic video generation please refer to
# :ref:`StreamReader Advanced Usage <lavfi>`.
def
get_data
(
height
,
width
,
format
=
"yuv444p"
,
frame_rate
=
30000
/
1001
,
duration
=
4
):
src
=
f
"testsrc2=rate=
{
frame_rate
}
:size=
{
width
}
x
{
height
}
:duration=
{
duration
}
"
s
=
StreamReader
(
src
=
src
,
format
=
"lavfi"
)
s
.
add_basic_video_stream
(
-
1
,
format
=
format
)
s
.
process_all_packets
()
(
video
,)
=
s
.
pop_chunks
()
return
video
######################################################################
# Encoding videos with NVENC
# --------------------------
#
# To use HW video encoder, you need to specify the HW encoder when
# defining the output video stream by providing ``encoder`` option to
# :py:meth:`~torchaudio.io.StreamWriter.add_video_stream`.
#
######################################################################
#
pict_config
=
{
"height"
:
360
,
"width"
:
640
,
"frame_rate"
:
30000
/
1001
,
"format"
:
"yuv444p"
,
}
frame_data
=
get_data
(
**
pict_config
)
######################################################################
#
w
=
StreamWriter
(
io
.
BytesIO
(),
format
=
"mp4"
)
w
.
add_video_stream
(
**
pict_config
,
encoder
=
"h264_nvenc"
,
encoder_format
=
"yuv444p"
)
with
w
.
open
():
w
.
write_video_chunk
(
0
,
frame_data
)
######################################################################
# Similar to the HW decoder, by default, the encoder expects the frame
# data to be on CPU memory. To send data from CUDA memory, you need to
# specify ``hw_accel`` option.
#
buffer
=
io
.
BytesIO
()
w
=
StreamWriter
(
buffer
,
format
=
"mp4"
)
w
.
add_video_stream
(
**
pict_config
,
encoder
=
"h264_nvenc"
,
encoder_format
=
"yuv444p"
,
hw_accel
=
"cuda:0"
)
with
w
.
open
():
w
.
write_video_chunk
(
0
,
frame_data
.
to
(
torch
.
device
(
"cuda:0"
)))
buffer
.
seek
(
0
)
video_cuda
=
buffer
.
read
()
######################################################################
#
Video
(
video_cuda
,
embed
=
True
,
mimetype
=
"video/mp4"
)
######################################################################
# Benchmark NVENC with StreamWriter
# ---------------------------------
#
# Now we compare the performance of software encoder and hardware
# encoder.
#
# Similar to the benchmark in NVDEC, we process the videos of different
# resolution, and measure the time it takes to encode them.
#
# We also measure the size of resulting video file.
######################################################################
# The following function encodes the given frames and measure the time
# it takes to encode and the size of the resulting video data.
#
def
test_encode
(
data
,
encoder
,
width
,
height
,
hw_accel
=
None
,
**
config
):
assert
data
.
is_cuda
buffer
=
io
.
BytesIO
()
s
=
StreamWriter
(
buffer
,
format
=
"mp4"
)
s
.
add_video_stream
(
encoder
=
encoder
,
width
=
width
,
height
=
height
,
hw_accel
=
hw_accel
,
**
config
)
with
s
.
open
():
t0
=
time
.
monotonic
()
if
hw_accel
is
None
:
data
=
data
.
to
(
"cpu"
)
s
.
write_video_chunk
(
0
,
data
)
elapsed
=
time
.
monotonic
()
-
t0
size
=
buffer
.
tell
()
fps
=
len
(
data
)
/
elapsed
print
(
f
" - Processed
{
len
(
data
)
}
frames in
{
elapsed
:.
2
f
}
seconds. (
{
fps
:.
2
f
}
fps)"
)
print
(
f
" - Encoded data size:
{
size
}
bytes"
)
return
elapsed
,
size
######################################################################
# We conduct the tests for the following configurations
#
# - Software encoder with the number of threads 1, 4, 8
# - Hardware encoder with and without ``hw_accel`` option.
#
def
run_tests
(
height
,
width
,
duration
=
4
):
# Generate the test data
print
(
f
"Testing resolution:
{
width
}
x
{
height
}
"
)
pict_config
=
{
"height"
:
height
,
"width"
:
width
,
"frame_rate"
:
30000
/
1001
,
"format"
:
"yuv444p"
,
}
data
=
get_data
(
**
pict_config
,
duration
=
duration
)
data
=
data
.
to
(
torch
.
device
(
"cuda:0"
))
times
=
[]
sizes
=
[]
# Test software encoding
encoder_config
=
{
"encoder"
:
"libx264"
,
"encoder_format"
:
"yuv444p"
,
}
for
i
,
num_threads
in
enumerate
([
1
,
4
,
8
]):
print
(
f
"* Software Encoder (num_threads=
{
num_threads
}
)"
)
time_
,
size
=
test_encode
(
data
,
encoder_option
=
{
"threads"
:
str
(
num_threads
)},
**
pict_config
,
**
encoder_config
,
)
times
.
append
(
time_
)
if
i
==
0
:
sizes
.
append
(
size
)
# Test hardware encoding
encoder_config
=
{
"encoder"
:
"h264_nvenc"
,
"encoder_format"
:
"yuv444p"
,
"encoder_option"
:
{
"gpu"
:
"0"
},
}
for
i
,
hw_accel
in
enumerate
([
None
,
"cuda"
]):
print
(
f
"* Hardware Encoder
{
'(CUDA frames)'
if
hw_accel
else
''
}
"
)
time_
,
size
=
test_encode
(
data
,
**
pict_config
,
**
encoder_config
,
hw_accel
=
hw_accel
,
)
times
.
append
(
time_
)
if
i
==
0
:
sizes
.
append
(
size
)
return
times
,
sizes
######################################################################
# And we change the resolution of videos to see how these measurement
# change.
#
# 360P
# ----
#
time_360
,
size_360
=
run_tests
(
360
,
640
)
######################################################################
# 720P
# ----
#
time_720
,
size_720
=
run_tests
(
720
,
1280
)
######################################################################
# 1080P
# -----
#
time_1080
,
size_1080
=
run_tests
(
1080
,
1920
)
######################################################################
# Now we plot the result.
#
def
plot
():
fig
,
axes
=
plt
.
subplots
(
2
,
1
,
sharex
=
True
,
figsize
=
[
9.6
,
7.2
])
for
items
in
zip
(
time_360
,
time_720
,
time_1080
,
"ov^X+"
):
axes
[
0
].
plot
(
items
[:
-
1
],
marker
=
items
[
-
1
])
axes
[
0
].
grid
(
axis
=
"both"
)
axes
[
0
].
set_xticks
([
0
,
1
,
2
],
[
"360p"
,
"720p"
,
"1080p"
],
visible
=
True
)
axes
[
0
].
tick_params
(
labeltop
=
False
)
axes
[
0
].
legend
(
[
"Software Encoding (threads=1)"
,
"Software Encoding (threads=4)"
,
"Software Encoding (threads=8)"
,
"Hardware Encoding (CPU Tensor)"
,
"Hardware Encoding (CUDA Tensor)"
,
]
)
axes
[
0
].
set_title
(
"Time to encode videos with different resolutions"
)
axes
[
0
].
set_ylabel
(
"Time [s]"
)
for
items
in
zip
(
size_360
,
size_720
,
size_1080
,
"v^"
):
axes
[
1
].
plot
(
items
[:
-
1
],
marker
=
items
[
-
1
])
axes
[
1
].
grid
(
axis
=
"both"
)
axes
[
1
].
set_xticks
([
0
,
1
,
2
],
[
"360p"
,
"720p"
,
"1080p"
])
axes
[
1
].
set_ylabel
(
"The encoded size [bytes]"
)
axes
[
1
].
set_title
(
"The size of encoded videos"
)
axes
[
1
].
legend
(
[
"Software Encoding"
,
"Hardware Encoding"
,
]
)
plt
.
tight_layout
()
plot
()
######################################################################
# Result
# ------
#
# We observe couple of things;
#
# - The time to encode video grows as the resolution becomes larger.
# - In the case of software encoding, increasing the number of threads
# helps reduce the decoding time.
# - The gain from extra threads diminishes around 8.
# - Hardware encoding is faster than software encoding in general.
# - Using ``hw_accel`` does not improve the speed of encoding itself
# as much.
# - The size of the resulting videos grow as the resolution becomes
# larger.
# - Hardware encoder produces smaller video file at larger resolution.
#
# The last point is somewhat strange to the author (who is not an
# expert in production of videos.)
# It is often said that hardware decoders produce larger video
# compared to software encoders.
# Some says that software encoders allow fine-grained control over
# encoding configuration, so the resulting video is more optimal.
# Meanwhile, hardware encoders are optimized for performance, thus
# does not provide as much control over quality and binary size.
#
######################################################################
# Quality Spotcheck
# -----------------
#
# So, how are the quality of videos produced with hardware encoders?
# A quick spot check of high resolution videos uncovers that they have
# more noticeable artifacts on higher resolution.
# Which might be an explanation of the smaller binary size. (meaning,
# it is not allocating enough bits to produce quality output.)
#
# The following images are raw frames of videos encoded with hardware
# encoders.
#
######################################################################
# 360P
# ----
#
# .. raw:: html
#
# <img style="max-width: 100%" src="https://download.pytorch.org/torchaudio/tutorial-assets/nvenc_testsrc2_360_097.png" alt="NVENC sample 360P">
######################################################################
# 720P
# ----
#
# .. raw:: html
#
# <img style="max-width: 100%" src="https://download.pytorch.org/torchaudio/tutorial-assets/nvenc_testsrc2_720_097.png" alt="NVENC sample 720P">
######################################################################
# 1080P
# -----
#
# .. raw:: html
#
# <img style="max-width: 100%" src="https://download.pytorch.org/torchaudio/tutorial-assets/nvenc_testsrc2_1080_097.png" alt="NVENC sample 1080P">
######################################################################
#
# We can see that there are more artifacts at higher resolution, which
# are noticeable.
#
# Perhaps one might be able to reduce these using ``encoder_options``
# arguments.
# We did not try, but if you try that and find a better quality
# setting, feel free to let us know. ;)
######################################################################
#
# Tag: :obj:`torchaudio.io`
examples/tutorials/online_asr_tutorial.py
View file @
ffeba11a
...
@@ -13,14 +13,11 @@ to perform online speech recognition.
...
@@ -13,14 +13,11 @@ to perform online speech recognition.
#
#
# .. note::
# .. note::
#
#
# This tutorial requires FFmpeg libraries
(>=4.1, <4.4)
and SentencePiece.
# This tutorial requires FFmpeg libraries and SentencePiece.
#
#
# There are multiple ways to install FFmpeg libraries.
# Please refer to :ref:`Optional Dependencies <optional_dependencies>`
# If you are using Anaconda Python distribution,
# for the detail.
# ``conda install 'ffmpeg<4.4'`` will install
# the required FFmpeg libraries.
#
#
# You can install SentencePiece by running ``pip install sentencepiece``.
######################################################################
######################################################################
# 1. Overview
# 1. Overview
...
@@ -45,29 +42,9 @@ import torchaudio
...
@@ -45,29 +42,9 @@ import torchaudio
print
(
torch
.
__version__
)
print
(
torch
.
__version__
)
print
(
torchaudio
.
__version__
)
print
(
torchaudio
.
__version__
)
######################################################################
#
import
IPython
import
IPython
import
matplotlib.pyplot
as
plt
try
:
from
torchaudio.io
import
StreamReader
from
torchaudio.io
import
StreamReader
except
ModuleNotFoundError
:
try
:
import
google.colab
print
(
"""
To enable running this notebook in Google Colab, install the requisite
third party libraries by running the following code block:
!add-apt-repository -y ppa:savoury1/ffmpeg4
!apt-get -qq install -y ffmpeg
"""
)
except
ModuleNotFoundError
:
pass
raise
######################################################################
######################################################################
# 3. Construct the pipeline
# 3. Construct the pipeline
...
@@ -195,22 +172,43 @@ state, hypothesis = None, None
...
@@ -195,22 +172,43 @@ state, hypothesis = None, None
stream_iterator
=
streamer
.
stream
()
stream_iterator
=
streamer
.
stream
()
def
_plot
(
feats
,
num_iter
,
unit
=
25
):
unit_dur
=
segment_length
/
sample_rate
*
unit
num_plots
=
num_iter
//
unit
+
(
1
if
num_iter
%
unit
else
0
)
fig
,
axes
=
plt
.
subplots
(
num_plots
,
1
)
t0
=
0
for
i
,
ax
in
enumerate
(
axes
):
feats_
=
feats
[
i
*
unit
:
(
i
+
1
)
*
unit
]
t1
=
t0
+
segment_length
/
sample_rate
*
len
(
feats_
)
feats_
=
torch
.
cat
([
f
[
2
:
-
2
]
for
f
in
feats_
])
# remove boundary effect and overlap
ax
.
imshow
(
feats_
.
T
,
extent
=
[
t0
,
t1
,
0
,
1
],
aspect
=
"auto"
,
origin
=
"lower"
)
ax
.
tick_params
(
which
=
"both"
,
left
=
False
,
labelleft
=
False
)
ax
.
set_xlim
(
t0
,
t0
+
unit_dur
)
t0
=
t1
fig
.
suptitle
(
"MelSpectrogram Feature"
)
plt
.
tight_layout
()
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
run_inference
(
num_iter
=
2
00
):
def
run_inference
(
num_iter
=
1
00
):
global
state
,
hypothesis
global
state
,
hypothesis
chunks
=
[]
chunks
=
[]
feats
=
[]
for
i
,
(
chunk
,)
in
enumerate
(
stream_iterator
,
start
=
1
):
for
i
,
(
chunk
,)
in
enumerate
(
stream_iterator
,
start
=
1
):
segment
=
cacher
(
chunk
[:,
0
])
segment
=
cacher
(
chunk
[:,
0
])
features
,
length
=
feature_extractor
(
segment
)
features
,
length
=
feature_extractor
(
segment
)
hypos
,
state
=
decoder
.
infer
(
features
,
length
,
10
,
state
=
state
,
hypothesis
=
hypothesis
)
hypos
,
state
=
decoder
.
infer
(
features
,
length
,
10
,
state
=
state
,
hypothesis
=
hypothesis
)
hypothesis
=
hypos
[
0
]
hypothesis
=
hypos
transcript
=
token_processor
(
hypo
thesis
[
0
],
lstrip
=
False
)
transcript
=
token_processor
(
hypo
s
[
0
]
[
0
],
lstrip
=
False
)
print
(
transcript
,
end
=
""
,
flush
=
True
)
print
(
transcript
,
end
=
"
\r
"
,
flush
=
True
)
chunks
.
append
(
chunk
)
chunks
.
append
(
chunk
)
feats
.
append
(
features
)
if
i
==
num_iter
:
if
i
==
num_iter
:
break
break
# Plot the features
_plot
(
feats
,
num_iter
)
return
IPython
.
display
.
Audio
(
torch
.
cat
(
chunks
).
T
.
numpy
(),
rate
=
bundle
.
sample_rate
)
return
IPython
.
display
.
Audio
(
torch
.
cat
(
chunks
).
T
.
numpy
(),
rate
=
bundle
.
sample_rate
)
...
@@ -249,6 +247,36 @@ run_inference()
...
@@ -249,6 +247,36 @@ run_inference()
run_inference
()
run_inference
()
######################################################################
#
run_inference
()
######################################################################
#
run_inference
()
######################################################################
#
run_inference
()
######################################################################
#
run_inference
()
######################################################################
#
run_inference
()
######################################################################
#
run_inference
()
######################################################################
######################################################################
#
#
# Tag: :obj:`torchaudio.io`
# Tag: :obj:`torchaudio.io`
examples/tutorials/speech_recognition_pipeline_tutorial.py
View file @
ffeba11a
...
@@ -160,8 +160,7 @@ for i, feats in enumerate(features):
...
@@ -160,8 +160,7 @@ for i, feats in enumerate(features):
ax
[
i
].
set_title
(
f
"Feature from transformer layer
{
i
+
1
}
"
)
ax
[
i
].
set_title
(
f
"Feature from transformer layer
{
i
+
1
}
"
)
ax
[
i
].
set_xlabel
(
"Feature dimension"
)
ax
[
i
].
set_xlabel
(
"Feature dimension"
)
ax
[
i
].
set_ylabel
(
"Frame (time-axis)"
)
ax
[
i
].
set_ylabel
(
"Frame (time-axis)"
)
plt
.
tight_layout
()
fig
.
tight_layout
()
plt
.
show
()
######################################################################
######################################################################
...
@@ -190,7 +189,7 @@ plt.imshow(emission[0].cpu().T, interpolation="nearest")
...
@@ -190,7 +189,7 @@ plt.imshow(emission[0].cpu().T, interpolation="nearest")
plt
.
title
(
"Classification result"
)
plt
.
title
(
"Classification result"
)
plt
.
xlabel
(
"Frame (time-axis)"
)
plt
.
xlabel
(
"Frame (time-axis)"
)
plt
.
ylabel
(
"Class"
)
plt
.
ylabel
(
"Class"
)
plt
.
show
()
plt
.
tight_layout
()
print
(
"Class labels:"
,
bundle
.
get_labels
())
print
(
"Class labels:"
,
bundle
.
get_labels
())
...
...
examples/tutorials/squim_tutorial.py
0 → 100644
View file @
ffeba11a
"""
Torchaudio-Squim: Non-intrusive Speech Assessment in TorchAudio
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
"""
######################################################################
# Author: `Anurag Kumar <anuragkr90@meta.com>`__, `Zhaoheng
# Ni <zni@meta.com>`__
#
######################################################################
# 1. Overview
# ^^^^^^^^^^^
#
######################################################################
# This tutorial shows uses of Torchaudio-Squim to estimate objective and
# subjective metrics for assessment of speech quality and intelligibility.
#
# TorchAudio-Squim enables speech assessment in Torchaudio. It provides
# interface and pre-trained models to estimate various speech quality and
# intelligibility metrics. Currently, Torchaudio-Squim [1] supports
# reference-free estimation 3 widely used objective metrics:
#
# - Wideband Perceptual Estimation of Speech Quality (PESQ) [2]
#
# - Short-Time Objective Intelligibility (STOI) [3]
#
# - Scale-Invariant Signal-to-Distortion Ratio (SI-SDR) [4]
#
# It also supports estimation of subjective Mean Opinion Score (MOS) for a
# given audio waveform using Non-Matching References [1, 5].
#
# **References**
#
# [1] Kumar, Anurag, et al. “TorchAudio-Squim: Reference-less Speech
# Quality and Intelligibility measures in TorchAudio.” ICASSP 2023-2023
# IEEE International Conference on Acoustics, Speech and Signal Processing
# (ICASSP). IEEE, 2023.
#
# [2] I. Rec, “P.862.2: Wideband extension to recommendation P.862 for the
# assessment of wideband telephone networks and speech codecs,”
# International Telecommunication Union, CH–Geneva, 2005.
#
# [3] Taal, C. H., Hendriks, R. C., Heusdens, R., & Jensen, J. (2010,
# March). A short-time objective intelligibility measure for
# time-frequency weighted noisy speech. In 2010 IEEE international
# conference on acoustics, speech and signal processing (pp. 4214-4217).
# IEEE.
#
# [4] Le Roux, Jonathan, et al. “SDR–half-baked or well done?.” ICASSP
# 2019-2019 IEEE International Conference on Acoustics, Speech and Signal
# Processing (ICASSP). IEEE, 2019.
#
# [5] Manocha, Pranay, and Anurag Kumar. “Speech quality assessment
# through MOS using non-matching references.” Interspeech, 2022.
#
import
torch
import
torchaudio
print
(
torch
.
__version__
)
print
(
torchaudio
.
__version__
)
######################################################################
# 2. Preparation
# ^^^^^^^^^^^^^^
#
# First import the modules and define the helper functions.
#
# We will need torch, torchaudio to use Torchaudio-squim, Matplotlib to
# plot data, pystoi, pesq for computing reference metrics.
#
try
:
from
pesq
import
pesq
from
pystoi
import
stoi
from
torchaudio.pipelines
import
SQUIM_OBJECTIVE
,
SQUIM_SUBJECTIVE
except
ImportError
:
try
:
import
google.colab
# noqa: F401
print
(
"""
To enable running this notebook in Google Colab, install nightly
torch and torchaudio builds by adding the following code block to the top
of the notebook before running it:
!pip3 uninstall -y torch torchvision torchaudio
!pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
!pip3 install pesq
!pip3 install pystoi
"""
)
except
Exception
:
pass
raise
import
matplotlib.pyplot
as
plt
######################################################################
#
#
import
torchaudio.functional
as
F
from
IPython.display
import
Audio
from
torchaudio.utils
import
download_asset
def
si_snr
(
estimate
,
reference
,
epsilon
=
1e-8
):
estimate
=
estimate
-
estimate
.
mean
()
reference
=
reference
-
reference
.
mean
()
reference_pow
=
reference
.
pow
(
2
).
mean
(
axis
=
1
,
keepdim
=
True
)
mix_pow
=
(
estimate
*
reference
).
mean
(
axis
=
1
,
keepdim
=
True
)
scale
=
mix_pow
/
(
reference_pow
+
epsilon
)
reference
=
scale
*
reference
error
=
estimate
-
reference
reference_pow
=
reference
.
pow
(
2
)
error_pow
=
error
.
pow
(
2
)
reference_pow
=
reference_pow
.
mean
(
axis
=
1
)
error_pow
=
error_pow
.
mean
(
axis
=
1
)
si_snr
=
10
*
torch
.
log10
(
reference_pow
)
-
10
*
torch
.
log10
(
error_pow
)
return
si_snr
.
item
()
def
plot
(
waveform
,
title
,
sample_rate
=
16000
):
wav_numpy
=
waveform
.
numpy
()
sample_size
=
waveform
.
shape
[
1
]
time_axis
=
torch
.
arange
(
0
,
sample_size
)
/
sample_rate
figure
,
axes
=
plt
.
subplots
(
2
,
1
)
axes
[
0
].
plot
(
time_axis
,
wav_numpy
[
0
],
linewidth
=
1
)
axes
[
0
].
grid
(
True
)
axes
[
1
].
specgram
(
wav_numpy
[
0
],
Fs
=
sample_rate
)
figure
.
suptitle
(
title
)
######################################################################
# 3. Load Speech and Noise Sample
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
SAMPLE_SPEECH
=
download_asset
(
"tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
)
SAMPLE_NOISE
=
download_asset
(
"tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo.wav"
)
######################################################################
#
#
WAVEFORM_SPEECH
,
SAMPLE_RATE_SPEECH
=
torchaudio
.
load
(
SAMPLE_SPEECH
)
WAVEFORM_NOISE
,
SAMPLE_RATE_NOISE
=
torchaudio
.
load
(
SAMPLE_NOISE
)
WAVEFORM_NOISE
=
WAVEFORM_NOISE
[
0
:
1
,
:]
######################################################################
# Currently, Torchaudio-Squim model only supports 16000 Hz sampling rate.
# Resample the waveforms if necessary.
#
if
SAMPLE_RATE_SPEECH
!=
16000
:
WAVEFORM_SPEECH
=
F
.
resample
(
WAVEFORM_SPEECH
,
SAMPLE_RATE_SPEECH
,
16000
)
if
SAMPLE_RATE_NOISE
!=
16000
:
WAVEFORM_NOISE
=
F
.
resample
(
WAVEFORM_NOISE
,
SAMPLE_RATE_NOISE
,
16000
)
######################################################################
# Trim waveforms so that they have the same number of frames.
#
if
WAVEFORM_SPEECH
.
shape
[
1
]
<
WAVEFORM_NOISE
.
shape
[
1
]:
WAVEFORM_NOISE
=
WAVEFORM_NOISE
[:,
:
WAVEFORM_SPEECH
.
shape
[
1
]]
else
:
WAVEFORM_SPEECH
=
WAVEFORM_SPEECH
[:,
:
WAVEFORM_NOISE
.
shape
[
1
]]
######################################################################
# Play speech sample
#
Audio
(
WAVEFORM_SPEECH
.
numpy
()[
0
],
rate
=
16000
)
######################################################################
# Play noise sample
#
Audio
(
WAVEFORM_NOISE
.
numpy
()[
0
],
rate
=
16000
)
######################################################################
# 4. Create distorted (noisy) speech samples
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
snr_dbs
=
torch
.
tensor
([
20
,
-
5
])
WAVEFORM_DISTORTED
=
F
.
add_noise
(
WAVEFORM_SPEECH
,
WAVEFORM_NOISE
,
snr_dbs
)
######################################################################
# Play distorted speech with 20dB SNR
#
Audio
(
WAVEFORM_DISTORTED
.
numpy
()[
0
],
rate
=
16000
)
######################################################################
# Play distorted speech with -5dB SNR
#
Audio
(
WAVEFORM_DISTORTED
.
numpy
()[
1
],
rate
=
16000
)
######################################################################
# 5. Visualize the waveforms
# ^^^^^^^^^^^^^^^^^^^^^^^^^^
#
######################################################################
# Visualize speech sample
#
plot
(
WAVEFORM_SPEECH
,
"Clean Speech"
)
######################################################################
# Visualize noise sample
#
plot
(
WAVEFORM_NOISE
,
"Noise"
)
######################################################################
# Visualize distorted speech with 20dB SNR
#
plot
(
WAVEFORM_DISTORTED
[
0
:
1
],
f
"Distorted Speech with
{
snr_dbs
[
0
]
}
dB SNR"
)
######################################################################
# Visualize distorted speech with -5dB SNR
#
plot
(
WAVEFORM_DISTORTED
[
1
:
2
],
f
"Distorted Speech with
{
snr_dbs
[
1
]
}
dB SNR"
)
######################################################################
# 6. Predict Objective Metrics
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
######################################################################
# Get the pre-trained ``SquimObjective``\ model.
#
objective_model
=
SQUIM_OBJECTIVE
.
get_model
()
######################################################################
# Compare model outputs with ground truths for distorted speech with 20dB
# SNR
#
stoi_hyp
,
pesq_hyp
,
si_sdr_hyp
=
objective_model
(
WAVEFORM_DISTORTED
[
0
:
1
,
:])
print
(
f
"Estimated metrics for distorted speech at
{
snr_dbs
[
0
]
}
dB are
\n
"
)
print
(
f
"STOI:
{
stoi_hyp
[
0
]
}
"
)
print
(
f
"PESQ:
{
pesq_hyp
[
0
]
}
"
)
print
(
f
"SI-SDR:
{
si_sdr_hyp
[
0
]
}
\n
"
)
pesq_ref
=
pesq
(
16000
,
WAVEFORM_SPEECH
[
0
].
numpy
(),
WAVEFORM_DISTORTED
[
0
].
numpy
(),
mode
=
"wb"
)
stoi_ref
=
stoi
(
WAVEFORM_SPEECH
[
0
].
numpy
(),
WAVEFORM_DISTORTED
[
0
].
numpy
(),
16000
,
extended
=
False
)
si_sdr_ref
=
si_snr
(
WAVEFORM_DISTORTED
[
0
:
1
],
WAVEFORM_SPEECH
)
print
(
f
"Reference metrics for distorted speech at
{
snr_dbs
[
0
]
}
dB are
\n
"
)
print
(
f
"STOI:
{
stoi_ref
}
"
)
print
(
f
"PESQ:
{
pesq_ref
}
"
)
print
(
f
"SI-SDR:
{
si_sdr_ref
}
"
)
######################################################################
# Compare model outputs with ground truths for distorted speech with -5dB
# SNR
#
stoi_hyp
,
pesq_hyp
,
si_sdr_hyp
=
objective_model
(
WAVEFORM_DISTORTED
[
1
:
2
,
:])
print
(
f
"Estimated metrics for distorted speech at
{
snr_dbs
[
1
]
}
dB are
\n
"
)
print
(
f
"STOI:
{
stoi_hyp
[
0
]
}
"
)
print
(
f
"PESQ:
{
pesq_hyp
[
0
]
}
"
)
print
(
f
"SI-SDR:
{
si_sdr_hyp
[
0
]
}
\n
"
)
pesq_ref
=
pesq
(
16000
,
WAVEFORM_SPEECH
[
0
].
numpy
(),
WAVEFORM_DISTORTED
[
1
].
numpy
(),
mode
=
"wb"
)
stoi_ref
=
stoi
(
WAVEFORM_SPEECH
[
0
].
numpy
(),
WAVEFORM_DISTORTED
[
1
].
numpy
(),
16000
,
extended
=
False
)
si_sdr_ref
=
si_snr
(
WAVEFORM_DISTORTED
[
1
:
2
],
WAVEFORM_SPEECH
)
print
(
f
"Reference metrics for distorted speech at
{
snr_dbs
[
1
]
}
dB are
\n
"
)
print
(
f
"STOI:
{
stoi_ref
}
"
)
print
(
f
"PESQ:
{
pesq_ref
}
"
)
print
(
f
"SI-SDR:
{
si_sdr_ref
}
"
)
######################################################################
# 7. Predict Mean Opinion Scores (Subjective) Metric
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
######################################################################
# Get the pre-trained ``SquimSubjective`` model.
#
subjective_model
=
SQUIM_SUBJECTIVE
.
get_model
()
######################################################################
# Load a non-matching reference (NMR)
#
NMR_SPEECH
=
download_asset
(
"tutorial-assets/ctc-decoding/1688-142285-0007.wav"
)
WAVEFORM_NMR
,
SAMPLE_RATE_NMR
=
torchaudio
.
load
(
NMR_SPEECH
)
if
SAMPLE_RATE_NMR
!=
16000
:
WAVEFORM_NMR
=
F
.
resample
(
WAVEFORM_NMR
,
SAMPLE_RATE_NMR
,
16000
)
######################################################################
# Compute MOS metric for distorted speech with 20dB SNR
#
mos
=
subjective_model
(
WAVEFORM_DISTORTED
[
0
:
1
,
:],
WAVEFORM_NMR
)
print
(
f
"Estimated MOS for distorted speech at
{
snr_dbs
[
0
]
}
dB is MOS:
{
mos
[
0
]
}
"
)
######################################################################
# Compute MOS metric for distorted speech with -5dB SNR
#
mos
=
subjective_model
(
WAVEFORM_DISTORTED
[
1
:
2
,
:],
WAVEFORM_NMR
)
print
(
f
"Estimated MOS for distorted speech at
{
snr_dbs
[
1
]
}
dB is MOS:
{
mos
[
0
]
}
"
)
######################################################################
# 8. Comparison with ground truths and baselines
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
# Visualizing the estimated metrics by the ``SquimObjective`` and
# ``SquimSubjective`` models can help users better understand how the
# models can be applicable in real scenario. The graph below shows scatter
# plots of three different systems: MOSA-Net [1], AMSA [2], and the
# ``SquimObjective`` model, where y axis represents the estimated STOI,
# PESQ, and Si-SDR scores, and x axis represents the corresponding ground
# truth.
#
# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/objective_plot.png
# :width: 500px
# :align: center
#
# [1] Zezario, Ryandhimas E., Szu-Wei Fu, Fei Chen, Chiou-Shann Fuh,
# Hsin-Min Wang, and Yu Tsao. “Deep learning-based non-intrusive
# multi-objective speech assessment model with cross-domain features.”
# IEEE/ACM Transactions on Audio, Speech, and Language Processing 31
# (2022): 54-70.
#
# [2] Dong, Xuan, and Donald S. Williamson. “An attention enhanced
# multi-task model for objective speech assessment in real-world
# environments.” In ICASSP 2020-2020 IEEE International Conference on
# Acoustics, Speech and Signal Processing (ICASSP), pp. 911-915. IEEE,
# 2020.
#
######################################################################
# The graph below shows scatter plot of the ``SquimSubjective`` model,
# where y axis represents the estimated MOS metric score, and x axis
# represents the corresponding ground truth.
#
# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/subjective_plot.png
# :width: 500px
# :align: center
#
examples/tutorials/streamreader_advanced_tutorial.py
View file @
ffeba11a
...
@@ -20,35 +20,15 @@ import torchaudio
...
@@ -20,35 +20,15 @@ import torchaudio
print
(
torch
.
__version__
)
print
(
torch
.
__version__
)
print
(
torchaudio
.
__version__
)
print
(
torchaudio
.
__version__
)
######################################################################
#
try
:
from
torchaudio.io
import
StreamReader
except
ModuleNotFoundError
:
try
:
import
google.colab
print
(
"""
To enable running this notebook in Google Colab, install the requisite
third party libraries by running the following code:
!add-apt-repository -y ppa:savoury1/ffmpeg4
!apt-get -qq install -y ffmpeg
"""
)
except
ModuleNotFoundError
:
pass
raise
import
IPython
import
IPython
import
matplotlib.pyplot
as
plt
import
matplotlib.pyplot
as
plt
from
torchaudio.io
import
StreamReader
base_url
=
"https://download.pytorch.org/torchaudio/tutorial-assets"
base_url
=
"https://download.pytorch.org/torchaudio/tutorial-assets"
AUDIO_URL
=
f
"
{
base_url
}
/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
AUDIO_URL
=
f
"
{
base_url
}
/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
VIDEO_URL
=
f
"
{
base_url
}
/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4.mp4"
VIDEO_URL
=
f
"
{
base_url
}
/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4.mp4"
######################################################################
######################################################################
# Audio / Video device input
# Audio / Video device input
# --------------------------
# --------------------------
...
@@ -122,6 +102,9 @@ VIDEO_URL = f"{base_url}/stream-api/NASAs_Most_Scientifically_Complex_Space_Obse
...
@@ -122,6 +102,9 @@ VIDEO_URL = f"{base_url}/stream-api/NASAs_Most_Scientifically_Complex_Space_Obse
#
#
######################################################################
######################################################################
#
# .. _lavfi:
#
# Synthetic source streams
# Synthetic source streams
# ------------------------
# ------------------------
#
#
...
@@ -372,13 +355,14 @@ chunks = next(streamer.stream())
...
@@ -372,13 +355,14 @@ chunks = next(streamer.stream())
def
_display
(
i
):
def
_display
(
i
):
print
(
"filter_desc:"
,
streamer
.
get_out_stream_info
(
i
).
filter_description
)
print
(
"filter_desc:"
,
streamer
.
get_out_stream_info
(
i
).
filter_description
)
_
,
axs
=
plt
.
subplots
(
2
,
1
)
fig
,
axs
=
plt
.
subplots
(
2
,
1
)
waveform
=
chunks
[
i
][:,
0
]
waveform
=
chunks
[
i
][:,
0
]
axs
[
0
].
plot
(
waveform
)
axs
[
0
].
plot
(
waveform
)
axs
[
0
].
grid
(
True
)
axs
[
0
].
grid
(
True
)
axs
[
0
].
set_ylim
([
-
1
,
1
])
axs
[
0
].
set_ylim
([
-
1
,
1
])
plt
.
setp
(
axs
[
0
].
get_xticklabels
(),
visible
=
False
)
plt
.
setp
(
axs
[
0
].
get_xticklabels
(),
visible
=
False
)
axs
[
1
].
specgram
(
waveform
,
Fs
=
sample_rate
)
axs
[
1
].
specgram
(
waveform
,
Fs
=
sample_rate
)
fig
.
tight_layout
()
return
IPython
.
display
.
Audio
(
chunks
[
i
].
T
,
rate
=
sample_rate
)
return
IPython
.
display
.
Audio
(
chunks
[
i
].
T
,
rate
=
sample_rate
)
...
@@ -457,7 +441,6 @@ def _display(i):
...
@@ -457,7 +441,6 @@ def _display(i):
axs
[
j
].
imshow
(
chunk
[
10
*
j
+
1
].
permute
(
1
,
2
,
0
))
axs
[
j
].
imshow
(
chunk
[
10
*
j
+
1
].
permute
(
1
,
2
,
0
))
axs
[
j
].
set_axis_off
()
axs
[
j
].
set_axis_off
()
plt
.
tight_layout
()
plt
.
tight_layout
()
plt
.
show
(
block
=
False
)
######################################################################
######################################################################
...
...
examples/tutorials/streamreader_basic_tutorial.py
View file @
ffeba11a
...
@@ -14,12 +14,9 @@ libavfilter provides.
...
@@ -14,12 +14,9 @@ libavfilter provides.
#
#
# .. note::
# .. note::
#
#
# This tutorial requires FFmpeg libraries (>=4.1, <4.4).
# This tutorial requires FFmpeg libraries.
#
# Please refer to :ref:`FFmpeg dependency <ffmpeg_dependency>` for
# There are multiple ways to install FFmpeg libraries.
# the detail.
# If you are using Anaconda Python distribution,
# ``conda install -c anaconda 'ffmpeg<4.4'`` will install
# the required libraries.
#
#
######################################################################
######################################################################
...
@@ -65,29 +62,8 @@ import torchaudio
...
@@ -65,29 +62,8 @@ import torchaudio
print
(
torch
.
__version__
)
print
(
torch
.
__version__
)
print
(
torchaudio
.
__version__
)
print
(
torchaudio
.
__version__
)
######################################################################
#
try
:
from
torchaudio.io
import
StreamReader
except
ModuleNotFoundError
:
try
:
import
google.colab
print
(
"""
To enable running this notebook in Google Colab, install the requisite
third party libraries by running the following code:
!add-apt-repository -y ppa:savoury1/ffmpeg4
!apt-get -qq install -y ffmpeg
"""
)
except
ModuleNotFoundError
:
pass
raise
import
matplotlib.pyplot
as
plt
import
matplotlib.pyplot
as
plt
from
torchaudio.io
import
StreamReader
base_url
=
"https://download.pytorch.org/torchaudio/tutorial-assets"
base_url
=
"https://download.pytorch.org/torchaudio/tutorial-assets"
AUDIO_URL
=
f
"
{
base_url
}
/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
AUDIO_URL
=
f
"
{
base_url
}
/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
...
@@ -613,7 +589,6 @@ for i, vid in enumerate(vids2):
...
@@ -613,7 +589,6 @@ for i, vid in enumerate(vids2):
if
i
==
0
and
j
==
0
:
if
i
==
0
and
j
==
0
:
ax
.
set_ylabel
(
"Stream 2"
)
ax
.
set_ylabel
(
"Stream 2"
)
plt
.
tight_layout
()
plt
.
tight_layout
()
plt
.
show
(
block
=
False
)
######################################################################
######################################################################
#
#
...
...
examples/tutorials/streamwriter_advanced.py
View file @
ffeba11a
...
@@ -23,17 +23,9 @@ play audio and video.
...
@@ -23,17 +23,9 @@ play audio and video.
#
#
# .. note::
# .. note::
#
#
# This tutorial requires torchaudio nightly build and FFmpeg libraries (>=4.1, <4.4).
# This tutorial requires FFmpeg libraries.
#
# Please refer to :ref:`FFmpeg dependency <ffmpeg_dependency>` for
# To install torchaudio nightly build, please refer to
# the detail.
# https://pytorch.org/get-started/locally/ .
#
#
# There are multiple ways to install FFmpeg libraries.
# If you are using Anaconda Python distribution,
# ``conda install 'ffmpeg<4.4'`` will install the required FFmpeg libraries,
# however, this distribution does not have SDL plugin, so it cannot play
# video.
#
#
######################################################################
######################################################################
...
@@ -74,7 +66,9 @@ from torchaudio.io import StreamWriter
...
@@ -74,7 +66,9 @@ from torchaudio.io import StreamWriter
from
torchaudio.utils
import
download_asset
from
torchaudio.utils
import
download_asset
AUDIO_PATH
=
download_asset
(
"tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
)
AUDIO_PATH
=
download_asset
(
"tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
)
VIDEO_PATH
=
download_asset
(
"tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4"
)
VIDEO_PATH
=
download_asset
(
"tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4"
)
######################################################################
######################################################################
#
#
...
@@ -140,7 +134,7 @@ s.add_audio_stream(sample_rate, num_channels, format="s16")
...
@@ -140,7 +134,7 @@ s.add_audio_stream(sample_rate, num_channels, format="s16")
# Write audio to the device
# Write audio to the device
with
s
.
open
():
with
s
.
open
():
for
i
in
range
(
0
,
num_frames
,
256
):
for
i
in
range
(
0
,
num_frames
,
256
):
s
.
write_audio_chunk
(
0
,
waveform
[
i
:
i
+
256
])
s
.
write_audio_chunk
(
0
,
waveform
[
i
:
i
+
256
])
######################################################################
######################################################################
#
#
...
@@ -186,8 +180,12 @@ width, height = 640, 360
...
@@ -186,8 +180,12 @@ width, height = 640, 360
# a background thread and give chunks
# a background thread and give chunks
running
=
True
running
=
True
def
video_streamer
(
path
,
frames_per_chunk
):
def
video_streamer
(
path
,
frames_per_chunk
):
import
queue
,
threading
import
queue
import
threading
from
torchaudio.io
import
StreamReader
from
torchaudio.io
import
StreamReader
q
=
queue
.
Queue
()
q
=
queue
.
Queue
()
...
@@ -196,9 +194,9 @@ def video_streamer(path, frames_per_chunk):
...
@@ -196,9 +194,9 @@ def video_streamer(path, frames_per_chunk):
def
_streamer
():
def
_streamer
():
streamer
=
StreamReader
(
path
)
streamer
=
StreamReader
(
path
)
streamer
.
add_basic_video_stream
(
streamer
.
add_basic_video_stream
(
frames_per_chunk
,
format
=
"rgb24"
,
frames_per_chunk
,
format
=
"rgb24"
,
frame_rate
=
frame_rate
,
width
=
width
,
height
=
height
frame_rate
=
frame_rate
,
width
=
width
,
height
=
height
)
)
for
(
chunk_
,
)
in
streamer
.
stream
():
for
(
chunk_
,)
in
streamer
.
stream
():
q
.
put
(
chunk_
)
q
.
put
(
chunk_
)
if
not
running
:
if
not
running
:
break
break
...
@@ -246,7 +244,7 @@ with s.open():
...
@@ -246,7 +244,7 @@ with s.open():
# <source src="https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio-sdl-demo.mp4">
# <source src="https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio-sdl-demo.mp4">
# </video>
# </video>
#
#
# [`code <https://download.pytorch.org/torchaudio/tutorial-assets/sdl.py>`_]
# [`code <https://download.pytorch.org/torchaudio/tutorial-assets/sdl.py>`_
_
]
#
#
######################################################################
######################################################################
...
@@ -292,7 +290,7 @@ with s.open():
...
@@ -292,7 +290,7 @@ with s.open():
# <source src="https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio-rtmp-demo.mp4">
# <source src="https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio-rtmp-demo.mp4">
# </video>
# </video>
#
#
# [`code <https://download.pytorch.org/torchaudio/tutorial-assets/rtmp.py>`_]
# [`code <https://download.pytorch.org/torchaudio/tutorial-assets/rtmp.py>`_
_
]
#
#
...
@@ -324,7 +322,7 @@ with s.open():
...
@@ -324,7 +322,7 @@ with s.open():
# <source src="https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio-udp-demo.mp4">
# <source src="https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio-udp-demo.mp4">
# </video>
# </video>
#
#
# [`code <https://download.pytorch.org/torchaudio/tutorial-assets/udp.py>`_]
# [`code <https://download.pytorch.org/torchaudio/tutorial-assets/udp.py>`_
_
]
#
#
######################################################################
######################################################################
...
...
examples/tutorials/streamwriter_basic_tutorial.py
View file @
ffeba11a
...
@@ -13,14 +13,9 @@ encode and save audio/video data into various formats/destinations.
...
@@ -13,14 +13,9 @@ encode and save audio/video data into various formats/destinations.
#
#
# .. note::
# .. note::
#
#
# This tutorial requires torchaudio nightly build and FFmpeg libraries (>=4.1, <4.4).
# This tutorial requires FFmpeg libraries.
#
# Please refer to :ref:`FFmpeg dependency <ffmpeg_dependency>` for
# To install torchaudio nightly build, please refer to
# the detail.
# https://pytorch.org/get-started/locally/ .
#
# There are multiple ways to install FFmpeg libraries.
# If you are using Anaconda Python distribution,
# ``conda install 'ffmpeg<4.4'`` will install the required FFmpeg libraries.
#
#
######################################################################
######################################################################
...
@@ -51,27 +46,7 @@ import torchaudio
...
@@ -51,27 +46,7 @@ import torchaudio
print
(
torch
.
__version__
)
print
(
torch
.
__version__
)
print
(
torchaudio
.
__version__
)
print
(
torchaudio
.
__version__
)
######################################################################
from
torchaudio.io
import
StreamWriter
#
try
:
from
torchaudio.io
import
StreamWriter
except
ImportError
:
try
:
import
google.colab
print
(
"""
To enable running this notebook in Google Colab, install nightly
torch and torchaudio builds by adding the following code block to the top
of the notebook before running it:
!pip3 uninstall -y torch torchvision torchaudio
!pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
"""
)
except
ModuleNotFoundError
:
pass
raise
print
(
"FFmpeg library versions"
)
print
(
"FFmpeg library versions"
)
for
k
,
v
in
torchaudio
.
utils
.
ffmpeg_utils
.
get_versions
().
items
():
for
k
,
v
in
torchaudio
.
utils
.
ffmpeg_utils
.
get_versions
().
items
():
...
@@ -84,9 +59,10 @@ import io
...
@@ -84,9 +59,10 @@ import io
import
os
import
os
import
tempfile
import
tempfile
from
torchaudio.utils
import
download_asset
from
IPython.display
import
Audio
,
Video
from
IPython.display
import
Audio
,
Video
from
torchaudio.utils
import
download_asset
SAMPLE_PATH
=
download_asset
(
"tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
)
SAMPLE_PATH
=
download_asset
(
"tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
)
WAVEFORM
,
SAMPLE_RATE
=
torchaudio
.
load
(
SAMPLE_PATH
,
channels_first
=
False
)
WAVEFORM
,
SAMPLE_RATE
=
torchaudio
.
load
(
SAMPLE_PATH
,
channels_first
=
False
)
NUM_FRAMES
,
NUM_CHANNELS
=
WAVEFORM
.
shape
NUM_FRAMES
,
NUM_CHANNELS
=
WAVEFORM
.
shape
...
@@ -503,47 +479,7 @@ print(f"{bytes2[:10]}...{bytes2[-10:]}\n")
...
@@ -503,47 +479,7 @@ print(f"{bytes2[:10]}...{bytes2[-10:]}\n")
assert
bytes1
==
bytes2
assert
bytes1
==
bytes2
######################################################################
import
matplotlib.pyplot
as
plt
# Note on slicing and AAC
# ~~~~~~~~~~~~~~~~~~~~~~~
#
# .. warning::
#
# FFmpeg's native AAC encoder (which is used by default when
# saving video with MP4 format) has a bug that affects the audibility.
#
# Please refer to the examples bellow.
#
def
test_slice
(
audio_encoder
,
slice_size
,
ext
=
"mp4"
):
path
=
get_path
(
f
"slice_
{
slice_size
}
.
{
ext
}
"
)
s
=
StreamWriter
(
dst
=
path
)
s
.
add_audio_stream
(
SAMPLE_RATE
,
NUM_CHANNELS
,
encoder
=
audio_encoder
)
with
s
.
open
():
for
start
in
range
(
0
,
NUM_FRAMES
,
slice_size
):
end
=
start
+
slice_size
s
.
write_audio_chunk
(
0
,
WAVEFORM
[
start
:
end
,
...])
return
path
######################################################################
#
# This causes some artifacts.
# note:
# Chrome does not support playing AAC audio directly while Safari does.
# Using MP4 container and specifying AAC allows Chrome to play it.
Video
(
test_slice
(
audio_encoder
=
"aac"
,
slice_size
=
8000
,
ext
=
"mp4"
),
embed
=
True
)
######################################################################
#
# It is more noticeable when using smaller slice.
Video
(
test_slice
(
audio_encoder
=
"aac"
,
slice_size
=
512
,
ext
=
"mp4"
),
embed
=
True
)
######################################################################
#
# Lame MP3 encoder works fine for the same slice size.
Audio
(
test_slice
(
audio_encoder
=
"libmp3lame"
,
slice_size
=
512
,
ext
=
"mp3"
))
######################################################################
######################################################################
#
#
...
@@ -559,7 +495,6 @@ Audio(test_slice(audio_encoder="libmp3lame", slice_size=512, ext="mp3"))
...
@@ -559,7 +495,6 @@ Audio(test_slice(audio_encoder="libmp3lame", slice_size=512, ext="mp3"))
# then use StreamWriter to convert them to video with the original audio.
# then use StreamWriter to convert them to video with the original audio.
import
torchaudio.transforms
as
T
import
torchaudio.transforms
as
T
import
matplotlib.pyplot
as
plt
######################################################################
######################################################################
#
#
...
@@ -590,7 +525,7 @@ specs = trans(WAVEFORM.T)[0].T
...
@@ -590,7 +525,7 @@ specs = trans(WAVEFORM.T)[0].T
#
#
spec_db
=
T
.
AmplitudeToDB
(
stype
=
"magnitude"
,
top_db
=
80
)(
specs
.
T
)
spec_db
=
T
.
AmplitudeToDB
(
stype
=
"magnitude"
,
top_db
=
80
)(
specs
.
T
)
_
=
plt
.
imshow
(
spec_db
,
aspect
=
"auto"
,
origin
=
'
lower
'
)
_
=
plt
.
imshow
(
spec_db
,
aspect
=
"auto"
,
origin
=
"
lower
"
)
######################################################################
######################################################################
#
#
...
@@ -611,21 +546,27 @@ ncols, nrows = fig.canvas.get_width_height()
...
@@ -611,21 +546,27 @@ ncols, nrows = fig.canvas.get_width_height()
def
_plot
(
data
):
def
_plot
(
data
):
ax
.
clear
()
ax
.
clear
()
x
=
list
(
range
(
len
(
data
)))
x
=
list
(
range
(
len
(
data
)))
R
,
G
,
B
=
238
/
255
,
76
/
255
,
44
/
255
R
,
G
,
B
=
238
/
255
,
76
/
255
,
44
/
255
for
coeff
,
alpha
in
[(
0.8
,
0.7
),
(
1
,
1
)]:
for
coeff
,
alpha
in
[(
0.8
,
0.7
),
(
1
,
1
)]:
d
=
data
**
coeff
d
=
data
**
coeff
ax
.
fill_between
(
x
,
d
,
-
d
,
color
=
[
R
,
G
,
B
,
alpha
])
ax
.
fill_between
(
x
,
d
,
-
d
,
color
=
[
R
,
G
,
B
,
alpha
])
xlim
=
n_fft
//
2
+
1
xlim
=
n_fft
//
2
+
1
ax
.
set_xlim
([
-
1
,
n_fft
//
2
+
1
])
ax
.
set_xlim
([
-
1
,
n_fft
//
2
+
1
])
ax
.
set_ylim
([
-
1
,
1
])
ax
.
set_ylim
([
-
1
,
1
])
ax
.
text
(
ax
.
text
(
xlim
,
0.95
,
xlim
,
0.95
,
f
"Created with TorchAudio
\n
{
torchaudio
.
__version__
}
"
,
f
"Created with TorchAudio
\n
{
torchaudio
.
__version__
}
"
,
color
=
"white"
,
ha
=
"right"
,
va
=
"top"
,
backgroundcolor
=
"black"
)
color
=
"white"
,
ha
=
"right"
,
va
=
"top"
,
backgroundcolor
=
"black"
,
)
fig
.
canvas
.
draw
()
fig
.
canvas
.
draw
()
frame
=
torch
.
frombuffer
(
fig
.
canvas
.
tostring_rgb
(),
dtype
=
torch
.
uint8
)
frame
=
torch
.
frombuffer
(
fig
.
canvas
.
tostring_rgb
(),
dtype
=
torch
.
uint8
)
return
frame
.
reshape
(
nrows
,
ncols
,
3
).
permute
(
2
,
0
,
1
)
return
frame
.
reshape
(
nrows
,
ncols
,
3
).
permute
(
2
,
0
,
1
)
# sphinx_gallery_defer_figures
# sphinx_gallery_defer_figures
######################################################################
######################################################################
...
@@ -646,10 +587,10 @@ with s.open():
...
@@ -646,10 +587,10 @@ with s.open():
# Process by second
# Process by second
for
t
in
range
(
0
,
NUM_FRAMES
,
SAMPLE_RATE
):
for
t
in
range
(
0
,
NUM_FRAMES
,
SAMPLE_RATE
):
# Write audio chunk
# Write audio chunk
s
.
write_audio_chunk
(
0
,
WAVEFORM
[
t
:
t
+
SAMPLE_RATE
,
:])
s
.
write_audio_chunk
(
0
,
WAVEFORM
[
t
:
t
+
SAMPLE_RATE
,
:])
# write 1 second of video chunk
# write 1 second of video chunk
frames
=
[
_plot
(
spec
)
for
spec
in
specs
[
i
:
i
+
frame_rate
]]
frames
=
[
_plot
(
spec
)
for
spec
in
specs
[
i
:
i
+
frame_rate
]]
if
frames
:
if
frames
:
s
.
write_video_chunk
(
1
,
torch
.
stack
(
frames
))
s
.
write_video_chunk
(
1
,
torch
.
stack
(
frames
))
i
+=
frame_rate
i
+=
frame_rate
...
...
examples/tutorials/tacotron2_pipeline_tutorial.py
View file @
ffeba11a
...
@@ -7,10 +7,6 @@ Text-to-Speech with Tacotron2
...
@@ -7,10 +7,6 @@ Text-to-Speech with Tacotron2
"""
"""
import
IPython
import
matplotlib
import
matplotlib.pyplot
as
plt
######################################################################
######################################################################
# Overview
# Overview
# --------
# --------
...
@@ -65,8 +61,6 @@ import matplotlib.pyplot as plt
...
@@ -65,8 +61,6 @@ import matplotlib.pyplot as plt
import
torch
import
torch
import
torchaudio
import
torchaudio
matplotlib
.
rcParams
[
"figure.figsize"
]
=
[
16.0
,
4.8
]
torch
.
random
.
manual_seed
(
0
)
torch
.
random
.
manual_seed
(
0
)
device
=
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
device
=
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
...
@@ -75,6 +69,13 @@ print(torchaudio.__version__)
...
@@ -75,6 +69,13 @@ print(torchaudio.__version__)
print
(
device
)
print
(
device
)
######################################################################
#
import
IPython
import
matplotlib.pyplot
as
plt
######################################################################
######################################################################
# Text Processing
# Text Processing
# ---------------
# ---------------
...
@@ -218,7 +219,7 @@ with torch.inference_mode():
...
@@ -218,7 +219,7 @@ with torch.inference_mode():
spec
,
_
,
_
=
tacotron2
.
infer
(
processed
,
lengths
)
spec
,
_
,
_
=
tacotron2
.
infer
(
processed
,
lengths
)
_
=
plt
.
imshow
(
spec
[
0
].
cpu
().
detach
())
_
=
plt
.
imshow
(
spec
[
0
].
cpu
().
detach
()
,
origin
=
"lower"
,
aspect
=
"auto"
)
######################################################################
######################################################################
...
@@ -226,13 +227,17 @@ _ = plt.imshow(spec[0].cpu().detach())
...
@@ -226,13 +227,17 @@ _ = plt.imshow(spec[0].cpu().detach())
# therefor, the process of generating the spectrogram incurs randomness.
# therefor, the process of generating the spectrogram incurs randomness.
#
#
fig
,
ax
=
plt
.
subplots
(
3
,
1
,
figsize
=
(
16
,
4.3
*
3
))
for
i
in
range
(
3
):
def
plot
():
with
torch
.
inference_mode
():
fig
,
ax
=
plt
.
subplots
(
3
,
1
)
spec
,
spec_lengths
,
_
=
tacotron2
.
infer
(
processed
,
lengths
)
for
i
in
range
(
3
):
print
(
spec
[
0
].
shape
)
with
torch
.
inference_mode
():
ax
[
i
].
imshow
(
spec
[
0
].
cpu
().
detach
())
spec
,
spec_lengths
,
_
=
tacotron2
.
infer
(
processed
,
lengths
)
plt
.
show
()
print
(
spec
[
0
].
shape
)
ax
[
i
].
imshow
(
spec
[
0
].
cpu
().
detach
(),
origin
=
"lower"
,
aspect
=
"auto"
)
plot
()
######################################################################
######################################################################
...
@@ -270,11 +275,22 @@ with torch.inference_mode():
...
@@ -270,11 +275,22 @@ with torch.inference_mode():
spec
,
spec_lengths
,
_
=
tacotron2
.
infer
(
processed
,
lengths
)
spec
,
spec_lengths
,
_
=
tacotron2
.
infer
(
processed
,
lengths
)
waveforms
,
lengths
=
vocoder
(
spec
,
spec_lengths
)
waveforms
,
lengths
=
vocoder
(
spec
,
spec_lengths
)
fig
,
[
ax1
,
ax2
]
=
plt
.
subplots
(
2
,
1
,
figsize
=
(
16
,
9
))
######################################################################
ax1
.
imshow
(
spec
[
0
].
cpu
().
detach
())
#
ax2
.
plot
(
waveforms
[
0
].
cpu
().
detach
())
def
plot
(
waveforms
,
spec
,
sample_rate
):
waveforms
=
waveforms
.
cpu
().
detach
()
IPython
.
display
.
Audio
(
waveforms
[
0
:
1
].
cpu
(),
rate
=
vocoder
.
sample_rate
)
fig
,
[
ax1
,
ax2
]
=
plt
.
subplots
(
2
,
1
)
ax1
.
plot
(
waveforms
[
0
])
ax1
.
set_xlim
(
0
,
waveforms
.
size
(
-
1
))
ax1
.
grid
(
True
)
ax2
.
imshow
(
spec
[
0
].
cpu
().
detach
(),
origin
=
"lower"
,
aspect
=
"auto"
)
return
IPython
.
display
.
Audio
(
waveforms
[
0
:
1
],
rate
=
sample_rate
)
plot
(
waveforms
,
spec
,
vocoder
.
sample_rate
)
######################################################################
######################################################################
...
@@ -300,11 +316,10 @@ with torch.inference_mode():
...
@@ -300,11 +316,10 @@ with torch.inference_mode():
spec
,
spec_lengths
,
_
=
tacotron2
.
infer
(
processed
,
lengths
)
spec
,
spec_lengths
,
_
=
tacotron2
.
infer
(
processed
,
lengths
)
waveforms
,
lengths
=
vocoder
(
spec
,
spec_lengths
)
waveforms
,
lengths
=
vocoder
(
spec
,
spec_lengths
)
fig
,
[
ax1
,
ax2
]
=
plt
.
subplots
(
2
,
1
,
figsize
=
(
16
,
9
))
######################################################################
ax1
.
imshow
(
spec
[
0
].
cpu
().
detach
())
#
ax2
.
plot
(
waveforms
[
0
].
cpu
().
detach
())
IPython
.
display
.
Audio
(
waveforms
[
0
:
1
].
cpu
(),
rate
=
vocoder
.
sample_rate
)
plot
(
waveforms
,
spec
,
vocoder
.
sample_rate
)
######################################################################
######################################################################
...
@@ -339,8 +354,7 @@ waveglow.eval()
...
@@ -339,8 +354,7 @@ waveglow.eval()
with
torch
.
no_grad
():
with
torch
.
no_grad
():
waveforms
=
waveglow
.
infer
(
spec
)
waveforms
=
waveglow
.
infer
(
spec
)
fig
,
[
ax1
,
ax2
]
=
plt
.
subplots
(
2
,
1
,
figsize
=
(
16
,
9
))
######################################################################
ax1
.
imshow
(
spec
[
0
].
cpu
().
detach
())
#
ax2
.
plot
(
waveforms
[
0
].
cpu
().
detach
())
IPython
.
display
.
Audio
(
waveforms
[
0
:
1
].
cpu
(),
rate
=
22050
)
plot
(
waveforms
,
spec
,
22050
)
packaging/torchaudio/meta.yaml
View file @
ffeba11a
...
@@ -14,11 +14,8 @@ requirements:
...
@@ -14,11 +14,8 @@ requirements:
host
:
host
:
-
python
-
python
-
setuptools
-
setuptools
-
pkg-config
# [not win]
-
cmake
-
cmake
-
ninja
-
ninja
-
numpy>=1.11
# [py <= 39]
-
numpy>=1.21.2
# [py >= 310]
-
pytorch-mutex 1.0 {{ build_variant }}
# [not osx ]
-
pytorch-mutex 1.0 {{ build_variant }}
# [not osx ]
{{
environ.get('CONDA_PYTORCH_BUILD_CONSTRAINT'
,
'
pytorch'
)
}}
{{
environ.get('CONDA_PYTORCH_BUILD_CONSTRAINT'
,
'
pytorch'
)
}}
{{
environ.get('CONDA_EXTRA_BUILD_CONSTRAINT'
,
'
'
)
}}
{{
environ.get('CONDA_EXTRA_BUILD_CONSTRAINT'
,
'
'
)
}}
...
@@ -26,8 +23,7 @@ requirements:
...
@@ -26,8 +23,7 @@ requirements:
run
:
run
:
-
python
-
python
-
numpy>=1.11
# [py <= 39]
-
numpy
-
numpy>=1.21.2
# [py >= 310]
-
pytorch-mutex 1.0 {{ build_variant }}
# [not osx ]
-
pytorch-mutex 1.0 {{ build_variant }}
# [not osx ]
{{
environ.get('CONDA_PYTORCH_CONSTRAINT'
,
'
pytorch'
)
}}
{{
environ.get('CONDA_PYTORCH_CONSTRAINT'
,
'
pytorch'
)
}}
{{
environ.get('CONDA_CUDATOOLKIT_CONSTRAINT'
,
'
'
)
}}
{{
environ.get('CONDA_CUDATOOLKIT_CONSTRAINT'
,
'
'
)
}}
...
@@ -49,7 +45,6 @@ build:
...
@@ -49,7 +45,6 @@ build:
-
TORCH_CUDA_ARCH_LIST
-
TORCH_CUDA_ARCH_LIST
-
USE_FFMPEG
-
USE_FFMPEG
-
USE_OPENMP
-
USE_OPENMP
-
FFMPEG_ROOT
-
MACOSX_DEPLOYMENT_TARGET
-
MACOSX_DEPLOYMENT_TARGET
test
:
test
:
...
...
packaging/vs2019/conda_build_config.yaml
View file @
ffeba11a
blas_impl
:
-
mkl
# [x86_64]
c_compiler
:
c_compiler
:
-
vs2019
# [win]
-
vs2019
# [win]
cxx_compiler
:
cxx_compiler
:
-
vs2019
# [win]
-
vs2019
# [win]
python
:
python
:
-
3.
7
-
3.
8
# This differs from target_platform in that it determines what subdir the compiler
# This differs from target_platform in that it determines what subdir the compiler
# will target, not what subdir the compiler package will be itself.
# will target, not what subdir the compiler package will be itself.
# For example, we need a win-64 vs2008_win-32 package, so that we compile win-32
# For example, we need a win-64 vs2008_win-32 package, so that we compile win-32
...
...
packaging/windows/internal/cuda_install.bat
View file @
ffeba11a
...
@@ -23,25 +23,25 @@ set CUDA_VERSION_STR=%CUDA_VER_MAJOR%.%CUDA_VER_MINOR%
...
@@ -23,25 +23,25 @@ set CUDA_VERSION_STR=%CUDA_VER_MAJOR%.%CUDA_VER_MINOR%
set
CUDNN_FOLDER
=
"cuda"
set
CUDNN_FOLDER
=
"cuda"
set
CUDNN_LIB_FOLDER
=
"lib\x64"
set
CUDNN_LIB_FOLDER
=
"lib\x64"
if
%CUDA_VER%
EQU
11
6
goto
cuda11
6
if
%CUDA_VER%
EQU
11
8
goto
cuda11
8
if
%CUDA_VER%
EQU
11
7
goto
cuda11
7
if
%CUDA_VER%
EQU
1
2
1
goto
cuda1
2
1
echo
CUDA
%CUDA_VERSION_STR%
is
not
supported
echo
CUDA
%CUDA_VERSION_STR%
is
not
supported
exit
/b
1
exit
/b
1
:cuda
11
6
:cuda
11
8
set
CUDA_INSTALL_EXE
=
cuda_11
.
6
.0_5
11.23
_windows.exe
set
CUDA_INSTALL_EXE
=
cuda_11
.
8
.0_5
22.06
_windows.exe
if
not
exist
"
%SRC_DIR%
\temp_build\
%CUDA_INSTALL_EXE%
"
(
if
not
exist
"
%SRC_DIR%
\temp_build\
%CUDA_INSTALL_EXE%
"
(
curl
-k -L
"https://ossci-windows.s3.amazonaws.com/
%CUDA_INSTALL_EXE%
"
--output
"
%SRC_DIR%
\temp_build\
%CUDA_INSTALL_EXE%
"
curl
-k -L
"https://ossci-windows.s3.amazonaws.com/
%CUDA_INSTALL_EXE%
"
--output
"
%SRC_DIR%
\temp_build\
%CUDA_INSTALL_EXE%
"
if
errorlevel
1
exit
/b
1
if
errorlevel
1
exit
/b
1
set
"CUDA_SETUP_FILE=
%SRC_DIR%
\temp_build\
%CUDA_INSTALL_EXE%
"
set
"CUDA_SETUP_FILE=
%SRC_DIR%
\temp_build\
%CUDA_INSTALL_EXE%
"
set
"ARGS=thrust_11.
6
nvcc_11.
6
cuobjdump_11.
6
nvprune_11.
6
nvprof_11.
6
cupti_11.
6
cublas_11.
6
cublas_dev_11.
6
cudart_11.
6
cufft_11.
6
cufft_dev_11.
6
curand_11.
6
curand_dev_11.
6
cusolver_11.
6
cusolver_dev_11.
6
cusparse_11.
6
cusparse_dev_11.
6
npp_11.
6
npp_dev_11.
6
nvrtc_11.
6
nvrtc_dev_11.
6
nvml_dev_11.
6
"
set
"ARGS=
cuda_profiler_api_11.8
thrust_11.
8
nvcc_11.
8
cuobjdump_11.
8
nvprune_11.
8
nvprof_11.
8
cupti_11.
8
cublas_11.
8
cublas_dev_11.
8
cudart_11.
8
cufft_11.
8
cufft_dev_11.
8
curand_11.
8
curand_dev_11.
8
cusolver_11.
8
cusolver_dev_11.
8
cusparse_11.
8
cusparse_dev_11.
8
npp_11.
8
npp_dev_11.
8
nvrtc_11.
8
nvrtc_dev_11.
8
nvml_dev_11.
8
"
)
)
set
CUDNN_INSTALL_ZIP
=
cudnn
-windows-x
86
_64
-
8
.3.2.44_cuda11.5
-archive
.zip
set
CUDNN_FOLDER
=
cudnn
-windows-x
86
_64
-
8
.7.0.84_cuda11
-archive
set
CUDNN_FOLDER
=
cudnn
-windows-x
86
_64
-
8
.3.2.44_cuda11.5
-archive
set
CUDNN_LIB_FOLDER
=
"lib"
set
CUDNN_LIB_FOLDER
=
"lib"
set
"CUDNN_INSTALL_ZIP=
%CUDNN_FOLDER%
.zip"
if
not
exist
"
%SRC_DIR%
\temp_build\
%CUDNN_INSTALL_ZIP%
"
(
if
not
exist
"
%SRC_DIR%
\temp_build\
%CUDNN_INSTALL_ZIP%
"
(
curl
-k -L
"http://s3.amazonaws.com/ossci-windows/
%CUDNN_INSTALL_ZIP%
"
--output
"
%SRC_DIR%
\temp_build\
%CUDNN_INSTALL_ZIP%
"
curl
-k -L
"http://s3.amazonaws.com/ossci-windows/
%CUDNN_INSTALL_ZIP%
"
--output
"
%SRC_DIR%
\temp_build\
%CUDNN_INSTALL_ZIP%
"
if
errorlevel
1
exit
/b
1
if
errorlevel
1
exit
/b
1
...
@@ -55,23 +55,23 @@ if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
...
@@ -55,23 +55,23 @@ if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
goto
cuda_common
goto
cuda_common
:cuda
11
7
:cuda
1
2
1
set
CUDA_INSTALL_EXE
=
cuda_1
1
.7.0_516.01
_windows.exe
set
CUDA_INSTALL_EXE
=
cuda_1
2
.1.1_531.14
_windows.exe
if
not
exist
"
%SRC_DIR%
\temp_build\
%CUDA_INSTALL_EXE%
"
(
if
not
exist
"
%SRC_DIR%
\temp_build\
%CUDA_INSTALL_EXE%
"
(
curl
-k -L
"https://ossci-windows.s3.amazonaws.com/
%CUDA_INSTALL_EXE%
"
--output
"
%SRC_DIR%
\temp_build\
%CUDA_INSTALL_EXE%
"
curl
-k -L
"https://ossci-windows.s3.amazonaws.com/
%CUDA_INSTALL_EXE%
"
--output
"
%SRC_DIR%
\temp_build\
%CUDA_INSTALL_EXE%
"
if
errorlevel
1
exit
/b
1
if
errorlevel
1
exit
/b
1
set
"CUDA_SETUP_FILE=
%SRC_DIR%
\temp_build\
%CUDA_INSTALL_EXE%
"
set
"CUDA_SETUP_FILE=
%SRC_DIR%
\temp_build\
%CUDA_INSTALL_EXE%
"
set
"ARGS=thrust_1
1.7
nvcc_1
1.7
cuobjdump_1
1.7
nvprune_1
1.7
nvprof_1
1.7
cupti_1
1.7
cublas_1
1.7
cublas_dev_1
1.7
cudart_1
1.7
cufft_1
1.7
cufft_dev_1
1.7
curand_1
1.7
curand_dev_1
1.7
cusolver_1
1.7
cusolver_dev_1
1.7
cusparse_1
1.7
cusparse_dev_1
1.7
npp_1
1.7
npp_dev_1
1.7
nvrtc_1
1.7
nvrtc_dev_1
1.7
nvml_dev_1
1.7
"
set
"ARGS=
cuda_profiler_api_12.1
thrust_1
2.1
nvcc_1
2.1
cuobjdump_1
2.1
nvprune_1
2.1
nvprof_1
2.1
cupti_1
2.1
cublas_1
2.1
cublas_dev_1
2.1
cudart_1
2.1
cufft_1
2.1
cufft_dev_1
2.1
curand_1
2.1
curand_dev_1
2.1
cusolver_1
2.1
cusolver_dev_1
2.1
cusparse_1
2.1
cusparse_dev_1
2.1
npp_1
2.1
npp_dev_1
2.1
nvrtc_1
2.1
nvrtc_dev_1
2.1
nvml_dev_1
2.1
"
)
)
set
CUDNN_INSTALL_ZIP
=
cudnn
-windows-x
86
_64
-
8
.3.2.44_cuda11.5
-archive
.zip
set
CUDNN_FOLDER
=
cudnn
-windows-x
86
_64
-
8
.8.1.3_cuda12
-archive
set
CUDNN_FOLDER
=
cudnn
-windows-x
86
_64
-
8
.3.2.44_cuda11.5
-archive
set
CUDNN_LIB_FOLDER
=
"lib"
set
CUDNN_LIB_FOLDER
=
"lib"
set
"CUDNN_INSTALL_ZIP=
%CUDNN_FOLDER%
.zip"
if
not
exist
"
%SRC_DIR%
\temp_build\
%CUDNN_INSTALL_ZIP%
"
(
if
not
exist
"
%SRC_DIR%
\temp_build\
%CUDNN_INSTALL_ZIP%
"
(
curl
-k -L
"http://s3.amazonaws.com/ossci-windows/
%CUDNN_INSTALL_ZIP%
"
--output
"
%SRC_DIR%
\temp_build\
%CUDNN_INSTALL_ZIP%
"
curl
-k -L
"http://s3.amazonaws.com/ossci-windows/
%CUDNN_INSTALL_ZIP%
"
--output
"
%SRC_DIR%
\temp_build\
%CUDNN_INSTALL_ZIP%
"
if
errorlevel
1
exit
/b
1
if
errorlevel
1
exit
/b
1
set
"CUDNN_SETUP_FILE=
%SRC_DIR%
\temp_build\
%CUDNN_INSTALL_ZIP%
"
set
"CUDNN_SETUP_FILE=
%SRC_DIR%
\temp_build\
%CUDNN_INSTALL_ZIP%
rem Make sure windows path contains zlib dll
rem Make sure windows path contains zlib dll
curl -k -L "
http
://s3.amazonaws.com/ossci
-windows/zlib
123
dllx64
.zip
" --output "
%SRC_DIR%
\temp_build\zlib123dllx64.zip
"
curl -k -L "
http
://s3.amazonaws.com/ossci
-windows/zlib
123
dllx64
.zip
" --output "
%SRC_DIR%
\temp_build\zlib123dllx64.zip
"
...
...
pyproject.toml
View file @
ffeba11a
...
@@ -5,6 +5,9 @@ first_party_detection = false
...
@@ -5,6 +5,9 @@ first_party_detection = false
[tool.black]
[tool.black]
line-length
=
120
line-length
=
120
target-version
=
["py3
7
"]
target-version
=
["py3
8
"]
[tool.ufmt]
[tool.ufmt]
excludes
=
[
"examples/tutorials/"
,
]
setup.py
View file @
ffeba11a
...
@@ -4,7 +4,6 @@ import os
...
@@ -4,7 +4,6 @@ import os
import
re
import
re
import
shutil
import
shutil
import
subprocess
import
subprocess
import
sys
from
pathlib
import
Path
from
pathlib
import
Path
import
torch
import
torch
...
@@ -22,13 +21,14 @@ def _run_cmd(cmd, shell=False):
...
@@ -22,13 +21,14 @@ def _run_cmd(cmd, shell=False):
return
None
return
None
def
_get_version
(
sha
):
def
_get_version
(
sha
):
with
open
(
ROOT_DIR
/
"version.txt"
,
"r"
)
as
f
:
with
open
(
ROOT_DIR
/
"version.txt"
,
"r"
)
as
f
:
version
=
f
.
read
().
strip
()
version
=
f
.
read
().
strip
()
if
os
.
getenv
(
"BUILD_VERSION"
):
if
os
.
getenv
(
"BUILD_VERSION"
):
version
=
os
.
getenv
(
"BUILD_VERSION"
)
version
=
os
.
getenv
(
"BUILD_VERSION"
)
elif
sha
is
not
None
:
elif
sha
is
not
None
:
version
+=
"+"
+
sha
[:
7
]
version
+=
"+
das
"
+
"."
+
"opt1"
return
version
return
version
...
@@ -36,12 +36,14 @@ def _make_version_file(version, sha):
...
@@ -36,12 +36,14 @@ def _make_version_file(version, sha):
sha
=
"Unknown"
if
sha
is
None
else
sha
sha
=
"Unknown"
if
sha
is
None
else
sha
abi
=
_run_cmd
([
"echo '#include <string>' | gcc -x c++ -E -dM - | fgrep _GLIBCXX_USE_CXX11_ABI | awk '{print $3}'"
],
shell
=
True
)
abi
=
_run_cmd
([
"echo '#include <string>' | gcc -x c++ -E -dM - | fgrep _GLIBCXX_USE_CXX11_ABI | awk '{print $3}'"
],
shell
=
True
)
dtk
=
_run_cmd
([
"cat"
,
os
.
path
.
join
(
ROCM_HOME
,
'.info/rocm_version'
)])
dtk
=
_run_cmd
([
"cat"
,
os
.
path
.
join
(
ROCM_HOME
,
'.info/rocm_version'
)])
dtk
=
''
.
join
(
dtk
.
split
(
'.'
)[:
2
])
dtk
=
''
.
join
(
dtk
.
split
(
'.'
)[:
2
])
+
"2"
torch_version
=
torch
.
__version__
torch_version
=
torch
.
__version__
dcu_version
=
f
"
{
version
}
.abi
{
abi
}
.dtk
{
dtk
}
.torch
{
torch_version
}
"
dcu_version
=
f
"
{
version
}
.dtk
{
dtk
}
"
version_path
=
ROOT_DIR
/
"torchaudio"
/
"version.py"
version_path
=
ROOT_DIR
/
"torchaudio"
/
"version.py"
version_write
=
version
[:
-
9
]
with
open
(
version_path
,
"w"
)
as
f
:
with
open
(
version_path
,
"w"
)
as
f
:
f
.
write
(
f
"__version__ = '
{
version
}
'
\n
"
)
f
.
write
(
f
"__version__ = '
{
version
_write
}
'
\n
"
)
f
.
write
(
f
"git_version = '
{
sha
}
'
\n
"
)
f
.
write
(
f
"git_version = '
{
sha
}
'
\n
"
)
f
.
write
(
f
"abi = 'abi
{
abi
}
'
\n
"
)
f
.
write
(
f
"abi = 'abi
{
abi
}
'
\n
"
)
f
.
write
(
f
"dtk = '
{
dtk
}
'
\n
"
)
f
.
write
(
f
"dtk = '
{
dtk
}
'
\n
"
)
...
@@ -50,7 +52,6 @@ def _make_version_file(version, sha):
...
@@ -50,7 +52,6 @@ def _make_version_file(version, sha):
return
dcu_version
return
dcu_version
def
_get_pytorch_version
():
def
_get_pytorch_version
():
if
"PYTORCH_VERSION"
in
os
.
environ
:
if
"PYTORCH_VERSION"
in
os
.
environ
:
return
f
"torch==
{
os
.
environ
[
'PYTORCH_VERSION'
]
}
"
return
f
"torch==
{
os
.
environ
[
'PYTORCH_VERSION'
]
}
"
...
@@ -95,18 +96,6 @@ def _get_packages(branch_name, tag):
...
@@ -95,18 +96,6 @@ def _get_packages(branch_name, tag):
return
find_packages
(
exclude
=
exclude
)
return
find_packages
(
exclude
=
exclude
)
def
_init_submodule
():
print
(
" --- Initializing submodules"
)
try
:
subprocess
.
check_call
([
"git"
,
"submodule"
,
"init"
])
subprocess
.
check_call
([
"git"
,
"submodule"
,
"update"
])
except
Exception
:
print
(
" --- Submodule initalization failed"
)
print
(
"Please run:
\n\t
git submodule update --init --recursive"
)
sys
.
exit
(
1
)
print
(
" --- Initialized submodule"
)
def
_parse_url
(
path
):
def
_parse_url
(
path
):
with
open
(
path
,
"r"
)
as
file_
:
with
open
(
path
,
"r"
)
as
file_
:
for
line
in
file_
:
for
line
in
file_
:
...
@@ -116,18 +105,6 @@ def _parse_url(path):
...
@@ -116,18 +105,6 @@ def _parse_url(path):
yield
url
yield
url
def
_parse_sources
():
third_party_dir
=
ROOT_DIR
/
"third_party"
libs
=
[
"zlib"
,
"bzip2"
,
"lzma"
,
"sox"
]
archive_dir
=
third_party_dir
/
"archives"
archive_dir
.
mkdir
(
exist_ok
=
True
)
for
lib
in
libs
:
cmake_file
=
third_party_dir
/
lib
/
"CMakeLists.txt"
for
url
in
_parse_url
(
cmake_file
):
path
=
archive_dir
/
os
.
path
.
basename
(
url
)
yield
path
,
url
def
_fetch_archives
(
src
):
def
_fetch_archives
(
src
):
for
dest
,
url
in
src
:
for
dest
,
url
in
src
:
if
not
dest
.
exists
():
if
not
dest
.
exists
():
...
@@ -135,12 +112,6 @@ def _fetch_archives(src):
...
@@ -135,12 +112,6 @@ def _fetch_archives(src):
torch
.
hub
.
download_url_to_file
(
url
,
dest
,
progress
=
False
)
torch
.
hub
.
download_url_to_file
(
url
,
dest
,
progress
=
False
)
def
_fetch_third_party_libraries
():
_init_submodule
()
if
os
.
name
!=
"nt"
:
_fetch_archives
(
_parse_sources
())
def
_main
():
def
_main
():
sha
=
_run_cmd
([
"git"
,
"rev-parse"
,
"HEAD"
])
sha
=
_run_cmd
([
"git"
,
"rev-parse"
,
"HEAD"
])
branch
=
_run_cmd
([
"git"
,
"rev-parse"
,
"--abbrev-ref"
,
"HEAD"
])
branch
=
_run_cmd
([
"git"
,
"rev-parse"
,
"--abbrev-ref"
,
"HEAD"
])
...
@@ -154,14 +125,21 @@ def _main():
...
@@ -154,14 +125,21 @@ def _main():
print
(
"-- Building version"
,
version
)
print
(
"-- Building version"
,
version
)
dcu_version
=
_make_version_file
(
version
,
sha
)
dcu_version
=
_make_version_file
(
version
,
sha
)
_fetch_third_party_libraries
()
with
open
(
"README.md"
)
as
f
:
long_description
=
f
.
read
()
setup
(
setup
(
name
=
"torchaudio"
,
name
=
"torchaudio"
,
version
=
dcu_version
,
version
=
dcu_version
,
description
=
"An audio package for PyTorch"
,
description
=
"An audio package for PyTorch"
,
long_description
=
long_description
,
long_description_content_type
=
"text/markdown"
,
url
=
"https://github.com/pytorch/audio"
,
url
=
"https://github.com/pytorch/audio"
,
author
=
"Soumith Chintala, David Pollack, Sean Naren, Peter Goldsborough, Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang"
,
author
=
(
"Soumith Chintala, David Pollack, Sean Naren, Peter Goldsborough, "
"Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang"
),
author_email
=
"soumith@pytorch.org"
,
author_email
=
"soumith@pytorch.org"
,
maintainer
=
"Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang"
,
maintainer
=
"Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang"
,
maintainer_email
=
"moto@meta.com"
,
maintainer_email
=
"moto@meta.com"
,
...
@@ -174,9 +152,10 @@ def _main():
...
@@ -174,9 +152,10 @@ def _main():
"Operating System :: Microsoft :: Windows"
,
"Operating System :: Microsoft :: Windows"
,
"Operating System :: POSIX"
,
"Operating System :: POSIX"
,
"Programming Language :: C++"
,
"Programming Language :: C++"
,
"Programming Language :: Python :: 3.7"
,
"Programming Language :: Python :: 3.8"
,
"Programming Language :: Python :: 3.8"
,
"Programming Language :: Python :: 3.9"
,
"Programming Language :: Python :: 3.9"
,
"Programming Language :: Python :: 3.10"
,
"Programming Language :: Python :: 3.11"
,
"Programming Language :: Python :: Implementation :: CPython"
,
"Programming Language :: Python :: Implementation :: CPython"
,
"Topic :: Multimedia :: Sound/Audio"
,
"Topic :: Multimedia :: Sound/Audio"
,
"Topic :: Scientific/Engineering :: Artificial Intelligence"
,
"Topic :: Scientific/Engineering :: Artificial Intelligence"
,
...
...
test/integration_tests/conftest.py
View file @
ffeba11a
...
@@ -102,7 +102,7 @@ def pytest_addoption(parser):
...
@@ -102,7 +102,7 @@ def pytest_addoption(parser):
@
pytest
.
fixture
(
autouse
=
True
)
@
pytest
.
fixture
(
autouse
=
True
)
def
temp_hub_dir
(
tmp_path
,
pytestconfig
):
def
temp_hub_dir
(
tmp_path
,
pytestconfig
):
if
not
pytestconfig
.
getoption
(
"use_tmp_hub_dir"
):
if
not
pytestconfig
.
getoption
(
"use_tmp_hub_dir"
,
default
=
False
):
yield
yield
else
:
else
:
org_dir
=
torch
.
hub
.
get_dir
()
org_dir
=
torch
.
hub
.
get_dir
()
...
...
test/integration_tests/prototype/hifi_gan_pipeline_test.py
0 → 100644
View file @
ffeba11a
import
math
import
torch
import
torchaudio
from
torchaudio.prototype.functional
import
oscillator_bank
from
torchaudio.prototype.pipelines
import
HIFIGAN_VOCODER_V3_LJSPEECH
def
test_hifi_gan_pretrained_weights
():
"""Test that a waveform reconstructed from mel spectrogram by HiFiGAN bundle is close enough to the original.
The main transformations performed in this test can be represented as
- audio -> reference log mel spectrogram
- audio -> mel spectrogram -> audio -> estimated log mel spectrogram
In the end, we compare estimated log mel spectrogram to the reference one. See comments in code for details.
"""
bundle
=
HIFIGAN_VOCODER_V3_LJSPEECH
# Get HiFiGAN-compatible transformation from waveform to mel spectrogram
mel_transform
=
bundle
.
get_mel_transform
()
# Get HiFiGAN vocoder
vocoder
=
bundle
.
get_vocoder
()
# Create a synthetic waveform
ref_waveform
=
get_sin_sweep
(
sample_rate
=
bundle
.
sample_rate
,
length
=
100000
)
ref_waveform
=
ref_waveform
[:,
:
-
(
ref_waveform
.
shape
[
1
]
%
mel_transform
.
hop_size
)]
# Generate mel spectrogram from waveform
mel_spectrogram
=
mel_transform
(
ref_waveform
)
with
torch
.
no_grad
():
# Generate waveform from mel spectrogram
estimated_waveform
=
vocoder
(
mel_spectrogram
).
squeeze
(
0
)
# Measure the reconstruction error.
# Even though the reconstructed audio is perceptually very close to the original, it doesn't score well on
# metrics like Si-SNR. It might be that HiFiGAN introduces non-uniform shifts to the reconstructed waveforms.
# So to evaluate the recontruction error we compute mel spectrograms of the reference and recontructed waveforms,
# and compare relative mean squared error of their logarithms.
final_spec
=
torchaudio
.
transforms
.
MelSpectrogram
(
sample_rate
=
bundle
.
sample_rate
,
normalized
=
True
)
# Log mel spectrogram of the estimated waveform
estimated_spectorogram
=
final_spec
(
estimated_waveform
)
estimated_spectorogram
=
torch
.
log
(
torch
.
clamp
(
estimated_spectorogram
,
min
=
1e-5
))
# Log mel spectrogram of the reference waveform
ref_spectrogram
=
final_spec
(
ref_waveform
)
ref_spectrogram
=
torch
.
log
(
torch
.
clamp
(
ref_spectrogram
,
min
=
1e-5
))
# Check that relative MSE is below 4%
mse
=
((
estimated_spectorogram
-
ref_spectrogram
)
**
2
).
mean
()
mean_ref
=
((
ref_spectrogram
)
**
2
).
mean
()
print
(
mse
/
mean_ref
)
assert
mse
/
mean_ref
<
0.04
def
get_sin_sweep
(
sample_rate
,
length
):
"""Create a waveform which changes frequency from 0 to the Nyquist frequency (half of the sample rate)"""
nyquist_freq
=
sample_rate
/
2
freq
=
torch
.
logspace
(
0
,
math
.
log
(
0.99
*
nyquist_freq
,
10
),
length
).
unsqueeze
(
-
1
)
amp
=
torch
.
ones
((
length
,
1
))
waveform
=
oscillator_bank
(
freq
,
amp
,
sample_rate
=
sample_rate
)
return
waveform
.
unsqueeze
(
0
)
Prev
1
…
5
6
7
8
9
10
11
12
13
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment