Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Torchaudio
Commits
0902494e
Commit
0902494e
authored
Jul 16, 2019
by
jamarshon
Committed by
cpuhrsch
Jul 16, 2019
Browse files
torch.functional Docs (#140)
parent
c569b40f
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
190 additions
and
106 deletions
+190
-106
.gitignore
.gitignore
+1
-0
docs/source/conf.py
docs/source/conf.py
+1
-0
docs/source/functional.rst
docs/source/functional.rst
+70
-0
docs/source/index.rst
docs/source/index.rst
+1
-0
torchaudio/functional.py
torchaudio/functional.py
+117
-106
No files found.
.gitignore
View file @
0902494e
...
...
@@ -68,6 +68,7 @@ instance/
# Sphinx documentation
docs/_build/
docs/src/
# PyBuilder
target/
...
...
docs/source/conf.py
View file @
0902494e
...
...
@@ -208,6 +208,7 @@ texinfo_documents = [
intersphinx_mapping
=
{
'python'
:
(
'https://docs.python.org/'
,
None
),
'numpy'
:
(
'https://docs.scipy.org/doc/numpy/'
,
None
),
'torch'
:
(
'https://pytorch.org/docs/stable/'
,
None
),
}
# -- A patch that prevents Sphinx from cross-referencing ivar tags -------
...
...
docs/source/functional.rst
0 → 100644
View file @
0902494e
.. role:: hidden
:class: hidden-section
torchaudio.functional
======================
.. currentmodule:: torchaudio.functional
Functions to perform common audio operations.
:hidden:`scale`
~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: scale
:hidden:`pad_trim`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: pad_trim
:hidden:`downmix_mono`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: downmix_mono
:hidden:`LC2CL`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: LC2CL
:hidden:`istft`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: istft
:hidden:`spectrogram`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: spectrogram
:hidden:`create_fb_matrix`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: create_fb_matrix
:hidden:`spectrogram_to_DB`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: spectrogram_to_DB
:hidden:`create_dct`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: create_dct
:hidden:`BLC2CBL`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: BLC2CBL
:hidden:`mu_law_encoding`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: mu_law_encoding
:hidden:`mu_law_expanding`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: mu_law_expanding
docs/source/index.rst
View file @
0902494e
...
...
@@ -12,6 +12,7 @@ The :mod:`torchaudio` package consists of I/O, popular datasets and common audio
compliance.kaldi
kaldi_io
transforms
functional
legacy
.. automodule:: torchaudio
...
...
torchaudio/functional.py
View file @
0902494e
...
...
@@ -21,16 +21,17 @@ __all__ = [
@
torch
.
jit
.
script
def
scale
(
tensor
,
factor
):
# type: (Tensor, int) -> Tensor
"""Scale audio tensor from a 16-bit integer (represented as a FloatTensor)
to a floating point number between -1.0 and 1.0. Note the 16-bit number is
called the "bit depth" or "precision", not to be confused with "bit rate".
r
"""Scale audio tensor from a 16-bit integer (represented as a
:class:`torch.FloatTensor`) to a floating point number between -1.0 and 1.0.
Note the 16-bit number is called the "bit depth" or "precision", not to be
confused with "bit rate".
Input
s:
tensor (Tensor): Tensor of audio of size (
Samples x Channels
)
Arg
s:
tensor (
torch.
Tensor): Tensor of audio of size (
n, c) or (c, n
)
factor (int): Maximum value of input tensor
Output
s:
Tensor: Scaled by the scale factor
Return
s:
torch.
Tensor: Scaled by the scale factor
"""
if
not
tensor
.
is_floating_point
():
tensor
=
tensor
.
to
(
torch
.
float32
)
...
...
@@ -41,17 +42,17 @@ def scale(tensor, factor):
@
torch
.
jit
.
script
def
pad_trim
(
tensor
,
ch_dim
,
max_len
,
len_dim
,
fill_value
):
# type: (Tensor, int, int, int, float) -> Tensor
"""Pad/
T
rim a 2
d-T
ensor (
S
ignal or
L
abels)
r
"""Pad/
t
rim a 2
D t
ensor (
s
ignal or
l
abels)
.
Input
s:
tensor (Tensor): Tensor of audio of size (n
x
c) or (c
x
n)
Arg
s:
tensor (
torch.
Tensor): Tensor of audio of size (n
,
c) or (c
,
n)
ch_dim (int): Dimension of channel (not size)
max_len (int): Length to which the tensor will be padded
len_dim (int): Dimension of length (not size)
fill_value (float): Value to fill in
Output
s:
Tensor: Padded/trimmed tensor
Return
s:
torch.
Tensor: Padded/trimmed tensor
"""
if
max_len
>
tensor
.
size
(
len_dim
):
# array of [padding_left, padding_right, padding_top, padding_bottom]
...
...
@@ -71,14 +72,14 @@ def pad_trim(tensor, ch_dim, max_len, len_dim, fill_value):
@
torch
.
jit
.
script
def
downmix_mono
(
tensor
,
ch_dim
):
# type: (Tensor, int) -> Tensor
"""Downmix any stereo signals to mono.
r
"""Downmix any stereo signals to mono.
Input
s:
tensor (Tensor): Tensor of audio of size (c
x
n) or (n
x
c)
Arg
s:
tensor (
torch.
Tensor): Tensor of audio of size (c
,
n) or (n
,
c)
ch_dim (int): Dimension of channel (not size)
Output
s:
Tensor: Mono signal
Return
s:
torch.
Tensor: Mono signal
"""
if
not
tensor
.
is_floating_point
():
tensor
=
tensor
.
to
(
torch
.
float32
)
...
...
@@ -90,13 +91,13 @@ def downmix_mono(tensor, ch_dim):
@
torch
.
jit
.
script
def
LC2CL
(
tensor
):
# type: (Tensor) -> Tensor
"""Permute a 2
d
tensor from samples (n
x
c) to (c
x
n)
r
"""Permute a 2
D
tensor from samples (n
,
c) to (c
,
n)
.
Input
s:
tensor (Tensor): Tensor of audio signal with shape (
LxC
)
Arg
s:
tensor (
torch.
Tensor): Tensor of audio signal with shape (
n, c
)
Output
s:
Tensor: Tensor of audio signal with shape (
CxL
)
Return
s:
torch.
Tensor: Tensor of audio signal with shape (
c, n
)
"""
return
tensor
.
transpose
(
0
,
1
).
contiguous
()
...
...
@@ -119,41 +120,54 @@ def istft(stft_matrix, # type: Tensor
):
# type: (...) -> Tensor
r
""" Inverse short time Fourier Transform. This is expected to be the inverse of torch.stft.
It has the same parameters (+ additional optional parameter of
:attr:
`length`) and it should return the
It has the same parameters (+ additional optional parameter of
`
`length`
`
) and it should return the
least squares estimation of the original signal. The algorithm will check using the NOLA condition (
nonzero overlap).
Important consideration in the parameters :attr:`window` and :attr:`center` so that the envelop
Important consideration in the parameters ``window`` and ``center`` so that the envelop
created by the summation of all the windows is never zero at certain point in time. Specifically,
:math:`\sum_{t=-\ infty}^{\ infty} w^2[n-t\times hop\_length] \neq 0`.
:math:`\sum_{t=-\infty}^{\infty} w^2[n-t\times hop\_length] \cancel{=} 0`.
Since stft discards elements at the end of the signal if they do not fit in a frame, the
istft may return a shorter signal than the original signal (can occur if
:attr:
`center` is False
istft may return a shorter signal than the original signal (can occur if `center` is False
since the signal isn't padded).
If :attr:`center` is True, then there will be padding e.g. 'constant', 'reflect', etc. Left padding
If ``center`` is True, then there will be padding e.g. 'constant', 'reflect', etc. Left padding
can be trimmed off exactly because they can be calculated but right padding cannot be calculated
without additional information.
Example: Suppose the last window is:
[17, 18, 0, 0, 0] vs [18, 0, 0, 0, 0]
The n_frames, hop_length, win_length are all the same which prevents the calculation of right padding.
These additional values could be zeros or a reflection of the signal so providing :attr:`length`
could be useful. If :attr:`length` is None then padding will be aggressively removed (some loss of signal).
These additional values could be zeros or a reflection of the signal so providing ``length``
could be useful. If ``length`` is ``None`` then padding will be aggressively removed
(some loss of signal).
[1] D. W. Griffin and J. S. Lim, “Signal estimation from modified short-time Fourier transform,”
IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984.
Inputs:
stft_matrix (Tensor): output of stft where each row of a batch is a frequency and each column is
a window. it has a shape of either (batch, fft_size, n_frames, 2) or (fft_size, n_frames, 2)
n_fft (int): size of Fourier transform
hop_length (Optional[int]): the distance between neighboring sliding window frames. (Default: win_length // 4)
win_length (Optional[int]): the size of window frame and STFT filter. (Default: n_fft)
window (Optional[Tensor]): the optional window function. (Default: torch.ones(win_length))
center (bool): whether :attr:`input` was padded on both sides so
Args:
stft_matrix (torch.Tensor): Output of stft where each row of a batch is a frequency and each
column is a window. it has a shape of either (batch, fft_size, n_frames, 2) or (
fft_size, n_frames, 2)
n_fft (int): Size of Fourier transform
hop_length (Optional[int]): The distance between neighboring sliding window frames.
(Default: ``win_length // 4``)
win_length (Optional[int]): The size of window frame and STFT filter. (Default: ``n_fft``)
window (Optional[torch.Tensor]): The optional window function.
(Default: ``torch.ones(win_length)``)
center (bool): Whether ``input`` was padded on both sides so
that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`
pad_mode (str):
c
ontrols the padding method used when
:attr:
`center` is ``True``
normalized (bool):
w
hether the STFT was normalized
onesided (bool):
w
hether the STFT is onesided
length (Optional[int]):
t
he amount to trim the signal by (i.e. the
pad_mode (str):
C
ontrols the padding method used when
`
`center`
`
is ``True``
normalized (bool):
W
hether the STFT was normalized
onesided (bool):
W
hether the STFT is onesided
length (Optional[int]):
T
he amount to trim the signal by (i.e. the
original signal length). (Default: whole signal)
Outputs:
Tensor: least squares estimation of the original signal of size (batch, signal_length) or (signal_length)
Returns:
torch.Tensor: Least squares estimation of the original signal of size
(batch, signal_length) or (signal_length)
"""
stft_matrix_dim
=
stft_matrix
.
dim
()
assert
3
<=
stft_matrix_dim
<=
4
,
(
'Incorrect stft dimension: %d'
%
(
stft_matrix_dim
))
...
...
@@ -241,25 +255,24 @@ def istft(stft_matrix, # type: Tensor
@
torch
.
jit
.
script
def
spectrogram
(
sig
,
pad
,
window
,
n_fft
,
hop
,
ws
,
power
,
normalize
):
# type: (Tensor, int, Tensor, int, int, int, int, bool) -> Tensor
"""Create a spectrogram from a raw audio signal
Inputs:
sig (Tensor): Tensor of audio of size (c, n)
pad (int): two sided padding of signal
window (Tensor): window_tensor
n_fft (int): size of fft
hop (int): length of hop between STFT windows
ws (int): window size
power (int > 0 ) : Exponent for the magnitude spectrogram,
e.g., 1 for energy, 2 for power, etc.
normalize (bool) : whether to normalize by magnitude after stft
Outputs:
Tensor: channels x hops x n_fft (c, l, f), where channels
is unchanged, hops is the number of hops, and n_fft is the
number of fourier bins, which should be the window size divided
by 2 plus 1.
r
"""Create a spectrogram from a raw audio signal.
Args:
sig (torch.Tensor): Tensor of audio of size (c, n)
pad (int): Two sided padding of signal
window (torch.Tensor): Window_tensor
n_fft (int): Size of fft
hop (int): Length of hop between STFT windows
ws (int): Window size
power (int) : Exponent for the magnitude spectrogram,
(must be > 0) e.g., 1 for energy, 2 for power, etc.
normalize (bool) : Whether to normalize by magnitude after stft
Returns:
torch.Tensor: Channels x hops x n_fft (c, l, f), where channels
is unchanged, hops is the number of hops, and n_fft is the
number of fourier bins, which should be the window size divided
by 2 plus 1.
"""
assert
sig
.
dim
()
==
2
...
...
@@ -280,17 +293,16 @@ def spectrogram(sig, pad, window, n_fft, hop, ws, power, normalize):
@
torch
.
jit
.
script
def
create_fb_matrix
(
n_stft
,
f_min
,
f_max
,
n_mels
):
# type: (int, float, float, int) -> Tensor
""" Create a frequency bin conversion matrix.
r
""" Create a frequency bin conversion matrix.
Inputs:
n_stft (int): number of filter banks from spectrogram
f_min (float): minimum frequency
f_max (float): maximum frequency
n_mels (int): number of mel bins
Outputs:
Tensor: triangular filter banks (fb matrix)
Args:
n_stft (int): Number of filter banks from spectrogram
f_min (float): Minimum frequency
f_max (float): Maximum frequency
n_mels (int): Number of mel bins
Returns:
torch.Tensor: Triangular filter banks (fb matrix)
"""
# get stft freq bins
stft_freqs
=
torch
.
linspace
(
f_min
,
f_max
,
n_stft
)
...
...
@@ -315,22 +327,22 @@ def create_fb_matrix(n_stft, f_min, f_max, n_mels):
@
torch
.
jit
.
script
def
spectrogram_to_DB
(
spec
,
multiplier
,
amin
,
db_multiplier
,
top_db
=
None
):
# type: (Tensor, float, float, float, Optional[float]) -> Tensor
"""Turns a spectrogram from the power/amplitude scale to the decibel scale.
r
"""Turns a spectrogram from the power/amplitude scale to the decibel scale.
This output depends on the maximum value in the input spectrogram, and so
may return different values for an audio clip split into snippets vs. a
a full clip.
Input
s:
spec (Tensor):
n
ormal STFT
multiplier (float):
u
se 10. for power and 20. for amplitude
amin (float):
n
umber to clamp spec
db_multiplier (float):
l
og10(max(reference value and amin))
top_db (Optional[float]):
m
inimum negative cut-off in decibels. A reasonable number
Arg
s:
spec (
torch.
Tensor):
N
ormal STFT
multiplier (float):
U
se 10. for power and 20. for amplitude
amin (float):
N
umber to clamp spec
db_multiplier (float):
L
og10(max(reference value and amin))
top_db (Optional[float]):
M
inimum negative cut-off in decibels. A reasonable number
is 80.
Output
s:
Tensor:
s
pectrogram in DB
Return
s:
torch.
Tensor:
S
pectrogram in DB
"""
spec_db
=
multiplier
*
torch
.
log10
(
torch
.
clamp
(
spec
,
min
=
amin
))
spec_db
-=
multiplier
*
db_multiplier
...
...
@@ -345,17 +357,16 @@ def spectrogram_to_DB(spec, multiplier, amin, db_multiplier, top_db=None):
@
torch
.
jit
.
script
def
create_dct
(
n_mfcc
,
n_mels
,
norm
):
# type: (int, int, Optional[str]) -> Tensor
"""
Creates a DCT transformation matrix with shape (num_mels, num_mfcc),
normalized depending on norm
r
"""Creates a DCT transformation matrix with shape (num_mels, num_mfcc),
normalized depending on norm.
Input
s:
n_mfcc (int) :
n
umber of mfc coefficients to retain
n_mels (int):
n
umber of MEL bins
norm (Optional[str]) :
n
orm to use (either 'ortho' or None)
Arg
s:
n_mfcc (int) :
N
umber of mfc coefficients to retain
n_mels (int):
N
umber of MEL bins
norm (Optional[str]) :
N
orm to use (either 'ortho' or None)
Output
s:
Tensor: The transformation matrix, to be right-multiplied to row-wise data.
Return
s:
torch.
Tensor: The transformation matrix, to be right-multiplied to row-wise data.
"""
outdim
=
n_mfcc
dim
=
n_mels
...
...
@@ -375,14 +386,14 @@ def create_dct(n_mfcc, n_mels, norm):
@
torch
.
jit
.
script
def
BLC2CBL
(
tensor
):
# type: (Tensor) -> Tensor
"""Permute a 3
d
tensor from Bands x Sample length x Channels to Channels x
Bands x Samples length
r
"""Permute a 3
D
tensor from Bands x Sample length x Channels to Channels x
Bands x Samples length
.
Input
s:
tensor (Tensor): Tensor of spectrogram with shape (
BxLxC
)
Arg
s:
tensor (
torch.
Tensor): Tensor of spectrogram with shape (
b, l, c
)
Output
s:
Tensor: Tensor of spectrogram with shape (
CxBxL
)
Return
s:
torch.
Tensor: Tensor of spectrogram with shape (
c, b, l
)
"""
return
tensor
.
permute
(
2
,
0
,
1
).
contiguous
()
...
...
@@ -390,18 +401,18 @@ def BLC2CBL(tensor):
@
torch
.
jit
.
script
def
mu_law_encoding
(
x
,
qc
):
# type: (Tensor, int) -> Tensor
"""Encode signal based on mu-law companding. For more info see the
r
"""Encode signal based on mu-law companding. For more info see the
`Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_
This algorithm assumes the signal has been scaled to between -1 and 1 and
returns a signal encoded with values from 0 to quantization_channels - 1
returns a signal encoded with values from 0 to quantization_channels - 1
.
Input
s:
x (Tensor): Input tensor
Arg
s:
x (
torch.
Tensor): Input tensor
qc (int): Number of channels (i.e. quantization channels)
Output
s:
Tensor: Input after mu-law companding
Return
s:
torch.
Tensor: Input after mu-law companding
"""
assert
isinstance
(
x
,
torch
.
Tensor
),
'mu_law_encoding expects a Tensor'
mu
=
qc
-
1.
...
...
@@ -417,18 +428,18 @@ def mu_law_encoding(x, qc):
@
torch
.
jit
.
script
def
mu_law_expanding
(
x_mu
,
qc
):
# type: (Tensor, int) -> Tensor
"""Decode mu-law encoded signal. For more info see the
r
"""Decode mu-law encoded signal. For more info see the
`Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_
This expects an input with values between 0 and quantization_channels - 1
and returns a signal scaled between -1 and 1.
Input
s:
x_mu (Tensor): Input tensor
Arg
s:
x_mu (
torch.
Tensor): Input tensor
qc (int): Number of channels (i.e. quantization channels)
Output
s:
Tensor: Input after decoding
Return
s:
torch.
Tensor: Input after decoding
"""
assert
isinstance
(
x_mu
,
torch
.
Tensor
),
'mu_law_expanding expects a Tensor'
mu
=
qc
-
1.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment