Commit 72ae755a authored by moto's avatar moto Committed by Facebook GitHub Bot
Browse files

Add devices/properties badges (#2321)

Summary:
Add badges of supported properties and devices to functionals and transforms.

This commit adds `.. devices::` and `.. properties::` directives to sphinx.

APIs with these directives will have badges (based off of shields.io) which link to the
page with description of these features.

Continuation of https://github.com/pytorch/audio/issues/2316
Excluded dtypes for further improvement, and actually added badges to most of functional/transforms.

Pull Request resolved: https://github.com/pytorch/audio/pull/2321

Reviewed By: hwangjeff

Differential Revision: D35489063

Pulled By: mthrok

fbshipit-source-id: f68a70ebb22df29d5e9bd171273bd19007a81762
parent eb23a242
......@@ -70,6 +70,7 @@ instance/
docs/_build/
docs/src/
docs/source/tutorials
docs/source/gen_images
docs/source/gen_modules
# PyBuilder
......
......@@ -9,3 +9,9 @@ dt > em.sig-param:last-of-type::after {
content: "\a";
white-space: pre;
}
/* For shields */
article.pytorch-article img.shield-badge {
width: unset;
margin-top: -18px;
margin-bottom: 9px;
}
......@@ -17,10 +17,10 @@
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
import os
import sys
sys.path.insert(0, os.path.abspath("."))
import re
import warnings
from datetime import datetime
......@@ -342,3 +342,13 @@ def inject_minigalleries(app, what, name, obj, options, lines):
def setup(app):
app.connect("autodoc-process-docstring", inject_minigalleries)
from custom_directives import SupportedDevices, SupportedProperties
# Register custom directives
from docutils.parsers import rst
rst.directives.register_directive("devices", SupportedDevices)
rst.directives.register_directive("properties", SupportedProperties)
import hashlib
from pathlib import Path
from typing import List
from urllib.parse import quote, urlencode
import requests
from docutils import nodes
from docutils.parsers.rst.directives.images import Image
_THIS_DIR = Path(__file__).parent
# Color palette from PyTorch Developer Day 2021 Presentation Template
YELLOW = "F9DB78"
GREEN = "70AD47"
BLUE = "00B0F0"
PINK = "FF71DA"
ORANGE = "FF8300"
TEAL = "00E5D1"
GRAY = "7F7F7F"
def _get_cache_path(key, ext):
filename = f"{hashlib.sha256(key).hexdigest()}{ext}"
cache_dir = _THIS_DIR / "gen_images"
cache_dir.mkdir(parents=True, exist_ok=True)
return cache_dir / filename
def _download(url, path):
response = requests.get(url)
response.raise_for_status()
with open(path, "wb") as file:
file.write(response.content)
def _fetch_image(url):
path = _get_cache_path(url.encode("utf-8"), ext=".svg")
if not path.exists():
_download(url, path)
return str(path.relative_to(_THIS_DIR))
class BaseShield(Image):
def run(self, params, alt, section) -> List[nodes.Node]:
url = f"https://img.shields.io/static/v1?{urlencode(params, quote_via=quote)}"
path = _fetch_image(url)
self.arguments = [path]
self.options["alt"] = alt
if "class" not in self.options:
self.options["class"] = []
self.options["class"].append("shield-badge")
self.options["target"] = f"supported_features.html#{section}"
return super().run()
def _parse_devices(arg: str):
devices = sorted(arg.strip().split())
valid_values = {"CPU", "CUDA"}
if any(val not in valid_values for val in devices):
raise ValueError(
f"One or more device values are not valid. The valid values are {valid_values}. Given value: '{arg}'"
)
return ", ".join(sorted(devices))
def _parse_properties(arg: str):
properties = sorted(arg.strip().split())
valid_values = {"Autograd", "TorchScript"}
if any(val not in valid_values for val in properties):
raise ValueError(
"One or more property values are not valid. "
f"The valid values are {valid_values}. "
f"Given value: '{arg}'"
)
return ", ".join(sorted(properties))
class SupportedDevices(BaseShield):
"""List the supported devices"""
required_arguments = 1
final_argument_whitespace = True
def run(self) -> List[nodes.Node]:
devices = _parse_devices(self.arguments[0])
alt = f"This feature supports the following devices: {devices}"
params = {
"label": "Devices",
"message": devices,
"labelColor": GRAY,
"color": BLUE,
"style": "flat-square",
}
return super().run(params, alt, "devices")
class SupportedProperties(BaseShield):
"""List the supported properties"""
required_arguments = 1
final_argument_whitespace = True
def run(self) -> List[nodes.Node]:
properties = _parse_properties(self.arguments[0])
alt = f"This API supports the following properties: {properties}"
params = {
"label": "Properties",
"message": properties,
"labelColor": GRAY,
"color": GREEN,
"style": "flat-square",
}
return super().run(params, alt, "properties")
......@@ -29,6 +29,7 @@ Features described in this documentation are classified by release status:
:caption: Torchaudio Documentation
Index <self>
supported_features
API References
--------------
......
@misc{RESAMPLE,
author = {Julius O. Smith},
title = {Digital Audio Resampling Home Page "Theory of Ideal Bandlimited Interpolation" section},
url = {https://ccrma.stanford.edu/~jos/resample/Theory_Ideal_Bandlimited_Interpolation.html},
month = {September},
year = {2020}
}
@article{voxpopuli,
author = {Changhan Wang and
Morgane Rivi{\`{e}}re and
......
Supported Features
==================
Each TorchAudio API supports a subset of PyTorch features, such as
devices and data types.
Supported features are indicated in API references like the following:
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
These icons mean that they are verified through automated testing.
.. note::
Missing feature icons mean that they are not tested, and this can mean
different things, depending on the API.
1. The API is compatible with the feature but not tested.
2. The API is not compatible with the feature.
In case of 2, the API might explicitly raise an error, but that is not guaranteed.
For example, APIs without an Autograd badge might throw an error during backpropagation,
or silently return a wrong gradient.
If you use an API that hasn't been labeled as supporting a feature, you might want to first verify that the
feature works fine.
Devices
-------
CPU
^^^
.. devices:: CPU
TorchAudio APIs that support CPU can perform their computation on CPU tensors.
CUDA
^^^^
.. devices:: CUDA
TorchAudio APIs that support CUDA can perform their computation on CUDA devices.
In case of functions, move the tensor arguments to CUDA device before passing them to a function.
For example:
.. code:: python
cuda = torch.device("cuda")
waveform = waveform.to(cuda)
spectrogram = torchaudio.functional.spectrogram(waveform)
Classes with CUDA support are implemented with :py:func:`torch.nn.Module`.
It is also necessary to move the instance to CUDA device, before passing CUDA tensors.
For example:
.. code:: python
cuda = torch.device("cuda")
resampler = torchaudio.transforms.Resample(8000, 16000)
resampler.to(cuda)
waveform.to(cuda)
resampled = resampler(waveform)
Properties
----------
Autograd
^^^^^^^^
.. properties:: Autograd
TorchAudio APIs with autograd support can correctly backpropagate gradients.
For the basics of autograd, please refer to this `tutorial <https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html>`_.
.. note::
APIs without this mark may or may not raise an error during backpropagation.
The absence of an error raised during backpropagation does not necessarily mean the gradient is correct.
TorchScript
^^^^^^^^^^^
.. properties:: TorchScript
TorchAudio APIs with TorchScript support can be serialized and executed in non-Python environments.
For details on TorchScript, please refer to the `documentation <https://pytorch.org/docs/stable/jit.html>`_.
......@@ -251,6 +251,33 @@ class Autograd(TestBaseMixin):
Q = torch.tensor(Q)
self.assert_grad(F.bandreject_biquad, (x, sr, central_freq, Q))
def test_deemph_biquad(self):
torch.random.manual_seed(2434)
x = get_whitenoise(sample_rate=22050, duration=0.01, n_channels=1)
self.assert_grad(F.deemph_biquad, (x, 44100))
def test_flanger(self):
torch.random.manual_seed(2434)
x = get_whitenoise(sample_rate=8000, duration=0.01, n_channels=1)
self.assert_grad(F.flanger, (x, 44100))
def test_gain(self):
torch.random.manual_seed(2434)
x = get_whitenoise(sample_rate=8000, duration=0.01, n_channels=1)
self.assert_grad(F.gain, (x, 1.1))
def test_overdrive(self):
torch.random.manual_seed(2434)
x = get_whitenoise(sample_rate=8000, duration=0.01, n_channels=1)
self.assert_grad(F.gain, (x,))
@parameterized.expand([(True,), (False,)])
def test_phaser(self, sinusoidal):
torch.random.manual_seed(2434)
sr = 8000
x = get_whitenoise(sample_rate=sr, duration=0.01, n_channels=1)
self.assert_grad(F.phaser, (x, sr, sinusoidal))
@parameterized.expand(
[
(True,),
......
......@@ -86,6 +86,10 @@ class Transforms(TestBaseMixin):
tensor = torch.rand((1, 10))
self._assert_consistency(T.MuLawDecoding(), tensor)
def test_ComputeDelta(self):
tensor = torch.rand((1, 10))
self._assert_consistency(T.ComputeDeltas(), tensor)
def test_Fade(self):
waveform = common_utils.get_whitenoise()
fade_in_len = 3000
......
......@@ -67,6 +67,10 @@ def _generate_wave_table(
def allpass_biquad(waveform: Tensor, sample_rate: int, central_freq: float, Q: float = 0.707) -> Tensor:
r"""Design two-pole all-pass filter. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
waveform(torch.Tensor): audio waveform of dimension of `(..., time)`
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
......@@ -107,6 +111,10 @@ def band_biquad(
) -> Tensor:
r"""Design two-pole band filter. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
waveform (Tensor): audio waveform of dimension of `(..., time)`
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
......@@ -156,6 +164,10 @@ def bandpass_biquad(
) -> Tensor:
r"""Design two-pole band-pass filter. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
waveform (Tensor): audio waveform of dimension of `(..., time)`
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
......@@ -192,6 +204,10 @@ def bandpass_biquad(
def bandreject_biquad(waveform: Tensor, sample_rate: int, central_freq: float, Q: float = 0.707) -> Tensor:
r"""Design two-pole band-reject filter. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
waveform (Tensor): audio waveform of dimension of `(..., time)`
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
......@@ -231,6 +247,10 @@ def bass_biquad(
) -> Tensor:
r"""Design a bass tone-control effect. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
waveform (Tensor): audio waveform of dimension of `(..., time)`
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
......@@ -271,7 +291,10 @@ def bass_biquad(
def biquad(waveform: Tensor, b0: float, b1: float, b2: float, a0: float, a1: float, a2: float) -> Tensor:
r"""Perform a biquad filter of input tensor. Initial conditions set to 0.
https://en.wikipedia.org/wiki/Digital_biquad_filter
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
waveform (Tensor): audio waveform of dimension of `(..., time)`
......@@ -284,6 +307,9 @@ def biquad(waveform: Tensor, b0: float, b1: float, b2: float, a0: float, a1: flo
Returns:
Tensor: Waveform with dimension of `(..., time)`
Reference:
- https://en.wikipedia.org/wiki/Digital_biquad_filter
"""
device = waveform.device
......@@ -306,6 +332,11 @@ def biquad(waveform: Tensor, b0: float, b1: float, b2: float, a0: float, a1: flo
def contrast(waveform: Tensor, enhancement_amount: float = 75.0) -> Tensor:
r"""Apply contrast effect. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Comparable with compression, this effect modifies an audio signal to make it sound louder
Args:
......@@ -335,6 +366,11 @@ def contrast(waveform: Tensor, enhancement_amount: float = 75.0) -> Tensor:
def dcshift(waveform: Tensor, shift: float, limiter_gain: Optional[float] = None) -> Tensor:
r"""Apply a DC shift to the audio. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: TorchScript
This can be useful to remove a DC offset
(caused perhaps by a hardware problem in the recording chain) from the audio
......@@ -357,6 +393,8 @@ def dcshift(waveform: Tensor, shift: float, limiter_gain: Optional[float] = None
if limiter_gain is not None:
limiter_threshold = 1.0 - (abs(shift) - limiter_gain)
# Note:
# the following index-based update breaks auto-grad support
if limiter_gain is not None and shift > 0:
mask = waveform > limiter_threshold
temp = (waveform[mask] - limiter_threshold) * limiter_gain / (1 - limiter_threshold)
......@@ -376,6 +414,10 @@ def dcshift(waveform: Tensor, shift: float, limiter_gain: Optional[float] = None
def deemph_biquad(waveform: Tensor, sample_rate: int) -> Tensor:
r"""Apply ISO 908 CD de-emphasis (shelving) IIR filter. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
waveform (Tensor): audio waveform of dimension of `(..., time)`
sample_rate (int): sampling rate of the waveform, Allowed sample rate ``44100`` or ``48000``
......@@ -551,7 +593,13 @@ def _apply_probability_distribution(waveform: Tensor, density_function: str = "T
def dither(waveform: Tensor, density_function: str = "TPDF", noise_shaping: bool = False) -> Tensor:
r"""Dither increases the perceived dynamic range of audio stored at a
r"""Apply dither
.. devices:: CPU CUDA
.. properties:: TorchScript
Dither increases the perceived dynamic range of audio stored at a
particular bit-depth by eliminating nonlinear truncation distortion
(i.e. adding minimally perceived noise to mask distortion caused by quantization).
......@@ -585,6 +633,10 @@ def equalizer_biquad(
) -> Tensor:
r"""Design biquad peaking equalizer filter and perform filtering. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
waveform (Tensor): audio waveform of dimension of `(..., time)`
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
......@@ -622,6 +674,10 @@ def filtfilt(
) -> Tensor:
r"""Apply an IIR filter forward and backward to a waveform.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Inspired by https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.filtfilt.html
Args:
......@@ -665,6 +721,10 @@ def flanger(
) -> Tensor:
r"""Apply a flanger effect to the audio. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
waveform (Tensor): audio waveform of dimension of `(..., channel, time)` .
Max 4 channels allowed
......@@ -808,6 +868,10 @@ def flanger(
def gain(waveform: Tensor, gain_db: float = 1.0) -> Tensor:
r"""Apply amplification or attenuation to the whole waveform.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
waveform (Tensor): Tensor of audio of dimension (..., time).
gain_db (float, optional) Gain adjustment in decibels (dB) (Default: ``1.0``).
......@@ -826,6 +890,10 @@ def gain(waveform: Tensor, gain_db: float = 1.0) -> Tensor:
def highpass_biquad(waveform: Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> Tensor:
r"""Design biquad highpass filter and perform filtering. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
waveform (Tensor): audio waveform of dimension of `(..., time)`
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
......@@ -922,6 +990,10 @@ except RuntimeError as err:
def lfilter(waveform: Tensor, a_coeffs: Tensor, b_coeffs: Tensor, clamp: bool = True, batching: bool = True) -> Tensor:
r"""Perform an IIR filter by evaluating difference equation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Note:
To avoid numerical problems, small filter order is preferred.
Using double precision could also minimize numerical precision errors.
......@@ -976,6 +1048,10 @@ def lfilter(waveform: Tensor, a_coeffs: Tensor, b_coeffs: Tensor, clamp: bool =
def lowpass_biquad(waveform: Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> Tensor:
r"""Design biquad lowpass filter and perform filtering. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
waveform (torch.Tensor): audio waveform of dimension of `(..., time)`
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
......@@ -1020,6 +1096,11 @@ except RuntimeError as err:
def overdrive(waveform: Tensor, gain: float = 20, colour: float = 20) -> Tensor:
r"""Apply a overdrive effect to the audio. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
This effect applies a non linear distortion to the audio signal.
Args:
......@@ -1081,6 +1162,10 @@ def phaser(
) -> Tensor:
r"""Apply a phasing effect to the audio. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
waveform (Tensor): audio waveform of dimension of `(..., time)`
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
......@@ -1161,6 +1246,10 @@ def phaser(
def riaa_biquad(waveform: Tensor, sample_rate: int) -> Tensor:
r"""Apply RIAA vinyl playback equalization. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
waveform (Tensor): audio waveform of dimension of `(..., time)`
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz).
......@@ -1227,6 +1316,10 @@ def treble_biquad(
) -> Tensor:
r"""Design a treble tone-control effect. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
waveform (Tensor): audio waveform of dimension of `(..., time)`
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
......@@ -1362,6 +1455,11 @@ def vad(
lp_lifter_freq: float = 2000.0,
) -> Tensor:
r"""Voice Activity Detector. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: TorchScript
Attempts to trim silence and quiet background sounds from the ends of recordings of speech.
The algorithm currently uses a simple cepstral power measurement to detect voice,
so may be fooled by other things, especially music.
......
......@@ -63,6 +63,10 @@ def spectrogram(
r"""Create a spectrogram or a batch of spectrograms from a raw audio signal.
The spectrogram can be either magnitude-only or complex.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
waveform (Tensor): Tensor of audio of dimension `(..., time)`
pad (int): Two sided padding of signal
......@@ -146,6 +150,10 @@ def inverse_spectrogram(
r"""Create an inverse spectrogram or a batch of inverse spectrograms from the provided
complex-valued spectrogram.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
spectrogram (Tensor): Complex tensor of audio of dimension (..., freq, time).
length (int or None): The output length of the waveform.
......@@ -226,6 +234,10 @@ def griffinlim(
) -> Tensor:
r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Implementation ported from
*librosa* [:footcite:`brian_mcfee-proc-scipy-2015`], *A fast Griffin-Lim algorithm* [:footcite:`6701851`]
and *Signal estimation from modified short-time Fourier transform* [:footcite:`1172092`].
......@@ -312,6 +324,10 @@ def amplitude_to_DB(
) -> Tensor:
r"""Turn a spectrogram from the power/amplitude scale to the decibel scale.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
The output of each tensor in a batch depends on the maximum value of that tensor,
and so may return different values for an audio clip split into snippets vs. a full clip.
......@@ -349,6 +365,10 @@ def amplitude_to_DB(
def DB_to_amplitude(x: Tensor, ref: float, power: float) -> Tensor:
r"""Turn a tensor from the decibel scale to the power/amplitude scale.
.. devices:: CPU CUDA
.. properties:: TorchScript
Args:
x (Tensor): Input tensor before being converted to power/amplitude scale.
ref (float): Reference which the output will be scaled by.
......@@ -464,6 +484,10 @@ def melscale_fbanks(
) -> Tensor:
r"""Create a frequency bin conversion matrix.
.. devices:: CPU
.. properties:: TorchScript
Note:
For the sake of the numerical compatibility with librosa, not all the coefficients
in the resulting filter bank has magnitude of 1.
......@@ -530,6 +554,10 @@ def linear_fbanks(
) -> Tensor:
r"""Creates a linear triangular filterbank.
.. devices:: CPU
.. properties:: TorchScript
Note:
For the sake of the numerical compatibility with librosa, not all the coefficients
in the resulting filter bank has magnitude of 1.
......@@ -567,6 +595,10 @@ def create_dct(n_mfcc: int, n_mels: int, norm: Optional[str]) -> Tensor:
r"""Create a DCT transformation matrix with shape (``n_mels``, ``n_mfcc``),
normalized depending on norm.
.. devices:: CPU
.. properties:: TorchScript
Args:
n_mfcc (int): Number of mfc coefficients to retain
n_mels (int): Number of mel filterbanks
......@@ -590,7 +622,13 @@ def create_dct(n_mfcc: int, n_mels: int, norm: Optional[str]) -> Tensor:
def mu_law_encoding(x: Tensor, quantization_channels: int) -> Tensor:
r"""Encode signal based on mu-law companding. For more info see the
r"""Encode signal based on mu-law companding.
.. devices:: CPU CUDA
.. properties:: TorchScript
For more info see the
`Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_
This algorithm expects the signal has been scaled to between -1 and 1 and
......@@ -617,7 +655,13 @@ def mu_law_encoding(x: Tensor, quantization_channels: int) -> Tensor:
def mu_law_decoding(x_mu: Tensor, quantization_channels: int) -> Tensor:
r"""Decode mu-law encoded signal. For more info see the
r"""Decode mu-law encoded signal.
.. devices:: CPU CUDA
.. properties:: TorchScript
For more info see the
`Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_
This expects an input with values between 0 and quantization_channels - 1
......@@ -640,8 +684,11 @@ def mu_law_decoding(x_mu: Tensor, quantization_channels: int) -> Tensor:
def phase_vocoder(complex_specgrams: Tensor, rate: float, phase_advance: Tensor) -> Tensor:
r"""Given a STFT tensor, speed up in time without modifying pitch by a
factor of ``rate``.
r"""Given a STFT tensor, speed up in time without modifying pitch by a factor of ``rate``.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
complex_specgrams (Tensor):
......@@ -724,11 +771,17 @@ def mask_along_axis_iid(
axis: int,
p: float = 1.0,
) -> Tensor:
r"""
Apply a mask along ``axis``. Mask will be applied from indices ``[v_0, v_0 + v)``, where
``v`` is sampled from ``uniform(0, max_v)`` and ``v_0`` from ``uniform(0, specgrams.size(axis) - v)``, with
``max_v = mask_param`` when ``p = 1.0`` and ``max_v = min(mask_param, floor(specgrams.size(axis) * p))``
otherwise.
r"""Apply a mask along ``axis``.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Mask will be applied from indices ``[v_0, v_0 + v)``,
where ``v`` is sampled from ``uniform(0, max_v)`` and
``v_0`` from ``uniform(0, specgrams.size(axis) - v)``,
with ``max_v = mask_param`` when ``p = 1.0`` and
``max_v = min(mask_param, floor(specgrams.size(axis) * p))`` otherwise.
Args:
specgrams (Tensor): Real spectrograms `(batch, channel, freq, time)`
......@@ -777,11 +830,19 @@ def mask_along_axis(
axis: int,
p: float = 1.0,
) -> Tensor:
r"""
Apply a mask along ``axis``. Mask will be applied from indices ``[v_0, v_0 + v)``, where
``v`` is sampled from ``uniform(0, max_v)`` and ``v_0`` from ``uniform(0, specgrams.size(axis) - v)``, with
``max_v = mask_param`` when ``p = 1.0`` and ``max_v = min(mask_param, floor(specgrams.size(axis) * p))``
otherwise. All examples will have the same mask interval.
r"""Apply a mask along ``axis``.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Mask will be applied from indices ``[v_0, v_0 + v)``,
where ``v`` is sampled from ``uniform(0, max_v)`` and
``v_0`` from ``uniform(0, specgrams.size(axis) - v)``, with
``max_v = mask_param`` when ``p = 1.0`` and
``max_v = min(mask_param, floor(specgrams.size(axis) * p))``
otherwise.
All examples will have the same mask interval.
Args:
specgram (Tensor): Real spectrogram `(channel, freq, time)`
......@@ -829,6 +890,10 @@ def mask_along_axis(
def compute_deltas(specgram: Tensor, win_length: int = 5, mode: str = "replicate") -> Tensor:
r"""Compute delta coefficients of a tensor, usually a spectrogram:
.. devices:: CPU CUDA
.. properties:: TorchScript
.. math::
d_t = \frac{\sum_{n=1}^{\text{N}} n (c_{t+n} - c_{t-n})}{2 \sum_{n=1}^{\text{N}} n^2}
......@@ -989,6 +1054,10 @@ def detect_pitch_frequency(
) -> Tensor:
r"""Detect pitch frequency.
.. devices:: CPU CUDA
.. properties:: TorchScript
It is implemented using normalized cross-correlation function and median smoothing.
Args:
......@@ -1030,6 +1099,10 @@ def sliding_window_cmn(
r"""
Apply sliding-window cepstral mean (and optionally variance) normalization per utterance.
.. devices:: CPU CUDA
.. properties:: TorchScript
Args:
specgram (Tensor): Tensor of spectrogram of dimension `(..., time, freq)`
cmn_window (int, optional): Window in frames for running average CMN computation (int, default = 600)
......@@ -1118,8 +1191,11 @@ def spectral_centroid(
hop_length: int,
win_length: int,
) -> Tensor:
r"""
Compute the spectral centroid for each channel along the time axis.
r"""Compute the spectral centroid for each channel along the time axis.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
The spectral centroid is defined as the weighted average of the
frequency values, weighted by their magnitude.
......@@ -1164,6 +1240,8 @@ def apply_codec(
r"""
Apply codecs as a form of augmentation.
.. devices:: CPU
Args:
waveform (Tensor): Audio data. Must be 2 dimensional. See also ```channels_first```.
sample_rate (int): Sample rate of the audio waveform.
......@@ -1218,6 +1296,10 @@ def compute_kaldi_pitch(
"""Extract pitch based on method described in *A pitch extraction algorithm tuned
for automatic speech recognition* [:footcite:`6854049`].
.. devices:: CPU
.. properties:: TorchScript
This function computes the equivalent of `compute-kaldi-pitch-feats` from Kaldi.
Args:
......@@ -1430,9 +1512,11 @@ def resample(
resampling_method: str = "sinc_interpolation",
beta: Optional[float] = None,
) -> Tensor:
r"""Resamples the waveform at the new frequency using bandlimited interpolation.
r"""Resamples the waveform at the new frequency using bandlimited interpolation. [:footcite:`RESAMPLE`].
https://ccrma.stanford.edu/~jos/resample/Theory_Ideal_Bandlimited_Interpolation.html
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Note:
``transforms.Resample`` precomputes and reuses the resampling kernel, so using it will result in
......@@ -1481,6 +1565,8 @@ def edit_distance(seq1: Sequence, seq2: Sequence) -> int:
"""
Calculate the word level edit (Levenshtein) distance between two sequences.
.. devices:: CPU
The function computes an edit distance allowing deletion, insertion and
substitution. The result is an integer.
......@@ -1490,8 +1576,6 @@ def edit_distance(seq1: Sequence, seq2: Sequence) -> int:
output is the edit distance between sentences (word edit distance). Users
may want to normalize the output by the length of the reference sequence.
torchscipt is not supported for this function.
Args:
seq1 (Sequence): the first sequence to compare.
seq2 (Sequence): the second sequence to compare.
......@@ -1531,6 +1615,10 @@ def pitch_shift(
"""
Shift the pitch of a waveform by ``n_steps`` steps.
.. devices:: CPU CUDA
.. properties:: TorchScript
Args:
waveform (Tensor): The input waveform of shape `(..., time)`.
sample_rate (int): Sample rate of `waveform`.
......@@ -1601,6 +1689,11 @@ def rnnt_loss(
):
"""Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks*
[:footcite:`graves2012sequence`].
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
The RNN Transducer loss extends the CTC loss by defining a distribution over output
sequences of all lengths, and by jointly modelling both input-output and output-output
dependencies.
......@@ -1650,6 +1743,10 @@ def psd(
) -> Tensor:
"""Compute cross-channel power spectral density (PSD) matrix.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
specgram (Tensor): Multi-channel complex-valued spectrum.
Tensor of dimension `(..., channel, freq, time)`
......@@ -1730,6 +1827,10 @@ def mvdr_weights_souden(
r"""Compute the Minimum Variance Distortionless Response (*MVDR* [:footcite:`capon1969high`]) beamforming weights
by the method proposed by *Souden et, al.* [:footcite:`souden2009optimal`].
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
.. math::
\textbf{w}_{\text{MVDR}}(f) =
\frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bf{\Phi}_{\textbf{SS}}}}(f)}
......@@ -1784,6 +1885,10 @@ def mvdr_weights_rtf(
r"""Compute the Minimum Variance Distortionless Response (*MVDR* [:footcite:`capon1969high`]) beamforming weights
based on the relative transfer function (RTF) and power spectral density (PSD) matrix of noise.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
.. math::
\textbf{w}_{\text{MVDR}}(f) =
\frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bm{v}}(f)}}
......@@ -1836,6 +1941,10 @@ def mvdr_weights_rtf(
def rtf_evd(psd_s: Tensor) -> Tensor:
r"""Estimate the relative transfer function (RTF) or the steering vector by eigenvalue decomposition.
.. devices:: CPU CUDA
.. properties:: TorchScript
Args:
psd_s (Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
Tensor of dimension `(..., freq, channel, channel)`
......@@ -1852,6 +1961,10 @@ def rtf_evd(psd_s: Tensor) -> Tensor:
def rtf_power(psd_s: Tensor, psd_n: Tensor, reference_channel: Union[int, Tensor], n_iter: int = 3) -> Tensor:
r"""Estimate the relative transfer function (RTF) or the steering vector by the power method.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
psd_s (Tensor): The complex-valued covariance matrix of target speech.
Tensor of dimension `(..., freq, channel, channel)`
......@@ -1895,6 +2008,10 @@ def rtf_power(psd_s: Tensor, psd_n: Tensor, reference_channel: Union[int, Tensor
def apply_beamforming(beamform_weights: Tensor, specgram: Tensor) -> Tensor:
r"""Apply the beamforming weight to the multi-channel noisy spectrum to obtain the single-channel enhanced spectrum.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
.. math::
\hat{\textbf{S}}(f) = \textbf{w}_{\text{bf}}(f)^{\mathsf{H}} \textbf{Y}(f)
where :math:`\textbf{w}_{\text{bf}}(f)` is the beamforming weight for the :math:`f`-th frequency bin,
......
......@@ -43,6 +43,8 @@ class Hypothesis(NamedTuple):
class LexiconDecoder:
"""torchaudio.prototype.ctc_decoder.LexiconDecoder()
.. devices:: CPU
Lexically contrained CTC beam search decoder from *Flashlight* [:footcite:`kahn2022flashlight`].
Note:
......
......@@ -59,6 +59,10 @@ def apply_effects_tensor(
) -> Tuple[torch.Tensor, int]:
"""Apply sox effects to given Tensor
.. devices:: CPU
.. properties:: TorchScript
Note:
This function only works on CPU Tensors.
This function works in the way very similar to ``sox`` command, however there are slight
......@@ -161,6 +165,10 @@ def apply_effects_file(
) -> Tuple[torch.Tensor, int]:
"""Apply sox effects to the audio file and load the resulting data as Tensor
.. devices:: CPU
.. properties:: TorchScript
Note:
This function works in the way very similar to ``sox`` command, however there are slight
differences. For example, ``sox`` commnad adds certain effects automatically (such as
......
......@@ -18,6 +18,10 @@ __all__ = []
class Spectrogram(torch.nn.Module):
r"""Create a spectrogram from a audio signal.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
win_length (int or None, optional): Window size. (Default: ``n_fft``)
......@@ -112,6 +116,10 @@ class Spectrogram(torch.nn.Module):
class InverseSpectrogram(torch.nn.Module):
r"""Create an inverse spectrogram to recover an audio signal from a spectrogram.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
win_length (int or None, optional): Window size. (Default: ``n_fft``)
......@@ -193,6 +201,10 @@ class InverseSpectrogram(torch.nn.Module):
class GriffinLim(torch.nn.Module):
r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Implementation ported from
*librosa* [:footcite:`brian_mcfee-proc-scipy-2015`], *A fast Griffin-Lim algorithm* [:footcite:`6701851`]
and *Signal estimation from modified short-time Fourier transform* [:footcite:`1172092`].
......@@ -277,6 +289,10 @@ class GriffinLim(torch.nn.Module):
class AmplitudeToDB(torch.nn.Module):
r"""Turn a tensor from the power/amplitude scale to the decibel scale.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
This output depends on the maximum value in the input tensor, and so
may return different values for an audio clip split into snippets vs. a
a full clip.
......@@ -315,8 +331,11 @@ class AmplitudeToDB(torch.nn.Module):
class MelScale(torch.nn.Module):
r"""Turn a normal STFT into a mel frequency STFT, using a conversion
matrix. This uses triangular filter banks.
r"""Turn a normal STFT into a mel frequency STFT with triangular filter banks.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
......@@ -372,8 +391,9 @@ class MelScale(torch.nn.Module):
class InverseMelScale(torch.nn.Module):
r"""Solve for a normal STFT from a mel frequency STFT, using a conversion
matrix. This uses triangular filter banks.
r"""Estimate a STFT in normal frequency domain from mel frequency domain.
.. devices:: CPU CUDA
It minimizes the euclidian norm between the input mel-spectrogram and the product between
the estimated spectrogram and the filter banks using SGD.
......@@ -483,6 +503,10 @@ class InverseMelScale(torch.nn.Module):
class MelSpectrogram(torch.nn.Module):
r"""Create MelSpectrogram for a raw audio signal.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
This is a composition of :py:func:`torchaudio.transforms.Spectrogram` and
and :py:func:`torchaudio.transforms.MelScale`.
......@@ -592,6 +616,10 @@ class MelSpectrogram(torch.nn.Module):
class MFCC(torch.nn.Module):
r"""Create the Mel-frequency cepstrum coefficients from an audio signal.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
By default, this calculates the MFCC on the DB-scaled Mel spectrogram.
This is not the textbook implementation, but is implemented here to
give consistency with librosa.
......@@ -666,6 +694,10 @@ class MFCC(torch.nn.Module):
class LFCC(torch.nn.Module):
r"""Create the linear-frequency cepstrum coefficients from an audio signal.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
By default, this calculates the LFCC on the DB-scaled linear filtered spectrogram.
This is not the textbook implementation, but is implemented here to
give consistency with librosa.
......@@ -762,7 +794,13 @@ class LFCC(torch.nn.Module):
class MuLawEncoding(torch.nn.Module):
r"""Encode signal based on mu-law companding. For more info see the
r"""Encode signal based on mu-law companding.
.. devices:: CPU CUDA
.. properties:: TorchScript
For more info see the
`Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_
This algorithm assumes the signal has been scaled to between -1 and 1 and
......@@ -795,7 +833,13 @@ class MuLawEncoding(torch.nn.Module):
class MuLawDecoding(torch.nn.Module):
r"""Decode mu-law encoded signal. For more info see the
r"""Decode mu-law encoded signal.
.. devices:: CPU CUDA
.. properties:: TorchScript
For more info see the
`Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_
This expects an input with values between 0 and ``quantization_channels - 1``
......@@ -829,6 +873,10 @@ class MuLawDecoding(torch.nn.Module):
class Resample(torch.nn.Module):
r"""Resample a signal from one frequency to another. A resampling method can be given.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Note:
If resampling on waveforms of higher precision than float32, there may be a small loss of precision
because the kernel is cached once as float32. If high precision resampling is important for your application,
......@@ -909,6 +957,10 @@ class Resample(torch.nn.Module):
class ComputeDeltas(torch.nn.Module):
r"""Compute delta coefficients of a tensor, usually a spectrogram.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
See `torchaudio.functional.compute_deltas` for more details.
Args:
......@@ -936,6 +988,10 @@ class ComputeDeltas(torch.nn.Module):
class TimeStretch(torch.nn.Module):
r"""Stretch stft in time without modifying pitch for a given rate.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Proposed in *SpecAugment* [:footcite:`specaugment`].
Args:
......@@ -1001,6 +1057,10 @@ class TimeStretch(torch.nn.Module):
class Fade(torch.nn.Module):
r"""Add a fade in and/or fade out to an waveform.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
fade_in_len (int, optional): Length of fade-in (time frames). (Default: ``0``)
fade_out_len (int, optional): Length of fade-out (time frames). (Default: ``0``)
......@@ -1114,6 +1174,10 @@ class _AxisMasking(torch.nn.Module):
class FrequencyMasking(_AxisMasking):
r"""Apply masking to a spectrogram in the frequency domain.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Proposed in *SpecAugment* [:footcite:`specaugment`].
Args:
......@@ -1144,6 +1208,10 @@ class FrequencyMasking(_AxisMasking):
class TimeMasking(_AxisMasking):
r"""Apply masking to a spectrogram in the time domain.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Proposed in *SpecAugment* [:footcite:`specaugment`].
Args:
......@@ -1178,6 +1246,10 @@ class TimeMasking(_AxisMasking):
class Vol(torch.nn.Module):
r"""Add a volume to an waveform.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
gain (float): Interpreted according to the given gain_type:
If ``gain_type`` = ``amplitude``, ``gain`` is a positive amplitude ratio.
......@@ -1218,6 +1290,10 @@ class SlidingWindowCmn(torch.nn.Module):
r"""
Apply sliding-window cepstral mean (and optionally variance) normalization per utterance.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
cmn_window (int, optional): Window in frames for running average CMN computation (int, default = 600)
min_cmn_window (int, optional): Minimum CMN window used at start of decoding (adds latency only at start).
......@@ -1250,6 +1326,11 @@ class SlidingWindowCmn(torch.nn.Module):
class Vad(torch.nn.Module):
r"""Voice Activity Detector. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: TorchScript
Attempts to trim silence and quiet background sounds from the ends of recordings of speech.
The algorithm currently uses a simple cepstral power measurement to detect voice,
so may be fooled by other things, especially music.
......@@ -1373,6 +1454,10 @@ class Vad(torch.nn.Module):
class SpectralCentroid(torch.nn.Module):
r"""Compute the spectral centroid for each channel along the time axis.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
The spectral centroid is defined as the weighted average of the
frequency values, weighted by their magnitude.
......@@ -1429,6 +1514,10 @@ class SpectralCentroid(torch.nn.Module):
class PitchShift(torch.nn.Module):
r"""Shift the pitch of a waveform by ``n_steps`` steps.
.. devices:: CPU CUDA
.. properties:: TorchScript
Args:
waveform (Tensor): The input waveform of shape `(..., time)`.
sample_rate (int): Sample rate of `waveform`.
......@@ -1493,6 +1582,11 @@ class PitchShift(torch.nn.Module):
class RNNTLoss(torch.nn.Module):
"""Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks*
[:footcite:`graves2012sequence`].
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
The RNN Transducer loss extends the CTC loss by defining a distribution over output
sequences of all lengths, and by jointly modelling both input-output and output-output
dependencies.
......@@ -1575,6 +1669,10 @@ def _get_mat_trace(input: torch.Tensor, dim1: int = -1, dim2: int = -2) -> torch
class PSD(torch.nn.Module):
r"""Compute cross-channel power spectral density (PSD) matrix.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
multi_mask (bool, optional): whether to use multi-channel Time-Frequency masks. (Default: ``False``)
normalize (bool, optional): whether normalize the mask along the time dimension.
......@@ -1622,6 +1720,10 @@ class PSD(torch.nn.Module):
class MVDR(torch.nn.Module):
"""Minimum Variance Distortionless Response (MVDR) module that performs MVDR beamforming with Time-Frequency masks.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Based on https://github.com/espnet/espnet/blob/master/espnet2/enh/layers/beamformer.py
We provide three solutions of MVDR beamforming. One is based on *reference channel selection*
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment