Commit 72ae755a authored by moto's avatar moto Committed by Facebook GitHub Bot
Browse files

Add devices/properties badges (#2321)

Summary:
Add badges of supported properties and devices to functionals and transforms.

This commit adds `.. devices::` and `.. properties::` directives to sphinx.

APIs with these directives will have badges (based off of shields.io) which link to the
page with description of these features.

Continuation of https://github.com/pytorch/audio/issues/2316
Excluded dtypes for further improvement, and actually added badges to most of functional/transforms.

Pull Request resolved: https://github.com/pytorch/audio/pull/2321

Reviewed By: hwangjeff

Differential Revision: D35489063

Pulled By: mthrok

fbshipit-source-id: f68a70ebb22df29d5e9bd171273bd19007a81762
parent eb23a242
...@@ -70,6 +70,7 @@ instance/ ...@@ -70,6 +70,7 @@ instance/
docs/_build/ docs/_build/
docs/src/ docs/src/
docs/source/tutorials docs/source/tutorials
docs/source/gen_images
docs/source/gen_modules docs/source/gen_modules
# PyBuilder # PyBuilder
......
...@@ -9,3 +9,9 @@ dt > em.sig-param:last-of-type::after { ...@@ -9,3 +9,9 @@ dt > em.sig-param:last-of-type::after {
content: "\a"; content: "\a";
white-space: pre; white-space: pre;
} }
/* For shields */
article.pytorch-article img.shield-badge {
width: unset;
margin-top: -18px;
margin-bottom: 9px;
}
...@@ -17,10 +17,10 @@ ...@@ -17,10 +17,10 @@
# add these directories to sys.path here. If the directory is relative to the # add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here. # documentation root, use os.path.abspath to make it absolute, like shown here.
# #
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
import os import os
import sys
sys.path.insert(0, os.path.abspath("."))
import re import re
import warnings import warnings
from datetime import datetime from datetime import datetime
...@@ -342,3 +342,13 @@ def inject_minigalleries(app, what, name, obj, options, lines): ...@@ -342,3 +342,13 @@ def inject_minigalleries(app, what, name, obj, options, lines):
def setup(app): def setup(app):
app.connect("autodoc-process-docstring", inject_minigalleries) app.connect("autodoc-process-docstring", inject_minigalleries)
from custom_directives import SupportedDevices, SupportedProperties
# Register custom directives
from docutils.parsers import rst
rst.directives.register_directive("devices", SupportedDevices)
rst.directives.register_directive("properties", SupportedProperties)
import hashlib
from pathlib import Path
from typing import List
from urllib.parse import quote, urlencode
import requests
from docutils import nodes
from docutils.parsers.rst.directives.images import Image
_THIS_DIR = Path(__file__).parent
# Color palette from PyTorch Developer Day 2021 Presentation Template
YELLOW = "F9DB78"
GREEN = "70AD47"
BLUE = "00B0F0"
PINK = "FF71DA"
ORANGE = "FF8300"
TEAL = "00E5D1"
GRAY = "7F7F7F"
def _get_cache_path(key, ext):
filename = f"{hashlib.sha256(key).hexdigest()}{ext}"
cache_dir = _THIS_DIR / "gen_images"
cache_dir.mkdir(parents=True, exist_ok=True)
return cache_dir / filename
def _download(url, path):
response = requests.get(url)
response.raise_for_status()
with open(path, "wb") as file:
file.write(response.content)
def _fetch_image(url):
path = _get_cache_path(url.encode("utf-8"), ext=".svg")
if not path.exists():
_download(url, path)
return str(path.relative_to(_THIS_DIR))
class BaseShield(Image):
def run(self, params, alt, section) -> List[nodes.Node]:
url = f"https://img.shields.io/static/v1?{urlencode(params, quote_via=quote)}"
path = _fetch_image(url)
self.arguments = [path]
self.options["alt"] = alt
if "class" not in self.options:
self.options["class"] = []
self.options["class"].append("shield-badge")
self.options["target"] = f"supported_features.html#{section}"
return super().run()
def _parse_devices(arg: str):
devices = sorted(arg.strip().split())
valid_values = {"CPU", "CUDA"}
if any(val not in valid_values for val in devices):
raise ValueError(
f"One or more device values are not valid. The valid values are {valid_values}. Given value: '{arg}'"
)
return ", ".join(sorted(devices))
def _parse_properties(arg: str):
properties = sorted(arg.strip().split())
valid_values = {"Autograd", "TorchScript"}
if any(val not in valid_values for val in properties):
raise ValueError(
"One or more property values are not valid. "
f"The valid values are {valid_values}. "
f"Given value: '{arg}'"
)
return ", ".join(sorted(properties))
class SupportedDevices(BaseShield):
"""List the supported devices"""
required_arguments = 1
final_argument_whitespace = True
def run(self) -> List[nodes.Node]:
devices = _parse_devices(self.arguments[0])
alt = f"This feature supports the following devices: {devices}"
params = {
"label": "Devices",
"message": devices,
"labelColor": GRAY,
"color": BLUE,
"style": "flat-square",
}
return super().run(params, alt, "devices")
class SupportedProperties(BaseShield):
"""List the supported properties"""
required_arguments = 1
final_argument_whitespace = True
def run(self) -> List[nodes.Node]:
properties = _parse_properties(self.arguments[0])
alt = f"This API supports the following properties: {properties}"
params = {
"label": "Properties",
"message": properties,
"labelColor": GRAY,
"color": GREEN,
"style": "flat-square",
}
return super().run(params, alt, "properties")
...@@ -29,6 +29,7 @@ Features described in this documentation are classified by release status: ...@@ -29,6 +29,7 @@ Features described in this documentation are classified by release status:
:caption: Torchaudio Documentation :caption: Torchaudio Documentation
Index <self> Index <self>
supported_features
API References API References
-------------- --------------
......
@misc{RESAMPLE,
author = {Julius O. Smith},
title = {Digital Audio Resampling Home Page "Theory of Ideal Bandlimited Interpolation" section},
url = {https://ccrma.stanford.edu/~jos/resample/Theory_Ideal_Bandlimited_Interpolation.html},
month = {September},
year = {2020}
}
@article{voxpopuli, @article{voxpopuli,
author = {Changhan Wang and author = {Changhan Wang and
Morgane Rivi{\`{e}}re and Morgane Rivi{\`{e}}re and
......
Supported Features
==================
Each TorchAudio API supports a subset of PyTorch features, such as
devices and data types.
Supported features are indicated in API references like the following:
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
These icons mean that they are verified through automated testing.
.. note::
Missing feature icons mean that they are not tested, and this can mean
different things, depending on the API.
1. The API is compatible with the feature but not tested.
2. The API is not compatible with the feature.
In case of 2, the API might explicitly raise an error, but that is not guaranteed.
For example, APIs without an Autograd badge might throw an error during backpropagation,
or silently return a wrong gradient.
If you use an API that hasn't been labeled as supporting a feature, you might want to first verify that the
feature works fine.
Devices
-------
CPU
^^^
.. devices:: CPU
TorchAudio APIs that support CPU can perform their computation on CPU tensors.
CUDA
^^^^
.. devices:: CUDA
TorchAudio APIs that support CUDA can perform their computation on CUDA devices.
In case of functions, move the tensor arguments to CUDA device before passing them to a function.
For example:
.. code:: python
cuda = torch.device("cuda")
waveform = waveform.to(cuda)
spectrogram = torchaudio.functional.spectrogram(waveform)
Classes with CUDA support are implemented with :py:func:`torch.nn.Module`.
It is also necessary to move the instance to CUDA device, before passing CUDA tensors.
For example:
.. code:: python
cuda = torch.device("cuda")
resampler = torchaudio.transforms.Resample(8000, 16000)
resampler.to(cuda)
waveform.to(cuda)
resampled = resampler(waveform)
Properties
----------
Autograd
^^^^^^^^
.. properties:: Autograd
TorchAudio APIs with autograd support can correctly backpropagate gradients.
For the basics of autograd, please refer to this `tutorial <https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html>`_.
.. note::
APIs without this mark may or may not raise an error during backpropagation.
The absence of an error raised during backpropagation does not necessarily mean the gradient is correct.
TorchScript
^^^^^^^^^^^
.. properties:: TorchScript
TorchAudio APIs with TorchScript support can be serialized and executed in non-Python environments.
For details on TorchScript, please refer to the `documentation <https://pytorch.org/docs/stable/jit.html>`_.
...@@ -251,6 +251,33 @@ class Autograd(TestBaseMixin): ...@@ -251,6 +251,33 @@ class Autograd(TestBaseMixin):
Q = torch.tensor(Q) Q = torch.tensor(Q)
self.assert_grad(F.bandreject_biquad, (x, sr, central_freq, Q)) self.assert_grad(F.bandreject_biquad, (x, sr, central_freq, Q))
def test_deemph_biquad(self):
torch.random.manual_seed(2434)
x = get_whitenoise(sample_rate=22050, duration=0.01, n_channels=1)
self.assert_grad(F.deemph_biquad, (x, 44100))
def test_flanger(self):
torch.random.manual_seed(2434)
x = get_whitenoise(sample_rate=8000, duration=0.01, n_channels=1)
self.assert_grad(F.flanger, (x, 44100))
def test_gain(self):
torch.random.manual_seed(2434)
x = get_whitenoise(sample_rate=8000, duration=0.01, n_channels=1)
self.assert_grad(F.gain, (x, 1.1))
def test_overdrive(self):
torch.random.manual_seed(2434)
x = get_whitenoise(sample_rate=8000, duration=0.01, n_channels=1)
self.assert_grad(F.gain, (x,))
@parameterized.expand([(True,), (False,)])
def test_phaser(self, sinusoidal):
torch.random.manual_seed(2434)
sr = 8000
x = get_whitenoise(sample_rate=sr, duration=0.01, n_channels=1)
self.assert_grad(F.phaser, (x, sr, sinusoidal))
@parameterized.expand( @parameterized.expand(
[ [
(True,), (True,),
......
...@@ -86,6 +86,10 @@ class Transforms(TestBaseMixin): ...@@ -86,6 +86,10 @@ class Transforms(TestBaseMixin):
tensor = torch.rand((1, 10)) tensor = torch.rand((1, 10))
self._assert_consistency(T.MuLawDecoding(), tensor) self._assert_consistency(T.MuLawDecoding(), tensor)
def test_ComputeDelta(self):
tensor = torch.rand((1, 10))
self._assert_consistency(T.ComputeDeltas(), tensor)
def test_Fade(self): def test_Fade(self):
waveform = common_utils.get_whitenoise() waveform = common_utils.get_whitenoise()
fade_in_len = 3000 fade_in_len = 3000
......
...@@ -67,6 +67,10 @@ def _generate_wave_table( ...@@ -67,6 +67,10 @@ def _generate_wave_table(
def allpass_biquad(waveform: Tensor, sample_rate: int, central_freq: float, Q: float = 0.707) -> Tensor: def allpass_biquad(waveform: Tensor, sample_rate: int, central_freq: float, Q: float = 0.707) -> Tensor:
r"""Design two-pole all-pass filter. Similar to SoX implementation. r"""Design two-pole all-pass filter. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
waveform(torch.Tensor): audio waveform of dimension of `(..., time)` waveform(torch.Tensor): audio waveform of dimension of `(..., time)`
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
...@@ -107,6 +111,10 @@ def band_biquad( ...@@ -107,6 +111,10 @@ def band_biquad(
) -> Tensor: ) -> Tensor:
r"""Design two-pole band filter. Similar to SoX implementation. r"""Design two-pole band filter. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
waveform (Tensor): audio waveform of dimension of `(..., time)` waveform (Tensor): audio waveform of dimension of `(..., time)`
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
...@@ -156,6 +164,10 @@ def bandpass_biquad( ...@@ -156,6 +164,10 @@ def bandpass_biquad(
) -> Tensor: ) -> Tensor:
r"""Design two-pole band-pass filter. Similar to SoX implementation. r"""Design two-pole band-pass filter. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
waveform (Tensor): audio waveform of dimension of `(..., time)` waveform (Tensor): audio waveform of dimension of `(..., time)`
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
...@@ -192,6 +204,10 @@ def bandpass_biquad( ...@@ -192,6 +204,10 @@ def bandpass_biquad(
def bandreject_biquad(waveform: Tensor, sample_rate: int, central_freq: float, Q: float = 0.707) -> Tensor: def bandreject_biquad(waveform: Tensor, sample_rate: int, central_freq: float, Q: float = 0.707) -> Tensor:
r"""Design two-pole band-reject filter. Similar to SoX implementation. r"""Design two-pole band-reject filter. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
waveform (Tensor): audio waveform of dimension of `(..., time)` waveform (Tensor): audio waveform of dimension of `(..., time)`
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
...@@ -231,6 +247,10 @@ def bass_biquad( ...@@ -231,6 +247,10 @@ def bass_biquad(
) -> Tensor: ) -> Tensor:
r"""Design a bass tone-control effect. Similar to SoX implementation. r"""Design a bass tone-control effect. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
waveform (Tensor): audio waveform of dimension of `(..., time)` waveform (Tensor): audio waveform of dimension of `(..., time)`
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
...@@ -271,7 +291,10 @@ def bass_biquad( ...@@ -271,7 +291,10 @@ def bass_biquad(
def biquad(waveform: Tensor, b0: float, b1: float, b2: float, a0: float, a1: float, a2: float) -> Tensor: def biquad(waveform: Tensor, b0: float, b1: float, b2: float, a0: float, a1: float, a2: float) -> Tensor:
r"""Perform a biquad filter of input tensor. Initial conditions set to 0. r"""Perform a biquad filter of input tensor. Initial conditions set to 0.
https://en.wikipedia.org/wiki/Digital_biquad_filter
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
waveform (Tensor): audio waveform of dimension of `(..., time)` waveform (Tensor): audio waveform of dimension of `(..., time)`
...@@ -284,6 +307,9 @@ def biquad(waveform: Tensor, b0: float, b1: float, b2: float, a0: float, a1: flo ...@@ -284,6 +307,9 @@ def biquad(waveform: Tensor, b0: float, b1: float, b2: float, a0: float, a1: flo
Returns: Returns:
Tensor: Waveform with dimension of `(..., time)` Tensor: Waveform with dimension of `(..., time)`
Reference:
- https://en.wikipedia.org/wiki/Digital_biquad_filter
""" """
device = waveform.device device = waveform.device
...@@ -306,6 +332,11 @@ def biquad(waveform: Tensor, b0: float, b1: float, b2: float, a0: float, a1: flo ...@@ -306,6 +332,11 @@ def biquad(waveform: Tensor, b0: float, b1: float, b2: float, a0: float, a1: flo
def contrast(waveform: Tensor, enhancement_amount: float = 75.0) -> Tensor: def contrast(waveform: Tensor, enhancement_amount: float = 75.0) -> Tensor:
r"""Apply contrast effect. Similar to SoX implementation. r"""Apply contrast effect. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Comparable with compression, this effect modifies an audio signal to make it sound louder Comparable with compression, this effect modifies an audio signal to make it sound louder
Args: Args:
...@@ -335,6 +366,11 @@ def contrast(waveform: Tensor, enhancement_amount: float = 75.0) -> Tensor: ...@@ -335,6 +366,11 @@ def contrast(waveform: Tensor, enhancement_amount: float = 75.0) -> Tensor:
def dcshift(waveform: Tensor, shift: float, limiter_gain: Optional[float] = None) -> Tensor: def dcshift(waveform: Tensor, shift: float, limiter_gain: Optional[float] = None) -> Tensor:
r"""Apply a DC shift to the audio. Similar to SoX implementation. r"""Apply a DC shift to the audio. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: TorchScript
This can be useful to remove a DC offset This can be useful to remove a DC offset
(caused perhaps by a hardware problem in the recording chain) from the audio (caused perhaps by a hardware problem in the recording chain) from the audio
...@@ -357,6 +393,8 @@ def dcshift(waveform: Tensor, shift: float, limiter_gain: Optional[float] = None ...@@ -357,6 +393,8 @@ def dcshift(waveform: Tensor, shift: float, limiter_gain: Optional[float] = None
if limiter_gain is not None: if limiter_gain is not None:
limiter_threshold = 1.0 - (abs(shift) - limiter_gain) limiter_threshold = 1.0 - (abs(shift) - limiter_gain)
# Note:
# the following index-based update breaks auto-grad support
if limiter_gain is not None and shift > 0: if limiter_gain is not None and shift > 0:
mask = waveform > limiter_threshold mask = waveform > limiter_threshold
temp = (waveform[mask] - limiter_threshold) * limiter_gain / (1 - limiter_threshold) temp = (waveform[mask] - limiter_threshold) * limiter_gain / (1 - limiter_threshold)
...@@ -376,6 +414,10 @@ def dcshift(waveform: Tensor, shift: float, limiter_gain: Optional[float] = None ...@@ -376,6 +414,10 @@ def dcshift(waveform: Tensor, shift: float, limiter_gain: Optional[float] = None
def deemph_biquad(waveform: Tensor, sample_rate: int) -> Tensor: def deemph_biquad(waveform: Tensor, sample_rate: int) -> Tensor:
r"""Apply ISO 908 CD de-emphasis (shelving) IIR filter. Similar to SoX implementation. r"""Apply ISO 908 CD de-emphasis (shelving) IIR filter. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
waveform (Tensor): audio waveform of dimension of `(..., time)` waveform (Tensor): audio waveform of dimension of `(..., time)`
sample_rate (int): sampling rate of the waveform, Allowed sample rate ``44100`` or ``48000`` sample_rate (int): sampling rate of the waveform, Allowed sample rate ``44100`` or ``48000``
...@@ -551,7 +593,13 @@ def _apply_probability_distribution(waveform: Tensor, density_function: str = "T ...@@ -551,7 +593,13 @@ def _apply_probability_distribution(waveform: Tensor, density_function: str = "T
def dither(waveform: Tensor, density_function: str = "TPDF", noise_shaping: bool = False) -> Tensor: def dither(waveform: Tensor, density_function: str = "TPDF", noise_shaping: bool = False) -> Tensor:
r"""Dither increases the perceived dynamic range of audio stored at a r"""Apply dither
.. devices:: CPU CUDA
.. properties:: TorchScript
Dither increases the perceived dynamic range of audio stored at a
particular bit-depth by eliminating nonlinear truncation distortion particular bit-depth by eliminating nonlinear truncation distortion
(i.e. adding minimally perceived noise to mask distortion caused by quantization). (i.e. adding minimally perceived noise to mask distortion caused by quantization).
...@@ -585,6 +633,10 @@ def equalizer_biquad( ...@@ -585,6 +633,10 @@ def equalizer_biquad(
) -> Tensor: ) -> Tensor:
r"""Design biquad peaking equalizer filter and perform filtering. Similar to SoX implementation. r"""Design biquad peaking equalizer filter and perform filtering. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
waveform (Tensor): audio waveform of dimension of `(..., time)` waveform (Tensor): audio waveform of dimension of `(..., time)`
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
...@@ -622,6 +674,10 @@ def filtfilt( ...@@ -622,6 +674,10 @@ def filtfilt(
) -> Tensor: ) -> Tensor:
r"""Apply an IIR filter forward and backward to a waveform. r"""Apply an IIR filter forward and backward to a waveform.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Inspired by https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.filtfilt.html Inspired by https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.filtfilt.html
Args: Args:
...@@ -665,6 +721,10 @@ def flanger( ...@@ -665,6 +721,10 @@ def flanger(
) -> Tensor: ) -> Tensor:
r"""Apply a flanger effect to the audio. Similar to SoX implementation. r"""Apply a flanger effect to the audio. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
waveform (Tensor): audio waveform of dimension of `(..., channel, time)` . waveform (Tensor): audio waveform of dimension of `(..., channel, time)` .
Max 4 channels allowed Max 4 channels allowed
...@@ -808,6 +868,10 @@ def flanger( ...@@ -808,6 +868,10 @@ def flanger(
def gain(waveform: Tensor, gain_db: float = 1.0) -> Tensor: def gain(waveform: Tensor, gain_db: float = 1.0) -> Tensor:
r"""Apply amplification or attenuation to the whole waveform. r"""Apply amplification or attenuation to the whole waveform.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
waveform (Tensor): Tensor of audio of dimension (..., time). waveform (Tensor): Tensor of audio of dimension (..., time).
gain_db (float, optional) Gain adjustment in decibels (dB) (Default: ``1.0``). gain_db (float, optional) Gain adjustment in decibels (dB) (Default: ``1.0``).
...@@ -826,6 +890,10 @@ def gain(waveform: Tensor, gain_db: float = 1.0) -> Tensor: ...@@ -826,6 +890,10 @@ def gain(waveform: Tensor, gain_db: float = 1.0) -> Tensor:
def highpass_biquad(waveform: Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> Tensor: def highpass_biquad(waveform: Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> Tensor:
r"""Design biquad highpass filter and perform filtering. Similar to SoX implementation. r"""Design biquad highpass filter and perform filtering. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
waveform (Tensor): audio waveform of dimension of `(..., time)` waveform (Tensor): audio waveform of dimension of `(..., time)`
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
...@@ -922,6 +990,10 @@ except RuntimeError as err: ...@@ -922,6 +990,10 @@ except RuntimeError as err:
def lfilter(waveform: Tensor, a_coeffs: Tensor, b_coeffs: Tensor, clamp: bool = True, batching: bool = True) -> Tensor: def lfilter(waveform: Tensor, a_coeffs: Tensor, b_coeffs: Tensor, clamp: bool = True, batching: bool = True) -> Tensor:
r"""Perform an IIR filter by evaluating difference equation. r"""Perform an IIR filter by evaluating difference equation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Note: Note:
To avoid numerical problems, small filter order is preferred. To avoid numerical problems, small filter order is preferred.
Using double precision could also minimize numerical precision errors. Using double precision could also minimize numerical precision errors.
...@@ -976,6 +1048,10 @@ def lfilter(waveform: Tensor, a_coeffs: Tensor, b_coeffs: Tensor, clamp: bool = ...@@ -976,6 +1048,10 @@ def lfilter(waveform: Tensor, a_coeffs: Tensor, b_coeffs: Tensor, clamp: bool =
def lowpass_biquad(waveform: Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> Tensor: def lowpass_biquad(waveform: Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> Tensor:
r"""Design biquad lowpass filter and perform filtering. Similar to SoX implementation. r"""Design biquad lowpass filter and perform filtering. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
waveform (torch.Tensor): audio waveform of dimension of `(..., time)` waveform (torch.Tensor): audio waveform of dimension of `(..., time)`
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
...@@ -1020,6 +1096,11 @@ except RuntimeError as err: ...@@ -1020,6 +1096,11 @@ except RuntimeError as err:
def overdrive(waveform: Tensor, gain: float = 20, colour: float = 20) -> Tensor: def overdrive(waveform: Tensor, gain: float = 20, colour: float = 20) -> Tensor:
r"""Apply a overdrive effect to the audio. Similar to SoX implementation. r"""Apply a overdrive effect to the audio. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
This effect applies a non linear distortion to the audio signal. This effect applies a non linear distortion to the audio signal.
Args: Args:
...@@ -1081,6 +1162,10 @@ def phaser( ...@@ -1081,6 +1162,10 @@ def phaser(
) -> Tensor: ) -> Tensor:
r"""Apply a phasing effect to the audio. Similar to SoX implementation. r"""Apply a phasing effect to the audio. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
waveform (Tensor): audio waveform of dimension of `(..., time)` waveform (Tensor): audio waveform of dimension of `(..., time)`
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
...@@ -1161,6 +1246,10 @@ def phaser( ...@@ -1161,6 +1246,10 @@ def phaser(
def riaa_biquad(waveform: Tensor, sample_rate: int) -> Tensor: def riaa_biquad(waveform: Tensor, sample_rate: int) -> Tensor:
r"""Apply RIAA vinyl playback equalization. Similar to SoX implementation. r"""Apply RIAA vinyl playback equalization. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
waveform (Tensor): audio waveform of dimension of `(..., time)` waveform (Tensor): audio waveform of dimension of `(..., time)`
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz). sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz).
...@@ -1227,6 +1316,10 @@ def treble_biquad( ...@@ -1227,6 +1316,10 @@ def treble_biquad(
) -> Tensor: ) -> Tensor:
r"""Design a treble tone-control effect. Similar to SoX implementation. r"""Design a treble tone-control effect. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
waveform (Tensor): audio waveform of dimension of `(..., time)` waveform (Tensor): audio waveform of dimension of `(..., time)`
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
...@@ -1362,6 +1455,11 @@ def vad( ...@@ -1362,6 +1455,11 @@ def vad(
lp_lifter_freq: float = 2000.0, lp_lifter_freq: float = 2000.0,
) -> Tensor: ) -> Tensor:
r"""Voice Activity Detector. Similar to SoX implementation. r"""Voice Activity Detector. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: TorchScript
Attempts to trim silence and quiet background sounds from the ends of recordings of speech. Attempts to trim silence and quiet background sounds from the ends of recordings of speech.
The algorithm currently uses a simple cepstral power measurement to detect voice, The algorithm currently uses a simple cepstral power measurement to detect voice,
so may be fooled by other things, especially music. so may be fooled by other things, especially music.
......
...@@ -63,6 +63,10 @@ def spectrogram( ...@@ -63,6 +63,10 @@ def spectrogram(
r"""Create a spectrogram or a batch of spectrograms from a raw audio signal. r"""Create a spectrogram or a batch of spectrograms from a raw audio signal.
The spectrogram can be either magnitude-only or complex. The spectrogram can be either magnitude-only or complex.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
waveform (Tensor): Tensor of audio of dimension `(..., time)` waveform (Tensor): Tensor of audio of dimension `(..., time)`
pad (int): Two sided padding of signal pad (int): Two sided padding of signal
...@@ -146,6 +150,10 @@ def inverse_spectrogram( ...@@ -146,6 +150,10 @@ def inverse_spectrogram(
r"""Create an inverse spectrogram or a batch of inverse spectrograms from the provided r"""Create an inverse spectrogram or a batch of inverse spectrograms from the provided
complex-valued spectrogram. complex-valued spectrogram.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
spectrogram (Tensor): Complex tensor of audio of dimension (..., freq, time). spectrogram (Tensor): Complex tensor of audio of dimension (..., freq, time).
length (int or None): The output length of the waveform. length (int or None): The output length of the waveform.
...@@ -226,6 +234,10 @@ def griffinlim( ...@@ -226,6 +234,10 @@ def griffinlim(
) -> Tensor: ) -> Tensor:
r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation. r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Implementation ported from Implementation ported from
*librosa* [:footcite:`brian_mcfee-proc-scipy-2015`], *A fast Griffin-Lim algorithm* [:footcite:`6701851`] *librosa* [:footcite:`brian_mcfee-proc-scipy-2015`], *A fast Griffin-Lim algorithm* [:footcite:`6701851`]
and *Signal estimation from modified short-time Fourier transform* [:footcite:`1172092`]. and *Signal estimation from modified short-time Fourier transform* [:footcite:`1172092`].
...@@ -312,6 +324,10 @@ def amplitude_to_DB( ...@@ -312,6 +324,10 @@ def amplitude_to_DB(
) -> Tensor: ) -> Tensor:
r"""Turn a spectrogram from the power/amplitude scale to the decibel scale. r"""Turn a spectrogram from the power/amplitude scale to the decibel scale.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
The output of each tensor in a batch depends on the maximum value of that tensor, The output of each tensor in a batch depends on the maximum value of that tensor,
and so may return different values for an audio clip split into snippets vs. a full clip. and so may return different values for an audio clip split into snippets vs. a full clip.
...@@ -349,6 +365,10 @@ def amplitude_to_DB( ...@@ -349,6 +365,10 @@ def amplitude_to_DB(
def DB_to_amplitude(x: Tensor, ref: float, power: float) -> Tensor: def DB_to_amplitude(x: Tensor, ref: float, power: float) -> Tensor:
r"""Turn a tensor from the decibel scale to the power/amplitude scale. r"""Turn a tensor from the decibel scale to the power/amplitude scale.
.. devices:: CPU CUDA
.. properties:: TorchScript
Args: Args:
x (Tensor): Input tensor before being converted to power/amplitude scale. x (Tensor): Input tensor before being converted to power/amplitude scale.
ref (float): Reference which the output will be scaled by. ref (float): Reference which the output will be scaled by.
...@@ -464,6 +484,10 @@ def melscale_fbanks( ...@@ -464,6 +484,10 @@ def melscale_fbanks(
) -> Tensor: ) -> Tensor:
r"""Create a frequency bin conversion matrix. r"""Create a frequency bin conversion matrix.
.. devices:: CPU
.. properties:: TorchScript
Note: Note:
For the sake of the numerical compatibility with librosa, not all the coefficients For the sake of the numerical compatibility with librosa, not all the coefficients
in the resulting filter bank has magnitude of 1. in the resulting filter bank has magnitude of 1.
...@@ -530,6 +554,10 @@ def linear_fbanks( ...@@ -530,6 +554,10 @@ def linear_fbanks(
) -> Tensor: ) -> Tensor:
r"""Creates a linear triangular filterbank. r"""Creates a linear triangular filterbank.
.. devices:: CPU
.. properties:: TorchScript
Note: Note:
For the sake of the numerical compatibility with librosa, not all the coefficients For the sake of the numerical compatibility with librosa, not all the coefficients
in the resulting filter bank has magnitude of 1. in the resulting filter bank has magnitude of 1.
...@@ -567,6 +595,10 @@ def create_dct(n_mfcc: int, n_mels: int, norm: Optional[str]) -> Tensor: ...@@ -567,6 +595,10 @@ def create_dct(n_mfcc: int, n_mels: int, norm: Optional[str]) -> Tensor:
r"""Create a DCT transformation matrix with shape (``n_mels``, ``n_mfcc``), r"""Create a DCT transformation matrix with shape (``n_mels``, ``n_mfcc``),
normalized depending on norm. normalized depending on norm.
.. devices:: CPU
.. properties:: TorchScript
Args: Args:
n_mfcc (int): Number of mfc coefficients to retain n_mfcc (int): Number of mfc coefficients to retain
n_mels (int): Number of mel filterbanks n_mels (int): Number of mel filterbanks
...@@ -590,7 +622,13 @@ def create_dct(n_mfcc: int, n_mels: int, norm: Optional[str]) -> Tensor: ...@@ -590,7 +622,13 @@ def create_dct(n_mfcc: int, n_mels: int, norm: Optional[str]) -> Tensor:
def mu_law_encoding(x: Tensor, quantization_channels: int) -> Tensor: def mu_law_encoding(x: Tensor, quantization_channels: int) -> Tensor:
r"""Encode signal based on mu-law companding. For more info see the r"""Encode signal based on mu-law companding.
.. devices:: CPU CUDA
.. properties:: TorchScript
For more info see the
`Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_ `Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_
This algorithm expects the signal has been scaled to between -1 and 1 and This algorithm expects the signal has been scaled to between -1 and 1 and
...@@ -617,7 +655,13 @@ def mu_law_encoding(x: Tensor, quantization_channels: int) -> Tensor: ...@@ -617,7 +655,13 @@ def mu_law_encoding(x: Tensor, quantization_channels: int) -> Tensor:
def mu_law_decoding(x_mu: Tensor, quantization_channels: int) -> Tensor: def mu_law_decoding(x_mu: Tensor, quantization_channels: int) -> Tensor:
r"""Decode mu-law encoded signal. For more info see the r"""Decode mu-law encoded signal.
.. devices:: CPU CUDA
.. properties:: TorchScript
For more info see the
`Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_ `Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_
This expects an input with values between 0 and quantization_channels - 1 This expects an input with values between 0 and quantization_channels - 1
...@@ -640,8 +684,11 @@ def mu_law_decoding(x_mu: Tensor, quantization_channels: int) -> Tensor: ...@@ -640,8 +684,11 @@ def mu_law_decoding(x_mu: Tensor, quantization_channels: int) -> Tensor:
def phase_vocoder(complex_specgrams: Tensor, rate: float, phase_advance: Tensor) -> Tensor: def phase_vocoder(complex_specgrams: Tensor, rate: float, phase_advance: Tensor) -> Tensor:
r"""Given a STFT tensor, speed up in time without modifying pitch by a r"""Given a STFT tensor, speed up in time without modifying pitch by a factor of ``rate``.
factor of ``rate``.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
complex_specgrams (Tensor): complex_specgrams (Tensor):
...@@ -724,11 +771,17 @@ def mask_along_axis_iid( ...@@ -724,11 +771,17 @@ def mask_along_axis_iid(
axis: int, axis: int,
p: float = 1.0, p: float = 1.0,
) -> Tensor: ) -> Tensor:
r""" r"""Apply a mask along ``axis``.
Apply a mask along ``axis``. Mask will be applied from indices ``[v_0, v_0 + v)``, where
``v`` is sampled from ``uniform(0, max_v)`` and ``v_0`` from ``uniform(0, specgrams.size(axis) - v)``, with .. devices:: CPU CUDA
``max_v = mask_param`` when ``p = 1.0`` and ``max_v = min(mask_param, floor(specgrams.size(axis) * p))``
otherwise. .. properties:: Autograd TorchScript
Mask will be applied from indices ``[v_0, v_0 + v)``,
where ``v`` is sampled from ``uniform(0, max_v)`` and
``v_0`` from ``uniform(0, specgrams.size(axis) - v)``,
with ``max_v = mask_param`` when ``p = 1.0`` and
``max_v = min(mask_param, floor(specgrams.size(axis) * p))`` otherwise.
Args: Args:
specgrams (Tensor): Real spectrograms `(batch, channel, freq, time)` specgrams (Tensor): Real spectrograms `(batch, channel, freq, time)`
...@@ -777,11 +830,19 @@ def mask_along_axis( ...@@ -777,11 +830,19 @@ def mask_along_axis(
axis: int, axis: int,
p: float = 1.0, p: float = 1.0,
) -> Tensor: ) -> Tensor:
r""" r"""Apply a mask along ``axis``.
Apply a mask along ``axis``. Mask will be applied from indices ``[v_0, v_0 + v)``, where
``v`` is sampled from ``uniform(0, max_v)`` and ``v_0`` from ``uniform(0, specgrams.size(axis) - v)``, with .. devices:: CPU CUDA
``max_v = mask_param`` when ``p = 1.0`` and ``max_v = min(mask_param, floor(specgrams.size(axis) * p))``
otherwise. All examples will have the same mask interval. .. properties:: Autograd TorchScript
Mask will be applied from indices ``[v_0, v_0 + v)``,
where ``v`` is sampled from ``uniform(0, max_v)`` and
``v_0`` from ``uniform(0, specgrams.size(axis) - v)``, with
``max_v = mask_param`` when ``p = 1.0`` and
``max_v = min(mask_param, floor(specgrams.size(axis) * p))``
otherwise.
All examples will have the same mask interval.
Args: Args:
specgram (Tensor): Real spectrogram `(channel, freq, time)` specgram (Tensor): Real spectrogram `(channel, freq, time)`
...@@ -829,6 +890,10 @@ def mask_along_axis( ...@@ -829,6 +890,10 @@ def mask_along_axis(
def compute_deltas(specgram: Tensor, win_length: int = 5, mode: str = "replicate") -> Tensor: def compute_deltas(specgram: Tensor, win_length: int = 5, mode: str = "replicate") -> Tensor:
r"""Compute delta coefficients of a tensor, usually a spectrogram: r"""Compute delta coefficients of a tensor, usually a spectrogram:
.. devices:: CPU CUDA
.. properties:: TorchScript
.. math:: .. math::
d_t = \frac{\sum_{n=1}^{\text{N}} n (c_{t+n} - c_{t-n})}{2 \sum_{n=1}^{\text{N}} n^2} d_t = \frac{\sum_{n=1}^{\text{N}} n (c_{t+n} - c_{t-n})}{2 \sum_{n=1}^{\text{N}} n^2}
...@@ -989,6 +1054,10 @@ def detect_pitch_frequency( ...@@ -989,6 +1054,10 @@ def detect_pitch_frequency(
) -> Tensor: ) -> Tensor:
r"""Detect pitch frequency. r"""Detect pitch frequency.
.. devices:: CPU CUDA
.. properties:: TorchScript
It is implemented using normalized cross-correlation function and median smoothing. It is implemented using normalized cross-correlation function and median smoothing.
Args: Args:
...@@ -1030,6 +1099,10 @@ def sliding_window_cmn( ...@@ -1030,6 +1099,10 @@ def sliding_window_cmn(
r""" r"""
Apply sliding-window cepstral mean (and optionally variance) normalization per utterance. Apply sliding-window cepstral mean (and optionally variance) normalization per utterance.
.. devices:: CPU CUDA
.. properties:: TorchScript
Args: Args:
specgram (Tensor): Tensor of spectrogram of dimension `(..., time, freq)` specgram (Tensor): Tensor of spectrogram of dimension `(..., time, freq)`
cmn_window (int, optional): Window in frames for running average CMN computation (int, default = 600) cmn_window (int, optional): Window in frames for running average CMN computation (int, default = 600)
...@@ -1118,8 +1191,11 @@ def spectral_centroid( ...@@ -1118,8 +1191,11 @@ def spectral_centroid(
hop_length: int, hop_length: int,
win_length: int, win_length: int,
) -> Tensor: ) -> Tensor:
r""" r"""Compute the spectral centroid for each channel along the time axis.
Compute the spectral centroid for each channel along the time axis.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
The spectral centroid is defined as the weighted average of the The spectral centroid is defined as the weighted average of the
frequency values, weighted by their magnitude. frequency values, weighted by their magnitude.
...@@ -1164,6 +1240,8 @@ def apply_codec( ...@@ -1164,6 +1240,8 @@ def apply_codec(
r""" r"""
Apply codecs as a form of augmentation. Apply codecs as a form of augmentation.
.. devices:: CPU
Args: Args:
waveform (Tensor): Audio data. Must be 2 dimensional. See also ```channels_first```. waveform (Tensor): Audio data. Must be 2 dimensional. See also ```channels_first```.
sample_rate (int): Sample rate of the audio waveform. sample_rate (int): Sample rate of the audio waveform.
...@@ -1218,6 +1296,10 @@ def compute_kaldi_pitch( ...@@ -1218,6 +1296,10 @@ def compute_kaldi_pitch(
"""Extract pitch based on method described in *A pitch extraction algorithm tuned """Extract pitch based on method described in *A pitch extraction algorithm tuned
for automatic speech recognition* [:footcite:`6854049`]. for automatic speech recognition* [:footcite:`6854049`].
.. devices:: CPU
.. properties:: TorchScript
This function computes the equivalent of `compute-kaldi-pitch-feats` from Kaldi. This function computes the equivalent of `compute-kaldi-pitch-feats` from Kaldi.
Args: Args:
...@@ -1430,9 +1512,11 @@ def resample( ...@@ -1430,9 +1512,11 @@ def resample(
resampling_method: str = "sinc_interpolation", resampling_method: str = "sinc_interpolation",
beta: Optional[float] = None, beta: Optional[float] = None,
) -> Tensor: ) -> Tensor:
r"""Resamples the waveform at the new frequency using bandlimited interpolation. r"""Resamples the waveform at the new frequency using bandlimited interpolation. [:footcite:`RESAMPLE`].
https://ccrma.stanford.edu/~jos/resample/Theory_Ideal_Bandlimited_Interpolation.html .. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Note: Note:
``transforms.Resample`` precomputes and reuses the resampling kernel, so using it will result in ``transforms.Resample`` precomputes and reuses the resampling kernel, so using it will result in
...@@ -1481,6 +1565,8 @@ def edit_distance(seq1: Sequence, seq2: Sequence) -> int: ...@@ -1481,6 +1565,8 @@ def edit_distance(seq1: Sequence, seq2: Sequence) -> int:
""" """
Calculate the word level edit (Levenshtein) distance between two sequences. Calculate the word level edit (Levenshtein) distance between two sequences.
.. devices:: CPU
The function computes an edit distance allowing deletion, insertion and The function computes an edit distance allowing deletion, insertion and
substitution. The result is an integer. substitution. The result is an integer.
...@@ -1490,8 +1576,6 @@ def edit_distance(seq1: Sequence, seq2: Sequence) -> int: ...@@ -1490,8 +1576,6 @@ def edit_distance(seq1: Sequence, seq2: Sequence) -> int:
output is the edit distance between sentences (word edit distance). Users output is the edit distance between sentences (word edit distance). Users
may want to normalize the output by the length of the reference sequence. may want to normalize the output by the length of the reference sequence.
torchscipt is not supported for this function.
Args: Args:
seq1 (Sequence): the first sequence to compare. seq1 (Sequence): the first sequence to compare.
seq2 (Sequence): the second sequence to compare. seq2 (Sequence): the second sequence to compare.
...@@ -1531,6 +1615,10 @@ def pitch_shift( ...@@ -1531,6 +1615,10 @@ def pitch_shift(
""" """
Shift the pitch of a waveform by ``n_steps`` steps. Shift the pitch of a waveform by ``n_steps`` steps.
.. devices:: CPU CUDA
.. properties:: TorchScript
Args: Args:
waveform (Tensor): The input waveform of shape `(..., time)`. waveform (Tensor): The input waveform of shape `(..., time)`.
sample_rate (int): Sample rate of `waveform`. sample_rate (int): Sample rate of `waveform`.
...@@ -1601,6 +1689,11 @@ def rnnt_loss( ...@@ -1601,6 +1689,11 @@ def rnnt_loss(
): ):
"""Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks* """Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks*
[:footcite:`graves2012sequence`]. [:footcite:`graves2012sequence`].
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
The RNN Transducer loss extends the CTC loss by defining a distribution over output The RNN Transducer loss extends the CTC loss by defining a distribution over output
sequences of all lengths, and by jointly modelling both input-output and output-output sequences of all lengths, and by jointly modelling both input-output and output-output
dependencies. dependencies.
...@@ -1650,6 +1743,10 @@ def psd( ...@@ -1650,6 +1743,10 @@ def psd(
) -> Tensor: ) -> Tensor:
"""Compute cross-channel power spectral density (PSD) matrix. """Compute cross-channel power spectral density (PSD) matrix.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
specgram (Tensor): Multi-channel complex-valued spectrum. specgram (Tensor): Multi-channel complex-valued spectrum.
Tensor of dimension `(..., channel, freq, time)` Tensor of dimension `(..., channel, freq, time)`
...@@ -1730,6 +1827,10 @@ def mvdr_weights_souden( ...@@ -1730,6 +1827,10 @@ def mvdr_weights_souden(
r"""Compute the Minimum Variance Distortionless Response (*MVDR* [:footcite:`capon1969high`]) beamforming weights r"""Compute the Minimum Variance Distortionless Response (*MVDR* [:footcite:`capon1969high`]) beamforming weights
by the method proposed by *Souden et, al.* [:footcite:`souden2009optimal`]. by the method proposed by *Souden et, al.* [:footcite:`souden2009optimal`].
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
.. math:: .. math::
\textbf{w}_{\text{MVDR}}(f) = \textbf{w}_{\text{MVDR}}(f) =
\frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bf{\Phi}_{\textbf{SS}}}}(f)} \frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bf{\Phi}_{\textbf{SS}}}}(f)}
...@@ -1784,6 +1885,10 @@ def mvdr_weights_rtf( ...@@ -1784,6 +1885,10 @@ def mvdr_weights_rtf(
r"""Compute the Minimum Variance Distortionless Response (*MVDR* [:footcite:`capon1969high`]) beamforming weights r"""Compute the Minimum Variance Distortionless Response (*MVDR* [:footcite:`capon1969high`]) beamforming weights
based on the relative transfer function (RTF) and power spectral density (PSD) matrix of noise. based on the relative transfer function (RTF) and power spectral density (PSD) matrix of noise.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
.. math:: .. math::
\textbf{w}_{\text{MVDR}}(f) = \textbf{w}_{\text{MVDR}}(f) =
\frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bm{v}}(f)}} \frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bm{v}}(f)}}
...@@ -1836,6 +1941,10 @@ def mvdr_weights_rtf( ...@@ -1836,6 +1941,10 @@ def mvdr_weights_rtf(
def rtf_evd(psd_s: Tensor) -> Tensor: def rtf_evd(psd_s: Tensor) -> Tensor:
r"""Estimate the relative transfer function (RTF) or the steering vector by eigenvalue decomposition. r"""Estimate the relative transfer function (RTF) or the steering vector by eigenvalue decomposition.
.. devices:: CPU CUDA
.. properties:: TorchScript
Args: Args:
psd_s (Tensor): The complex-valued power spectral density (PSD) matrix of target speech. psd_s (Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
Tensor of dimension `(..., freq, channel, channel)` Tensor of dimension `(..., freq, channel, channel)`
...@@ -1852,6 +1961,10 @@ def rtf_evd(psd_s: Tensor) -> Tensor: ...@@ -1852,6 +1961,10 @@ def rtf_evd(psd_s: Tensor) -> Tensor:
def rtf_power(psd_s: Tensor, psd_n: Tensor, reference_channel: Union[int, Tensor], n_iter: int = 3) -> Tensor: def rtf_power(psd_s: Tensor, psd_n: Tensor, reference_channel: Union[int, Tensor], n_iter: int = 3) -> Tensor:
r"""Estimate the relative transfer function (RTF) or the steering vector by the power method. r"""Estimate the relative transfer function (RTF) or the steering vector by the power method.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
psd_s (Tensor): The complex-valued covariance matrix of target speech. psd_s (Tensor): The complex-valued covariance matrix of target speech.
Tensor of dimension `(..., freq, channel, channel)` Tensor of dimension `(..., freq, channel, channel)`
...@@ -1895,6 +2008,10 @@ def rtf_power(psd_s: Tensor, psd_n: Tensor, reference_channel: Union[int, Tensor ...@@ -1895,6 +2008,10 @@ def rtf_power(psd_s: Tensor, psd_n: Tensor, reference_channel: Union[int, Tensor
def apply_beamforming(beamform_weights: Tensor, specgram: Tensor) -> Tensor: def apply_beamforming(beamform_weights: Tensor, specgram: Tensor) -> Tensor:
r"""Apply the beamforming weight to the multi-channel noisy spectrum to obtain the single-channel enhanced spectrum. r"""Apply the beamforming weight to the multi-channel noisy spectrum to obtain the single-channel enhanced spectrum.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
.. math:: .. math::
\hat{\textbf{S}}(f) = \textbf{w}_{\text{bf}}(f)^{\mathsf{H}} \textbf{Y}(f) \hat{\textbf{S}}(f) = \textbf{w}_{\text{bf}}(f)^{\mathsf{H}} \textbf{Y}(f)
where :math:`\textbf{w}_{\text{bf}}(f)` is the beamforming weight for the :math:`f`-th frequency bin, where :math:`\textbf{w}_{\text{bf}}(f)` is the beamforming weight for the :math:`f`-th frequency bin,
......
...@@ -43,6 +43,8 @@ class Hypothesis(NamedTuple): ...@@ -43,6 +43,8 @@ class Hypothesis(NamedTuple):
class LexiconDecoder: class LexiconDecoder:
"""torchaudio.prototype.ctc_decoder.LexiconDecoder() """torchaudio.prototype.ctc_decoder.LexiconDecoder()
.. devices:: CPU
Lexically contrained CTC beam search decoder from *Flashlight* [:footcite:`kahn2022flashlight`]. Lexically contrained CTC beam search decoder from *Flashlight* [:footcite:`kahn2022flashlight`].
Note: Note:
......
...@@ -59,6 +59,10 @@ def apply_effects_tensor( ...@@ -59,6 +59,10 @@ def apply_effects_tensor(
) -> Tuple[torch.Tensor, int]: ) -> Tuple[torch.Tensor, int]:
"""Apply sox effects to given Tensor """Apply sox effects to given Tensor
.. devices:: CPU
.. properties:: TorchScript
Note: Note:
This function only works on CPU Tensors. This function only works on CPU Tensors.
This function works in the way very similar to ``sox`` command, however there are slight This function works in the way very similar to ``sox`` command, however there are slight
...@@ -161,6 +165,10 @@ def apply_effects_file( ...@@ -161,6 +165,10 @@ def apply_effects_file(
) -> Tuple[torch.Tensor, int]: ) -> Tuple[torch.Tensor, int]:
"""Apply sox effects to the audio file and load the resulting data as Tensor """Apply sox effects to the audio file and load the resulting data as Tensor
.. devices:: CPU
.. properties:: TorchScript
Note: Note:
This function works in the way very similar to ``sox`` command, however there are slight This function works in the way very similar to ``sox`` command, however there are slight
differences. For example, ``sox`` commnad adds certain effects automatically (such as differences. For example, ``sox`` commnad adds certain effects automatically (such as
......
...@@ -18,6 +18,10 @@ __all__ = [] ...@@ -18,6 +18,10 @@ __all__ = []
class Spectrogram(torch.nn.Module): class Spectrogram(torch.nn.Module):
r"""Create a spectrogram from a audio signal. r"""Create a spectrogram from a audio signal.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``) n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
win_length (int or None, optional): Window size. (Default: ``n_fft``) win_length (int or None, optional): Window size. (Default: ``n_fft``)
...@@ -112,6 +116,10 @@ class Spectrogram(torch.nn.Module): ...@@ -112,6 +116,10 @@ class Spectrogram(torch.nn.Module):
class InverseSpectrogram(torch.nn.Module): class InverseSpectrogram(torch.nn.Module):
r"""Create an inverse spectrogram to recover an audio signal from a spectrogram. r"""Create an inverse spectrogram to recover an audio signal from a spectrogram.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``) n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
win_length (int or None, optional): Window size. (Default: ``n_fft``) win_length (int or None, optional): Window size. (Default: ``n_fft``)
...@@ -193,6 +201,10 @@ class InverseSpectrogram(torch.nn.Module): ...@@ -193,6 +201,10 @@ class InverseSpectrogram(torch.nn.Module):
class GriffinLim(torch.nn.Module): class GriffinLim(torch.nn.Module):
r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation. r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Implementation ported from Implementation ported from
*librosa* [:footcite:`brian_mcfee-proc-scipy-2015`], *A fast Griffin-Lim algorithm* [:footcite:`6701851`] *librosa* [:footcite:`brian_mcfee-proc-scipy-2015`], *A fast Griffin-Lim algorithm* [:footcite:`6701851`]
and *Signal estimation from modified short-time Fourier transform* [:footcite:`1172092`]. and *Signal estimation from modified short-time Fourier transform* [:footcite:`1172092`].
...@@ -277,6 +289,10 @@ class GriffinLim(torch.nn.Module): ...@@ -277,6 +289,10 @@ class GriffinLim(torch.nn.Module):
class AmplitudeToDB(torch.nn.Module): class AmplitudeToDB(torch.nn.Module):
r"""Turn a tensor from the power/amplitude scale to the decibel scale. r"""Turn a tensor from the power/amplitude scale to the decibel scale.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
This output depends on the maximum value in the input tensor, and so This output depends on the maximum value in the input tensor, and so
may return different values for an audio clip split into snippets vs. a may return different values for an audio clip split into snippets vs. a
a full clip. a full clip.
...@@ -315,8 +331,11 @@ class AmplitudeToDB(torch.nn.Module): ...@@ -315,8 +331,11 @@ class AmplitudeToDB(torch.nn.Module):
class MelScale(torch.nn.Module): class MelScale(torch.nn.Module):
r"""Turn a normal STFT into a mel frequency STFT, using a conversion r"""Turn a normal STFT into a mel frequency STFT with triangular filter banks.
matrix. This uses triangular filter banks.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
n_mels (int, optional): Number of mel filterbanks. (Default: ``128``) n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
...@@ -372,8 +391,9 @@ class MelScale(torch.nn.Module): ...@@ -372,8 +391,9 @@ class MelScale(torch.nn.Module):
class InverseMelScale(torch.nn.Module): class InverseMelScale(torch.nn.Module):
r"""Solve for a normal STFT from a mel frequency STFT, using a conversion r"""Estimate a STFT in normal frequency domain from mel frequency domain.
matrix. This uses triangular filter banks.
.. devices:: CPU CUDA
It minimizes the euclidian norm between the input mel-spectrogram and the product between It minimizes the euclidian norm between the input mel-spectrogram and the product between
the estimated spectrogram and the filter banks using SGD. the estimated spectrogram and the filter banks using SGD.
...@@ -483,6 +503,10 @@ class InverseMelScale(torch.nn.Module): ...@@ -483,6 +503,10 @@ class InverseMelScale(torch.nn.Module):
class MelSpectrogram(torch.nn.Module): class MelSpectrogram(torch.nn.Module):
r"""Create MelSpectrogram for a raw audio signal. r"""Create MelSpectrogram for a raw audio signal.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
This is a composition of :py:func:`torchaudio.transforms.Spectrogram` and This is a composition of :py:func:`torchaudio.transforms.Spectrogram` and
and :py:func:`torchaudio.transforms.MelScale`. and :py:func:`torchaudio.transforms.MelScale`.
...@@ -592,6 +616,10 @@ class MelSpectrogram(torch.nn.Module): ...@@ -592,6 +616,10 @@ class MelSpectrogram(torch.nn.Module):
class MFCC(torch.nn.Module): class MFCC(torch.nn.Module):
r"""Create the Mel-frequency cepstrum coefficients from an audio signal. r"""Create the Mel-frequency cepstrum coefficients from an audio signal.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
By default, this calculates the MFCC on the DB-scaled Mel spectrogram. By default, this calculates the MFCC on the DB-scaled Mel spectrogram.
This is not the textbook implementation, but is implemented here to This is not the textbook implementation, but is implemented here to
give consistency with librosa. give consistency with librosa.
...@@ -666,6 +694,10 @@ class MFCC(torch.nn.Module): ...@@ -666,6 +694,10 @@ class MFCC(torch.nn.Module):
class LFCC(torch.nn.Module): class LFCC(torch.nn.Module):
r"""Create the linear-frequency cepstrum coefficients from an audio signal. r"""Create the linear-frequency cepstrum coefficients from an audio signal.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
By default, this calculates the LFCC on the DB-scaled linear filtered spectrogram. By default, this calculates the LFCC on the DB-scaled linear filtered spectrogram.
This is not the textbook implementation, but is implemented here to This is not the textbook implementation, but is implemented here to
give consistency with librosa. give consistency with librosa.
...@@ -762,7 +794,13 @@ class LFCC(torch.nn.Module): ...@@ -762,7 +794,13 @@ class LFCC(torch.nn.Module):
class MuLawEncoding(torch.nn.Module): class MuLawEncoding(torch.nn.Module):
r"""Encode signal based on mu-law companding. For more info see the r"""Encode signal based on mu-law companding.
.. devices:: CPU CUDA
.. properties:: TorchScript
For more info see the
`Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_ `Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_
This algorithm assumes the signal has been scaled to between -1 and 1 and This algorithm assumes the signal has been scaled to between -1 and 1 and
...@@ -795,7 +833,13 @@ class MuLawEncoding(torch.nn.Module): ...@@ -795,7 +833,13 @@ class MuLawEncoding(torch.nn.Module):
class MuLawDecoding(torch.nn.Module): class MuLawDecoding(torch.nn.Module):
r"""Decode mu-law encoded signal. For more info see the r"""Decode mu-law encoded signal.
.. devices:: CPU CUDA
.. properties:: TorchScript
For more info see the
`Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_ `Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_
This expects an input with values between 0 and ``quantization_channels - 1`` This expects an input with values between 0 and ``quantization_channels - 1``
...@@ -829,6 +873,10 @@ class MuLawDecoding(torch.nn.Module): ...@@ -829,6 +873,10 @@ class MuLawDecoding(torch.nn.Module):
class Resample(torch.nn.Module): class Resample(torch.nn.Module):
r"""Resample a signal from one frequency to another. A resampling method can be given. r"""Resample a signal from one frequency to another. A resampling method can be given.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Note: Note:
If resampling on waveforms of higher precision than float32, there may be a small loss of precision If resampling on waveforms of higher precision than float32, there may be a small loss of precision
because the kernel is cached once as float32. If high precision resampling is important for your application, because the kernel is cached once as float32. If high precision resampling is important for your application,
...@@ -909,6 +957,10 @@ class Resample(torch.nn.Module): ...@@ -909,6 +957,10 @@ class Resample(torch.nn.Module):
class ComputeDeltas(torch.nn.Module): class ComputeDeltas(torch.nn.Module):
r"""Compute delta coefficients of a tensor, usually a spectrogram. r"""Compute delta coefficients of a tensor, usually a spectrogram.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
See `torchaudio.functional.compute_deltas` for more details. See `torchaudio.functional.compute_deltas` for more details.
Args: Args:
...@@ -936,6 +988,10 @@ class ComputeDeltas(torch.nn.Module): ...@@ -936,6 +988,10 @@ class ComputeDeltas(torch.nn.Module):
class TimeStretch(torch.nn.Module): class TimeStretch(torch.nn.Module):
r"""Stretch stft in time without modifying pitch for a given rate. r"""Stretch stft in time without modifying pitch for a given rate.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Proposed in *SpecAugment* [:footcite:`specaugment`]. Proposed in *SpecAugment* [:footcite:`specaugment`].
Args: Args:
...@@ -1001,6 +1057,10 @@ class TimeStretch(torch.nn.Module): ...@@ -1001,6 +1057,10 @@ class TimeStretch(torch.nn.Module):
class Fade(torch.nn.Module): class Fade(torch.nn.Module):
r"""Add a fade in and/or fade out to an waveform. r"""Add a fade in and/or fade out to an waveform.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
fade_in_len (int, optional): Length of fade-in (time frames). (Default: ``0``) fade_in_len (int, optional): Length of fade-in (time frames). (Default: ``0``)
fade_out_len (int, optional): Length of fade-out (time frames). (Default: ``0``) fade_out_len (int, optional): Length of fade-out (time frames). (Default: ``0``)
...@@ -1114,6 +1174,10 @@ class _AxisMasking(torch.nn.Module): ...@@ -1114,6 +1174,10 @@ class _AxisMasking(torch.nn.Module):
class FrequencyMasking(_AxisMasking): class FrequencyMasking(_AxisMasking):
r"""Apply masking to a spectrogram in the frequency domain. r"""Apply masking to a spectrogram in the frequency domain.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Proposed in *SpecAugment* [:footcite:`specaugment`]. Proposed in *SpecAugment* [:footcite:`specaugment`].
Args: Args:
...@@ -1144,6 +1208,10 @@ class FrequencyMasking(_AxisMasking): ...@@ -1144,6 +1208,10 @@ class FrequencyMasking(_AxisMasking):
class TimeMasking(_AxisMasking): class TimeMasking(_AxisMasking):
r"""Apply masking to a spectrogram in the time domain. r"""Apply masking to a spectrogram in the time domain.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Proposed in *SpecAugment* [:footcite:`specaugment`]. Proposed in *SpecAugment* [:footcite:`specaugment`].
Args: Args:
...@@ -1178,6 +1246,10 @@ class TimeMasking(_AxisMasking): ...@@ -1178,6 +1246,10 @@ class TimeMasking(_AxisMasking):
class Vol(torch.nn.Module): class Vol(torch.nn.Module):
r"""Add a volume to an waveform. r"""Add a volume to an waveform.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
gain (float): Interpreted according to the given gain_type: gain (float): Interpreted according to the given gain_type:
If ``gain_type`` = ``amplitude``, ``gain`` is a positive amplitude ratio. If ``gain_type`` = ``amplitude``, ``gain`` is a positive amplitude ratio.
...@@ -1218,6 +1290,10 @@ class SlidingWindowCmn(torch.nn.Module): ...@@ -1218,6 +1290,10 @@ class SlidingWindowCmn(torch.nn.Module):
r""" r"""
Apply sliding-window cepstral mean (and optionally variance) normalization per utterance. Apply sliding-window cepstral mean (and optionally variance) normalization per utterance.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
cmn_window (int, optional): Window in frames for running average CMN computation (int, default = 600) cmn_window (int, optional): Window in frames for running average CMN computation (int, default = 600)
min_cmn_window (int, optional): Minimum CMN window used at start of decoding (adds latency only at start). min_cmn_window (int, optional): Minimum CMN window used at start of decoding (adds latency only at start).
...@@ -1250,6 +1326,11 @@ class SlidingWindowCmn(torch.nn.Module): ...@@ -1250,6 +1326,11 @@ class SlidingWindowCmn(torch.nn.Module):
class Vad(torch.nn.Module): class Vad(torch.nn.Module):
r"""Voice Activity Detector. Similar to SoX implementation. r"""Voice Activity Detector. Similar to SoX implementation.
.. devices:: CPU CUDA
.. properties:: TorchScript
Attempts to trim silence and quiet background sounds from the ends of recordings of speech. Attempts to trim silence and quiet background sounds from the ends of recordings of speech.
The algorithm currently uses a simple cepstral power measurement to detect voice, The algorithm currently uses a simple cepstral power measurement to detect voice,
so may be fooled by other things, especially music. so may be fooled by other things, especially music.
...@@ -1373,6 +1454,10 @@ class Vad(torch.nn.Module): ...@@ -1373,6 +1454,10 @@ class Vad(torch.nn.Module):
class SpectralCentroid(torch.nn.Module): class SpectralCentroid(torch.nn.Module):
r"""Compute the spectral centroid for each channel along the time axis. r"""Compute the spectral centroid for each channel along the time axis.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
The spectral centroid is defined as the weighted average of the The spectral centroid is defined as the weighted average of the
frequency values, weighted by their magnitude. frequency values, weighted by their magnitude.
...@@ -1429,6 +1514,10 @@ class SpectralCentroid(torch.nn.Module): ...@@ -1429,6 +1514,10 @@ class SpectralCentroid(torch.nn.Module):
class PitchShift(torch.nn.Module): class PitchShift(torch.nn.Module):
r"""Shift the pitch of a waveform by ``n_steps`` steps. r"""Shift the pitch of a waveform by ``n_steps`` steps.
.. devices:: CPU CUDA
.. properties:: TorchScript
Args: Args:
waveform (Tensor): The input waveform of shape `(..., time)`. waveform (Tensor): The input waveform of shape `(..., time)`.
sample_rate (int): Sample rate of `waveform`. sample_rate (int): Sample rate of `waveform`.
...@@ -1493,6 +1582,11 @@ class PitchShift(torch.nn.Module): ...@@ -1493,6 +1582,11 @@ class PitchShift(torch.nn.Module):
class RNNTLoss(torch.nn.Module): class RNNTLoss(torch.nn.Module):
"""Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks* """Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks*
[:footcite:`graves2012sequence`]. [:footcite:`graves2012sequence`].
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
The RNN Transducer loss extends the CTC loss by defining a distribution over output The RNN Transducer loss extends the CTC loss by defining a distribution over output
sequences of all lengths, and by jointly modelling both input-output and output-output sequences of all lengths, and by jointly modelling both input-output and output-output
dependencies. dependencies.
...@@ -1575,6 +1669,10 @@ def _get_mat_trace(input: torch.Tensor, dim1: int = -1, dim2: int = -2) -> torch ...@@ -1575,6 +1669,10 @@ def _get_mat_trace(input: torch.Tensor, dim1: int = -1, dim2: int = -2) -> torch
class PSD(torch.nn.Module): class PSD(torch.nn.Module):
r"""Compute cross-channel power spectral density (PSD) matrix. r"""Compute cross-channel power spectral density (PSD) matrix.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args: Args:
multi_mask (bool, optional): whether to use multi-channel Time-Frequency masks. (Default: ``False``) multi_mask (bool, optional): whether to use multi-channel Time-Frequency masks. (Default: ``False``)
normalize (bool, optional): whether normalize the mask along the time dimension. normalize (bool, optional): whether normalize the mask along the time dimension.
...@@ -1622,6 +1720,10 @@ class PSD(torch.nn.Module): ...@@ -1622,6 +1720,10 @@ class PSD(torch.nn.Module):
class MVDR(torch.nn.Module): class MVDR(torch.nn.Module):
"""Minimum Variance Distortionless Response (MVDR) module that performs MVDR beamforming with Time-Frequency masks. """Minimum Variance Distortionless Response (MVDR) module that performs MVDR beamforming with Time-Frequency masks.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Based on https://github.com/espnet/espnet/blob/master/espnet2/enh/layers/beamformer.py Based on https://github.com/espnet/espnet/blob/master/espnet2/enh/layers/beamformer.py
We provide three solutions of MVDR beamforming. One is based on *reference channel selection* We provide three solutions of MVDR beamforming. One is based on *reference channel selection*
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment