Unverified Commit 4d251485 authored by Caroline Chen's avatar Caroline Chen Committed by GitHub
Browse files

[release 0.13] Remove prototype (#2749)

parent 84d8ced9
from .functional import add_noise, convolve, fftconvolve
__all__ = ["add_noise", "convolve", "fftconvolve"]
import torch
def _check_convolve_inputs(x: torch.Tensor, y: torch.Tensor) -> None:
if x.shape[:-1] != y.shape[:-1]:
raise ValueError(f"Leading dimensions of x and y don't match (got {x.shape} and {y.shape}).")
def fftconvolve(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
r"""
Convolves inputs along their last dimension using FFT. For inputs with large last dimensions, this function
is generally much faster than :meth:`convolve`.
Note that, in contrast to :meth:`torch.nn.functional.conv1d`, which actually applies the valid cross-correlation
operator, this function applies the true `convolution`_ operator.
Also note that this function can only output float tensors (int tensor inputs will be cast to float).
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
x (torch.Tensor): First convolution operand, with shape `(..., N)`.
y (torch.Tensor): Second convolution operand, with shape `(..., M)`
(leading dimensions must match those of ``x``).
Returns:
torch.Tensor: Result of convolving ``x`` and ``y``, with shape `(..., N + M - 1)`, where
the leading dimensions match those of ``x``.
.. _convolution:
https://en.wikipedia.org/wiki/Convolution
"""
_check_convolve_inputs(x, y)
n = x.size(-1) + y.size(-1) - 1
fresult = torch.fft.rfft(x, n=n) * torch.fft.rfft(y, n=n)
return torch.fft.irfft(fresult, n=n)
def convolve(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
r"""
Convolves inputs along their last dimension using the direct method.
Note that, in contrast to :meth:`torch.nn.functional.conv1d`, which actually applies the valid cross-correlation
operator, this function applies the true `convolution`_ operator.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
x (torch.Tensor): First convolution operand, with shape `(..., N)`.
y (torch.Tensor): Second convolution operand, with shape `(..., M)`
(leading dimensions must match those of ``x``).
Returns:
torch.Tensor: Result of convolving ``x`` and ``y``, with shape `(..., N + M - 1)`, where
the leading dimensions match those of ``x``.
.. _convolution:
https://en.wikipedia.org/wiki/Convolution
"""
_check_convolve_inputs(x, y)
if x.size(-1) < y.size(-1):
x, y = y, x
num_signals = torch.tensor(x.shape[:-1]).prod()
reshaped_x = x.reshape((int(num_signals), x.size(-1)))
reshaped_y = y.reshape((int(num_signals), y.size(-1)))
output = torch.nn.functional.conv1d(
input=reshaped_x,
weight=reshaped_y.flip(-1).unsqueeze(1),
stride=1,
groups=reshaped_x.size(0),
padding=reshaped_y.size(-1) - 1,
)
output_shape = x.shape[:-1] + (-1,)
return output.reshape(output_shape)
def add_noise(waveform: torch.Tensor, noise: torch.Tensor, lengths: torch.Tensor, snr: torch.Tensor) -> torch.Tensor:
r"""Scales and adds noise to waveform per signal-to-noise ratio.
Specifically, for each pair of waveform vector :math:`x \in \mathbb{R}^L` and noise vector
:math:`n \in \mathbb{R}^L`, the function computes output :math:`y` as
.. math::
y = x + a n \, \text{,}
where
.. math::
a = \sqrt{ \frac{ ||x||_{2}^{2} }{ ||n||_{2}^{2} } \cdot 10^{-\frac{\text{SNR}}{10}} } \, \text{,}
with :math:`\text{SNR}` being the desired signal-to-noise ratio between :math:`x` and :math:`n`, in dB.
Note that this function broadcasts singleton leading dimensions in its inputs in a manner that is
consistent with the above formulae and PyTorch's broadcasting semantics.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
waveform (torch.Tensor): Input waveform, with shape `(..., L)`.
noise (torch.Tensor): Noise, with shape `(..., L)` (same shape as ``waveform``).
lengths (torch.Tensor): Valid lengths of signals in ``waveform`` and ``noise``, with shape `(...,)`
(leading dimensions must match those of ``waveform``).
snr (torch.Tensor): Signal-to-noise ratios in dB, with shape `(...,)`.
Returns:
torch.Tensor: Result of scaling and adding ``noise`` to ``waveform``, with shape `(..., L)`
(same shape as ``waveform``).
"""
if not (waveform.ndim - 1 == noise.ndim - 1 == lengths.ndim == snr.ndim):
raise ValueError("Input leading dimensions don't match.")
L = waveform.size(-1)
if L != noise.size(-1):
raise ValueError(f"Length dimensions of waveform and noise don't match (got {L} and {noise.size(-1)}).")
# compute scale
mask = torch.arange(0, L, device=lengths.device).expand(waveform.shape) < lengths.unsqueeze(
-1
) # (*, L) < (*, 1) = (*, L)
energy_signal = torch.linalg.vector_norm(waveform * mask, ord=2, dim=-1) ** 2 # (*,)
energy_noise = torch.linalg.vector_norm(noise * mask, ord=2, dim=-1) ** 2 # (*,)
original_snr_db = 10 * (torch.log10(energy_signal) - torch.log10(energy_noise))
scale = 10 ** ((original_snr_db - snr) / 20.0) # (*,)
# scale noise
scaled_noise = scale.unsqueeze(-1) * noise # (*, 1) * (*, L) = (*, L)
return waveform + scaled_noise # (*, L)
from .conv_emformer import ConvEmformer
from .rnnt import conformer_rnnt_base, conformer_rnnt_model
__all__ = [
"conformer_rnnt_base",
"conformer_rnnt_model",
"ConvEmformer",
]
This diff is collapsed.
import torchaudio
functions = ["HDemucs", "hdemucs_high", "hdemucs_medium", "hdemucs_low"]
def __getattr__(name: str):
if name in functions:
import warnings
warnings.warn(
f"{__name__}.{name} has been moved to torchaudio.models.hdemucs",
DeprecationWarning,
)
return getattr(torchaudio.models, name)
raise AttributeError(f"module {__name__} has no attribute {name}")
def __dir__():
return functions
from typing import List, Optional, Tuple
import torch
from torchaudio.models import Conformer, RNNT
from torchaudio.models.rnnt import _Joiner, _Predictor, _TimeReduction, _Transcriber
class _ConformerEncoder(torch.nn.Module, _Transcriber):
def __init__(
self,
*,
input_dim: int,
output_dim: int,
time_reduction_stride: int,
conformer_input_dim: int,
conformer_ffn_dim: int,
conformer_num_layers: int,
conformer_num_heads: int,
conformer_depthwise_conv_kernel_size: int,
conformer_dropout: float,
) -> None:
super().__init__()
self.time_reduction = _TimeReduction(time_reduction_stride)
self.input_linear = torch.nn.Linear(input_dim * time_reduction_stride, conformer_input_dim)
self.conformer = Conformer(
num_layers=conformer_num_layers,
input_dim=conformer_input_dim,
ffn_dim=conformer_ffn_dim,
num_heads=conformer_num_heads,
depthwise_conv_kernel_size=conformer_depthwise_conv_kernel_size,
dropout=conformer_dropout,
use_group_norm=True,
convolution_first=True,
)
self.output_linear = torch.nn.Linear(conformer_input_dim, output_dim)
self.layer_norm = torch.nn.LayerNorm(output_dim)
def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
time_reduction_out, time_reduction_lengths = self.time_reduction(input, lengths)
input_linear_out = self.input_linear(time_reduction_out)
x, lengths = self.conformer(input_linear_out, time_reduction_lengths)
output_linear_out = self.output_linear(x)
layer_norm_out = self.layer_norm(output_linear_out)
return layer_norm_out, lengths
def infer(
self,
input: torch.Tensor,
lengths: torch.Tensor,
states: Optional[List[List[torch.Tensor]]],
) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
raise RuntimeError("Conformer does not support streaming inference.")
def conformer_rnnt_model(
*,
input_dim: int,
encoding_dim: int,
time_reduction_stride: int,
conformer_input_dim: int,
conformer_ffn_dim: int,
conformer_num_layers: int,
conformer_num_heads: int,
conformer_depthwise_conv_kernel_size: int,
conformer_dropout: float,
num_symbols: int,
symbol_embedding_dim: int,
num_lstm_layers: int,
lstm_hidden_dim: int,
lstm_layer_norm: int,
lstm_layer_norm_epsilon: int,
lstm_dropout: int,
joiner_activation: str,
) -> RNNT:
r"""Builds Conformer-based recurrent neural network transducer (RNN-T) model.
Args:
input_dim (int): dimension of input sequence frames passed to transcription network.
encoding_dim (int): dimension of transcription- and prediction-network-generated encodings
passed to joint network.
time_reduction_stride (int): factor by which to reduce length of input sequence.
conformer_input_dim (int): dimension of Conformer input.
conformer_ffn_dim (int): hidden layer dimension of each Conformer layer's feedforward network.
conformer_num_layers (int): number of Conformer layers to instantiate.
conformer_num_heads (int): number of attention heads in each Conformer layer.
conformer_depthwise_conv_kernel_size (int): kernel size of each Conformer layer's depthwise convolution layer.
conformer_dropout (float): Conformer dropout probability.
num_symbols (int): cardinality of set of target tokens.
symbol_embedding_dim (int): dimension of each target token embedding.
num_lstm_layers (int): number of LSTM layers to instantiate.
lstm_hidden_dim (int): output dimension of each LSTM layer.
lstm_layer_norm (bool): if ``True``, enables layer normalization for LSTM layers.
lstm_layer_norm_epsilon (float): value of epsilon to use in LSTM layer normalization layers.
lstm_dropout (float): LSTM dropout probability.
joiner_activation (str): activation function to use in the joiner.
Must be one of ("relu", "tanh"). (Default: "relu")
Returns:
RNNT:
Conformer RNN-T model.
"""
encoder = _ConformerEncoder(
input_dim=input_dim,
output_dim=encoding_dim,
time_reduction_stride=time_reduction_stride,
conformer_input_dim=conformer_input_dim,
conformer_ffn_dim=conformer_ffn_dim,
conformer_num_layers=conformer_num_layers,
conformer_num_heads=conformer_num_heads,
conformer_depthwise_conv_kernel_size=conformer_depthwise_conv_kernel_size,
conformer_dropout=conformer_dropout,
)
predictor = _Predictor(
num_symbols=num_symbols,
output_dim=encoding_dim,
symbol_embedding_dim=symbol_embedding_dim,
num_lstm_layers=num_lstm_layers,
lstm_hidden_dim=lstm_hidden_dim,
lstm_layer_norm=lstm_layer_norm,
lstm_layer_norm_epsilon=lstm_layer_norm_epsilon,
lstm_dropout=lstm_dropout,
)
joiner = _Joiner(encoding_dim, num_symbols, activation=joiner_activation)
return RNNT(encoder, predictor, joiner)
def conformer_rnnt_base() -> RNNT:
r"""Builds basic version of Conformer RNN-T model.
Returns:
RNNT:
Conformer RNN-T model.
"""
return conformer_rnnt_model(
input_dim=80,
encoding_dim=1024,
time_reduction_stride=4,
conformer_input_dim=256,
conformer_ffn_dim=1024,
conformer_num_layers=16,
conformer_num_heads=4,
conformer_depthwise_conv_kernel_size=31,
conformer_dropout=0.1,
num_symbols=1024,
symbol_embedding_dim=256,
num_lstm_layers=2,
lstm_hidden_dim=512,
lstm_layer_norm=True,
lstm_layer_norm_epsilon=1e-5,
lstm_dropout=0.3,
joiner_activation="tanh",
)
from .rnnt_pipeline import EMFORMER_RNNT_BASE_MUSTC, EMFORMER_RNNT_BASE_TEDLIUM3
__all__ = [
"EMFORMER_RNNT_BASE_MUSTC",
"EMFORMER_RNNT_BASE_TEDLIUM3",
]
from functools import partial
from torchaudio.models import emformer_rnnt_base
from torchaudio.pipelines import RNNTBundle
EMFORMER_RNNT_BASE_MUSTC = RNNTBundle(
_rnnt_path="models/emformer_rnnt_base_mustc.pt",
_rnnt_factory_func=partial(emformer_rnnt_base, num_symbols=501),
_global_stats_path="pipeline-assets/global_stats_rnnt_mustc.json",
_sp_model_path="pipeline-assets/spm_bpe_500_mustc.model",
_right_padding=4,
_blank=500,
_sample_rate=16000,
_n_fft=400,
_n_mels=80,
_hop_length=160,
_segment_length=16,
_right_context_length=4,
)
EMFORMER_RNNT_BASE_MUSTC.__doc__ = """Pre-trained Emformer-RNNT-based ASR pipeline capable of performing both streaming and non-streaming inference.
The underlying model is constructed by :py:func:`torchaudio.models.emformer_rnnt_base` and utilizes weights
trained on *MuST-C release v2.0* :cite:`CATTONI2021101155` dataset using training script ``train.py``
`here <https://github.com/pytorch/audio/tree/main/examples/asr/emformer_rnnt>`__ with ``num_symbols=501``.
Please refer to :py:class:`torchaudio.pipelines.RNNTBundle` for usage instructions.
"""
EMFORMER_RNNT_BASE_TEDLIUM3 = RNNTBundle(
_rnnt_path="models/emformer_rnnt_base_tedlium3.pt",
_rnnt_factory_func=partial(emformer_rnnt_base, num_symbols=501),
_global_stats_path="pipeline-assets/global_stats_rnnt_tedlium3.json",
_sp_model_path="pipeline-assets/spm_bpe_500_tedlium3.model",
_right_padding=4,
_blank=500,
_sample_rate=16000,
_n_fft=400,
_n_mels=80,
_hop_length=160,
_segment_length=16,
_right_context_length=4,
)
EMFORMER_RNNT_BASE_TEDLIUM3.__doc__ = """Pre-trained Emformer-RNNT-based ASR pipeline capable of performing both streaming and non-streaming inference.
The underlying model is constructed by :py:func:`torchaudio.models.emformer_rnnt_base`
and utilizes weights trained on TED-LIUM Release 3 dataset using training script ``train.py``
`here <https://github.com/pytorch/audio/tree/main/examples/asr/emformer_rnnt>`__ with ``num_symbols=501``.
Please refer to :py:class:`torchaudio.pipelines.RNNTBundle` for usage instructions.
"""
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment