[release 0.13] Remove prototype (#2749)

4d251485 · Caroline Chen · GitHub · 84d8ced9 · 84d8ced9 · 84d8ced9
Unverified Commit 4d251485 authored Oct 11, 2022 by Caroline Chen Committed by GitHub Oct 11, 2022
9 changed files
--- a/torchaudio/prototype/__init__.py
+++ b/torchaudio/prototype/__init__.py
--- a/torchaudio/prototype/functional/__init__.py
+++ b/torchaudio/prototype/functional/__init__.py
-from .functional import add_noise, convolve, fftconvolve
-
-__all__ = ["add_noise", "convolve", "fftconvolve"]
--- a/torchaudio/prototype/functional/functional.py
+++ b/torchaudio/prototype/functional/functional.py
-import torch
-
-
-def _check_convolve_inputs(x: torch.Tensor, y: torch.Tensor) -> None:
-    if x.shape[:-1] != y.shape[:-1]:
-        raise ValueError(f"Leading dimensions of x and y don't match (got {x.shape} and {y.shape}).")
-
-
-def fftconvolve(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-    r"""
-    Convolves inputs along their last dimension using FFT. For inputs with large last dimensions, this function
-    is generally much faster than :meth:`convolve`.
-    Note that, in contrast to :meth:`torch.nn.functional.conv1d`, which actually applies the valid cross-correlation
-    operator, this function applies the true `convolution`_ operator.
-    Also note that this function can only output float tensors (int tensor inputs will be cast to float).
-
-    .. devices:: CPU CUDA
-
-    .. properties:: Autograd TorchScript
-
-    Args:
-        x (torch.Tensor): First convolution operand, with shape `(..., N)`.
-        y (torch.Tensor): Second convolution operand, with shape `(..., M)`
-            (leading dimensions must match those of ``x``).
-
-    Returns:
-        torch.Tensor: Result of convolving ``x`` and ``y``, with shape `(..., N + M - 1)`, where
-        the leading dimensions match those of ``x``.
-
-    .. _convolution:
-        https://en.wikipedia.org/wiki/Convolution
-    """
-    _check_convolve_inputs(x, y)
-
-    n = x.size(-1) + y.size(-1) - 1
-    fresult = torch.fft.rfft(x, n=n) * torch.fft.rfft(y, n=n)
-    return torch.fft.irfft(fresult, n=n)
-
-
-def convolve(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-    r"""
-    Convolves inputs along their last dimension using the direct method.
-    Note that, in contrast to :meth:`torch.nn.functional.conv1d`, which actually applies the valid cross-correlation
-    operator, this function applies the true `convolution`_ operator.
-
-    .. devices:: CPU CUDA
-
-    .. properties:: Autograd TorchScript
-
-    Args:
-        x (torch.Tensor): First convolution operand, with shape `(..., N)`.
-        y (torch.Tensor): Second convolution operand, with shape `(..., M)`
-            (leading dimensions must match those of ``x``).
-
-    Returns:
-        torch.Tensor: Result of convolving ``x`` and ``y``, with shape `(..., N + M - 1)`, where
-        the leading dimensions match those of ``x``.
-
-    .. _convolution:
-        https://en.wikipedia.org/wiki/Convolution
-    """
-    _check_convolve_inputs(x, y)
-
-    if x.size(-1) < y.size(-1):
-        x, y = y, x
-
-    num_signals = torch.tensor(x.shape[:-1]).prod()
-    reshaped_x = x.reshape((int(num_signals), x.size(-1)))
-    reshaped_y = y.reshape((int(num_signals), y.size(-1)))
-    output = torch.nn.functional.conv1d(
-        input=reshaped_x,
-        weight=reshaped_y.flip(-1).unsqueeze(1),
-        stride=1,
-        groups=reshaped_x.size(0),
-        padding=reshaped_y.size(-1) - 1,
-    )
-    output_shape = x.shape[:-1] + (-1,)
-    return output.reshape(output_shape)
-
-
-def add_noise(waveform: torch.Tensor, noise: torch.Tensor, lengths: torch.Tensor, snr: torch.Tensor) -> torch.Tensor:
-    r"""Scales and adds noise to waveform per signal-to-noise ratio.
-
-    Specifically, for each pair of waveform vector :math:`x \in \mathbb{R}^L` and noise vector
-    :math:`n \in \mathbb{R}^L`, the function computes output :math:`y` as
-
-    .. math::
-        y = x + a n \, \text{,}
-
-    where
-
-    .. math::
-        a = \sqrt{ \frac{ ||x||_{2}^{2} }{ ||n||_{2}^{2} } \cdot 10^{-\frac{\text{SNR}}{10}} } \, \text{,}
-
-    with :math:`\text{SNR}` being the desired signal-to-noise ratio between :math:`x` and :math:`n`, in dB.
-
-    Note that this function broadcasts singleton leading dimensions in its inputs in a manner that is
-    consistent with the above formulae and PyTorch's broadcasting semantics.
-
-    .. devices:: CPU CUDA
-
-    .. properties:: Autograd TorchScript
-
-    Args:
-        waveform (torch.Tensor): Input waveform, with shape `(..., L)`.
-        noise (torch.Tensor): Noise, with shape `(..., L)` (same shape as ``waveform``).
-        lengths (torch.Tensor): Valid lengths of signals in ``waveform`` and ``noise``, with shape `(...,)`
-            (leading dimensions must match those of ``waveform``).
-        snr (torch.Tensor): Signal-to-noise ratios in dB, with shape `(...,)`.
-
-    Returns:
-        torch.Tensor: Result of scaling and adding ``noise`` to ``waveform``, with shape `(..., L)`
-        (same shape as ``waveform``).
-    """
-
-    if not (waveform.ndim - 1 == noise.ndim - 1 == lengths.ndim == snr.ndim):
-        raise ValueError("Input leading dimensions don't match.")
-
-    L = waveform.size(-1)
-
-    if L != noise.size(-1):
-        raise ValueError(f"Length dimensions of waveform and noise don't match (got {L} and {noise.size(-1)}).")
-
-    # compute scale
-    mask = torch.arange(0, L, device=lengths.device).expand(waveform.shape) < lengths.unsqueeze(
-        -1
-    )  # (*, L) < (*, 1) = (*, L)
-    energy_signal = torch.linalg.vector_norm(waveform * mask, ord=2, dim=-1) ** 2  # (*,)
-    energy_noise = torch.linalg.vector_norm(noise * mask, ord=2, dim=-1) ** 2  # (*,)
-    original_snr_db = 10 * (torch.log10(energy_signal) - torch.log10(energy_noise))
-    scale = 10 ** ((original_snr_db - snr) / 20.0)  # (*,)
-
-    # scale noise
-    scaled_noise = scale.unsqueeze(-1) * noise  # (*, 1) * (*, L) = (*, L)
-
-    return waveform + scaled_noise  # (*, L)
--- a/torchaudio/prototype/models/__init__.py
+++ b/torchaudio/prototype/models/__init__.py
-from .conv_emformer import ConvEmformer
-from .rnnt import conformer_rnnt_base, conformer_rnnt_model
-
-__all__ = [
-    "conformer_rnnt_base",
-    "conformer_rnnt_model",
-    "ConvEmformer",
-]
--- a/torchaudio/prototype/models/conv_emformer.py
+++ b/torchaudio/prototype/models/conv_emformer.py
--- a/torchaudio/prototype/models/hdemucs/__init__.py
+++ b/torchaudio/prototype/models/hdemucs/__init__.py
-import torchaudio
-
-functions = ["HDemucs", "hdemucs_high", "hdemucs_medium", "hdemucs_low"]
-
-
-def __getattr__(name: str):
-    if name in functions:
-        import warnings
-
-        warnings.warn(
-            f"{__name__}.{name} has been moved to torchaudio.models.hdemucs",
-            DeprecationWarning,
-        )
-
-        return getattr(torchaudio.models, name)
-
-    raise AttributeError(f"module {__name__} has no attribute {name}")
-
-
-def __dir__():
-    return functions
--- a/torchaudio/prototype/models/rnnt.py
+++ b/torchaudio/prototype/models/rnnt.py
-from typing import List, Optional, Tuple
-
-import torch
-from torchaudio.models import Conformer, RNNT
-from torchaudio.models.rnnt import _Joiner, _Predictor, _TimeReduction, _Transcriber
-
-
-class _ConformerEncoder(torch.nn.Module, _Transcriber):
-    def __init__(
-        self,
-        *,
-        input_dim: int,
-        output_dim: int,
-        time_reduction_stride: int,
-        conformer_input_dim: int,
-        conformer_ffn_dim: int,
-        conformer_num_layers: int,
-        conformer_num_heads: int,
-        conformer_depthwise_conv_kernel_size: int,
-        conformer_dropout: float,
-    ) -> None:
-        super().__init__()
-        self.time_reduction = _TimeReduction(time_reduction_stride)
-        self.input_linear = torch.nn.Linear(input_dim * time_reduction_stride, conformer_input_dim)
-        self.conformer = Conformer(
-            num_layers=conformer_num_layers,
-            input_dim=conformer_input_dim,
-            ffn_dim=conformer_ffn_dim,
-            num_heads=conformer_num_heads,
-            depthwise_conv_kernel_size=conformer_depthwise_conv_kernel_size,
-            dropout=conformer_dropout,
-            use_group_norm=True,
-            convolution_first=True,
-        )
-        self.output_linear = torch.nn.Linear(conformer_input_dim, output_dim)
-        self.layer_norm = torch.nn.LayerNorm(output_dim)
-
-    def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        time_reduction_out, time_reduction_lengths = self.time_reduction(input, lengths)
-        input_linear_out = self.input_linear(time_reduction_out)
-        x, lengths = self.conformer(input_linear_out, time_reduction_lengths)
-        output_linear_out = self.output_linear(x)
-        layer_norm_out = self.layer_norm(output_linear_out)
-        return layer_norm_out, lengths
-
-    def infer(
-        self,
-        input: torch.Tensor,
-        lengths: torch.Tensor,
-        states: Optional[List[List[torch.Tensor]]],
-    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
-        raise RuntimeError("Conformer does not support streaming inference.")
-
-
-def conformer_rnnt_model(
-    *,
-    input_dim: int,
-    encoding_dim: int,
-    time_reduction_stride: int,
-    conformer_input_dim: int,
-    conformer_ffn_dim: int,
-    conformer_num_layers: int,
-    conformer_num_heads: int,
-    conformer_depthwise_conv_kernel_size: int,
-    conformer_dropout: float,
-    num_symbols: int,
-    symbol_embedding_dim: int,
-    num_lstm_layers: int,
-    lstm_hidden_dim: int,
-    lstm_layer_norm: int,
-    lstm_layer_norm_epsilon: int,
-    lstm_dropout: int,
-    joiner_activation: str,
-) -> RNNT:
-    r"""Builds Conformer-based recurrent neural network transducer (RNN-T) model.
-
-    Args:
-        input_dim (int): dimension of input sequence frames passed to transcription network.
-        encoding_dim (int): dimension of transcription- and prediction-network-generated encodings
-            passed to joint network.
-        time_reduction_stride (int): factor by which to reduce length of input sequence.
-        conformer_input_dim (int): dimension of Conformer input.
-        conformer_ffn_dim (int): hidden layer dimension of each Conformer layer's feedforward network.
-        conformer_num_layers (int): number of Conformer layers to instantiate.
-        conformer_num_heads (int): number of attention heads in each Conformer layer.
-        conformer_depthwise_conv_kernel_size (int): kernel size of each Conformer layer's depthwise convolution layer.
-        conformer_dropout (float): Conformer dropout probability.
-        num_symbols (int): cardinality of set of target tokens.
-        symbol_embedding_dim (int): dimension of each target token embedding.
-        num_lstm_layers (int): number of LSTM layers to instantiate.
-        lstm_hidden_dim (int): output dimension of each LSTM layer.
-        lstm_layer_norm (bool): if ``True``, enables layer normalization for LSTM layers.
-        lstm_layer_norm_epsilon (float): value of epsilon to use in LSTM layer normalization layers.
-        lstm_dropout (float): LSTM dropout probability.
-        joiner_activation (str): activation function to use in the joiner.
-            Must be one of ("relu", "tanh"). (Default: "relu")
-
-        Returns:
-            RNNT:
-                Conformer RNN-T model.
-    """
-    encoder = _ConformerEncoder(
-        input_dim=input_dim,
-        output_dim=encoding_dim,
-        time_reduction_stride=time_reduction_stride,
-        conformer_input_dim=conformer_input_dim,
-        conformer_ffn_dim=conformer_ffn_dim,
-        conformer_num_layers=conformer_num_layers,
-        conformer_num_heads=conformer_num_heads,
-        conformer_depthwise_conv_kernel_size=conformer_depthwise_conv_kernel_size,
-        conformer_dropout=conformer_dropout,
-    )
-    predictor = _Predictor(
-        num_symbols=num_symbols,
-        output_dim=encoding_dim,
-        symbol_embedding_dim=symbol_embedding_dim,
-        num_lstm_layers=num_lstm_layers,
-        lstm_hidden_dim=lstm_hidden_dim,
-        lstm_layer_norm=lstm_layer_norm,
-        lstm_layer_norm_epsilon=lstm_layer_norm_epsilon,
-        lstm_dropout=lstm_dropout,
-    )
-    joiner = _Joiner(encoding_dim, num_symbols, activation=joiner_activation)
-    return RNNT(encoder, predictor, joiner)
-
-
-def conformer_rnnt_base() -> RNNT:
-    r"""Builds basic version of Conformer RNN-T model.
-
-    Returns:
-        RNNT:
-            Conformer RNN-T model.
-    """
-    return conformer_rnnt_model(
-        input_dim=80,
-        encoding_dim=1024,
-        time_reduction_stride=4,
-        conformer_input_dim=256,
-        conformer_ffn_dim=1024,
-        conformer_num_layers=16,
-        conformer_num_heads=4,
-        conformer_depthwise_conv_kernel_size=31,
-        conformer_dropout=0.1,
-        num_symbols=1024,
-        symbol_embedding_dim=256,
-        num_lstm_layers=2,
-        lstm_hidden_dim=512,
-        lstm_layer_norm=True,
-        lstm_layer_norm_epsilon=1e-5,
-        lstm_dropout=0.3,
-        joiner_activation="tanh",
-    )
--- a/torchaudio/prototype/pipelines/__init__.py
+++ b/torchaudio/prototype/pipelines/__init__.py
-from .rnnt_pipeline import EMFORMER_RNNT_BASE_MUSTC, EMFORMER_RNNT_BASE_TEDLIUM3
-
-__all__ = [
-    "EMFORMER_RNNT_BASE_MUSTC",
-    "EMFORMER_RNNT_BASE_TEDLIUM3",
-]
--- a/torchaudio/prototype/pipelines/rnnt_pipeline.py
+++ b/torchaudio/prototype/pipelines/rnnt_pipeline.py
-from functools import partial
-
-from torchaudio.models import emformer_rnnt_base
-from torchaudio.pipelines import RNNTBundle
-
-
-EMFORMER_RNNT_BASE_MUSTC = RNNTBundle(
-    _rnnt_path="models/emformer_rnnt_base_mustc.pt",
-    _rnnt_factory_func=partial(emformer_rnnt_base, num_symbols=501),
-    _global_stats_path="pipeline-assets/global_stats_rnnt_mustc.json",
-    _sp_model_path="pipeline-assets/spm_bpe_500_mustc.model",
-    _right_padding=4,
-    _blank=500,
-    _sample_rate=16000,
-    _n_fft=400,
-    _n_mels=80,
-    _hop_length=160,
-    _segment_length=16,
-    _right_context_length=4,
-)
-EMFORMER_RNNT_BASE_MUSTC.__doc__ = """Pre-trained Emformer-RNNT-based ASR pipeline capable of performing both streaming and non-streaming inference.
-    The underlying model is constructed by :py:func:`torchaudio.models.emformer_rnnt_base` and utilizes weights
-    trained on *MuST-C release v2.0* :cite:`CATTONI2021101155` dataset using training script ``train.py``
-    `here <https://github.com/pytorch/audio/tree/main/examples/asr/emformer_rnnt>`__ with ``num_symbols=501``.
-    Please refer to :py:class:`torchaudio.pipelines.RNNTBundle` for usage instructions.
-    """
-
-
-EMFORMER_RNNT_BASE_TEDLIUM3 = RNNTBundle(
-    _rnnt_path="models/emformer_rnnt_base_tedlium3.pt",
-    _rnnt_factory_func=partial(emformer_rnnt_base, num_symbols=501),
-    _global_stats_path="pipeline-assets/global_stats_rnnt_tedlium3.json",
-    _sp_model_path="pipeline-assets/spm_bpe_500_tedlium3.model",
-    _right_padding=4,
-    _blank=500,
-    _sample_rate=16000,
-    _n_fft=400,
-    _n_mels=80,
-    _hop_length=160,
-    _segment_length=16,
-    _right_context_length=4,
-)
-EMFORMER_RNNT_BASE_TEDLIUM3.__doc__ = """Pre-trained Emformer-RNNT-based ASR pipeline capable of performing both streaming and non-streaming inference.
-
-    The underlying model is constructed by :py:func:`torchaudio.models.emformer_rnnt_base`
-    and utilizes weights trained on TED-LIUM Release 3 dataset using training script ``train.py``
-    `here <https://github.com/pytorch/audio/tree/main/examples/asr/emformer_rnnt>`__ with ``num_symbols=501``.
-
-    Please refer to :py:class:`torchaudio.pipelines.RNNTBundle` for usage instructions.
-    """