Update models/pipelines doc (#1894)

1. Override the return type so that Sphinx shows the exported symbols. (output model types and input torch.nn.Module) 2. Tweak docs for Tacotron2TTSBundle interfaces 3. Fix for HUBERT_ASR_XLARGE

Update models/pipelines doc (#1894)
1. Override the return type so that Sphinx shows the exported symbols. (output model types and input torch.nn.Module) 2. Tweak docs for Tacotron2TTSBundle interfaces 3. Fix for HUBERT_ASR_XLARGE
420e84ee · moto · GitHub · cb40dd72 · 420e84ee · 420e84ee
Unverified Commit 420e84ee authored Oct 18, 2021 by moto Committed by GitHub Oct 18, 2021
6 changed files
--- a/docs/source/pipelines.rst
+++ b/docs/source/pipelines.rst
@@ -167,7 +167,7 @@ HUBERT_ASR_XLARGE
 .. container:: py attribute
   .. autodata:: HUBERT_ASR_XLARGE
+      :no-value:
 Tacotron2 Text-To-Speech
 ------------------------

--- a/torchaudio/models/wav2vec2/model.py
+++ b/torchaudio/models/wav2vec2/model.py
@@ -8,7 +8,9 @@ from . import components
 class Wav2Vec2Model(Module):
-    """Encoder model used in *wav2vec 2.0* [:footcite:`baevski2020wav2vec`].
+    """torchaudio.models.Wav2Vec2Model(feature_extractor: torch.nn.Module, encoder: torch.nn.Module, aux: Optional[torch.nn.Module] = None)
+    Encoder model used in *wav2vec 2.0* [:footcite:`baevski2020wav2vec`].
    Note:
        To build the model, please use one of the factory functions.
@@ -23,7 +25,7 @@ class Wav2Vec2Model(Module):
        aux (torch.nn.Module or None, optional):
            Auxiliary module. If provided, the output from encoder is passed to this module.
-    """
+    """  # noqa: E501
    def __init__(
            self,
            feature_extractor: Module,
@@ -132,7 +134,10 @@ def wav2vec2_model(
        encoder_layer_drop: float,
        aux_num_out: Optional[int],
 ) -> Wav2Vec2Model:
-    """Build a custom Wav2Vec2Model
+    # Overriding the signature so that the return type is correct on Sphinx
+    """wav2vec2_model(extractor_mode: str, extractor_conv_layer_config: Optional[List[Tuple[int, int, int]]], extractor_conv_bias: bool, encoder_embed_dim: int, encoder_projection_dropout: float, encoder_pos_conv_kernel: int, encoder_pos_conv_groups: int, encoder_num_layers: int, encoder_num_heads: int, encoder_attention_dropout: float, encoder_ff_interm_features: int, encoder_ff_interm_dropout: float, encoder_dropout: float, encoder_layer_norm_first: bool, encoder_layer_drop: float, aux_num_out: Optional[int]) -> torchaudio.models.Wav2Vec2Model
+    Build a custom Wav2Vec2Model
    Note:
        The "feature extractor" below corresponds to
@@ -287,7 +292,10 @@ def wav2vec2_base(
        encoder_layer_drop: float = 0.1,
        aux_num_out: Optional[int] = None,
 ) -> Wav2Vec2Model:
-    """Build Wav2Vec2Model with "base" architecture from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]
+    # Overriding the signature so that the return type is correct on Sphinx
+    """wav2vec2_base(encoder_projection_dropout: float = 0.1, encoder_attention_dropout: float = 0.1, encoder_ff_interm_dropout: float = 0.1, encoder_dropout: float = 0.1, encoder_layer_drop: float = 0.1, aux_num_out: Optional[int] = None) -> torchaudio.models.Wav2Vec2Model
+    Build Wav2Vec2Model with "base" architecture from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]
    Args:
        encoder_projection_dropout (float):
@@ -306,7 +314,7 @@ def wav2vec2_base(
    Returns:
        Wav2Vec2Model:
            The resulting model.
-    """
+    """  # noqa: E501
    return wav2vec2_model(
        extractor_mode="group_norm",
        extractor_conv_layer_config=None,
@@ -335,7 +343,10 @@ def wav2vec2_large(
        encoder_layer_drop: float = 0.1,
        aux_num_out: Optional[int] = None,
 ) -> Wav2Vec2Model:
-    """Build Wav2Vec2Model with "large" architecture from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]
+    # Overriding the signature so that the return type is correct on Sphinx
+    """wav2vec2_large(encoder_projection_dropout: float = 0.1, encoder_attention_dropout: float = 0.1, encoder_ff_interm_dropout: float = 0.1, encoder_dropout: float = 0.1, encoder_layer_drop: float = 0.1, aux_num_out: Optional[int] = None) -> torchaudio.models.Wav2Vec2Model
+    Build Wav2Vec2Model with "large" architecture from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]
    Args:
        encoder_projection_dropout (float):
@@ -354,7 +365,7 @@ def wav2vec2_large(
    Returns:
        Wav2Vec2Model:
            The resulting model.
-    """
+    """  # noqa: E501
    return wav2vec2_model(
        extractor_mode="group_norm",
        extractor_conv_layer_config=None,
@@ -383,7 +394,10 @@ def wav2vec2_large_lv60k(
        encoder_layer_drop: float = 0.1,
        aux_num_out: Optional[int] = None,
 ) -> Wav2Vec2Model:
-    """Build Wav2Vec2Model with "large lv-60k" architecture from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]
+    # Overriding the signature so that the return type is correct on Sphinx
+    """wav2vec2_large_lv60k( encoder_projection_dropout: float = 0.1, encoder_attention_dropout: float = 0.0, encoder_ff_interm_dropout: float = 0.1, encoder_dropout: float = 0.0, encoder_layer_drop: float = 0.1, aux_num_out: Optional[int] = None) -> torchaudio.models.Wav2Vec2Model
+    Build Wav2Vec2Model with "large lv-60k" architecture from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]
    Args:
        encoder_projection_dropout (float):
@@ -402,7 +416,7 @@ def wav2vec2_large_lv60k(
    Returns:
        Wav2Vec2Model:
            The resulting model.
-    """
+    """  # noqa: E501
    return wav2vec2_model(
        extractor_mode="layer_norm",
        extractor_conv_layer_config=None,
@@ -431,7 +445,10 @@ def hubert_base(
        encoder_layer_drop: float = 0.05,
        aux_num_out: Optional[int] = None,
 ) -> Wav2Vec2Model:
-    """Build HuBERT model with "base" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
+    # Overriding the signature so that the return type is correct on Sphinx
+    """hubert_base(encoder_projection_dropout: float = 0.1, encoder_attention_dropout: float = 0.1, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.1, encoder_layer_drop: float = 0.05, aux_num_out: Optional[int] = None) -> torchaudio.models.Wav2Vec2Model
+    Build HuBERT model with "base" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
    Args:
        encoder_projection_dropout (float):
@@ -450,7 +467,7 @@ def hubert_base(
    Returns:
        Wav2Vec2Model:
            The resulting model.
-    """
+    """  # noqa: E501
    return wav2vec2_model(
        extractor_mode='group_norm',
        extractor_conv_layer_config=None,
@@ -479,7 +496,10 @@ def hubert_large(
        encoder_layer_drop: float = 0.0,
        aux_num_out: Optional[int] = None,
 ) -> Wav2Vec2Model:
-    """Build HuBERT model with "large" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
+    # Overriding the signature so that the return type is correct on Sphinx
+    """hubert_large(encoder_projection_dropout: float = 0.0, encoder_attention_dropout: float = 0.0, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.0, encoder_layer_drop: float = 0.0, aux_num_out: Optional[int] = None) -> torchaudio.models.Wav2Vec2Model
+    Build HuBERT model with "large" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
    Args:
        encoder_projection_dropout (float):
@@ -498,7 +518,7 @@ def hubert_large(
    Returns:
        Wav2Vec2Model:
            The resulting model.
-    """
+    """  # noqa: E501
    return wav2vec2_model(
        extractor_mode='layer_norm',
        extractor_conv_layer_config=None,
@@ -527,7 +547,10 @@ def hubert_xlarge(
        encoder_layer_drop: float = 0.0,
        aux_num_out: Optional[int] = None,
 ) -> Wav2Vec2Model:
-    """Build HuBERT model with "extra large" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
+    # Overriding the signature so that the return type is correct on Sphinx
+    """hubert_xlarge(encoder_projection_dropout: float = 0.0, encoder_attention_dropout: float = 0.0, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.0, encoder_layer_drop: float = 0.0, aux_num_out: Optional[int] = None) -> torchaudio.models.Wav2Vec2Model
+    Build HuBERT model with "extra large" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
    Args:
        encoder_projection_dropout (float):
@@ -546,7 +569,7 @@ def hubert_xlarge(
    Returns:
        Wav2Vec2Model:
            The resulting model.
-    """
+    """  # noqa: E501
    return wav2vec2_model(
        extractor_mode='layer_norm',
        extractor_conv_layer_config=None,

--- a/torchaudio/models/wav2vec2/utils/import_fairseq.py
+++ b/torchaudio/models/wav2vec2/utils/import_fairseq.py
@@ -126,7 +126,10 @@ def _convert_state_dict(state_dict):
 def import_fairseq_model(original: Module) -> Wav2Vec2Model:
-    """Build Wav2Vec2Model from pretrained parameters published by `fairseq`_.
+    # Overriding the signature so that the types are correct on Sphinx
+    """import_fairseq_model(original: torch.nn.Module) -> torchaudio.models.Wav2Vec2Model
+    Build Wav2Vec2Model from the corresponding model object of `fairseq`_.
    Args:
        original (torch.nn.Module):

--- a/torchaudio/models/wav2vec2/utils/import_huggingface.py
+++ b/torchaudio/models/wav2vec2/utils/import_huggingface.py
@@ -50,7 +50,9 @@ def _build(config, original):
 def import_huggingface_model(original: Module) -> Wav2Vec2Model:
-    """Import wav2vec2 model from Hugging Face's `Transformers`_.
+    """import_huggingface_model(original: torch.nn.Module) -> torchaudio.models.Wav2Vec2Model
+    Build Wav2Vec2Model from the corresponding model object of Hugging Face's `Transformers`_.
    Args:
        original (torch.nn.Module): An instance of ``Wav2Vec2ForCTC`` from ``transformers``.

--- a/torchaudio/pipelines/_tts/interface.py
+++ b/torchaudio/pipelines/_tts/interface.py
@@ -7,13 +7,13 @@ from torchaudio.models import Tacotron2
 class _TextProcessor(ABC):
-    """Interface of the text processing part of Tacotron2TTS pipeline"""
    @property
    @abstractmethod
    def tokens(self):
        """The tokens that the each value in the processed tensor represent.
+        See :func:`torchaudio.pipelines.Tacotron2TTSBundle.get_text_processor` for the usage.
        :type: List[str]
        """
@@ -21,6 +21,8 @@ class _TextProcessor(ABC):
    def __call__(self, texts: Union[str, List[str]]) -> Tuple[Tensor, Tensor]:
        """Encode the given (batch of) texts into numerical tensors
+        See :func:`torchaudio.pipelines.Tacotron2TTSBundle.get_text_processor` for the usage.
        Args:
            text (str or list of str): The input texts.
@@ -34,13 +36,13 @@ class _TextProcessor(ABC):
 class _Vocoder(ABC):
-    """Interface of the vocoder part of Tacotron2TTS pipeline"""
    @property
    @abstractmethod
    def sample_rate(self):
        """The sample rate of the resulting waveform
+        See :func:`torchaudio.pipelines.Tacotron2TTSBundle.get_vocoder` for the usage.
        :type: float
        """
@@ -48,6 +50,8 @@ class _Vocoder(ABC):
    def __call__(self, specgrams: Tensor, lengths: Optional[Tensor]) -> Tuple[Tensor, Optional[Tensor]]:
        """Generate waveform from the given input, such as spectrogram
+        See :func:`torchaudio.pipelines.Tacotron2TTSBundle.get_vocoder` for the usage.
        Args:
            specgrams (Tensor):
                The input spectrogram. Shape: `(batch, frequency bins, time)`.
@@ -146,14 +150,21 @@ class Tacotron2TTSBundle(ABC):
    # new text processing and vocoder will be added in the future, so we want to make these
    # interfaces specific to this Tacotron2TTS pipeline.
    class TextProcessor(_TextProcessor):
-        pass
+        """Interface of the text processing part of Tacotron2TTS pipeline
+        See :func:`torchaudio.pipelines.Tacotron2TTSBundle.get_text_processor` for the usage.
+        """
    class Vocoder(_Vocoder):
-        pass
+        """Interface of the vocoder part of Tacotron2TTS pipeline
+        See :func:`torchaudio.pipelines.Tacotron2TTSBundle.get_vocoder` for the usage.
+        """
    @abstractmethod
    def get_text_processor(self, *, dl_kwargs=None) -> TextProcessor:
-        """get_text_processor(self, *, dl_kwargs=None) -> Tacotron2TTSBundle.TextProcessor:
+        # Overriding the signature so that the return type is correct on Sphinx
+        """get_text_processor(self, *, dl_kwargs=None) -> torchaudio.pipelines.Tacotron2TTSBundle.TextProcessor
        Create a text processor
@@ -177,7 +188,7 @@ class Tacotron2TTSBundle(ABC):
        Example - Character-based
            >>> text = [
-            >>>     "Hello, T T S !",
+            >>>     "Hello World!",
            >>>     "Text-to-speech!",
            >>> ]
            >>> bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH
@@ -192,9 +203,9 @@ class Tacotron2TTSBundle(ABC):
            >>> print(lengths)
            tensor([12, 15], dtype=torch.int32)
            >>>
-            >>> print([processor.tokens[i] for i in input[0]])
+            >>> print([processor.tokens[i] for i in input[0, :lengths[0]]])
-            ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!', '_', '_', '_']
+            ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']
-            >>> print([processor.tokens[i] for i in input[1]])
+            >>> print([processor.tokens[i] for i in input[1, :lengths[1]]])
            ['t', 'e', 'x', 't', '-', 't', 'o', '-', 's', 'p', 'e', 'e', 'c', 'h', '!']
        Example - Phoneme-based
@@ -224,7 +235,8 @@ class Tacotron2TTSBundle(ABC):
    @abstractmethod
    def get_vocoder(self, *, dl_kwargs=None) -> Vocoder:
-        """get_vocoder(self, *, dl_kwargs=None) -> Tacotron2TTSBundle.Vocoder:
+        # Overriding the signature so that the return type is correct on Sphinx
+        """get_vocoder(self, *, dl_kwargs=None) -> torchaudio.pipelines.Tacotron2TTSBundle.Vocoder
        Create a vocoder module, based off of either WaveRNN or GriffinLim.
@@ -244,7 +256,10 @@ class Tacotron2TTSBundle(ABC):
    @abstractmethod
    def get_tacotron2(self, *, dl_kwargs=None) -> Tacotron2:
-        """Create a Tacotron2 model with pre-trained weight.
+        # Overriding the signature so that the return type is correct on Sphinx
+        """get_tacotron2(self, *, dl_kwargs=None) -> torchaudio.models.Tacotron2
+        Create a Tacotron2 model with pre-trained weight.
        Args:
            dl_kwargs (dictionary of keyword arguments):

--- a/torchaudio/pipelines/_wav2vec2.py
+++ b/torchaudio/pipelines/_wav2vec2.py
@@ -53,6 +53,7 @@ class Wav2Vec2Bundle:
        return self._sample_rate
    def get_model(self, *, dl_kwargs=None) -> Wav2Vec2Model:
+        # Overriding the signature so that the return type is correct on Sphinx
        """get_model(self, *, dl_kwargs=None) -> torchaudio.models.Wav2Vec2Model
        Construct the model and load the pretrained weight.