Unverified Commit 420e84ee authored by moto's avatar moto Committed by GitHub
Browse files

Update models/pipelines doc (#1894)

1. Override the return type so that Sphinx shows the exported symbols.
   (output model types and input torch.nn.Module)
2. Tweak docs for Tacotron2TTSBundle interfaces
3. Fix for HUBERT_ASR_XLARGE
parent cb40dd72
......@@ -167,7 +167,7 @@ HUBERT_ASR_XLARGE
.. container:: py attribute
.. autodata:: HUBERT_ASR_XLARGE
:no-value:
Tacotron2 Text-To-Speech
------------------------
......
......@@ -8,7 +8,9 @@ from . import components
class Wav2Vec2Model(Module):
"""Encoder model used in *wav2vec 2.0* [:footcite:`baevski2020wav2vec`].
"""torchaudio.models.Wav2Vec2Model(feature_extractor: torch.nn.Module, encoder: torch.nn.Module, aux: Optional[torch.nn.Module] = None)
Encoder model used in *wav2vec 2.0* [:footcite:`baevski2020wav2vec`].
Note:
To build the model, please use one of the factory functions.
......@@ -23,7 +25,7 @@ class Wav2Vec2Model(Module):
aux (torch.nn.Module or None, optional):
Auxiliary module. If provided, the output from encoder is passed to this module.
"""
""" # noqa: E501
def __init__(
self,
feature_extractor: Module,
......@@ -132,7 +134,10 @@ def wav2vec2_model(
encoder_layer_drop: float,
aux_num_out: Optional[int],
) -> Wav2Vec2Model:
"""Build a custom Wav2Vec2Model
# Overriding the signature so that the return type is correct on Sphinx
"""wav2vec2_model(extractor_mode: str, extractor_conv_layer_config: Optional[List[Tuple[int, int, int]]], extractor_conv_bias: bool, encoder_embed_dim: int, encoder_projection_dropout: float, encoder_pos_conv_kernel: int, encoder_pos_conv_groups: int, encoder_num_layers: int, encoder_num_heads: int, encoder_attention_dropout: float, encoder_ff_interm_features: int, encoder_ff_interm_dropout: float, encoder_dropout: float, encoder_layer_norm_first: bool, encoder_layer_drop: float, aux_num_out: Optional[int]) -> torchaudio.models.Wav2Vec2Model
Build a custom Wav2Vec2Model
Note:
The "feature extractor" below corresponds to
......@@ -287,7 +292,10 @@ def wav2vec2_base(
encoder_layer_drop: float = 0.1,
aux_num_out: Optional[int] = None,
) -> Wav2Vec2Model:
"""Build Wav2Vec2Model with "base" architecture from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]
# Overriding the signature so that the return type is correct on Sphinx
"""wav2vec2_base(encoder_projection_dropout: float = 0.1, encoder_attention_dropout: float = 0.1, encoder_ff_interm_dropout: float = 0.1, encoder_dropout: float = 0.1, encoder_layer_drop: float = 0.1, aux_num_out: Optional[int] = None) -> torchaudio.models.Wav2Vec2Model
Build Wav2Vec2Model with "base" architecture from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]
Args:
encoder_projection_dropout (float):
......@@ -306,7 +314,7 @@ def wav2vec2_base(
Returns:
Wav2Vec2Model:
The resulting model.
"""
""" # noqa: E501
return wav2vec2_model(
extractor_mode="group_norm",
extractor_conv_layer_config=None,
......@@ -335,7 +343,10 @@ def wav2vec2_large(
encoder_layer_drop: float = 0.1,
aux_num_out: Optional[int] = None,
) -> Wav2Vec2Model:
"""Build Wav2Vec2Model with "large" architecture from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]
# Overriding the signature so that the return type is correct on Sphinx
"""wav2vec2_large(encoder_projection_dropout: float = 0.1, encoder_attention_dropout: float = 0.1, encoder_ff_interm_dropout: float = 0.1, encoder_dropout: float = 0.1, encoder_layer_drop: float = 0.1, aux_num_out: Optional[int] = None) -> torchaudio.models.Wav2Vec2Model
Build Wav2Vec2Model with "large" architecture from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]
Args:
encoder_projection_dropout (float):
......@@ -354,7 +365,7 @@ def wav2vec2_large(
Returns:
Wav2Vec2Model:
The resulting model.
"""
""" # noqa: E501
return wav2vec2_model(
extractor_mode="group_norm",
extractor_conv_layer_config=None,
......@@ -383,7 +394,10 @@ def wav2vec2_large_lv60k(
encoder_layer_drop: float = 0.1,
aux_num_out: Optional[int] = None,
) -> Wav2Vec2Model:
"""Build Wav2Vec2Model with "large lv-60k" architecture from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]
# Overriding the signature so that the return type is correct on Sphinx
"""wav2vec2_large_lv60k( encoder_projection_dropout: float = 0.1, encoder_attention_dropout: float = 0.0, encoder_ff_interm_dropout: float = 0.1, encoder_dropout: float = 0.0, encoder_layer_drop: float = 0.1, aux_num_out: Optional[int] = None) -> torchaudio.models.Wav2Vec2Model
Build Wav2Vec2Model with "large lv-60k" architecture from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]
Args:
encoder_projection_dropout (float):
......@@ -402,7 +416,7 @@ def wav2vec2_large_lv60k(
Returns:
Wav2Vec2Model:
The resulting model.
"""
""" # noqa: E501
return wav2vec2_model(
extractor_mode="layer_norm",
extractor_conv_layer_config=None,
......@@ -431,7 +445,10 @@ def hubert_base(
encoder_layer_drop: float = 0.05,
aux_num_out: Optional[int] = None,
) -> Wav2Vec2Model:
"""Build HuBERT model with "base" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
# Overriding the signature so that the return type is correct on Sphinx
"""hubert_base(encoder_projection_dropout: float = 0.1, encoder_attention_dropout: float = 0.1, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.1, encoder_layer_drop: float = 0.05, aux_num_out: Optional[int] = None) -> torchaudio.models.Wav2Vec2Model
Build HuBERT model with "base" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
Args:
encoder_projection_dropout (float):
......@@ -450,7 +467,7 @@ def hubert_base(
Returns:
Wav2Vec2Model:
The resulting model.
"""
""" # noqa: E501
return wav2vec2_model(
extractor_mode='group_norm',
extractor_conv_layer_config=None,
......@@ -479,7 +496,10 @@ def hubert_large(
encoder_layer_drop: float = 0.0,
aux_num_out: Optional[int] = None,
) -> Wav2Vec2Model:
"""Build HuBERT model with "large" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
# Overriding the signature so that the return type is correct on Sphinx
"""hubert_large(encoder_projection_dropout: float = 0.0, encoder_attention_dropout: float = 0.0, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.0, encoder_layer_drop: float = 0.0, aux_num_out: Optional[int] = None) -> torchaudio.models.Wav2Vec2Model
Build HuBERT model with "large" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
Args:
encoder_projection_dropout (float):
......@@ -498,7 +518,7 @@ def hubert_large(
Returns:
Wav2Vec2Model:
The resulting model.
"""
""" # noqa: E501
return wav2vec2_model(
extractor_mode='layer_norm',
extractor_conv_layer_config=None,
......@@ -527,7 +547,10 @@ def hubert_xlarge(
encoder_layer_drop: float = 0.0,
aux_num_out: Optional[int] = None,
) -> Wav2Vec2Model:
"""Build HuBERT model with "extra large" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
# Overriding the signature so that the return type is correct on Sphinx
"""hubert_xlarge(encoder_projection_dropout: float = 0.0, encoder_attention_dropout: float = 0.0, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.0, encoder_layer_drop: float = 0.0, aux_num_out: Optional[int] = None) -> torchaudio.models.Wav2Vec2Model
Build HuBERT model with "extra large" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
Args:
encoder_projection_dropout (float):
......@@ -546,7 +569,7 @@ def hubert_xlarge(
Returns:
Wav2Vec2Model:
The resulting model.
"""
""" # noqa: E501
return wav2vec2_model(
extractor_mode='layer_norm',
extractor_conv_layer_config=None,
......
......@@ -126,7 +126,10 @@ def _convert_state_dict(state_dict):
def import_fairseq_model(original: Module) -> Wav2Vec2Model:
"""Build Wav2Vec2Model from pretrained parameters published by `fairseq`_.
# Overriding the signature so that the types are correct on Sphinx
"""import_fairseq_model(original: torch.nn.Module) -> torchaudio.models.Wav2Vec2Model
Build Wav2Vec2Model from the corresponding model object of `fairseq`_.
Args:
original (torch.nn.Module):
......
......@@ -50,7 +50,9 @@ def _build(config, original):
def import_huggingface_model(original: Module) -> Wav2Vec2Model:
"""Import wav2vec2 model from Hugging Face's `Transformers`_.
"""import_huggingface_model(original: torch.nn.Module) -> torchaudio.models.Wav2Vec2Model
Build Wav2Vec2Model from the corresponding model object of Hugging Face's `Transformers`_.
Args:
original (torch.nn.Module): An instance of ``Wav2Vec2ForCTC`` from ``transformers``.
......
......@@ -7,13 +7,13 @@ from torchaudio.models import Tacotron2
class _TextProcessor(ABC):
"""Interface of the text processing part of Tacotron2TTS pipeline"""
@property
@abstractmethod
def tokens(self):
"""The tokens that the each value in the processed tensor represent.
See :func:`torchaudio.pipelines.Tacotron2TTSBundle.get_text_processor` for the usage.
:type: List[str]
"""
......@@ -21,6 +21,8 @@ class _TextProcessor(ABC):
def __call__(self, texts: Union[str, List[str]]) -> Tuple[Tensor, Tensor]:
"""Encode the given (batch of) texts into numerical tensors
See :func:`torchaudio.pipelines.Tacotron2TTSBundle.get_text_processor` for the usage.
Args:
text (str or list of str): The input texts.
......@@ -34,13 +36,13 @@ class _TextProcessor(ABC):
class _Vocoder(ABC):
"""Interface of the vocoder part of Tacotron2TTS pipeline"""
@property
@abstractmethod
def sample_rate(self):
"""The sample rate of the resulting waveform
See :func:`torchaudio.pipelines.Tacotron2TTSBundle.get_vocoder` for the usage.
:type: float
"""
......@@ -48,6 +50,8 @@ class _Vocoder(ABC):
def __call__(self, specgrams: Tensor, lengths: Optional[Tensor]) -> Tuple[Tensor, Optional[Tensor]]:
"""Generate waveform from the given input, such as spectrogram
See :func:`torchaudio.pipelines.Tacotron2TTSBundle.get_vocoder` for the usage.
Args:
specgrams (Tensor):
The input spectrogram. Shape: `(batch, frequency bins, time)`.
......@@ -146,14 +150,21 @@ class Tacotron2TTSBundle(ABC):
# new text processing and vocoder will be added in the future, so we want to make these
# interfaces specific to this Tacotron2TTS pipeline.
class TextProcessor(_TextProcessor):
pass
"""Interface of the text processing part of Tacotron2TTS pipeline
See :func:`torchaudio.pipelines.Tacotron2TTSBundle.get_text_processor` for the usage.
"""
class Vocoder(_Vocoder):
pass
"""Interface of the vocoder part of Tacotron2TTS pipeline
See :func:`torchaudio.pipelines.Tacotron2TTSBundle.get_vocoder` for the usage.
"""
@abstractmethod
def get_text_processor(self, *, dl_kwargs=None) -> TextProcessor:
"""get_text_processor(self, *, dl_kwargs=None) -> Tacotron2TTSBundle.TextProcessor:
# Overriding the signature so that the return type is correct on Sphinx
"""get_text_processor(self, *, dl_kwargs=None) -> torchaudio.pipelines.Tacotron2TTSBundle.TextProcessor
Create a text processor
......@@ -177,7 +188,7 @@ class Tacotron2TTSBundle(ABC):
Example - Character-based
>>> text = [
>>> "Hello, T T S !",
>>> "Hello World!",
>>> "Text-to-speech!",
>>> ]
>>> bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH
......@@ -192,9 +203,9 @@ class Tacotron2TTSBundle(ABC):
>>> print(lengths)
tensor([12, 15], dtype=torch.int32)
>>>
>>> print([processor.tokens[i] for i in input[0]])
['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!', '_', '_', '_']
>>> print([processor.tokens[i] for i in input[1]])
>>> print([processor.tokens[i] for i in input[0, :lengths[0]]])
['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']
>>> print([processor.tokens[i] for i in input[1, :lengths[1]]])
['t', 'e', 'x', 't', '-', 't', 'o', '-', 's', 'p', 'e', 'e', 'c', 'h', '!']
Example - Phoneme-based
......@@ -224,7 +235,8 @@ class Tacotron2TTSBundle(ABC):
@abstractmethod
def get_vocoder(self, *, dl_kwargs=None) -> Vocoder:
"""get_vocoder(self, *, dl_kwargs=None) -> Tacotron2TTSBundle.Vocoder:
# Overriding the signature so that the return type is correct on Sphinx
"""get_vocoder(self, *, dl_kwargs=None) -> torchaudio.pipelines.Tacotron2TTSBundle.Vocoder
Create a vocoder module, based off of either WaveRNN or GriffinLim.
......@@ -244,7 +256,10 @@ class Tacotron2TTSBundle(ABC):
@abstractmethod
def get_tacotron2(self, *, dl_kwargs=None) -> Tacotron2:
"""Create a Tacotron2 model with pre-trained weight.
# Overriding the signature so that the return type is correct on Sphinx
"""get_tacotron2(self, *, dl_kwargs=None) -> torchaudio.models.Tacotron2
Create a Tacotron2 model with pre-trained weight.
Args:
dl_kwargs (dictionary of keyword arguments):
......
......@@ -53,6 +53,7 @@ class Wav2Vec2Bundle:
return self._sample_rate
def get_model(self, *, dl_kwargs=None) -> Wav2Vec2Model:
# Overriding the signature so that the return type is correct on Sphinx
"""get_model(self, *, dl_kwargs=None) -> torchaudio.models.Wav2Vec2Model
Construct the model and load the pretrained weight.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment