Unverified Commit 420e84ee authored by moto's avatar moto Committed by GitHub
Browse files

Update models/pipelines doc (#1894)

1. Override the return type so that Sphinx shows the exported symbols.
   (output model types and input torch.nn.Module)
2. Tweak docs for Tacotron2TTSBundle interfaces
3. Fix for HUBERT_ASR_XLARGE
parent cb40dd72
...@@ -167,7 +167,7 @@ HUBERT_ASR_XLARGE ...@@ -167,7 +167,7 @@ HUBERT_ASR_XLARGE
.. container:: py attribute .. container:: py attribute
.. autodata:: HUBERT_ASR_XLARGE .. autodata:: HUBERT_ASR_XLARGE
:no-value:
Tacotron2 Text-To-Speech Tacotron2 Text-To-Speech
------------------------ ------------------------
......
...@@ -8,7 +8,9 @@ from . import components ...@@ -8,7 +8,9 @@ from . import components
class Wav2Vec2Model(Module): class Wav2Vec2Model(Module):
"""Encoder model used in *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]. """torchaudio.models.Wav2Vec2Model(feature_extractor: torch.nn.Module, encoder: torch.nn.Module, aux: Optional[torch.nn.Module] = None)
Encoder model used in *wav2vec 2.0* [:footcite:`baevski2020wav2vec`].
Note: Note:
To build the model, please use one of the factory functions. To build the model, please use one of the factory functions.
...@@ -23,7 +25,7 @@ class Wav2Vec2Model(Module): ...@@ -23,7 +25,7 @@ class Wav2Vec2Model(Module):
aux (torch.nn.Module or None, optional): aux (torch.nn.Module or None, optional):
Auxiliary module. If provided, the output from encoder is passed to this module. Auxiliary module. If provided, the output from encoder is passed to this module.
""" """ # noqa: E501
def __init__( def __init__(
self, self,
feature_extractor: Module, feature_extractor: Module,
...@@ -132,7 +134,10 @@ def wav2vec2_model( ...@@ -132,7 +134,10 @@ def wav2vec2_model(
encoder_layer_drop: float, encoder_layer_drop: float,
aux_num_out: Optional[int], aux_num_out: Optional[int],
) -> Wav2Vec2Model: ) -> Wav2Vec2Model:
"""Build a custom Wav2Vec2Model # Overriding the signature so that the return type is correct on Sphinx
"""wav2vec2_model(extractor_mode: str, extractor_conv_layer_config: Optional[List[Tuple[int, int, int]]], extractor_conv_bias: bool, encoder_embed_dim: int, encoder_projection_dropout: float, encoder_pos_conv_kernel: int, encoder_pos_conv_groups: int, encoder_num_layers: int, encoder_num_heads: int, encoder_attention_dropout: float, encoder_ff_interm_features: int, encoder_ff_interm_dropout: float, encoder_dropout: float, encoder_layer_norm_first: bool, encoder_layer_drop: float, aux_num_out: Optional[int]) -> torchaudio.models.Wav2Vec2Model
Build a custom Wav2Vec2Model
Note: Note:
The "feature extractor" below corresponds to The "feature extractor" below corresponds to
...@@ -287,7 +292,10 @@ def wav2vec2_base( ...@@ -287,7 +292,10 @@ def wav2vec2_base(
encoder_layer_drop: float = 0.1, encoder_layer_drop: float = 0.1,
aux_num_out: Optional[int] = None, aux_num_out: Optional[int] = None,
) -> Wav2Vec2Model: ) -> Wav2Vec2Model:
"""Build Wav2Vec2Model with "base" architecture from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`] # Overriding the signature so that the return type is correct on Sphinx
"""wav2vec2_base(encoder_projection_dropout: float = 0.1, encoder_attention_dropout: float = 0.1, encoder_ff_interm_dropout: float = 0.1, encoder_dropout: float = 0.1, encoder_layer_drop: float = 0.1, aux_num_out: Optional[int] = None) -> torchaudio.models.Wav2Vec2Model
Build Wav2Vec2Model with "base" architecture from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]
Args: Args:
encoder_projection_dropout (float): encoder_projection_dropout (float):
...@@ -306,7 +314,7 @@ def wav2vec2_base( ...@@ -306,7 +314,7 @@ def wav2vec2_base(
Returns: Returns:
Wav2Vec2Model: Wav2Vec2Model:
The resulting model. The resulting model.
""" """ # noqa: E501
return wav2vec2_model( return wav2vec2_model(
extractor_mode="group_norm", extractor_mode="group_norm",
extractor_conv_layer_config=None, extractor_conv_layer_config=None,
...@@ -335,7 +343,10 @@ def wav2vec2_large( ...@@ -335,7 +343,10 @@ def wav2vec2_large(
encoder_layer_drop: float = 0.1, encoder_layer_drop: float = 0.1,
aux_num_out: Optional[int] = None, aux_num_out: Optional[int] = None,
) -> Wav2Vec2Model: ) -> Wav2Vec2Model:
"""Build Wav2Vec2Model with "large" architecture from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`] # Overriding the signature so that the return type is correct on Sphinx
"""wav2vec2_large(encoder_projection_dropout: float = 0.1, encoder_attention_dropout: float = 0.1, encoder_ff_interm_dropout: float = 0.1, encoder_dropout: float = 0.1, encoder_layer_drop: float = 0.1, aux_num_out: Optional[int] = None) -> torchaudio.models.Wav2Vec2Model
Build Wav2Vec2Model with "large" architecture from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]
Args: Args:
encoder_projection_dropout (float): encoder_projection_dropout (float):
...@@ -354,7 +365,7 @@ def wav2vec2_large( ...@@ -354,7 +365,7 @@ def wav2vec2_large(
Returns: Returns:
Wav2Vec2Model: Wav2Vec2Model:
The resulting model. The resulting model.
""" """ # noqa: E501
return wav2vec2_model( return wav2vec2_model(
extractor_mode="group_norm", extractor_mode="group_norm",
extractor_conv_layer_config=None, extractor_conv_layer_config=None,
...@@ -383,7 +394,10 @@ def wav2vec2_large_lv60k( ...@@ -383,7 +394,10 @@ def wav2vec2_large_lv60k(
encoder_layer_drop: float = 0.1, encoder_layer_drop: float = 0.1,
aux_num_out: Optional[int] = None, aux_num_out: Optional[int] = None,
) -> Wav2Vec2Model: ) -> Wav2Vec2Model:
"""Build Wav2Vec2Model with "large lv-60k" architecture from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`] # Overriding the signature so that the return type is correct on Sphinx
"""wav2vec2_large_lv60k( encoder_projection_dropout: float = 0.1, encoder_attention_dropout: float = 0.0, encoder_ff_interm_dropout: float = 0.1, encoder_dropout: float = 0.0, encoder_layer_drop: float = 0.1, aux_num_out: Optional[int] = None) -> torchaudio.models.Wav2Vec2Model
Build Wav2Vec2Model with "large lv-60k" architecture from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]
Args: Args:
encoder_projection_dropout (float): encoder_projection_dropout (float):
...@@ -402,7 +416,7 @@ def wav2vec2_large_lv60k( ...@@ -402,7 +416,7 @@ def wav2vec2_large_lv60k(
Returns: Returns:
Wav2Vec2Model: Wav2Vec2Model:
The resulting model. The resulting model.
""" """ # noqa: E501
return wav2vec2_model( return wav2vec2_model(
extractor_mode="layer_norm", extractor_mode="layer_norm",
extractor_conv_layer_config=None, extractor_conv_layer_config=None,
...@@ -431,7 +445,10 @@ def hubert_base( ...@@ -431,7 +445,10 @@ def hubert_base(
encoder_layer_drop: float = 0.05, encoder_layer_drop: float = 0.05,
aux_num_out: Optional[int] = None, aux_num_out: Optional[int] = None,
) -> Wav2Vec2Model: ) -> Wav2Vec2Model:
"""Build HuBERT model with "base" architecture from *HuBERT* [:footcite:`hsu2021hubert`] # Overriding the signature so that the return type is correct on Sphinx
"""hubert_base(encoder_projection_dropout: float = 0.1, encoder_attention_dropout: float = 0.1, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.1, encoder_layer_drop: float = 0.05, aux_num_out: Optional[int] = None) -> torchaudio.models.Wav2Vec2Model
Build HuBERT model with "base" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
Args: Args:
encoder_projection_dropout (float): encoder_projection_dropout (float):
...@@ -450,7 +467,7 @@ def hubert_base( ...@@ -450,7 +467,7 @@ def hubert_base(
Returns: Returns:
Wav2Vec2Model: Wav2Vec2Model:
The resulting model. The resulting model.
""" """ # noqa: E501
return wav2vec2_model( return wav2vec2_model(
extractor_mode='group_norm', extractor_mode='group_norm',
extractor_conv_layer_config=None, extractor_conv_layer_config=None,
...@@ -479,7 +496,10 @@ def hubert_large( ...@@ -479,7 +496,10 @@ def hubert_large(
encoder_layer_drop: float = 0.0, encoder_layer_drop: float = 0.0,
aux_num_out: Optional[int] = None, aux_num_out: Optional[int] = None,
) -> Wav2Vec2Model: ) -> Wav2Vec2Model:
"""Build HuBERT model with "large" architecture from *HuBERT* [:footcite:`hsu2021hubert`] # Overriding the signature so that the return type is correct on Sphinx
"""hubert_large(encoder_projection_dropout: float = 0.0, encoder_attention_dropout: float = 0.0, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.0, encoder_layer_drop: float = 0.0, aux_num_out: Optional[int] = None) -> torchaudio.models.Wav2Vec2Model
Build HuBERT model with "large" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
Args: Args:
encoder_projection_dropout (float): encoder_projection_dropout (float):
...@@ -498,7 +518,7 @@ def hubert_large( ...@@ -498,7 +518,7 @@ def hubert_large(
Returns: Returns:
Wav2Vec2Model: Wav2Vec2Model:
The resulting model. The resulting model.
""" """ # noqa: E501
return wav2vec2_model( return wav2vec2_model(
extractor_mode='layer_norm', extractor_mode='layer_norm',
extractor_conv_layer_config=None, extractor_conv_layer_config=None,
...@@ -527,7 +547,10 @@ def hubert_xlarge( ...@@ -527,7 +547,10 @@ def hubert_xlarge(
encoder_layer_drop: float = 0.0, encoder_layer_drop: float = 0.0,
aux_num_out: Optional[int] = None, aux_num_out: Optional[int] = None,
) -> Wav2Vec2Model: ) -> Wav2Vec2Model:
"""Build HuBERT model with "extra large" architecture from *HuBERT* [:footcite:`hsu2021hubert`] # Overriding the signature so that the return type is correct on Sphinx
"""hubert_xlarge(encoder_projection_dropout: float = 0.0, encoder_attention_dropout: float = 0.0, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.0, encoder_layer_drop: float = 0.0, aux_num_out: Optional[int] = None) -> torchaudio.models.Wav2Vec2Model
Build HuBERT model with "extra large" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
Args: Args:
encoder_projection_dropout (float): encoder_projection_dropout (float):
...@@ -546,7 +569,7 @@ def hubert_xlarge( ...@@ -546,7 +569,7 @@ def hubert_xlarge(
Returns: Returns:
Wav2Vec2Model: Wav2Vec2Model:
The resulting model. The resulting model.
""" """ # noqa: E501
return wav2vec2_model( return wav2vec2_model(
extractor_mode='layer_norm', extractor_mode='layer_norm',
extractor_conv_layer_config=None, extractor_conv_layer_config=None,
......
...@@ -126,7 +126,10 @@ def _convert_state_dict(state_dict): ...@@ -126,7 +126,10 @@ def _convert_state_dict(state_dict):
def import_fairseq_model(original: Module) -> Wav2Vec2Model: def import_fairseq_model(original: Module) -> Wav2Vec2Model:
"""Build Wav2Vec2Model from pretrained parameters published by `fairseq`_. # Overriding the signature so that the types are correct on Sphinx
"""import_fairseq_model(original: torch.nn.Module) -> torchaudio.models.Wav2Vec2Model
Build Wav2Vec2Model from the corresponding model object of `fairseq`_.
Args: Args:
original (torch.nn.Module): original (torch.nn.Module):
......
...@@ -50,7 +50,9 @@ def _build(config, original): ...@@ -50,7 +50,9 @@ def _build(config, original):
def import_huggingface_model(original: Module) -> Wav2Vec2Model: def import_huggingface_model(original: Module) -> Wav2Vec2Model:
"""Import wav2vec2 model from Hugging Face's `Transformers`_. """import_huggingface_model(original: torch.nn.Module) -> torchaudio.models.Wav2Vec2Model
Build Wav2Vec2Model from the corresponding model object of Hugging Face's `Transformers`_.
Args: Args:
original (torch.nn.Module): An instance of ``Wav2Vec2ForCTC`` from ``transformers``. original (torch.nn.Module): An instance of ``Wav2Vec2ForCTC`` from ``transformers``.
......
...@@ -7,13 +7,13 @@ from torchaudio.models import Tacotron2 ...@@ -7,13 +7,13 @@ from torchaudio.models import Tacotron2
class _TextProcessor(ABC): class _TextProcessor(ABC):
"""Interface of the text processing part of Tacotron2TTS pipeline"""
@property @property
@abstractmethod @abstractmethod
def tokens(self): def tokens(self):
"""The tokens that the each value in the processed tensor represent. """The tokens that the each value in the processed tensor represent.
See :func:`torchaudio.pipelines.Tacotron2TTSBundle.get_text_processor` for the usage.
:type: List[str] :type: List[str]
""" """
...@@ -21,6 +21,8 @@ class _TextProcessor(ABC): ...@@ -21,6 +21,8 @@ class _TextProcessor(ABC):
def __call__(self, texts: Union[str, List[str]]) -> Tuple[Tensor, Tensor]: def __call__(self, texts: Union[str, List[str]]) -> Tuple[Tensor, Tensor]:
"""Encode the given (batch of) texts into numerical tensors """Encode the given (batch of) texts into numerical tensors
See :func:`torchaudio.pipelines.Tacotron2TTSBundle.get_text_processor` for the usage.
Args: Args:
text (str or list of str): The input texts. text (str or list of str): The input texts.
...@@ -34,13 +36,13 @@ class _TextProcessor(ABC): ...@@ -34,13 +36,13 @@ class _TextProcessor(ABC):
class _Vocoder(ABC): class _Vocoder(ABC):
"""Interface of the vocoder part of Tacotron2TTS pipeline"""
@property @property
@abstractmethod @abstractmethod
def sample_rate(self): def sample_rate(self):
"""The sample rate of the resulting waveform """The sample rate of the resulting waveform
See :func:`torchaudio.pipelines.Tacotron2TTSBundle.get_vocoder` for the usage.
:type: float :type: float
""" """
...@@ -48,6 +50,8 @@ class _Vocoder(ABC): ...@@ -48,6 +50,8 @@ class _Vocoder(ABC):
def __call__(self, specgrams: Tensor, lengths: Optional[Tensor]) -> Tuple[Tensor, Optional[Tensor]]: def __call__(self, specgrams: Tensor, lengths: Optional[Tensor]) -> Tuple[Tensor, Optional[Tensor]]:
"""Generate waveform from the given input, such as spectrogram """Generate waveform from the given input, such as spectrogram
See :func:`torchaudio.pipelines.Tacotron2TTSBundle.get_vocoder` for the usage.
Args: Args:
specgrams (Tensor): specgrams (Tensor):
The input spectrogram. Shape: `(batch, frequency bins, time)`. The input spectrogram. Shape: `(batch, frequency bins, time)`.
...@@ -146,14 +150,21 @@ class Tacotron2TTSBundle(ABC): ...@@ -146,14 +150,21 @@ class Tacotron2TTSBundle(ABC):
# new text processing and vocoder will be added in the future, so we want to make these # new text processing and vocoder will be added in the future, so we want to make these
# interfaces specific to this Tacotron2TTS pipeline. # interfaces specific to this Tacotron2TTS pipeline.
class TextProcessor(_TextProcessor): class TextProcessor(_TextProcessor):
pass """Interface of the text processing part of Tacotron2TTS pipeline
See :func:`torchaudio.pipelines.Tacotron2TTSBundle.get_text_processor` for the usage.
"""
class Vocoder(_Vocoder): class Vocoder(_Vocoder):
pass """Interface of the vocoder part of Tacotron2TTS pipeline
See :func:`torchaudio.pipelines.Tacotron2TTSBundle.get_vocoder` for the usage.
"""
@abstractmethod @abstractmethod
def get_text_processor(self, *, dl_kwargs=None) -> TextProcessor: def get_text_processor(self, *, dl_kwargs=None) -> TextProcessor:
"""get_text_processor(self, *, dl_kwargs=None) -> Tacotron2TTSBundle.TextProcessor: # Overriding the signature so that the return type is correct on Sphinx
"""get_text_processor(self, *, dl_kwargs=None) -> torchaudio.pipelines.Tacotron2TTSBundle.TextProcessor
Create a text processor Create a text processor
...@@ -177,7 +188,7 @@ class Tacotron2TTSBundle(ABC): ...@@ -177,7 +188,7 @@ class Tacotron2TTSBundle(ABC):
Example - Character-based Example - Character-based
>>> text = [ >>> text = [
>>> "Hello, T T S !", >>> "Hello World!",
>>> "Text-to-speech!", >>> "Text-to-speech!",
>>> ] >>> ]
>>> bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH >>> bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH
...@@ -192,9 +203,9 @@ class Tacotron2TTSBundle(ABC): ...@@ -192,9 +203,9 @@ class Tacotron2TTSBundle(ABC):
>>> print(lengths) >>> print(lengths)
tensor([12, 15], dtype=torch.int32) tensor([12, 15], dtype=torch.int32)
>>> >>>
>>> print([processor.tokens[i] for i in input[0]]) >>> print([processor.tokens[i] for i in input[0, :lengths[0]]])
['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!', '_', '_', '_'] ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']
>>> print([processor.tokens[i] for i in input[1]]) >>> print([processor.tokens[i] for i in input[1, :lengths[1]]])
['t', 'e', 'x', 't', '-', 't', 'o', '-', 's', 'p', 'e', 'e', 'c', 'h', '!'] ['t', 'e', 'x', 't', '-', 't', 'o', '-', 's', 'p', 'e', 'e', 'c', 'h', '!']
Example - Phoneme-based Example - Phoneme-based
...@@ -224,7 +235,8 @@ class Tacotron2TTSBundle(ABC): ...@@ -224,7 +235,8 @@ class Tacotron2TTSBundle(ABC):
@abstractmethod @abstractmethod
def get_vocoder(self, *, dl_kwargs=None) -> Vocoder: def get_vocoder(self, *, dl_kwargs=None) -> Vocoder:
"""get_vocoder(self, *, dl_kwargs=None) -> Tacotron2TTSBundle.Vocoder: # Overriding the signature so that the return type is correct on Sphinx
"""get_vocoder(self, *, dl_kwargs=None) -> torchaudio.pipelines.Tacotron2TTSBundle.Vocoder
Create a vocoder module, based off of either WaveRNN or GriffinLim. Create a vocoder module, based off of either WaveRNN or GriffinLim.
...@@ -244,7 +256,10 @@ class Tacotron2TTSBundle(ABC): ...@@ -244,7 +256,10 @@ class Tacotron2TTSBundle(ABC):
@abstractmethod @abstractmethod
def get_tacotron2(self, *, dl_kwargs=None) -> Tacotron2: def get_tacotron2(self, *, dl_kwargs=None) -> Tacotron2:
"""Create a Tacotron2 model with pre-trained weight. # Overriding the signature so that the return type is correct on Sphinx
"""get_tacotron2(self, *, dl_kwargs=None) -> torchaudio.models.Tacotron2
Create a Tacotron2 model with pre-trained weight.
Args: Args:
dl_kwargs (dictionary of keyword arguments): dl_kwargs (dictionary of keyword arguments):
......
...@@ -53,6 +53,7 @@ class Wav2Vec2Bundle: ...@@ -53,6 +53,7 @@ class Wav2Vec2Bundle:
return self._sample_rate return self._sample_rate
def get_model(self, *, dl_kwargs=None) -> Wav2Vec2Model: def get_model(self, *, dl_kwargs=None) -> Wav2Vec2Model:
# Overriding the signature so that the return type is correct on Sphinx
"""get_model(self, *, dl_kwargs=None) -> torchaudio.models.Wav2Vec2Model """get_model(self, *, dl_kwargs=None) -> torchaudio.models.Wav2Vec2Model
Construct the model and load the pretrained weight. Construct the model and load the pretrained weight.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment