Update Tacotron2 docs (#1840)

486022e9 · hwangjeff · hwangjeff · 9bbd4600 · 486022e9 · 486022e9
Commit 486022e9 authored Oct 08, 2021 by hwangjeff Committed by hwangjeff Oct 08, 2021
Show whitespace changes
Inline Side-by-side

Showing with 26 additions and 22 deletions

docs/source/models.rst docs/source/models.rst +2 -2

torchaudio/models/tacotron2.py torchaudio/models/tacotron2.py +24 -20

No files found.
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -31,8 +31,8 @@ Tacotron2
 Model
 -----

-Tacotoron2
-^^^^^^^^^^
+Tacotron2
+^^^^^^^^^

 .. autoclass:: Tacotron2


--- a/torchaudio/models/tacotron2.py
+++ b/torchaudio/models/tacotron2.py
@@ -1091,21 +1091,23 @@ class Tacotron2(nn.Module):
        The input ``mel_specgram`` should be padded with zeros to length max of ``mel_specgram_lengths``.

        Args:
-            text (Tensor): The input text to Tacotron2 with shape (n_batch, max of ``text_lengths``).
-            text_lengths (Tensor): The length of each text with shape (n_batch).
+            text (Tensor): The input text to Tacotron2 with shape `(n_batch, max of text_lengths)`.
+            text_lengths (Tensor): The length of each text with shape `(n_batch, )`.
            mel_specgram (Tensor): The target mel spectrogram
-                with shape (n_batch, n_mels, max of ``mel_specgram_lengths``).
-            mel_specgram_lengths (Tensor): The length of each mel spectrogram with shape (n_batch).
+                with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
+            mel_specgram_lengths (Tensor): The length of each mel spectrogram with shape `(n_batch, )`.

        Returns:
-            mel_specgram (Tensor): Mel spectrogram before Postnet
-                with shape (n_batch, n_mels, max of ``mel_specgram_lengths``).
-            mel_specgram_postnet (Tensor): Mel spectrogram after Postnet
-                with shape (n_batch, n_mels, max of ``mel_specgram_lengths``).
-            stop_token (Tensor): The output for stop token at each time step
-                with shape (n_batch, max of ``mel_specgram_lengths``).
-            alignment (Tensor): Sequence of attention weights from the decoder.
-                with shape (n_batch, max of ``mel_specgram_lengths``, max of ``text_lengths``).
+            Tensor, Tensor, Tensor, and Tensor:
+                Tensor
+                    Mel spectrogram before Postnet with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
+                Tensor
+                    Mel spectrogram after Postnet with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
+                Tensor
+                    The output for stop token at each time step with shape `(n_batch, max of mel_specgram_lengths)`.
+                Tensor
+                    Sequence of attention weights from the decoder with
+                    shape `(n_batch, max of mel_specgram_lengths, max of text_lengths)`.
        """

        embedded_inputs = self.embedding(text).transpose(1, 2)
@@ -1139,17 +1141,19 @@ class Tacotron2(nn.Module):
        The input `text` should be padded with zeros to length max of ``text_lengths``.

        Args:
-            text (Tensor): The input text to Tacotron2 with shape (n_batch, max of ``text_lengths``).
+            text (Tensor): The input text to Tacotron2 with shape `(n_batch, max of text_lengths)`.
            text_lengths (Tensor or None, optional): The length of each text with shape `(n_batch, )`.
                If ``None``, it is assumed that the all the texts are valid. Default: ``None``

-        Return:
-            mel_specgram (Tensor): The predicted mel spectrogram
-                with shape (n_batch, n_mels, max of ``mel_specgram_lengths.max()``).
-            mel_specgram_lengths (Tensor): The length of the predicted mel spectrogram
-                with shape (n_batch, ).
-            alignments (Tensor): Sequence of attention weights from the decoder.
-                with shape (n_batch, max of ``mel_specgram_lengths``, max of ``text_lengths``).
+        Returns:
+            Tensor, Tensor, and Tensor:
+                Tensor
+                    The predicted mel spectrogram with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
+                Tensor
+                    The length of the predicted mel spectrogram with shape `(n_batch, )`.
+                Tensor
+                    Sequence of attention weights from the decoder with shape
+                    `(n_batch, max of mel_specgram_lengths, max of text_lengths)`.
        """
        n_batch, max_length = text.shape
        if text_lengths is None: