Update desciptions of `lengths` parameters (#1890)

211270db · moto · GitHub · 89aeb686 · 211270db · 211270db
Unverified Commit 211270db authored Oct 16, 2021 by moto Committed by GitHub Oct 16, 2021
Showing with 33 additions and 12 deletions

torchaudio/models/tacotron2.py torchaudio/models/tacotron2.py +1 -1

torchaudio/models/wav2vec2/model.py torchaudio/models/wav2vec2/model.py +21 -7

torchaudio/models/wavernn.py torchaudio/models/wavernn.py +11 -4

No files found.
--- a/torchaudio/models/tacotron2.py
+++ b/torchaudio/models/tacotron2.py
@@ -1080,7 +1080,7 @@ class Tacotron2(nn.Module):
                If ``None``, it is assumed that the all the tokens are valid. Default: ``None``

        Returns:
-            Tensor, Tensor, and Tensor:
+            (Tensor, Tensor, Tensor):
                Tensor
                    The predicted mel spectrogram with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
                Tensor

--- a/torchaudio/models/wav2vec2/model.py
+++ b/torchaudio/models/wav2vec2/model.py
@@ -50,8 +50,14 @@ class Wav2Vec2Model(Module):
        Args:
            waveforms (Tensor): Audio tensor of shape `(batch, frames)`.
            lengths (Tensor or None, optional):
-                Indicates the valid length of each audio sample in the batch.
+                Indicates the valid length of each audio in the batch.
                Shape: `(batch, )`.
+                When the ``waveforms`` contains audios with different durations,
+                by providing ``lengths`` argument, the model will compute
+                the corresponding valid output lengths and apply proper mask in
+                transformer attention layer.
+                If ``None``, it is assumed that the entire audio waveform
+                length is valid.
            num_layers (int or None, optional):
                If given, limit the number of intermediate layers to go through.
                Providing `1` will stop the computation after going through one
@@ -59,13 +65,14 @@ class Wav2Vec2Model(Module):
                intermediate layers are returned.

        Returns:
-            List of Tensors and an optional Tensor:
+            (List[Tensor], Optional[Tensor]):
            List of Tensors
                Features from requested layers.
-                Each Tensor is of shape: `(batch, frames, feature dimention)`
+                Each Tensor is of shape: `(batch, time frame, feature dimension)`
            Tensor or None
                If ``lengths`` argument was provided, a Tensor of shape `(batch, )`
-                is retuned. It indicates the valid length of each feature in the batch.
+                is returned.
+                It indicates the valid length in time axis of each feature Tensor.
        """
        x, lengths = self.feature_extractor(waveforms, lengths)
        x = self.encoder.extract_features(x, lengths, num_layers)
@@ -81,17 +88,24 @@ class Wav2Vec2Model(Module):
        Args:
            waveforms (Tensor): Audio tensor of shape `(batch, frames)`.
            lengths (Tensor or None, optional):
-                Indicates the valid length of each audio sample in the batch.
+                Indicates the valid length of each audio in the batch.
                Shape: `(batch, )`.
+                When the ``waveforms`` contains audios with different duration,
+                by providing ``lengths`` argument, the model will compute
+                the corresponding valid output lengths and apply proper mask in
+                transformer attention layer.
+                If ``None``, it is assumed that all the audio in ``waveforms``
+                have valid length. Default: ``None``.

        Returns:
-            Tensor and an optional Tensor:
+            (Tensor, Optional[Tensor]):
            Tensor
                The sequences of probability distribution (in logit) over labels.
                Shape: `(batch, frames, num labels)`.
            Tensor or None
                If ``lengths`` argument was provided, a Tensor of shape `(batch, )`
-                is retuned. It indicates the valid length of each feature in the batch.
+                is retuned.
+                It indicates the valid length in time axis of the output Tensor.
        """
        x, lengths = self.feature_extractor(waveforms, lengths)
        x = self.encoder(x, lengths)

--- a/torchaudio/models/wavernn.py
+++ b/torchaudio/models/wavernn.py
@@ -341,16 +341,23 @@ class WaveRNN(nn.Module):
            specgram (Tensor):
                Batch of spectrograms. Shape: `(n_batch, n_freq, n_time)`.
            lengths (Tensor or None, optional):
-                Indicates the valid length in of each spectrogram in time axis.
-                Shape: `(n_batch, )`.
+                Indicates the valid length of each audio in the batch.
+                Shape: `(batch, )`.
+                When the ``specgram`` contains spectrograms with different duration,
+                by providing ``lengths`` argument, the model will compute
+                the corresponding valid output lengths.
+                If ``None``, it is assumed that all the audio in ``waveforms``
+                have valid length. Default: ``None``.

        Returns:
-            Tensor and optional Tensor:
+            (Tensor, Optional[Tensor]):
            Tensor
                The inferred waveform of size `(n_batch, 1, n_time)`.
                1 stands for a single channel.
            Tensor or None
-                The valid lengths of each waveform in the batch. Size `(n_batch, )`.
+                If ``lengths`` argument was provided, a Tensor of shape `(batch, )`
+                is retuned.
+                It indicates the valid length in time axis of the output Tensor.
        """

        device = specgram.device