[DOC] Standardization and minor fixes (#1892)

cb40dd72 · Caroline Chen · GitHub · 955cdbdc · cb40dd72 · cb40dd72
Unverified Commit cb40dd72 authored Oct 18, 2021 by Caroline Chen Committed by GitHub Oct 18, 2021
20 changed files
--- a/torchaudio/backend/soundfile_backend.py
+++ b/torchaudio/backend/soundfile_backend.py
@@ -146,7 +146,7 @@ def load(
        * SPHERE

    By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
-    ``float32`` dtype and the shape of ``[channel, time]``.
+    ``float32`` dtype and the shape of `[channel, time]`.
    The samples are normalized to fit in the range of ``[-1.0, 1.0]``.

    When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
@@ -182,16 +182,16 @@ def load(
            integer type.
            This argument has no effect for formats other than integer WAV type.
        channels_first (bool, optional):
-            When True, the returned Tensor has dimension ``[channel, time]``.
-            Otherwise, the returned Tensor's dimension is ``[time, channel]``.
+            When True, the returned Tensor has dimension `[channel, time]`.
+            Otherwise, the returned Tensor's dimension is `[time, channel]`.
        format (str or None, optional):
            Not used. PySoundFile does not accept format hint.

    Returns:
-        Tuple[torch.Tensor, int]: Resulting Tensor and sample rate.
+        (torch.Tensor, int): Resulting Tensor and sample rate.
            If the input file has integer wav format and normalization is off, then it has
            integer type, else ``float32`` type. If ``channels_first=True``, it has
-            ``[channel, time]`` else ``[time, channel]``.
+            `[channel, time]` else `[time, channel]`.
    """
    with soundfile.SoundFile(filepath, "r") as file_:
        if file_.format != "WAV" or normalize:
@@ -335,8 +335,8 @@ def save(
        filepath (str or pathlib.Path): Path to audio file.
        src (torch.Tensor): Audio data to save. must be 2D tensor.
        sample_rate (int): sampling rate
-        channels_first (bool, optional): If ``True``, the given tensor is interpreted as ``[channel, time]``,
-            otherwise ``[time, channel]``.
+        channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
+            otherwise `[time, channel]`.
        compression (float of None, optional): Not used.
            It is here only for interface compatibility reson with "sox_io" backend.
        format (str or None, optional): Override the audio format.

--- a/torchaudio/backend/sox_io_backend.py
+++ b/torchaudio/backend/sox_io_backend.py
@@ -89,7 +89,7 @@ def load(
        and corresponding codec libraries such as ``libmad`` or ``libmp3lame`` etc.

    By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
-    ``float32`` dtype and the shape of ``[channel, time]``.
+    ``float32`` dtype and the shape of `[channel, time]`.
    The samples are normalized to fit in the range of ``[-1.0, 1.0]``.

    When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
@@ -131,18 +131,18 @@ def load(
            integer type.
            This argument has no effect for formats other than integer WAV type.
        channels_first (bool, optional):
-            When True, the returned Tensor has dimension ``[channel, time]``.
-            Otherwise, the returned Tensor's dimension is ``[time, channel]``.
+            When True, the returned Tensor has dimension `[channel, time]`.
+            Otherwise, the returned Tensor's dimension is `[time, channel]`.
        format (str or None, optional):
            Override the format detection with the given format.
            Providing the argument might help when libsox can not infer the format
            from header or extension,

    Returns:
-        Tuple[torch.Tensor, int]: Resulting Tensor and sample rate.
+        (torch.Tensor, int): Resulting Tensor and sample rate.
            If the input file has integer wav format and normalization is off, then it has
            integer type, else ``float32`` type. If ``channels_first=True``, it has
-            ``[channel, time]`` else ``[time, channel]``.
+            `[channel, time]` else `[time, channel]`.
    """
    if not torch.jit.is_scripting():
        if hasattr(filepath, 'read'):
@@ -172,8 +172,8 @@ def save(
            as ``str`` for TorchScript compiler compatibility.
        src (torch.Tensor): Audio data to save. must be 2D tensor.
        sample_rate (int): sampling rate
-        channels_first (bool, optional): If ``True``, the given tensor is interpreted as ``[channel, time]``,
-            otherwise ``[time, channel]``.
+        channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
+            otherwise `[time, channel]`.
        compression (float or None, optional): Used for formats other than WAV.
            This corresponds to ``-C`` option of ``sox`` command.


--- a/torchaudio/datasets/cmuarctic.py
+++ b/torchaudio/datasets/cmuarctic.py
@@ -164,7 +164,7 @@ class CMUARCTIC(Dataset):
            n (int): The index of the sample to be loaded

        Returns:
-            tuple: ``(waveform, sample_rate, transcript, utterance_id)``
+            (Tensor, int, str, str): ``(waveform, sample_rate, transcript, utterance_id)``
        """
        line = self._walker[n]
        return load_cmuarctic_item(line, self._path, self._folder_audio, self._ext_audio)

--- a/torchaudio/datasets/cmudict.py
+++ b/torchaudio/datasets/cmudict.py
@@ -167,7 +167,7 @@ class CMUDict(Dataset):
            n (int): The index of the sample to be loaded.

        Returns:
-            tuple: The corresponding word and phonemes ``(word, [phonemes])``.
+            (str, List[str]): The corresponding word and phonemes ``(word, [phonemes])``.

        """
        return self._dictionary[n]

--- a/torchaudio/datasets/commonvoice.py
+++ b/torchaudio/datasets/commonvoice.py
@@ -65,8 +65,8 @@ class COMMONVOICE(Dataset):
            n (int): The index of the sample to be loaded

        Returns:
-            tuple: ``(waveform, sample_rate, dictionary)``,  where dictionary is built
-            from the TSV file with the following keys: ``client_id``, ``path``, ``sentence``,
+            (Tensor, int, Dict[str, str]): ``(waveform, sample_rate, dictionary)``,  where dictionary
+            is built from the TSV file with the following keys: ``client_id``, ``path``, ``sentence``,
            ``up_votes``, ``down_votes``, ``age``, ``gender`` and ``accent``.
        """
        line = self._walker[n]

--- a/torchaudio/datasets/dr_vctk.py
+++ b/torchaudio/datasets/dr_vctk.py
@@ -107,8 +107,9 @@ class DR_VCTK(Dataset):
            n (int): The index of the sample to be loaded

        Returns:
-            tuple: ``(waveform_clean, sample_rate_clean, waveform_noisy, sample_rate_noisy, speaker_id, utterance_id,\
-                source, channel_id)``
+            (Tensor, int, Tensor, int, str, str, str, int):
+            ``(waveform_clean, sample_rate_clean, waveform_noisy, sample_rate_noisy, speaker_id,\
+                utterance_id, source, channel_id)``
        """
        filename = self._filename_list[n]
        return self._load_dr_vctk_item(filename)

--- a/torchaudio/datasets/gtzan.py
+++ b/torchaudio/datasets/gtzan.py
@@ -1102,7 +1102,7 @@ class GTZAN(Dataset):
            n (int): The index of the sample to be loaded

        Returns:
-            tuple: ``(waveform, sample_rate, label)``
+            (Tensor, int, str): ``(waveform, sample_rate, label)``
        """
        fileid = self._walker[n]
        item = load_gtzan_item(fileid, self._path, self._ext_audio)

--- a/torchaudio/datasets/librimix.py
+++ b/torchaudio/datasets/librimix.py
@@ -84,6 +84,6 @@ class LibriMix(Dataset):
        Args:
            key (int): The index of the sample to be loaded
        Returns:
-            tuple: ``(sample_rate, mix_waveform, list_of_source_waveforms)``
+            (int, Tensor, List[Tensor]): ``(sample_rate, mix_waveform, list_of_source_waveforms)``
        """
        return self._load_sample(self.files[key])
--- a/torchaudio/datasets/librispeech.py
+++ b/torchaudio/datasets/librispeech.py
@@ -133,7 +133,8 @@ class LIBRISPEECH(Dataset):
            n (int): The index of the sample to be loaded

        Returns:
-            tuple: ``(waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)``
+            (Tensor, int, str, int, int, int):
+            ``(waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)``
        """
        fileid = self._walker[n]
        return load_librispeech_item(fileid, self._path, self._ext_audio, self._ext_txt)

--- a/torchaudio/datasets/libritts.py
+++ b/torchaudio/datasets/libritts.py
@@ -134,8 +134,8 @@ class LIBRITTS(Dataset):
            n (int): The index of the sample to be loaded

        Returns:
-            tuple: ``(waveform, sample_rate, original_text, normalized_text, speaker_id,
-            chapter_id, utterance_id)``
+            (Tensor, int, str, str, str, int, int, str):
+            ``(waveform, sample_rate, original_text, normalized_text, speaker_id, chapter_id, utterance_id)``
        """
        fileid = self._walker[n]
        return load_libritts_item(

--- a/torchaudio/datasets/ljspeech.py
+++ b/torchaudio/datasets/ljspeech.py
@@ -68,7 +68,8 @@ class LJSPEECH(Dataset):
            n (int): The index of the sample to be loaded

        Returns:
-            tuple: ``(waveform, sample_rate, transcript, normalized_transcript)``
+            (Tensor, int, str, str):
+            ``(waveform, sample_rate, transcript, normalized_transcript)``
        """
        line = self._flist[n]
        fileid, transcript, normalized_transcript = line

--- a/torchaudio/datasets/speechcommands.py
+++ b/torchaudio/datasets/speechcommands.py
@@ -138,7 +138,8 @@ class SPEECHCOMMANDS(Dataset):
            n (int): The index of the sample to be loaded

        Returns:
-            tuple: ``(waveform, sample_rate, label, speaker_id, utterance_number)``
+            (Tensor, int, str, str, int):
+            ``(waveform, sample_rate, label, speaker_id, utterance_number)``
        """
        fileid = self._walker[n]
        return load_speechcommands_item(fileid, self._path)

--- a/torchaudio/datasets/tedlium.py
+++ b/torchaudio/datasets/tedlium.py
@@ -127,7 +127,8 @@ class TEDLIUM(Dataset):
            path (str): Dataset root path

        Returns:
-            tuple: ``(waveform, sample_rate, transcript, talk_id, speaker_id, identifier)``
+            (Tensor, int, str, int, int, int):
+            ``(waveform, sample_rate, transcript, talk_id, speaker_id, identifier)``
        """
        transcript_path = os.path.join(path, "stm", fileid)
        with open(transcript_path + ".stm") as f:

--- a/torchaudio/datasets/utils.py
+++ b/torchaudio/datasets/utils.py
@@ -151,7 +151,7 @@ def extract_archive(from_path: str, to_path: Optional[str] = None, overwrite: bo
        overwrite (bool, optional): overwrite existing files (Default: ``False``)

    Returns:
-        list: List of paths to extracted files even if not overwritten.
+        List[str]: List of paths to extracted files even if not overwritten.

    Examples:
        >>> url = 'http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz'

--- a/torchaudio/datasets/vctk.py
+++ b/torchaudio/datasets/vctk.py
@@ -134,7 +134,8 @@ class VCTK_092(Dataset):
            n (int): The index of the sample to be loaded

        Returns:
-            tuple: ``(waveform, sample_rate, transcript, speaker_id, utterance_id)``
+            (Tensor, int, str, str, str):
+            ``(waveform, sample_rate, transcript, speaker_id, utterance_id)``
        """
        speaker_id, utterance_id = self._sample_ids[n]
        return self._load_sample(speaker_id, utterance_id, self._mic_id)

--- a/torchaudio/datasets/yesno.py
+++ b/torchaudio/datasets/yesno.py
@@ -77,7 +77,7 @@ class YESNO(Dataset):
            n (int): The index of the sample to be loaded

        Returns:
-            tuple: ``(waveform, sample_rate, labels)``
+            (Tensor, int, List[int]): ``(waveform, sample_rate, labels)``
        """
        fileid = self._walker[n]
        item = self._load_item(fileid, self._path)

--- a/torchaudio/functional/filtering.py
+++ b/torchaudio/functional/filtering.py
@@ -663,7 +663,7 @@ def filtfilt(

    Returns:
        Tensor: Waveform with dimension of either `(..., num_filters, time)` if ``a_coeffs`` and ``b_coeffs``
-                are 2D Tensors, or `(..., time)` otherwise.
+        are 2D Tensors, or `(..., time)` otherwise.
    """
    forward_filtered = lfilter(waveform, a_coeffs, b_coeffs, clamp=False, batching=True)
    backward_filtered = lfilter(
@@ -987,7 +987,7 @@ def lfilter(

    Returns:
        Tensor: Waveform with dimension of either `(..., num_filters, time)` if ``a_coeffs`` and ``b_coeffs``
-                are 2D Tensors, or `(..., time)` otherwise.
+        are 2D Tensors, or `(..., time)` otherwise.
    """
    assert a_coeffs.size() == b_coeffs.size()
    assert a_coeffs.ndim <= 2
@@ -1474,7 +1474,7 @@ def vad(
            in the detector algorithm. (Default: 2000.0)

    Returns:
-        Tensor: Tensor of audio of dimension (..., time).
+        Tensor: Tensor of audio of dimension `(..., time)`.

    Reference:
        - http://sox.sourceforge.net/sox.html

--- a/torchaudio/functional/functional.py
+++ b/torchaudio/functional/functional.py
@@ -263,7 +263,7 @@ def griffinlim(
        rand_init (bool): Initializes phase randomly if True, to zero otherwise.

    Returns:
-        torch.Tensor: waveform of `(..., time)`, where time equals the ``length`` parameter if given.
+        Tensor: waveform of `(..., time)`, where time equals the ``length`` parameter if given.
    """
    assert momentum < 1, 'momentum={} > 1 can be unstable'.format(momentum)
    assert momentum >= 0, 'momentum={} < 0'.format(momentum)
@@ -1369,7 +1369,7 @@ def apply_codec(
            For more details see :py:func:`torchaudio.backend.sox_io_backend.save`.

    Returns:
-        torch.Tensor: Resulting Tensor.
+        Tensor: Resulting Tensor.
        If ``channels_first=True``, it has `(channel, time)` else `(time, channel)`.
    """
    bytes = io.BytesIO()

--- a/torchaudio/models/conv_tasnet.py
+++ b/torchaudio/models/conv_tasnet.py
@@ -154,7 +154,7 @@ class MaskGenerator(torch.nn.Module):
            input (torch.Tensor): 3D Tensor with shape [batch, features, frames]

        Returns:
-            torch.Tensor: shape [batch, num_sources, features, frames]
+            Tensor: shape [batch, num_sources, features, frames]
        """
        batch_size = input.shape[0]
        feats = self.input_norm(input)
@@ -264,7 +264,7 @@ class ConvTasNet(torch.nn.Module):
            input (torch.Tensor): 3D Tensor with shape (batch_size, channels==1, frames)

        Returns:
-            torch.Tensor: Padded Tensor
+            Tensor: Padded Tensor
            int: Number of paddings performed
        """
        batch_size, num_channels, num_frames = input.shape
@@ -291,7 +291,7 @@ class ConvTasNet(torch.nn.Module):
            input (torch.Tensor): 3D Tensor with shape [batch, channel==1, frames]

        Returns:
-            torch.Tensor: 3D Tensor with shape [batch, channel==num_sources, frames]
+            Tensor: 3D Tensor with shape [batch, channel==num_sources, frames]
        """
        if input.ndim != 3 or input.shape[1] != 1:
            raise ValueError(

--- a/torchaudio/models/tacotron2.py
+++ b/torchaudio/models/tacotron2.py
@@ -1031,7 +1031,7 @@ class Tacotron2(nn.Module):
            mel_specgram_lengths (Tensor): The length of each mel spectrogram with shape `(n_batch, )`.

        Returns:
-            Tensor, Tensor, Tensor, and Tensor:
+            [Tensor, Tensor, Tensor, Tensor]:
                Tensor
                    Mel spectrogram before Postnet with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
                Tensor