Unverified Commit cb40dd72 authored by Caroline Chen's avatar Caroline Chen Committed by GitHub
Browse files

[DOC] Standardization and minor fixes (#1892)

parent 955cdbdc
......@@ -146,7 +146,7 @@ def load(
* SPHERE
By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
``float32`` dtype and the shape of ``[channel, time]``.
``float32`` dtype and the shape of `[channel, time]`.
The samples are normalized to fit in the range of ``[-1.0, 1.0]``.
When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
......@@ -182,16 +182,16 @@ def load(
integer type.
This argument has no effect for formats other than integer WAV type.
channels_first (bool, optional):
When True, the returned Tensor has dimension ``[channel, time]``.
Otherwise, the returned Tensor's dimension is ``[time, channel]``.
When True, the returned Tensor has dimension `[channel, time]`.
Otherwise, the returned Tensor's dimension is `[time, channel]`.
format (str or None, optional):
Not used. PySoundFile does not accept format hint.
Returns:
Tuple[torch.Tensor, int]: Resulting Tensor and sample rate.
(torch.Tensor, int): Resulting Tensor and sample rate.
If the input file has integer wav format and normalization is off, then it has
integer type, else ``float32`` type. If ``channels_first=True``, it has
``[channel, time]`` else ``[time, channel]``.
`[channel, time]` else `[time, channel]`.
"""
with soundfile.SoundFile(filepath, "r") as file_:
if file_.format != "WAV" or normalize:
......@@ -335,8 +335,8 @@ def save(
filepath (str or pathlib.Path): Path to audio file.
src (torch.Tensor): Audio data to save. must be 2D tensor.
sample_rate (int): sampling rate
channels_first (bool, optional): If ``True``, the given tensor is interpreted as ``[channel, time]``,
otherwise ``[time, channel]``.
channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
otherwise `[time, channel]`.
compression (float of None, optional): Not used.
It is here only for interface compatibility reson with "sox_io" backend.
format (str or None, optional): Override the audio format.
......
......@@ -89,7 +89,7 @@ def load(
and corresponding codec libraries such as ``libmad`` or ``libmp3lame`` etc.
By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
``float32`` dtype and the shape of ``[channel, time]``.
``float32`` dtype and the shape of `[channel, time]`.
The samples are normalized to fit in the range of ``[-1.0, 1.0]``.
When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
......@@ -131,18 +131,18 @@ def load(
integer type.
This argument has no effect for formats other than integer WAV type.
channels_first (bool, optional):
When True, the returned Tensor has dimension ``[channel, time]``.
Otherwise, the returned Tensor's dimension is ``[time, channel]``.
When True, the returned Tensor has dimension `[channel, time]`.
Otherwise, the returned Tensor's dimension is `[time, channel]`.
format (str or None, optional):
Override the format detection with the given format.
Providing the argument might help when libsox can not infer the format
from header or extension,
Returns:
Tuple[torch.Tensor, int]: Resulting Tensor and sample rate.
(torch.Tensor, int): Resulting Tensor and sample rate.
If the input file has integer wav format and normalization is off, then it has
integer type, else ``float32`` type. If ``channels_first=True``, it has
``[channel, time]`` else ``[time, channel]``.
`[channel, time]` else `[time, channel]`.
"""
if not torch.jit.is_scripting():
if hasattr(filepath, 'read'):
......@@ -172,8 +172,8 @@ def save(
as ``str`` for TorchScript compiler compatibility.
src (torch.Tensor): Audio data to save. must be 2D tensor.
sample_rate (int): sampling rate
channels_first (bool, optional): If ``True``, the given tensor is interpreted as ``[channel, time]``,
otherwise ``[time, channel]``.
channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
otherwise `[time, channel]`.
compression (float or None, optional): Used for formats other than WAV.
This corresponds to ``-C`` option of ``sox`` command.
......
......@@ -164,7 +164,7 @@ class CMUARCTIC(Dataset):
n (int): The index of the sample to be loaded
Returns:
tuple: ``(waveform, sample_rate, transcript, utterance_id)``
(Tensor, int, str, str): ``(waveform, sample_rate, transcript, utterance_id)``
"""
line = self._walker[n]
return load_cmuarctic_item(line, self._path, self._folder_audio, self._ext_audio)
......
......@@ -167,7 +167,7 @@ class CMUDict(Dataset):
n (int): The index of the sample to be loaded.
Returns:
tuple: The corresponding word and phonemes ``(word, [phonemes])``.
(str, List[str]): The corresponding word and phonemes ``(word, [phonemes])``.
"""
return self._dictionary[n]
......
......@@ -65,8 +65,8 @@ class COMMONVOICE(Dataset):
n (int): The index of the sample to be loaded
Returns:
tuple: ``(waveform, sample_rate, dictionary)``, where dictionary is built
from the TSV file with the following keys: ``client_id``, ``path``, ``sentence``,
(Tensor, int, Dict[str, str]): ``(waveform, sample_rate, dictionary)``, where dictionary
is built from the TSV file with the following keys: ``client_id``, ``path``, ``sentence``,
``up_votes``, ``down_votes``, ``age``, ``gender`` and ``accent``.
"""
line = self._walker[n]
......
......@@ -107,8 +107,9 @@ class DR_VCTK(Dataset):
n (int): The index of the sample to be loaded
Returns:
tuple: ``(waveform_clean, sample_rate_clean, waveform_noisy, sample_rate_noisy, speaker_id, utterance_id,\
source, channel_id)``
(Tensor, int, Tensor, int, str, str, str, int):
``(waveform_clean, sample_rate_clean, waveform_noisy, sample_rate_noisy, speaker_id,\
utterance_id, source, channel_id)``
"""
filename = self._filename_list[n]
return self._load_dr_vctk_item(filename)
......
......@@ -1102,7 +1102,7 @@ class GTZAN(Dataset):
n (int): The index of the sample to be loaded
Returns:
tuple: ``(waveform, sample_rate, label)``
(Tensor, int, str): ``(waveform, sample_rate, label)``
"""
fileid = self._walker[n]
item = load_gtzan_item(fileid, self._path, self._ext_audio)
......
......@@ -84,6 +84,6 @@ class LibriMix(Dataset):
Args:
key (int): The index of the sample to be loaded
Returns:
tuple: ``(sample_rate, mix_waveform, list_of_source_waveforms)``
(int, Tensor, List[Tensor]): ``(sample_rate, mix_waveform, list_of_source_waveforms)``
"""
return self._load_sample(self.files[key])
......@@ -133,7 +133,8 @@ class LIBRISPEECH(Dataset):
n (int): The index of the sample to be loaded
Returns:
tuple: ``(waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)``
(Tensor, int, str, int, int, int):
``(waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)``
"""
fileid = self._walker[n]
return load_librispeech_item(fileid, self._path, self._ext_audio, self._ext_txt)
......
......@@ -134,8 +134,8 @@ class LIBRITTS(Dataset):
n (int): The index of the sample to be loaded
Returns:
tuple: ``(waveform, sample_rate, original_text, normalized_text, speaker_id,
chapter_id, utterance_id)``
(Tensor, int, str, str, str, int, int, str):
``(waveform, sample_rate, original_text, normalized_text, speaker_id, chapter_id, utterance_id)``
"""
fileid = self._walker[n]
return load_libritts_item(
......
......@@ -68,7 +68,8 @@ class LJSPEECH(Dataset):
n (int): The index of the sample to be loaded
Returns:
tuple: ``(waveform, sample_rate, transcript, normalized_transcript)``
(Tensor, int, str, str):
``(waveform, sample_rate, transcript, normalized_transcript)``
"""
line = self._flist[n]
fileid, transcript, normalized_transcript = line
......
......@@ -138,7 +138,8 @@ class SPEECHCOMMANDS(Dataset):
n (int): The index of the sample to be loaded
Returns:
tuple: ``(waveform, sample_rate, label, speaker_id, utterance_number)``
(Tensor, int, str, str, int):
``(waveform, sample_rate, label, speaker_id, utterance_number)``
"""
fileid = self._walker[n]
return load_speechcommands_item(fileid, self._path)
......
......@@ -127,7 +127,8 @@ class TEDLIUM(Dataset):
path (str): Dataset root path
Returns:
tuple: ``(waveform, sample_rate, transcript, talk_id, speaker_id, identifier)``
(Tensor, int, str, int, int, int):
``(waveform, sample_rate, transcript, talk_id, speaker_id, identifier)``
"""
transcript_path = os.path.join(path, "stm", fileid)
with open(transcript_path + ".stm") as f:
......
......@@ -151,7 +151,7 @@ def extract_archive(from_path: str, to_path: Optional[str] = None, overwrite: bo
overwrite (bool, optional): overwrite existing files (Default: ``False``)
Returns:
list: List of paths to extracted files even if not overwritten.
List[str]: List of paths to extracted files even if not overwritten.
Examples:
>>> url = 'http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz'
......
......@@ -134,7 +134,8 @@ class VCTK_092(Dataset):
n (int): The index of the sample to be loaded
Returns:
tuple: ``(waveform, sample_rate, transcript, speaker_id, utterance_id)``
(Tensor, int, str, str, str):
``(waveform, sample_rate, transcript, speaker_id, utterance_id)``
"""
speaker_id, utterance_id = self._sample_ids[n]
return self._load_sample(speaker_id, utterance_id, self._mic_id)
......
......@@ -77,7 +77,7 @@ class YESNO(Dataset):
n (int): The index of the sample to be loaded
Returns:
tuple: ``(waveform, sample_rate, labels)``
(Tensor, int, List[int]): ``(waveform, sample_rate, labels)``
"""
fileid = self._walker[n]
item = self._load_item(fileid, self._path)
......
......@@ -663,7 +663,7 @@ def filtfilt(
Returns:
Tensor: Waveform with dimension of either `(..., num_filters, time)` if ``a_coeffs`` and ``b_coeffs``
are 2D Tensors, or `(..., time)` otherwise.
are 2D Tensors, or `(..., time)` otherwise.
"""
forward_filtered = lfilter(waveform, a_coeffs, b_coeffs, clamp=False, batching=True)
backward_filtered = lfilter(
......@@ -987,7 +987,7 @@ def lfilter(
Returns:
Tensor: Waveform with dimension of either `(..., num_filters, time)` if ``a_coeffs`` and ``b_coeffs``
are 2D Tensors, or `(..., time)` otherwise.
are 2D Tensors, or `(..., time)` otherwise.
"""
assert a_coeffs.size() == b_coeffs.size()
assert a_coeffs.ndim <= 2
......@@ -1474,7 +1474,7 @@ def vad(
in the detector algorithm. (Default: 2000.0)
Returns:
Tensor: Tensor of audio of dimension (..., time).
Tensor: Tensor of audio of dimension `(..., time)`.
Reference:
- http://sox.sourceforge.net/sox.html
......
......@@ -263,7 +263,7 @@ def griffinlim(
rand_init (bool): Initializes phase randomly if True, to zero otherwise.
Returns:
torch.Tensor: waveform of `(..., time)`, where time equals the ``length`` parameter if given.
Tensor: waveform of `(..., time)`, where time equals the ``length`` parameter if given.
"""
assert momentum < 1, 'momentum={} > 1 can be unstable'.format(momentum)
assert momentum >= 0, 'momentum={} < 0'.format(momentum)
......@@ -1369,7 +1369,7 @@ def apply_codec(
For more details see :py:func:`torchaudio.backend.sox_io_backend.save`.
Returns:
torch.Tensor: Resulting Tensor.
Tensor: Resulting Tensor.
If ``channels_first=True``, it has `(channel, time)` else `(time, channel)`.
"""
bytes = io.BytesIO()
......
......@@ -154,7 +154,7 @@ class MaskGenerator(torch.nn.Module):
input (torch.Tensor): 3D Tensor with shape [batch, features, frames]
Returns:
torch.Tensor: shape [batch, num_sources, features, frames]
Tensor: shape [batch, num_sources, features, frames]
"""
batch_size = input.shape[0]
feats = self.input_norm(input)
......@@ -264,7 +264,7 @@ class ConvTasNet(torch.nn.Module):
input (torch.Tensor): 3D Tensor with shape (batch_size, channels==1, frames)
Returns:
torch.Tensor: Padded Tensor
Tensor: Padded Tensor
int: Number of paddings performed
"""
batch_size, num_channels, num_frames = input.shape
......@@ -291,7 +291,7 @@ class ConvTasNet(torch.nn.Module):
input (torch.Tensor): 3D Tensor with shape [batch, channel==1, frames]
Returns:
torch.Tensor: 3D Tensor with shape [batch, channel==num_sources, frames]
Tensor: 3D Tensor with shape [batch, channel==num_sources, frames]
"""
if input.ndim != 3 or input.shape[1] != 1:
raise ValueError(
......
......@@ -1031,7 +1031,7 @@ class Tacotron2(nn.Module):
mel_specgram_lengths (Tensor): The length of each mel spectrogram with shape `(n_batch, )`.
Returns:
Tensor, Tensor, Tensor, and Tensor:
[Tensor, Tensor, Tensor, Tensor]:
Tensor
Mel spectrogram before Postnet with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
Tensor
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment