Commit 476ab9ab authored by moto's avatar moto Committed by Facebook GitHub Bot
Browse files

Consolidate bibliography / reference (#2676)

Summary:
Preparation for the adoptation of `autosummary`.

Replace `:footcite:` with `:cite:` and introduce dedicated reference page, as `:footcite:` does not work well with `autosummary`.

Example:

https://output.circle-artifacts.com/output/job/4da47ba6-d9c7-418e-b5b0-e9f8a146a6c3/artifacts/0/docs/datasets.html#cmuarctic

https://output.circle-artifacts.com/output/job/4da47ba6-d9c7-418e-b5b0-e9f8a146a6c3/artifacts/0/docs/references.html

Pull Request resolved: https://github.com/pytorch/audio/pull/2676

Reviewed By: carolineechen

Differential Revision: D39509431

Pulled By: mthrok

fbshipit-source-id: e6003dd01ec3eff3d598054690f61de8ee31ac9a
parent 50c66721
......@@ -20,7 +20,7 @@ _RELEASE_CONFIGS = {
class LJSPEECH(Dataset):
"""Create a Dataset for *LJSpeech-1.1* [:footcite:`ljspeech17`].
"""Create a Dataset for *LJSpeech-1.1* :cite:`ljspeech17`.
Args:
root (str or Path): Path to the directory where the dataset is found or downloaded.
......
......@@ -31,7 +31,7 @@ _VALIDATION_SET = [
class MUSDB_HQ(Dataset):
"""Create *MUSDB_HQ* [:footcite:`MUSDB18HQ`] Dataset
"""Create *MUSDB_HQ* :cite:`MUSDB18HQ` Dataset
Args:
root (str or Path): Root directory where the dataset's top level directory is found
......
......@@ -23,7 +23,7 @@ _LANGUAGES = [
class QUESST14(Dataset):
"""Create *QUESST14* [:footcite:`Mir2015QUESST2014EQ`] Dataset
"""Create *QUESST14* :cite:`Mir2015QUESST2014EQ` Dataset
Args:
root (str or Path): Root directory where the dataset's top level directory is found
......
......@@ -49,7 +49,7 @@ def load_speechcommands_item(filepath: str, path: str) -> Tuple[Tensor, int, str
class SPEECHCOMMANDS(Dataset):
"""Create a Dataset for *Speech Commands* [:footcite:`speechcommandsv2`].
"""Create a Dataset for *Speech Commands* :cite:`speechcommandsv2`.
Args:
root (str or Path): Path to the directory where the dataset is found or downloaded.
......
......@@ -42,7 +42,7 @@ _RELEASE_CONFIGS = {
class TEDLIUM(Dataset):
"""
Create a Dataset for *Tedlium* [:footcite:`rousseau2012tedlium`]. It supports releases 1,2 and 3.
Create a Dataset for *Tedlium* :cite:`rousseau2012tedlium`. It supports releases 1,2 and 3.
Args:
root (str or Path): Path to the directory where the dataset is found or downloaded.
......
......@@ -17,7 +17,7 @@ SampleType = Tuple[Tensor, int, str, str, str]
class VCTK_092(Dataset):
"""Create *VCTK 0.92* [:footcite:`yamagishi2019vctk`] Dataset
"""Create *VCTK 0.92* :cite:`yamagishi2019vctk` Dataset
Args:
root (str): Root directory where the dataset's top level directory is found.
......
......@@ -90,7 +90,7 @@ def _get_file_id(file_path: str, _ext_audio: str):
class VoxCeleb1(Dataset):
"""Create *VoxCeleb1* [:footcite:`nagrani2017voxceleb`] Dataset.
"""Create *VoxCeleb1* :cite:`nagrani2017voxceleb` Dataset.
Args:
root (str or Path): Path to the directory where the dataset is found or downloaded.
......@@ -119,7 +119,7 @@ class VoxCeleb1(Dataset):
class VoxCeleb1Identification(VoxCeleb1):
"""Create *VoxCeleb1* [:footcite:`nagrani2017voxceleb`] Dataset for speaker identification task.
"""Create *VoxCeleb1* :cite:`nagrani2017voxceleb` Dataset for speaker identification task.
Each data sample contains the waveform, sample rate, speaker id, and the file id.
Args:
......@@ -167,7 +167,7 @@ class VoxCeleb1Identification(VoxCeleb1):
class VoxCeleb1Verification(VoxCeleb1):
"""Create *VoxCeleb1* [:footcite:`nagrani2017voxceleb`] Dataset for speaker verification task.
"""Create *VoxCeleb1* :cite:`nagrani2017voxceleb` Dataset for speaker verification task.
Each data sample contains a pair of waveforms, sample rate, the label indicating if they are
from the same speaker, and the file ids.
......
......@@ -19,7 +19,7 @@ _RELEASE_CONFIGS = {
class YESNO(Dataset):
"""Create a Dataset for *YesNo* [:footcite:`YesNo`].
"""Create a Dataset for *YesNo* :cite:`YesNo`.
Args:
root (str or Path): Path to the directory where the dataset is found or downloaded.
......
......@@ -269,8 +269,8 @@ def griffinlim(
.. properties:: Autograd TorchScript
Implementation ported from
*librosa* [:footcite:`brian_mcfee-proc-scipy-2015`], *A fast Griffin-Lim algorithm* [:footcite:`6701851`]
and *Signal estimation from modified short-time Fourier transform* [:footcite:`1172092`].
*librosa* :cite:`brian_mcfee-proc-scipy-2015`, *A fast Griffin-Lim algorithm* :cite:`6701851`
and *Signal estimation from modified short-time Fourier transform* :cite:`1172092`.
Args:
specgram (Tensor): A magnitude-only STFT spectrogram of dimension `(..., freq, frames)`
......@@ -1332,7 +1332,7 @@ def compute_kaldi_pitch(
snip_edges: bool = True,
) -> torch.Tensor:
"""Extract pitch based on method described in *A pitch extraction algorithm tuned
for automatic speech recognition* [:footcite:`6854049`].
for automatic speech recognition* :cite:`6854049`.
.. devices:: CPU
......@@ -1552,7 +1552,7 @@ def resample(
resampling_method: str = "sinc_interpolation",
beta: Optional[float] = None,
) -> Tensor:
r"""Resamples the waveform at the new frequency using bandlimited interpolation. [:footcite:`RESAMPLE`].
r"""Resamples the waveform at the new frequency using bandlimited interpolation. :cite:`RESAMPLE`.
.. devices:: CPU CUDA
......@@ -1840,7 +1840,7 @@ def rnnt_loss(
reduction: str = "mean",
):
"""Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks*
[:footcite:`graves2012sequence`].
:cite:`graves2012sequence`.
.. devices:: CPU CUDA
......@@ -2009,8 +2009,8 @@ def mvdr_weights_souden(
diag_eps: float = 1e-7,
eps: float = 1e-8,
) -> Tensor:
r"""Compute the Minimum Variance Distortionless Response (*MVDR* [:footcite:`capon1969high`]) beamforming weights
by the method proposed by *Souden et, al.* [:footcite:`souden2009optimal`].
r"""Compute the Minimum Variance Distortionless Response (*MVDR* :cite:`capon1969high`) beamforming weights
by the method proposed by *Souden et, al.* :cite:`souden2009optimal`.
.. devices:: CPU CUDA
......@@ -2072,7 +2072,7 @@ def mvdr_weights_rtf(
diag_eps: float = 1e-7,
eps: float = 1e-8,
) -> Tensor:
r"""Compute the Minimum Variance Distortionless Response (*MVDR* [:footcite:`capon1969high`]) beamforming weights
r"""Compute the Minimum Variance Distortionless Response (*MVDR* :cite:`capon1969high`) beamforming weights
based on the relative transfer function (RTF) and power spectral density (PSD) matrix of noise.
.. devices:: CPU CUDA
......
......@@ -300,7 +300,7 @@ class _HDecLayer(torch.nn.Module):
class HDemucs(torch.nn.Module):
r"""
Hybrid Demucs model from *Hybrid Spectrogram and Waveform Source Separation* [:footcite:`defossez2021hybrid`].
Hybrid Demucs model from *Hybrid Spectrogram and Waveform Source Separation* :cite:`defossez2021hybrid`.
Args:
sources (List[str]): list of source names. List can contain the following source
......
......@@ -215,7 +215,7 @@ class ConformerLayer(torch.nn.Module):
class Conformer(torch.nn.Module):
r"""Implements the Conformer architecture introduced in
*Conformer: Convolution-augmented Transformer for Speech Recognition*
[:footcite:`gulati2020conformer`].
:cite:`gulati2020conformer`.
Args:
input_dim (int): input dimension.
......
......@@ -162,7 +162,7 @@ class MaskGenerator(torch.nn.Module):
class ConvTasNet(torch.nn.Module):
"""Conv-TasNet: a fully-convolutional time-domain audio separation network
*Conv-TasNet: Surpassing Ideal Time–Frequency Magnitude Masking for Speech Separation*
[:footcite:`Luo_2019`].
:cite:`Luo_2019`.
Args:
num_sources (int, optional): The number of sources to split.
......@@ -304,7 +304,7 @@ class ConvTasNet(torch.nn.Module):
def conv_tasnet_base(num_sources: int = 2) -> ConvTasNet:
r"""Builds the non-causal version of ConvTasNet in
*Conv-TasNet: Surpassing Ideal Time–Frequency Magnitude Masking for Speech Separation*
[:footcite:`Luo_2019`].
:cite:`Luo_2019`.
The parameter settings follow the ones with the highest Si-SNR metirc score in the paper,
except the mask activation function is changed from "sigmoid" to "relu" for performance improvement.
......
......@@ -197,7 +197,7 @@ class CTCDecoder:
"""
.. devices:: CPU
CTC beam search decoder from *Flashlight* [:footcite:`kahn2022flashlight`].
CTC beam search decoder from *Flashlight* :cite:`kahn2022flashlight`.
Note:
To build the decoder, please use the factory function :py:func:`ctc_decoder`.
......@@ -349,7 +349,7 @@ def ctc_decoder(
unk_word: str = "<unk>",
) -> CTCDecoder:
"""
Builds CTC beam search decoder from *Flashlight* [:footcite:`kahn2022flashlight`].
Builds CTC beam search decoder from *Flashlight* :cite:`kahn2022flashlight`.
Args:
lexicon (str or None): lexicon file containing the possible words and corresponding spellings.
......
......@@ -28,7 +28,7 @@ class FullyConnected(torch.nn.Module):
class DeepSpeech(torch.nn.Module):
"""
DeepSpeech model architecture from *Deep Speech: Scaling up end-to-end speech recognition*
[:footcite:`hannun2014deep`].
:cite:`hannun2014deep`.
Args:
n_feature: Number of input features
......
......@@ -806,7 +806,7 @@ class _EmformerImpl(torch.nn.Module):
class Emformer(_EmformerImpl):
r"""Implements the Emformer architecture introduced in
*Emformer: Efficient Memory Transformer Based Acoustic Model for Low Latency Streaming Speech Recognition*
[:footcite:`shi2021emformer`].
:cite:`shi2021emformer`.
Args:
input_dim (int): input dimension.
......
......@@ -872,7 +872,7 @@ class Tacotron2(nn.Module):
The original implementation was introduced in
*Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions*
[:footcite:`shen2018natural`].
:cite:`shen2018natural`.
Args:
mask_padding (bool, optional): Use mask padding (Default: ``False``).
......
......@@ -7,7 +7,7 @@ __all__ = [
class Wav2Letter(nn.Module):
r"""Wav2Letter model architecture from *Wav2Letter: an End-to-End ConvNet-based Speech
Recognition System* [:footcite:`collobert2016wav2letter`].
Recognition System* :cite:`collobert2016wav2letter`.
:math:`\text{padding} = \frac{\text{ceil}(\text{kernel} - \text{stride})}{2}`
......
......@@ -10,7 +10,7 @@ from . import components
class Wav2Vec2Model(Module):
"""torchaudio.models.Wav2Vec2Model(feature_extractor: torch.nn.Module, encoder: torch.nn.Module, aux: Optional[torch.nn.Module] = None)
Encoder model used in *wav2vec 2.0* [:footcite:`baevski2020wav2vec`].
Encoder model used in *wav2vec 2.0* :cite:`baevski2020wav2vec`.
Note:
To build the model, please use one of the factory functions.
......@@ -244,7 +244,7 @@ def wav2vec2_model(
`ConvFeatureExtractionModel <https://github.com/pytorch/fairseq/blob/dd3bd3c0497ae9a7ae7364404a6b0a4c501780b3/fairseq/models/wav2vec/wav2vec2.py#L736>`__
in the original ``fairseq`` implementation.
This is referred as "(convolutional) feature encoder" in the *wav2vec 2.0*
[:footcite:`baevski2020wav2vec`] paper.
:cite:`baevski2020wav2vec` paper.
The "encoder" below corresponds to `TransformerEncoder <https://github.com/pytorch/fairseq/blob/dd3bd3c0497ae9a7ae7364404a6b0a4c501780b3/fairseq/models/wav2vec/wav2vec2.py#L817>`__,
and this is referred as "Transformer" in the paper.
......@@ -393,7 +393,7 @@ def wav2vec2_base(
encoder_layer_drop: float = 0.1,
aux_num_out: Optional[int] = None,
) -> Wav2Vec2Model:
"""Build Wav2Vec2Model with "base" architecture from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]
"""Build Wav2Vec2Model with "base" architecture from *wav2vec 2.0* :cite:`baevski2020wav2vec`
Args:
encoder_projection_dropout (float):
......@@ -441,7 +441,7 @@ def wav2vec2_large(
encoder_layer_drop: float = 0.1,
aux_num_out: Optional[int] = None,
) -> Wav2Vec2Model:
"""Build Wav2Vec2Model with "large" architecture from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]
"""Build Wav2Vec2Model with "large" architecture from *wav2vec 2.0* :cite:`baevski2020wav2vec`
Args:
encoder_projection_dropout (float):
......@@ -489,7 +489,7 @@ def wav2vec2_large_lv60k(
encoder_layer_drop: float = 0.1,
aux_num_out: Optional[int] = None,
) -> Wav2Vec2Model:
"""Build Wav2Vec2Model with "large lv-60k" architecture from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]
"""Build Wav2Vec2Model with "large lv-60k" architecture from *wav2vec 2.0* :cite:`baevski2020wav2vec`
Args:
encoder_projection_dropout (float):
......@@ -537,7 +537,7 @@ def hubert_base(
encoder_layer_drop: float = 0.05,
aux_num_out: Optional[int] = None,
) -> Wav2Vec2Model:
"""Build HuBERT model with "base" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
"""Build HuBERT model with "base" architecture from *HuBERT* :cite:`hsu2021hubert`
Args:
encoder_projection_dropout (float):
......@@ -585,7 +585,7 @@ def hubert_large(
encoder_layer_drop: float = 0.0,
aux_num_out: Optional[int] = None,
) -> Wav2Vec2Model:
"""Build HuBERT model with "large" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
"""Build HuBERT model with "large" architecture from *HuBERT* :cite:`hsu2021hubert`
Args:
encoder_projection_dropout (float):
......@@ -633,7 +633,7 @@ def hubert_xlarge(
encoder_layer_drop: float = 0.0,
aux_num_out: Optional[int] = None,
) -> Wav2Vec2Model:
"""Build HuBERT model with "extra large" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
"""Build HuBERT model with "extra large" architecture from *HuBERT* :cite:`hsu2021hubert`
Args:
encoder_projection_dropout (float):
......@@ -714,7 +714,7 @@ def hubert_pretrain_model(
`ConvFeatureExtractionModel <https://github.com/pytorch/fairseq/blob/dd3bd3c0497ae9a7ae7364404a6b0a4c501780b3/fairseq/models/wav2vec/wav2vec2.py#L736>`__
in the original ``fairseq`` implementation.
This is referred as "(convolutional) feature encoder" in the *wav2vec 2.0*
[:footcite:`baevski2020wav2vec`] paper.
:cite:`baevski2020wav2vec` paper.
The "encoder" below corresponds to `TransformerEncoder <https://github.com/pytorch/fairseq/blob/dd3bd3c0497ae9a7ae7364404a6b0a4c501780b3/fairseq/models/wav2vec/wav2vec2.py#L817>`__,
and this is referred as "Transformer" in the paper.
......@@ -975,7 +975,7 @@ def hubert_pretrain_base(
feature_grad_mult: Optional[float] = 0.1,
num_classes: int = 100,
) -> HuBERTPretrainModel:
"""Build HuBERTPretrainModel model with "base" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
"""Build HuBERTPretrainModel model with "base" architecture from *HuBERT* :cite:`hsu2021hubert`
Args:
encoder_projection_dropout (float):
......@@ -1050,7 +1050,7 @@ def hubert_pretrain_large(
mask_channel_length: int = 10,
feature_grad_mult: Optional[float] = None,
) -> HuBERTPretrainModel:
"""Build HuBERTPretrainModel model for pre-training with "large" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
"""Build HuBERTPretrainModel model for pre-training with "large" architecture from *HuBERT* :cite:`hsu2021hubert`
Args:
encoder_projection_dropout (float):
......@@ -1123,7 +1123,7 @@ def hubert_pretrain_xlarge(
mask_channel_length: int = 10,
feature_grad_mult: Optional[float] = None,
) -> HuBERTPretrainModel:
"""Build HuBERTPretrainModel model for pre-training with "extra large" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
"""Build HuBERTPretrainModel model for pre-training with "extra large" architecture from *HuBERT* :cite:`hsu2021hubert`
Args:
encoder_projection_dropout (float):
......
......@@ -15,7 +15,7 @@ __all__ = [
class ResBlock(nn.Module):
r"""ResNet block based on *Efficient Neural Audio Synthesis* [:footcite:`kalchbrenner2018efficient`].
r"""ResNet block based on *Efficient Neural Audio Synthesis* :cite:`kalchbrenner2018efficient`.
Args:
n_freq: the number of bins in a spectrogram. (Default: ``128``)
......@@ -200,7 +200,7 @@ class WaveRNN(nn.Module):
r"""WaveRNN model based on the implementation from `fatchord <https://github.com/fatchord/WaveRNN>`_.
The original implementation was introduced in *Efficient Neural Audio Synthesis*
[:footcite:`kalchbrenner2018efficient`]. The input channels of waveform and spectrogram have to be 1.
:cite:`kalchbrenner2018efficient`. The input channels of waveform and spectrogram have to be 1.
The product of `upsample_scales` must equal `hop_length`.
Args:
......
......@@ -66,8 +66,8 @@ CONVTASNET_BASE_LIBRI2MIX = SourceSeparationBundle(
_model_factory_func=partial(conv_tasnet_base, num_sources=2),
_sample_rate=8000,
)
CONVTASNET_BASE_LIBRI2MIX.__doc__ = """Pre-trained Source Separation pipeline with *ConvTasNet* [:footcite:`Luo_2019`] trained on
*Libri2Mix dataset* [:footcite:`cosentino2020librimix`].
CONVTASNET_BASE_LIBRI2MIX.__doc__ = """Pre-trained Source Separation pipeline with *ConvTasNet* :cite:`Luo_2019` trained on
*Libri2Mix dataset* :cite:`cosentino2020librimix`.
The source separation model is constructed by :py:func:`torchaudio.models.conv_tasnet_base`
and is trained using the training script ``lightning_train.py``
......@@ -83,8 +83,8 @@ HDEMUCS_HIGH_MUSDB_PLUS = SourceSeparationBundle(
_model_factory_func=partial(hdemucs_high, sources=["drums", "bass", "other", "vocals"]),
_sample_rate=44100,
)
HDEMUCS_HIGH_MUSDB_PLUS.__doc__ = """Pre-trained *Hybrid Demucs* [:footcite:`defossez2021hybrid`] pipeline for music
source separation trained on MUSDB-HQ [:footcite:`MUSDB18HQ`] and additional internal training data.
HDEMUCS_HIGH_MUSDB_PLUS.__doc__ = """Pre-trained *Hybrid Demucs* :cite:`defossez2021hybrid` pipeline for music
source separation trained on MUSDB-HQ :cite:`MUSDB18HQ` and additional internal training data.
The model is constructed by :py:func:`torchaudio.prototype.models.hdemucs_high`.
Training was performed in the original HDemucs repository `here <https://github.com/facebookresearch/demucs/>`__.
......@@ -98,8 +98,8 @@ HDEMUCS_HIGH_MUSDB = SourceSeparationBundle(
_model_factory_func=partial(hdemucs_high, sources=["drums", "bass", "other", "vocals"]),
_sample_rate=44100,
)
HDEMUCS_HIGH_MUSDB.__doc__ = """Pre-trained *Hybrid Demucs* [:footcite:`defossez2021hybrid`] pipeline for music
source separation trained on MUSDB-HQ [:footcite:`MUSDB18HQ`].
HDEMUCS_HIGH_MUSDB.__doc__ = """Pre-trained *Hybrid Demucs* :cite:`defossez2021hybrid` pipeline for music
source separation trained on MUSDB-HQ :cite:`MUSDB18HQ`.
The model is constructed by :py:func:`torchaudio.prototype.models.hdemucs_high`.
Training was performed in the original HDemucs repository `here <https://github.com/facebookresearch/demucs/>`__.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment