Consolidate bibliography / reference (#2676)

Summary: Preparation for the adoptation of `autosummary`. Replace `:footcite:` with `:cite:` and introduce dedicated reference page, as `:footcite:` does not work well with `autosummary`. Example: https://output.circle-artifacts.com/output/job/4da47ba6-d9c7-418e-b5b0-e9f8a146a6c3/artifacts/0/docs/datasets.html#cmuarctic https://output.circle-artifacts.com/output/job/4da47ba6-d9c7-418e-b5b0-e9f8a146a6c3/artifacts/0/docs/references.html Pull Request resolved: https://github.com/pytorch/audio/pull/2676 Reviewed By: carolineechen Differential Revision: D39509431 Pulled By: mthrok fbshipit-source-id: e6003dd01ec3eff3d598054690f61de8ee31ac9a

Consolidate bibliography / reference (#2676)
Summary: Preparation for the adoptation of `autosummary`. Replace `:footcite:` with `:cite:` and introduce dedicated reference page, as `:footcite:` does not work well with `autosummary`. Example: https://output.circle-artifacts.com/output/job/4da47ba6-d9c7-418e-b5b0-e9f8a146a6c3/artifacts/0/docs/datasets.html#cmuarctic https://output.circle-artifacts.com/output/job/4da47ba6-d9c7-418e-b5b0-e9f8a146a6c3/artifacts/0/docs/references.html Pull Request resolved: https://github.com/pytorch/audio/pull/2676 Reviewed By: carolineechen Differential Revision: D39509431 Pulled By: mthrok fbshipit-source-id: e6003dd01ec3eff3d598054690f61de8ee31ac9a
476ab9ab · moto · Facebook GitHub Bot · 50c66721 · 476ab9ab · 476ab9ab
Commit 476ab9ab authored Sep 14, 2022 by moto Committed by Facebook GitHub Bot Sep 14, 2022
20 changed files
--- a/torchaudio/datasets/ljspeech.py
+++ b/torchaudio/datasets/ljspeech.py
@@ -20,7 +20,7 @@ _RELEASE_CONFIGS = {
 class LJSPEECH(Dataset):
-    """Create a Dataset for *LJSpeech-1.1* [:footcite:`ljspeech17`].
+    """Create a Dataset for *LJSpeech-1.1* :cite:`ljspeech17`.
    Args:
        root (str or Path): Path to the directory where the dataset is found or downloaded.

--- a/torchaudio/datasets/musdb_hq.py
+++ b/torchaudio/datasets/musdb_hq.py
@@ -31,7 +31,7 @@ _VALIDATION_SET = [
 class MUSDB_HQ(Dataset):
-    """Create *MUSDB_HQ* [:footcite:`MUSDB18HQ`] Dataset
+    """Create *MUSDB_HQ* :cite:`MUSDB18HQ` Dataset
    Args:
        root (str or Path): Root directory where the dataset's top level directory is found

--- a/torchaudio/datasets/quesst14.py
+++ b/torchaudio/datasets/quesst14.py
@@ -23,7 +23,7 @@ _LANGUAGES = [
 class QUESST14(Dataset):
-    """Create *QUESST14* [:footcite:`Mir2015QUESST2014EQ`] Dataset
+    """Create *QUESST14* :cite:`Mir2015QUESST2014EQ` Dataset
    Args:
        root (str or Path): Root directory where the dataset's top level directory is found

--- a/torchaudio/datasets/speechcommands.py
+++ b/torchaudio/datasets/speechcommands.py
@@ -49,7 +49,7 @@ def load_speechcommands_item(filepath: str, path: str) -> Tuple[Tensor, int, str
 class SPEECHCOMMANDS(Dataset):
-    """Create a Dataset for *Speech Commands* [:footcite:`speechcommandsv2`].
+    """Create a Dataset for *Speech Commands* :cite:`speechcommandsv2`.
    Args:
        root (str or Path): Path to the directory where the dataset is found or downloaded.

--- a/torchaudio/datasets/tedlium.py
+++ b/torchaudio/datasets/tedlium.py
@@ -42,7 +42,7 @@ _RELEASE_CONFIGS = {
 class TEDLIUM(Dataset):
    """
-    Create a Dataset for *Tedlium* [:footcite:`rousseau2012tedlium`]. It supports releases 1,2 and 3.
+    Create a Dataset for *Tedlium* :cite:`rousseau2012tedlium`. It supports releases 1,2 and 3.
    Args:
        root (str or Path): Path to the directory where the dataset is found or downloaded.

--- a/torchaudio/datasets/vctk.py
+++ b/torchaudio/datasets/vctk.py
@@ -17,7 +17,7 @@ SampleType = Tuple[Tensor, int, str, str, str]
 class VCTK_092(Dataset):
-    """Create *VCTK 0.92* [:footcite:`yamagishi2019vctk`] Dataset
+    """Create *VCTK 0.92* :cite:`yamagishi2019vctk` Dataset
    Args:
        root (str): Root directory where the dataset's top level directory is found.

--- a/torchaudio/datasets/voxceleb1.py
+++ b/torchaudio/datasets/voxceleb1.py
@@ -90,7 +90,7 @@ def _get_file_id(file_path: str, _ext_audio: str):
 class VoxCeleb1(Dataset):
-    """Create *VoxCeleb1* [:footcite:`nagrani2017voxceleb`] Dataset.
+    """Create *VoxCeleb1* :cite:`nagrani2017voxceleb` Dataset.
    Args:
        root (str or Path): Path to the directory where the dataset is found or downloaded.
@@ -119,7 +119,7 @@ class VoxCeleb1(Dataset):
 class VoxCeleb1Identification(VoxCeleb1):
-    """Create *VoxCeleb1* [:footcite:`nagrani2017voxceleb`] Dataset for speaker identification task.
+    """Create *VoxCeleb1* :cite:`nagrani2017voxceleb` Dataset for speaker identification task.
    Each data sample contains the waveform, sample rate, speaker id, and the file id.
    Args:
@@ -167,7 +167,7 @@ class VoxCeleb1Identification(VoxCeleb1):
 class VoxCeleb1Verification(VoxCeleb1):
-    """Create *VoxCeleb1* [:footcite:`nagrani2017voxceleb`] Dataset for speaker verification task.
+    """Create *VoxCeleb1* :cite:`nagrani2017voxceleb` Dataset for speaker verification task.
    Each data sample contains a pair of waveforms, sample rate, the label indicating if they are
    from the same speaker, and the file ids.

--- a/torchaudio/datasets/yesno.py
+++ b/torchaudio/datasets/yesno.py
@@ -19,7 +19,7 @@ _RELEASE_CONFIGS = {
 class YESNO(Dataset):
-    """Create a Dataset for *YesNo* [:footcite:`YesNo`].
+    """Create a Dataset for *YesNo* :cite:`YesNo`.
    Args:
        root (str or Path): Path to the directory where the dataset is found or downloaded.

--- a/torchaudio/functional/functional.py
+++ b/torchaudio/functional/functional.py
@@ -269,8 +269,8 @@ def griffinlim(
    .. properties:: Autograd TorchScript
    Implementation ported from
-    *librosa* [:footcite:`brian_mcfee-proc-scipy-2015`], *A fast Griffin-Lim algorithm* [:footcite:`6701851`]
+    *librosa* :cite:`brian_mcfee-proc-scipy-2015`, *A fast Griffin-Lim algorithm* :cite:`6701851`
-    and *Signal estimation from modified short-time Fourier transform* [:footcite:`1172092`].
+    and *Signal estimation from modified short-time Fourier transform* :cite:`1172092`.
    Args:
        specgram (Tensor): A magnitude-only STFT spectrogram of dimension `(..., freq, frames)`
@@ -1332,7 +1332,7 @@ def compute_kaldi_pitch(
    snip_edges: bool = True,
 ) -> torch.Tensor:
    """Extract pitch based on method described in *A pitch extraction algorithm tuned
-    for automatic speech recognition* [:footcite:`6854049`].
+    for automatic speech recognition* :cite:`6854049`.
    .. devices:: CPU
@@ -1552,7 +1552,7 @@ def resample(
    resampling_method: str = "sinc_interpolation",
    beta: Optional[float] = None,
 ) -> Tensor:
-    r"""Resamples the waveform at the new frequency using bandlimited interpolation. [:footcite:`RESAMPLE`].
+    r"""Resamples the waveform at the new frequency using bandlimited interpolation. :cite:`RESAMPLE`.
    .. devices:: CPU CUDA
@@ -1840,7 +1840,7 @@ def rnnt_loss(
    reduction: str = "mean",
 ):
    """Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks*
-    [:footcite:`graves2012sequence`].
+    :cite:`graves2012sequence`.
    .. devices:: CPU CUDA
@@ -2009,8 +2009,8 @@ def mvdr_weights_souden(
    diag_eps: float = 1e-7,
    eps: float = 1e-8,
 ) -> Tensor:
-    r"""Compute the Minimum Variance Distortionless Response (*MVDR* [:footcite:`capon1969high`]) beamforming weights
+    r"""Compute the Minimum Variance Distortionless Response (*MVDR* :cite:`capon1969high`) beamforming weights
-    by the method proposed by *Souden et, al.* [:footcite:`souden2009optimal`].
+    by the method proposed by *Souden et, al.* :cite:`souden2009optimal`.
    .. devices:: CPU CUDA
@@ -2072,7 +2072,7 @@ def mvdr_weights_rtf(
    diag_eps: float = 1e-7,
    eps: float = 1e-8,
 ) -> Tensor:
-    r"""Compute the Minimum Variance Distortionless Response (*MVDR* [:footcite:`capon1969high`]) beamforming weights
+    r"""Compute the Minimum Variance Distortionless Response (*MVDR* :cite:`capon1969high`) beamforming weights
    based on the relative transfer function (RTF) and power spectral density (PSD) matrix of noise.
    .. devices:: CPU CUDA

--- a/torchaudio/models/_hdemucs.py
+++ b/torchaudio/models/_hdemucs.py
@@ -300,7 +300,7 @@ class _HDecLayer(torch.nn.Module):
 class HDemucs(torch.nn.Module):
    r"""
-    Hybrid Demucs model from *Hybrid Spectrogram and Waveform Source Separation* [:footcite:`defossez2021hybrid`].
+    Hybrid Demucs model from *Hybrid Spectrogram and Waveform Source Separation* :cite:`defossez2021hybrid`.
    Args:
        sources (List[str]): list of source names. List can contain the following source

--- a/torchaudio/models/conformer.py
+++ b/torchaudio/models/conformer.py
@@ -215,7 +215,7 @@ class ConformerLayer(torch.nn.Module):
 class Conformer(torch.nn.Module):
    r"""Implements the Conformer architecture introduced in
    *Conformer: Convolution-augmented Transformer for Speech Recognition*
-    [:footcite:`gulati2020conformer`].
+    :cite:`gulati2020conformer`.
    Args:
        input_dim (int): input dimension.

--- a/torchaudio/models/conv_tasnet.py
+++ b/torchaudio/models/conv_tasnet.py
@@ -162,7 +162,7 @@ class MaskGenerator(torch.nn.Module):
 class ConvTasNet(torch.nn.Module):
    """Conv-TasNet: a fully-convolutional time-domain audio separation network
    *Conv-TasNet: Surpassing Ideal Time–Frequency Magnitude Masking for Speech Separation*
-    [:footcite:`Luo_2019`].
+    :cite:`Luo_2019`.
    Args:
        num_sources (int, optional): The number of sources to split.
@@ -304,7 +304,7 @@ class ConvTasNet(torch.nn.Module):
 def conv_tasnet_base(num_sources: int = 2) -> ConvTasNet:
    r"""Builds the non-causal version of ConvTasNet in
    *Conv-TasNet: Surpassing Ideal Time–Frequency Magnitude Masking for Speech Separation*
-    [:footcite:`Luo_2019`].
+    :cite:`Luo_2019`.
    The parameter settings follow the ones with the highest Si-SNR metirc score in the paper,
    except the mask activation function is changed from "sigmoid" to "relu" for performance improvement.

--- a/torchaudio/models/decoder/_ctc_decoder.py
+++ b/torchaudio/models/decoder/_ctc_decoder.py
@@ -197,7 +197,7 @@ class CTCDecoder:
    """
    .. devices:: CPU
-    CTC beam search decoder from *Flashlight* [:footcite:`kahn2022flashlight`].
+    CTC beam search decoder from *Flashlight* :cite:`kahn2022flashlight`.
    Note:
        To build the decoder, please use the factory function :py:func:`ctc_decoder`.
@@ -349,7 +349,7 @@ def ctc_decoder(
    unk_word: str = "<unk>",
 ) -> CTCDecoder:
    """
-    Builds CTC beam search decoder from *Flashlight* [:footcite:`kahn2022flashlight`].
+    Builds CTC beam search decoder from *Flashlight* :cite:`kahn2022flashlight`.
    Args:
        lexicon (str or None): lexicon file containing the possible words and corresponding spellings.

--- a/torchaudio/models/deepspeech.py
+++ b/torchaudio/models/deepspeech.py
@@ -28,7 +28,7 @@ class FullyConnected(torch.nn.Module):
 class DeepSpeech(torch.nn.Module):
    """
    DeepSpeech model architecture from *Deep Speech: Scaling up end-to-end speech recognition*
-    [:footcite:`hannun2014deep`].
+    :cite:`hannun2014deep`.
    Args:
        n_feature: Number of input features

--- a/torchaudio/models/emformer.py
+++ b/torchaudio/models/emformer.py
@@ -806,7 +806,7 @@ class _EmformerImpl(torch.nn.Module):
 class Emformer(_EmformerImpl):
    r"""Implements the Emformer architecture introduced in
    *Emformer: Efficient Memory Transformer Based Acoustic Model for Low Latency Streaming Speech Recognition*
-    [:footcite:`shi2021emformer`].
+    :cite:`shi2021emformer`.
    Args:
        input_dim (int): input dimension.

--- a/torchaudio/models/tacotron2.py
+++ b/torchaudio/models/tacotron2.py
@@ -872,7 +872,7 @@ class Tacotron2(nn.Module):
    The original implementation was introduced in
    *Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions*
-    [:footcite:`shen2018natural`].
+    :cite:`shen2018natural`.
    Args:
        mask_padding (bool, optional): Use mask padding (Default: ``False``).

--- a/torchaudio/models/wav2letter.py
+++ b/torchaudio/models/wav2letter.py
@@ -7,7 +7,7 @@ __all__ = [
 class Wav2Letter(nn.Module):
    r"""Wav2Letter model architecture from *Wav2Letter: an End-to-End ConvNet-based Speech
-    Recognition System* [:footcite:`collobert2016wav2letter`].
+    Recognition System* :cite:`collobert2016wav2letter`.
     :math:`\text{padding} = \frac{\text{ceil}(\text{kernel} - \text{stride})}{2}`

--- a/torchaudio/models/wav2vec2/model.py
+++ b/torchaudio/models/wav2vec2/model.py
@@ -10,7 +10,7 @@ from . import components
 class Wav2Vec2Model(Module):
    """torchaudio.models.Wav2Vec2Model(feature_extractor: torch.nn.Module, encoder: torch.nn.Module, aux: Optional[torch.nn.Module] = None)
-    Encoder model used in *wav2vec 2.0* [:footcite:`baevski2020wav2vec`].
+    Encoder model used in *wav2vec 2.0* :cite:`baevski2020wav2vec`.
    Note:
        To build the model, please use one of the factory functions.
@@ -244,7 +244,7 @@ def wav2vec2_model(
        `ConvFeatureExtractionModel <https://github.com/pytorch/fairseq/blob/dd3bd3c0497ae9a7ae7364404a6b0a4c501780b3/fairseq/models/wav2vec/wav2vec2.py#L736>`__
        in the original ``fairseq`` implementation.
        This is referred as "(convolutional) feature encoder" in the *wav2vec 2.0*
-        [:footcite:`baevski2020wav2vec`] paper.
+        :cite:`baevski2020wav2vec` paper.
        The "encoder" below corresponds to `TransformerEncoder <https://github.com/pytorch/fairseq/blob/dd3bd3c0497ae9a7ae7364404a6b0a4c501780b3/fairseq/models/wav2vec/wav2vec2.py#L817>`__,
        and this is referred as "Transformer" in the paper.
@@ -393,7 +393,7 @@ def wav2vec2_base(
    encoder_layer_drop: float = 0.1,
    aux_num_out: Optional[int] = None,
 ) -> Wav2Vec2Model:
-    """Build Wav2Vec2Model with "base" architecture from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]
+    """Build Wav2Vec2Model with "base" architecture from *wav2vec 2.0* :cite:`baevski2020wav2vec`
    Args:
        encoder_projection_dropout (float):
@@ -441,7 +441,7 @@ def wav2vec2_large(
    encoder_layer_drop: float = 0.1,
    aux_num_out: Optional[int] = None,
 ) -> Wav2Vec2Model:
-    """Build Wav2Vec2Model with "large" architecture from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]
+    """Build Wav2Vec2Model with "large" architecture from *wav2vec 2.0* :cite:`baevski2020wav2vec`
    Args:
        encoder_projection_dropout (float):
@@ -489,7 +489,7 @@ def wav2vec2_large_lv60k(
    encoder_layer_drop: float = 0.1,
    aux_num_out: Optional[int] = None,
 ) -> Wav2Vec2Model:
-    """Build Wav2Vec2Model with "large lv-60k" architecture from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]
+    """Build Wav2Vec2Model with "large lv-60k" architecture from *wav2vec 2.0* :cite:`baevski2020wav2vec`
    Args:
        encoder_projection_dropout (float):
@@ -537,7 +537,7 @@ def hubert_base(
    encoder_layer_drop: float = 0.05,
    aux_num_out: Optional[int] = None,
 ) -> Wav2Vec2Model:
-    """Build HuBERT model with "base" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
+    """Build HuBERT model with "base" architecture from *HuBERT* :cite:`hsu2021hubert`
    Args:
        encoder_projection_dropout (float):
@@ -585,7 +585,7 @@ def hubert_large(
    encoder_layer_drop: float = 0.0,
    aux_num_out: Optional[int] = None,
 ) -> Wav2Vec2Model:
-    """Build HuBERT model with "large" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
+    """Build HuBERT model with "large" architecture from *HuBERT* :cite:`hsu2021hubert`
    Args:
        encoder_projection_dropout (float):
@@ -633,7 +633,7 @@ def hubert_xlarge(
    encoder_layer_drop: float = 0.0,
    aux_num_out: Optional[int] = None,
 ) -> Wav2Vec2Model:
-    """Build HuBERT model with "extra large" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
+    """Build HuBERT model with "extra large" architecture from *HuBERT* :cite:`hsu2021hubert`
    Args:
        encoder_projection_dropout (float):
@@ -714,7 +714,7 @@ def hubert_pretrain_model(
        `ConvFeatureExtractionModel <https://github.com/pytorch/fairseq/blob/dd3bd3c0497ae9a7ae7364404a6b0a4c501780b3/fairseq/models/wav2vec/wav2vec2.py#L736>`__
        in the original ``fairseq`` implementation.
        This is referred as "(convolutional) feature encoder" in the *wav2vec 2.0*
-        [:footcite:`baevski2020wav2vec`] paper.
+        :cite:`baevski2020wav2vec` paper.
        The "encoder" below corresponds to `TransformerEncoder <https://github.com/pytorch/fairseq/blob/dd3bd3c0497ae9a7ae7364404a6b0a4c501780b3/fairseq/models/wav2vec/wav2vec2.py#L817>`__,
        and this is referred as "Transformer" in the paper.
@@ -975,7 +975,7 @@ def hubert_pretrain_base(
    feature_grad_mult: Optional[float] = 0.1,
    num_classes: int = 100,
 ) -> HuBERTPretrainModel:
-    """Build HuBERTPretrainModel model with "base" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
+    """Build HuBERTPretrainModel model with "base" architecture from *HuBERT* :cite:`hsu2021hubert`
    Args:
        encoder_projection_dropout (float):
@@ -1050,7 +1050,7 @@ def hubert_pretrain_large(
    mask_channel_length: int = 10,
    feature_grad_mult: Optional[float] = None,
 ) -> HuBERTPretrainModel:
-    """Build HuBERTPretrainModel model for pre-training with "large" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
+    """Build HuBERTPretrainModel model for pre-training with "large" architecture from *HuBERT* :cite:`hsu2021hubert`
    Args:
        encoder_projection_dropout (float):
@@ -1123,7 +1123,7 @@ def hubert_pretrain_xlarge(
    mask_channel_length: int = 10,
    feature_grad_mult: Optional[float] = None,
 ) -> HuBERTPretrainModel:
-    """Build HuBERTPretrainModel model for pre-training with "extra large" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
+    """Build HuBERTPretrainModel model for pre-training with "extra large" architecture from *HuBERT* :cite:`hsu2021hubert`
    Args:
        encoder_projection_dropout (float):

--- a/torchaudio/models/wavernn.py
+++ b/torchaudio/models/wavernn.py
@@ -15,7 +15,7 @@ __all__ = [
 class ResBlock(nn.Module):
-    r"""ResNet block based on *Efficient Neural Audio Synthesis* [:footcite:`kalchbrenner2018efficient`].
+    r"""ResNet block based on *Efficient Neural Audio Synthesis* :cite:`kalchbrenner2018efficient`.
    Args:
        n_freq: the number of bins in a spectrogram. (Default: ``128``)
@@ -200,7 +200,7 @@ class WaveRNN(nn.Module):
    r"""WaveRNN model based on the implementation from `fatchord <https://github.com/fatchord/WaveRNN>`_.
    The original implementation was introduced in *Efficient Neural Audio Synthesis*
-    [:footcite:`kalchbrenner2018efficient`]. The input channels of waveform and spectrogram have to be 1.
+    :cite:`kalchbrenner2018efficient`. The input channels of waveform and spectrogram have to be 1.
    The product of `upsample_scales` must equal `hop_length`.
    Args:

--- a/torchaudio/pipelines/_source_separation_pipeline.py
+++ b/torchaudio/pipelines/_source_separation_pipeline.py
@@ -66,8 +66,8 @@ CONVTASNET_BASE_LIBRI2MIX = SourceSeparationBundle(
    _model_factory_func=partial(conv_tasnet_base, num_sources=2),
    _sample_rate=8000,
 )
-CONVTASNET_BASE_LIBRI2MIX.__doc__ = """Pre-trained Source Separation pipeline with *ConvTasNet* [:footcite:`Luo_2019`] trained on
+CONVTASNET_BASE_LIBRI2MIX.__doc__ = """Pre-trained Source Separation pipeline with *ConvTasNet* :cite:`Luo_2019` trained on
-    *Libri2Mix dataset* [:footcite:`cosentino2020librimix`].
+    *Libri2Mix dataset* :cite:`cosentino2020librimix`.
    The source separation model is constructed by :py:func:`torchaudio.models.conv_tasnet_base`
    and is trained using the training script ``lightning_train.py``
@@ -83,8 +83,8 @@ HDEMUCS_HIGH_MUSDB_PLUS = SourceSeparationBundle(
    _model_factory_func=partial(hdemucs_high, sources=["drums", "bass", "other", "vocals"]),
    _sample_rate=44100,
 )
-HDEMUCS_HIGH_MUSDB_PLUS.__doc__ = """Pre-trained *Hybrid Demucs* [:footcite:`defossez2021hybrid`] pipeline for music
+HDEMUCS_HIGH_MUSDB_PLUS.__doc__ = """Pre-trained *Hybrid Demucs* :cite:`defossez2021hybrid` pipeline for music
-    source separation trained on MUSDB-HQ [:footcite:`MUSDB18HQ`] and additional internal training data.
+    source separation trained on MUSDB-HQ :cite:`MUSDB18HQ` and additional internal training data.
    The model is constructed by :py:func:`torchaudio.prototype.models.hdemucs_high`.
    Training was performed in the original HDemucs repository `here <https://github.com/facebookresearch/demucs/>`__.
@@ -98,8 +98,8 @@ HDEMUCS_HIGH_MUSDB = SourceSeparationBundle(
    _model_factory_func=partial(hdemucs_high, sources=["drums", "bass", "other", "vocals"]),
    _sample_rate=44100,
 )
-HDEMUCS_HIGH_MUSDB.__doc__ = """Pre-trained *Hybrid Demucs* [:footcite:`defossez2021hybrid`] pipeline for music
+HDEMUCS_HIGH_MUSDB.__doc__ = """Pre-trained *Hybrid Demucs* :cite:`defossez2021hybrid` pipeline for music
-    source separation trained on MUSDB-HQ [:footcite:`MUSDB18HQ`].
+    source separation trained on MUSDB-HQ :cite:`MUSDB18HQ`.
    The model is constructed by :py:func:`torchaudio.prototype.models.hdemucs_high`.
    Training was performed in the original HDemucs repository `here <https://github.com/facebookresearch/demucs/>`__.