Update docs (#1550)

* Use `bibtex` for paper citations. * add `override.css` for fixing back reference. * wav2vec2 * wav2letter * convtasnet * deepspeech * rnnt-loss * griffinlim * Fix broken references in `filtering`. * Fix note in soundfile backends. * Tweak wav2vec2 example. * Removes unused `pytorch_theme.css`

Update docs (#1550)
* Use `bibtex` for paper citations. * add `override.css` for fixing back reference. * wav2vec2 * wav2letter * convtasnet * deepspeech * rnnt-loss * griffinlim * Fix broken references in `filtering`. * Fix note in soundfile backends. * Tweak wav2vec2 example. * Removes unused `pytorch_theme.css`
0166a851 · moto · GitHub · a87b33db · 0166a851 · 0166a851
Unverified Commit 0166a851 authored Jun 03, 2021 by moto Committed by GitHub Jun 03, 2021
20 changed files
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
 sphinx==2.4.4
 -e git+git://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
 sphinxcontrib.katex
+sphinxcontrib.bibtex
 matplotlib
--- a/docs/source/_static/css/override.css
+++ b/docs/source/_static/css/override.css
+/* Fix for bibtex reference */
+dl.footnote.brackets > dt.label > span.brackets > a.fn-backref {
+    position: inherit
+}
+/* Fix for bibtex back reference */
+dl.footnote.brackets > dt.label > span.fn-backref > a {
+    position: inherit
+}
--- a/docs/source/_static/css/pytorch_theme.css
+++ b/docs/source/_static/css/pytorch_theme.css
-body {
-    font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
-}
-
-/* Default header fonts are ugly */
-h1, h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend, p.caption {
-    font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
-}
-
-/* Use white for docs background */
-.wy-side-nav-search {
-    background-color: #fff;
-}
-
-.wy-nav-content-wrap, .wy-menu li.current > a  {
-    background-color: #fff;
-}
-
-@media screen and (min-width: 1400px) {
-    .wy-nav-content-wrap {
-        background-color: rgba(0, 0, 0, 0.0470588);
-    }
-
-    .wy-nav-content {
-        background-color: #fff;
-    }
-}
-
-/* Fixes for mobile */
-.wy-nav-top {
-    background-color: #fff;
-    background-image: url('../img/pytorch-logo-dark.svg');
-    background-repeat: no-repeat;
-    background-position: center;
-    padding: 0;
-    margin: 0.4045em 0.809em;
-    color: #333;
-}
-
-.wy-nav-top > a {
-    display: none;
-}
-
-@media screen and (max-width: 768px) {
-    .wy-side-nav-search>a img.logo {
-        height: 60px;
-    }
-}
-
-/* This is needed to ensure that logo above search scales properly */
-.wy-side-nav-search a {
-    display: block;
-}
-
-/* This ensures that multiple constructors will remain in separate lines. */
-.rst-content dl:not(.docutils) dt {
-    display: table;
-}
-
-/* Use our red for literals (it's very similar to the original color) */
-.rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
-    color: #F05732;
-}
-
-.rst-content tt.xref, a .rst-content tt, .rst-content tt.xref,
-.rst-content code.xref, a .rst-content tt, a .rst-content code {
-    color: #404040;
-}
-
-/* Change link colors (except for the menu) */
-
-a {
-    color: #F05732;
-}
-
-a:hover {
-    color: #F05732;
-}
-
-
-a:visited {
-    color: #D44D2C;
-}
-
-.wy-menu a {
-    color: #b3b3b3;
-}
-
-.wy-menu a:hover {
-    color: #b3b3b3;
-}
-
-/* Default footer text is quite big */
-footer {
-    font-size: 80%;
-}
-
-footer .rst-footer-buttons {
-    font-size: 125%; /* revert footer settings - 1/80% = 125% */
-}
-
-footer p {
-    font-size: 100%;
-}
-
-/* For hidden headers that appear in TOC tree */
-/* see http://stackoverflow.com/a/32363545/3343043 */
-.rst-content .hidden-section {
-    display: none;
-}
-
-nav .hidden-section {
-    display: inherit;
-}
-
-.wy-side-nav-search>div.version {
-    color: #000;
-}
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -41,6 +41,7 @@ extensions = [
    'sphinx.ext.napoleon',
    'sphinx.ext.viewcode',
    'sphinxcontrib.katex',
+    'sphinxcontrib.bibtex',
 ]

 # katex options
@@ -55,6 +56,8 @@ delimiters : [
 ]
 '''

+bibtex_bibfiles = ['refs.bib']
+
 napoleon_use_ivar = True
 napoleon_numpy_docstring = False
 napoleon_google_docstring = True
@@ -133,22 +136,10 @@ html_logo = '_static/img/pytorch-logo-dark.svg'
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ['_static']
-
-
-def setup(app):
-    # NOTE: in Sphinx 1.8+ `html_css_files` is an official configuration value
-    # and can be moved outside of this function (and the setup(app) function
-    # can be deleted).
-    html_css_files = [
-        'https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css'
-    ]
-
-    # In Sphinx 1.8 it was renamed to `add_css_file`, 1.7 and prior it is
-    # `add_stylesheet` (deprecated in 1.8).
-    add_css = getattr(app, 'add_css_file', app.add_stylesheet)
-    for css_file in html_css_files:
-        add_css(css_file)
-
+html_css_files = [
+    'css/override.css',
+    'https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css'
+]

 # -- Options for HTMLHelp output ------------------------------------------


--- a/docs/source/functional.rst
+++ b/docs/source/functional.rst
@@ -235,3 +235,8 @@ vad
 ---------------------------

 .. autofunction:: spectral_centroid
+
+References
+~~~~~~~~~~
+
+.. footbibliography::
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -2,31 +2,31 @@
    :class: hidden-section

 torchaudio.models
-======================
+=================

 .. currentmodule:: torchaudio.models

 The models subpackage contains definitions of models for addressing common audio tasks.


-:hidden:`ConvTasNet`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ConvTasNet
+~~~~~~~~~~

 .. autoclass:: ConvTasNet

  .. automethod:: forward


-:hidden:`DeepSpeech`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+DeepSpeech
+~~~~~~~~~~

 .. autoclass:: DeepSpeech

  .. automethod:: forward


-:hidden:`Wav2Letter`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Wav2Letter
+~~~~~~~~~~

 .. autoclass:: Wav2Letter

@@ -34,11 +34,11 @@ The models subpackage contains definitions of models for addressing common audio



-:hidden:`Wav2Vec2.0`
-~~~~~~~~~~~~~~~~~~~~
+Wav2Vec2.0
+~~~~~~~~~~

-Model
-----
+Wav2Vec2Model
+-------------

 .. autoclass:: Wav2Vec2Model

@@ -49,10 +49,19 @@ Model
 Factory Functions
 -----------------

+wav2vec2_base
+-------------
+
 .. autofunction:: wav2vec2_base

+wav2vec2_large
+--------------
+
 .. autofunction:: wav2vec2_large

+wav2vec2_large_lv60k
+--------------------
+
 .. autofunction:: wav2vec2_large_lv60k

 .. currentmodule:: torchaudio.models.wav2vec2.utils
@@ -60,15 +69,27 @@ Factory Functions
 Utility Functions
 -----------------

+import_huggingface_model
+------------------------
+		   
 .. autofunction:: import_huggingface_model

+import_fairseq_model
+--------------------
+		   
 .. autofunction:: import_fairseq_model

 .. currentmodule:: torchaudio.models

-:hidden:`WaveRNN`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+WaveRNN
+~~~~~~~

 .. autoclass:: WaveRNN

  .. automethod:: forward
+
+References
+~~~~~~~~~~
+
+.. footbibliography::
+
--- a/docs/source/refs.bib
+++ b/docs/source/refs.bib
+@misc{baevski2020wav2vec,
+      title={wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations}, 
+      author={Alexei Baevski and Henry Zhou and Abdelrahman Mohamed and Michael Auli},
+      year={2020},
+      eprint={2006.11477},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{hannun2014deep,
+      title={Deep Speech: Scaling up end-to-end speech recognition}, 
+      author={Awni Hannun and Carl Case and Jared Casper and Bryan Catanzaro and Greg Diamos and Erich Elsen and Ryan Prenger and Sanjeev Satheesh and Shubho Sengupta and Adam Coates and Andrew Y. Ng},
+      year={2014},
+      eprint={1412.5567},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{graves2012sequence,
+      title={Sequence Transduction with Recurrent Neural Networks}, 
+      author={Alex Graves},
+      year={2012},
+      eprint={1211.3711},
+      archivePrefix={arXiv},
+      primaryClass={cs.NE}
+}
+@misc{collobert2016wav2letter,
+      title={Wav2Letter: an End-to-End ConvNet-based Speech Recognition System}, 
+      author={Ronan Collobert and Christian Puhrsch and Gabriel Synnaeve},
+      year={2016},
+      eprint={1609.03193},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+@misc{kalchbrenner2018efficient,
+      title={Efficient Neural Audio Synthesis}, 
+      author={Nal Kalchbrenner and Erich Elsen and Karen Simonyan and Seb Noury and Norman Casagrande and Edward Lockhart and Florian Stimberg and Aaron van den Oord and Sander Dieleman and Koray Kavukcuoglu},
+      year={2018},
+      eprint={1802.08435},
+      archivePrefix={arXiv},
+      primaryClass={cs.SD}
+}
+@article{Luo_2019,
+   title={Conv-TasNet: Surpassing Ideal Time–Frequency Magnitude Masking for Speech Separation},
+   volume={27},
+   ISSN={2329-9304},
+   url={http://dx.doi.org/10.1109/TASLP.2019.2915167},
+   DOI={10.1109/taslp.2019.2915167},
+   number={8},
+   journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
+   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
+   author={Luo, Yi and Mesgarani, Nima},
+   year={2019},
+   month={Aug},
+   pages={1256–1266}
+}
+@InProceedings{ brian_mcfee-proc-scipy-2015,
+  author    = { {B}rian {M}c{F}ee and {C}olin {R}affel and {D}awen {L}iang and {D}aniel {P}.{W}. {E}llis and {M}att {M}c{V}icar and {E}ric {B}attenberg and {O}riol {N}ieto },
+  title     = { librosa: {A}udio and {M}usic {S}ignal {A}nalysis in {P}ython },
+  booktitle = { {P}roceedings of the 14th {P}ython in {S}cience {C}onference },
+  pages     = { 18 - 24 },
+  year      = { 2015 },
+  editor    = { {K}athryn {H}uff and {J}ames {B}ergstra },
+  doi       = { 10.25080/Majora-7b98e3ed-003 }
+}
+@INPROCEEDINGS{6701851,
+  author={Perraudin, Nathanaël and Balazs, Peter and Søndergaard, Peter L.},
+  booktitle={2013 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics}, 
+  title={A fast Griffin-Lim algorithm}, 
+  year={2013},
+  volume={},
+  number={},
+  pages={1-4},
+  doi={10.1109/WASPAA.2013.6701851}}
+@INPROCEEDINGS{1172092,
+  author={Griffin, D. and Jae Lim},
+  booktitle={ICASSP '83. IEEE International Conference on Acoustics, Speech, and Signal Processing}, 
+  title={Signal estimation from modified short-time Fourier transform}, 
+  year={1983},
+  volume={8},
+  number={},
+  pages={804-807},
+  doi={10.1109/ICASSP.1983.1172092}}
+@INPROCEEDINGS{6854049,
+  author={Ghahremani, Pegah and BabaAli, Bagher and Povey, Daniel and Riedhammer, Korbinian and Trmal, Jan and Khudanpur, Sanjeev},
+  booktitle={2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
+  title={A pitch extraction algorithm tuned for automatic speech recognition}, 
+  year={2014},
+  volume={},
+  number={},
+  pages={2494-2498},
+  doi={10.1109/ICASSP.2014.6854049}}
--- a/docs/source/rnnt_loss.rst
+++ b/docs/source/rnnt_loss.rst
@@ -2,7 +2,7 @@
    :class: hidden-section

 torchaudio.prototype.rnnt_loss
-===============================
+==============================

 .. currentmodule:: torchaudio.prototype.rnnt_loss

@@ -11,13 +11,18 @@ torchaudio.prototype.rnnt_loss
    The RNN transducer loss is a prototype feature, see `here <https://pytorch.org/audio>`_ to learn more about the nomenclature. It is only available within the nightlies, and also needs to be imported explicitly using: :code:`from torchaudio.prototype.rnnt_loss import rnnt_loss, RNNTLoss`.

 rnnt_loss
---------
+~~~~~~~~~

 .. autofunction:: rnnt_loss

-:hidden:`RNNTLoss`
-~~~~~~~~~~~~~~~~~~
+RNNTLoss
+~~~~~~~~

 .. autoclass:: RNNTLoss

  .. automethod:: forward
+
+References
+~~~~~~~~~~
+
+.. footbibliography::
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -53,99 +53,105 @@ Transforms are common audio transforms. They can be chained together using :clas
  .. automethod:: forward

 :hidden:`MFCC`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~

 .. autoclass:: MFCC

  .. automethod:: forward

 :hidden:`MuLawEncoding`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: MuLawEncoding

  .. automethod:: forward

 :hidden:`MuLawDecoding`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: MuLawDecoding

  .. automethod:: forward

 :hidden:`Resample`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~

 .. autoclass:: Resample

  .. automethod:: forward

 :hidden:`ComplexNorm`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: ComplexNorm

  .. automethod:: forward

 :hidden:`ComputeDeltas`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: ComputeDeltas

  .. automethod:: forward

 :hidden:`TimeStretch`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: TimeStretch

  .. automethod:: forward

 :hidden:`Fade`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~

 .. autoclass:: Fade

  .. automethod:: forward

 :hidden:`FrequencyMasking`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: FrequencyMasking

  .. automethod:: forward

 :hidden:`TimeMasking`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: TimeMasking

  .. automethod:: forward

 :hidden:`Vol`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~

 .. autoclass:: Vol

  .. automethod:: forward

 :hidden:`SlidingWindowCmn`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: SlidingWindowCmn

  .. automethod:: forward

 :hidden:`SpectralCentroid`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: SpectralCentroid

  .. automethod:: forward

 :hidden:`Vad`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~

 .. autoclass:: Vad

  .. automethod:: forward
+
+
+References
+~~~~~~~~~~
+
+.. footbibliography::
--- a/torchaudio/backend/soundfile_backend.py
+++ b/torchaudio/backend/soundfile_backend.py
@@ -85,18 +85,20 @@ def _get_encoding(format: str, subtype: str):
 def info(filepath: str, format: Optional[str] = None) -> AudioMetaData:
    """Get signal information of an audio file.

+    Note:
+        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
+        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
+        which has a restriction on type annotation due to TorchScript compiler compatiblity.
+
    Args:
        filepath (path-like object or file-like object):
            Source of audio data.
-            Note:
-                  * This argument is intentionally annotated as ``str`` only,
-                    for the consistency with "sox_io" backend, which has a restriction
-                    on type annotation due to TorchScript compiler compatiblity.
        format (str, optional):
            Not used. PySoundFile does not accept format hint.

    Returns:
        AudioMetaData: meta data of the given audio.
+
    """
    sinfo = soundfile.info(filepath)
    return AudioMetaData(
@@ -159,13 +161,14 @@ def load(
    For these formats, this function always returns ``float32`` Tensor with values normalized to
    ``[-1.0, 1.0]``.

+    Note:
+        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
+        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
+        which has a restriction on type annotation due to TorchScript compiler compatiblity.
+
    Args:
        filepath (path-like object or file-like object):
            Source of audio data.
-            Note:
-                  * This argument is intentionally annotated as ``str`` only,
-                    for the consistency with "sox_io" backend, which has a restriction
-                    on type annotation due to TorchScript compiler compatiblity.
        frame_offset (int):
            Number of frames to skip before start reading data.
        num_frames (int):
@@ -324,11 +327,13 @@ def save(
        * OGG/VORBIS
        * SPHERE

+    Note:
+        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
+        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
+        which has a restriction on type annotation due to TorchScript compiler compatiblity.
+
    Args:
        filepath (str or pathlib.Path): Path to audio file.
-            This functionalso handles ``pathlib.Path`` objects, but is annotated as ``str``
-            for the consistency with "sox_io" backend, which has a restriction on type annotation
-            for TorchScript compiler compatiblity.
        src (torch.Tensor): Audio data to save. must be 2D tensor.
        sample_rate (int): sampling rate
        channels_first (bool): If ``True``, the given tensor is interpreted as ``[channel, time]``,

--- a/torchaudio/functional/filtering.py
+++ b/torchaudio/functional/filtering.py
@@ -80,9 +80,9 @@ def allpass_biquad(
    Returns:
        Tensor: Waveform of dimension of `(..., time)`

-    References:
-        http://sox.sourceforge.net/sox.html
-        https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
    """
    dtype = waveform.dtype
    device = waveform.device
@@ -123,9 +123,9 @@ def band_biquad(
    Returns:
        Tensor: Waveform of dimension of `(..., time)`

-    References:
-        http://sox.sourceforge.net/sox.html
-        https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
    """
    dtype = waveform.dtype
    device = waveform.device
@@ -171,9 +171,9 @@ def bandpass_biquad(
    Returns:
        Tensor: Waveform of dimension of `(..., time)`

-    References:
-        http://sox.sourceforge.net/sox.html
-        https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
    """
    dtype = waveform.dtype
    device = waveform.device
@@ -207,9 +207,9 @@ def bandreject_biquad(
    Returns:
        Tensor: Waveform of dimension of `(..., time)`

-    References:
-        http://sox.sourceforge.net/sox.html
-        https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
    """
    dtype = waveform.dtype
    device = waveform.device
@@ -247,9 +247,9 @@ def bass_biquad(
    Returns:
        Tensor: Waveform of dimension of `(..., time)`

-    References:
-        http://sox.sourceforge.net/sox.html
-        https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
    """
    dtype = waveform.dtype
    device = waveform.device
@@ -325,8 +325,8 @@ def contrast(waveform: Tensor, enhancement_amount: float = 75.0) -> Tensor:
    Returns:
        Tensor: Waveform of dimension of `(..., time)`

-    References:
-        http://sox.sourceforge.net/sox.html
+    Reference:
+        - http://sox.sourceforge.net/sox.html
    """

    if not 0 <= enhancement_amount <= 100:
@@ -358,8 +358,8 @@ def dcshift(
    Returns:
        Tensor: Waveform of dimension of `(..., time)`

-    References:
-        http://sox.sourceforge.net/sox.html
+    Reference:
+        - http://sox.sourceforge.net/sox.html
    """
    output_waveform = waveform
    limiter_threshold = 0.0
@@ -405,9 +405,9 @@ def deemph_biquad(waveform: Tensor, sample_rate: int) -> Tensor:
    Returns:
        Tensor: Waveform of dimension of `(..., time)`

-    References:
-        http://sox.sourceforge.net/sox.html
-        https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
    """

    if sample_rate == 44100:
@@ -680,10 +680,12 @@ def flanger(
    Returns:
        Tensor: Waveform of dimension of `(..., channel, time)`

-    References:
-        http://sox.sourceforge.net/sox.html
+    Reference:
+        - http://sox.sourceforge.net/sox.html

-        Scott Lehman, Effects Explained,
+        - Scott Lehman, `Effects Explained`_,
+
+    .. _Effects Explained:
        https://web.archive.org/web/20051125072557/http://www.harmony-central.com/Effects/effects-explained.html
    """

@@ -1027,8 +1029,8 @@ def overdrive(waveform: Tensor, gain: float = 20, colour: float = 20) -> Tensor:
    Returns:
        Tensor: Waveform of dimension of `(..., time)`

-    References:
-        http://sox.sourceforge.net/sox.html
+    Reference:
+        - http://sox.sourceforge.net/sox.html
    """
    actual_shape = waveform.shape
    device, dtype = waveform.device, waveform.dtype
@@ -1096,9 +1098,11 @@ def phaser(
    Returns:
        Tensor: Waveform of dimension of `(..., time)`

-    References:
-        http://sox.sourceforge.net/sox.html
-        Scott Lehman, Effects Explained,
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - Scott Lehman, `Effects Explained`_.
+
+    .. _Effects Explained:
        https://web.archive.org/web/20051125072557/http://www.harmony-central.com/Effects/effects-explained.html
    """
    actual_shape = waveform.shape
@@ -1166,9 +1170,9 @@ def riaa_biquad(waveform: Tensor, sample_rate: int) -> Tensor:
    Returns:
        Tensor: Waveform of dimension of `(..., time)`

-    References:
-        http://sox.sourceforge.net/sox.html
-        https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
    """

    if sample_rate == 44100:
@@ -1234,9 +1238,9 @@ def treble_biquad(
    Returns:
        Tensor: Waveform of dimension of `(..., time)`

-    References:
-        http://sox.sourceforge.net/sox.html
-        https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
    """
    dtype = waveform.dtype
    device = waveform.device
@@ -1420,8 +1424,8 @@ def vad(
    Returns:
        Tensor: Tensor of audio of dimension (..., time).

-    References:
-        http://sox.sourceforge.net/sox.html
+    Reference:
+        - http://sox.sourceforge.net/sox.html
    """

    if waveform.ndim > 2:

--- a/torchaudio/functional/functional.py
+++ b/torchaudio/functional/functional.py
@@ -156,18 +156,9 @@ def griffinlim(
        rand_init: bool
 ) -> Tensor:
    r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation.
-        Implementation ported from `librosa`.
-
-    *  [1] McFee, Brian, Colin Raffel, Dawen Liang, Daniel PW Ellis, Matt McVicar, Eric Battenberg, and Oriol Nieto.
-        "librosa: Audio and music signal analysis in python."
-        In Proceedings of the 14th python in science conference, pp. 18-25. 2015.
-    *  [2] Perraudin, N., Balazs, P., & Søndergaard, P. L.
-        "A fast Griffin-Lim algorithm,"
-        IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (pp. 1-4),
-        Oct. 2013.
-    *  [3] D. W. Griffin and J. S. Lim,
-        "Signal estimation from modified short-time Fourier transform,"
-        IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984.
+
+    Implementation ported from
+    :footcite:`brian_mcfee-proc-scipy-2015`, :footcite:`6701851` and :footcite:`1172092`.

    Args:
        specgram (Tensor): A magnitude-only STFT spectrogram of dimension (..., freq, frames)
@@ -1215,7 +1206,7 @@ def compute_kaldi_pitch(
        recompute_frame: int = 500,
        snip_edges: bool = True,
 ) -> torch.Tensor:
-    """Extract pitch based on method described in [1].
+    """Extract pitch based on method described in :footcite:`6854049`.

    This function computes the equivalent of `compute-kaldi-pitch-feats` from Kaldi.

@@ -1274,15 +1265,6 @@ def compute_kaldi_pitch(
    Returns:
       Tensor: Pitch feature. Shape: ``(batch, frames 2)`` where the last dimension
       corresponds to pitch and NCCF.
-
-    Reference:
-        - A pitch extraction algorithm tuned for automatic speech recognition
-
-          P. Ghahremani, B. BabaAli, D. Povey, K. Riedhammer, J. Trmal and S. Khudanpur
-
-          2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP),
-
-          Florence, 2014, pp. 2494-2498, doi: 10.1109/ICASSP.2014.6854049.
    """
    shape = waveform.shape
    waveform = waveform.reshape(-1, shape[-1])

--- a/torchaudio/models/conv_tasnet.py
+++ b/torchaudio/models/conv_tasnet.py
@@ -21,13 +21,6 @@ class ConvBlock(torch.nn.Module):

    Note:
        This implementation corresponds to the "non-causal" setting in the paper.
-
-    Reference:
-        - Conv-TasNet: Surpassing Ideal Time--Frequency Magnitude Masking for Speech Separation
-
-          Luo, Yi and Mesgarani, Nima
-
-          https://arxiv.org/abs/1809.07454
    """

    def __init__(
@@ -98,11 +91,6 @@ class MaskGenerator(torch.nn.Module):

    Note:
        This implementation corresponds to the "non-causal" setting in the paper.
-
-    References:
-        - Conv-TasNet: Surpassing Ideal Time--Frequency Magnitude Masking for Speech Separation
-          Luo, Yi and Mesgarani, Nima
-          https://arxiv.org/abs/1809.07454
    """

    def __init__(
@@ -176,7 +164,7 @@ class MaskGenerator(torch.nn.Module):


 class ConvTasNet(torch.nn.Module):
-    """Conv-TasNet: a fully-convolutional time-domain audio separation network
+    """Conv-TasNet: a fully-convolutional time-domain audio separation network :footcite:`Luo_2019`.

    Args:
        num_sources (int): The number of sources to split.
@@ -190,13 +178,6 @@ class ConvTasNet(torch.nn.Module):

    Note:
        This implementation corresponds to the "non-causal" setting in the paper.
-
-    Reference:
-        - Conv-TasNet: Surpassing Ideal Time--Frequency Magnitude Masking for Speech Separation
-
-          Luo, Yi and Mesgarani, Nima
-
-          https://arxiv.org/abs/1809.07454
    """

    def __init__(

--- a/torchaudio/models/deepspeech.py
+++ b/torchaudio/models/deepspeech.py
@@ -31,9 +31,7 @@ class FullyConnected(torch.nn.Module):

 class DeepSpeech(torch.nn.Module):
    """
-    DeepSpeech model architecture from
-    `"Deep Speech: Scaling up end-to-end speech recognition"`
-    <https://arxiv.org/abs/1412.5567> paper.
+    DeepSpeech model architecture from :footcite:`hannun2014deep`.

    Args:
        n_feature: Number of input features

--- a/torchaudio/models/wav2letter.py
+++ b/torchaudio/models/wav2letter.py
@@ -7,9 +7,7 @@ __all__ = [


 class Wav2Letter(nn.Module):
-    r"""Wav2Letter model architecture from the `Wav2Letter an End-to-End ConvNet-based Speech Recognition System`_.
-
-    .. _Wav2Letter an End-to-End ConvNet-based Speech Recognition System: https://arxiv.org/abs/1609.03193
+    r"""Wav2Letter model architecture from :footcite:`collobert2016wav2letter`.

     :math:`\text{padding} = \frac{\text{ceil}(\text{kernel} - \text{stride})}{2}`


--- a/torchaudio/models/wav2vec2/model.py
+++ b/torchaudio/models/wav2vec2/model.py
@@ -7,7 +7,7 @@ from . import components


 class Wav2Vec2Model(Module):
-    """Model used in wav2vec2.0 paper. [1]
+    """Encoder model used in [:footcite:`baevski2020wav2vec`].

    Note:
        To build the model, please use one of the factory functions.
@@ -19,13 +19,6 @@ class Wav2Vec2Model(Module):
        encoder (torch.nn.Module):
            Encoder that converts the audio features into the sequence of probability
            distribution (in negative log-likelihood) over labels.
-
-    Reference:
-        - wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations
-
-          Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli
-
-          https://arxiv.org/abs/2006.11477
    """
    def __init__(
            self,
@@ -129,7 +122,7 @@ def _get_model(


 def wav2vec2_base(num_out: int) -> Wav2Vec2Model:
-    """Build wav2vec2.0 model with **Base** configuration. [1]
+    """Build wav2vec2.0 model with "Base" configuration from [:footcite:`baevski2020wav2vec`].

    Args:
        num_out: int
@@ -138,12 +131,17 @@ def wav2vec2_base(num_out: int) -> Wav2Vec2Model:
    Returns:
        Wav2Vec2Model: The resulting model.

-    Reference:
-        - wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations
-
-          Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli
-
-          https://arxiv.org/abs/2006.11477
+    Example - Reload fine-tuned model from Hugging Face:
+        >>> # Session 1 - Convert pretrained model from Hugging Face and save the parameters.
+        >>> from torchaudio.models.wav2vec2.utils import import_huggingface_model
+        >>>
+        >>> original = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
+        >>> model = import_huggingface_model(original)
+        >>> torch.save(model.state_dict(), "wav2vec2-base-960h.pt")
+        >>>
+        >>> # Session 2 - Load model and the parameters
+        >>> model = wav2vec2_base(num_out=32)
+        >>> model.load_state_dict(torch.load("wav2vec2-base-960h.pt"))
    """
    return _get_model(
        extractor_mode="group_norm",
@@ -166,7 +164,7 @@ def wav2vec2_base(num_out: int) -> Wav2Vec2Model:


 def wav2vec2_large(num_out: int) -> Wav2Vec2Model:
-    """Build wav2vec2.0 model with **Large** configuration. [1]
+    """Build wav2vec2.0 model with "Large" configuration from [:footcite:`baevski2020wav2vec`].

    Args:
        num_out: int
@@ -175,12 +173,17 @@ def wav2vec2_large(num_out: int) -> Wav2Vec2Model:
    Returns:
        Wav2Vec2Model: The resulting model.

-    Reference:
-        - wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations
-
-          Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli
-
-          https://arxiv.org/abs/2006.11477
+    Example - Reload fine-tuned model from Hugging Face:
+        >>> # Session 1 - Convert pretrained model from Hugging Face and save the parameters.
+        >>> from torchaudio.models.wav2vec2.utils import import_huggingface_model
+        >>>
+        >>> original = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
+        >>> model = import_huggingface_model(original)
+        >>> torch.save(model.state_dict(), "wav2vec2-base-960h.pt")
+        >>>
+        >>> # Session 2 - Load model and the parameters
+        >>> model = wav2vec2_large(num_out=32)
+        >>> model.load_state_dict(torch.load("wav2vec2-base-960h.pt"))
    """
    return _get_model(
        extractor_mode="group_norm",
@@ -203,7 +206,7 @@ def wav2vec2_large(num_out: int) -> Wav2Vec2Model:


 def wav2vec2_large_lv60k(num_out: int) -> Wav2Vec2Model:
-    """Build wav2vec2.0 model with **Large LV-60k** configuration. [1]
+    """Build wav2vec2.0 model with "Large LV-60k" configuration from [:footcite:`baevski2020wav2vec`].

    Args:
        num_out: int
@@ -212,12 +215,17 @@ def wav2vec2_large_lv60k(num_out: int) -> Wav2Vec2Model:
    Returns:
        Wav2Vec2Model: The resulting model.

-    Reference:
-        - wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations
-
-          Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli
-
-          https://arxiv.org/abs/2006.11477
+    Example - Reload fine-tuned model from Hugging Face:
+        >>> # Session 1 - Convert pretrained model from Hugging Face and save the parameters.
+        >>> from torchaudio.models.wav2vec2.utils import import_huggingface_model
+        >>>
+        >>> original = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
+        >>> model = import_huggingface_model(original)
+        >>> torch.save(model.state_dict(), "wav2vec2-base-960h.pt")
+        >>>
+        >>> # Session 2 - Load model and the parameters
+        >>> model = wav2vec2_large_lv60k(num_out=32)
+        >>> model.load_state_dict(torch.load("wav2vec2-base-960h.pt"))
    """
    return _get_model(
        extractor_mode="layer_norm",

--- a/torchaudio/models/wav2vec2/utils/import_fairseq.py
+++ b/torchaudio/models/wav2vec2/utils/import_fairseq.py
@@ -141,6 +141,8 @@ def import_fairseq_model(
        Wav2Vec2Model: Imported model.

    Example - Loading pretrain-only model
+        >>> from torchaudio.models.wav2vec2.utils import import_fairseq_model
+        >>>
        >>> # Load model using fairseq
        >>> model_file = 'wav2vec_small.pt'
        >>> model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([model_file])
@@ -156,6 +158,8 @@ def import_fairseq_model(
        >>> torch.testing.assert_allclose(features, reference)

    Example - Fine-tuned model
+        >>> from torchaudio.models.wav2vec2.utils import import_fairseq_model
+        >>>
        >>> # Load model using fairseq
        >>> model_file = 'wav2vec_small_960h.pt'
        >>> model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([model_file])

--- a/torchaudio/models/wav2vec2/utils/import_huggingface.py
+++ b/torchaudio/models/wav2vec2/utils/import_huggingface.py
@@ -50,6 +50,8 @@ def import_huggingface_model(original: Module) -> Wav2Vec2Model:
        Wav2Vec2Model: Imported model.

    Example
+        >>> from torchaudio.models.wav2vec2.utils import import_huggingface_model
+        >>>
        >>> original = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
        >>> model = import_huggingface_model(original)
        >>>

--- a/torchaudio/models/wavernn.py
+++ b/torchaudio/models/wavernn.py
@@ -14,9 +14,7 @@ __all__ = [


 class ResBlock(nn.Module):
-    r"""ResNet block based on "Deep Residual Learning for Image Recognition"
-
-    The paper link is https://arxiv.org/pdf/1512.03385.pdf.
+    r"""ResNet block based on :footcite:`kalchbrenner2018efficient`.

    Args:
        n_freq: the number of bins in a spectrogram. (Default: ``128``)
@@ -204,8 +202,7 @@ class UpsampleNetwork(nn.Module):
 class WaveRNN(nn.Module):
    r"""WaveRNN model based on the implementation from `fatchord <https://github.com/fatchord/WaveRNN>`_.

-    The original implementation was introduced in
-    `"Efficient Neural Audio Synthesis" <https://arxiv.org/pdf/1802.08435.pdf>`_.
+    The original implementation was introduced in :footcite:`kalchbrenner2018efficient`.
    The input channels of waveform and spectrogram have to be 1. The product of
    `upsample_scales` must equal `hop_length`.


--- a/torchaudio/prototype/rnnt_loss.py
+++ b/torchaudio/prototype/rnnt_loss.py
@@ -17,11 +17,10 @@ def rnnt_loss(
    fused_log_softmax: bool = True,
    reuse_logits_for_grads: bool = True,
 ):
-    """
-    Compute the RNN Transducer Loss.
+    """Compute the RNN Transducer loss from :footcite:`graves2012sequence`.

-    The RNN Transducer loss (`Graves 2012 <https://arxiv.org/pdf/1211.3711.pdf>`__) extends the CTC loss by defining
-    a distribution over output sequences of all lengths, and by jointly modelling both input-output and output-output
+    The RNN Transducer loss extends the CTC loss by defining a distribution over output
+    sequences of all lengths, and by jointly modelling both input-output and output-output
    dependencies.

    Args:
@@ -58,11 +57,10 @@ def rnnt_loss(


 class RNNTLoss(torch.nn.Module):
-    """
-    Compute the RNN Transducer Loss.
+    """Compute the RNN Transducer loss from :footcite:`graves2012sequence`.

-    The RNN Transducer loss (`Graves 2012 <https://arxiv.org/pdf/1211.3711.pdf>`__) extends the CTC loss by defining
-    a distribution over output sequences of all lengths, and by jointly modelling both input-output and output-output
+    The RNN Transducer loss extends the CTC loss by defining a distribution over output
+    sequences of all lengths, and by jointly modelling both input-output and output-output
    dependencies.

    Args: