Update CTC decoder docs and add citation (#2278)

Summary: rendered: - [tutorial](https://output.circle-artifacts.com/output/job/e7fb5a23-87cf-4dd5-b4a8-8b4f91e20eb4/artifacts/0/docs/tutorials/asr_inference_with_ctc_decoder_tutorial.html) - [docs](https://output.circle-artifacts.com/output/job/e7fb5a23-87cf-4dd5-b4a8-8b4f91e20eb4/artifacts/0/docs/prototype.ctc_decoder.html) Pull Request resolved: https://github.com/pytorch/audio/pull/2278 Reviewed By: mthrok Differential Revision: D35097734 Pulled By: carolineechen fbshipit-source-id: 1e5d5fff0b7740757cca358cf3ea44c6488fcd5c

Update CTC decoder docs and add citation (#2278)
Summary: rendered: - [tutorial](https://output.circle-artifacts.com/output/job/e7fb5a23-87cf-4dd5-b4a8-8b4f91e20eb4/artifacts/0/docs/tutorials/asr_inference_with_ctc_decoder_tutorial.html) - [docs](https://output.circle-artifacts.com/output/job/e7fb5a23-87cf-4dd5-b4a8-8b4f91e20eb4/artifacts/0/docs/prototype.ctc_decoder.html) Pull Request resolved: https://github.com/pytorch/audio/pull/2278 Reviewed By: mthrok Differential Revision: D35097734 Pulled By: carolineechen fbshipit-source-id: 1e5d5fff0b7740757cca358cf3ea44c6488fcd5c
05592dff · Caroline Chen · Facebook GitHub Bot · 8844fbb7 · 05592dff · 05592dff
Commit 05592dff authored Mar 24, 2022 by Caroline Chen Committed by Facebook GitHub Bot Mar 24, 2022
4 changed files
--- a/docs/source/prototype.ctc_decoder.rst
+++ b/docs/source/prototype.ctc_decoder.rst
@@ -28,3 +28,8 @@ lexicon_decoder
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: lexicon_decoder
+References
+----------
+.. footbibliography::
--- a/docs/source/refs.bib
+++ b/docs/source/refs.bib
@@ -261,3 +261,9 @@
  year={1969},
  publisher={IEEE}
 }
+@article{kahn2022flashlight,
+  title={Flashlight: Enabling Innovation in Tools for Machine Learning},
+  author={Kahn, Jacob and Pratap, Vineel and Likhomanenko, Tatiana and Xu, Qiantong and Hannun, Awni and Cai, Jeff and Tomasello, Paden and Lee, Ann and Grave, Edouard and Avidov, Gilad and others},
+  journal={arXiv preprint arXiv:2201.12465},
+  year={2022}
+}
--- a/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py
+++ b/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py
@@ -36,7 +36,10 @@ using CTC loss.
 # highest scores at each time step. A language model can be incorporated into
 # the scoring computation, and adding a lexicon constraint restricts the
 # next possible tokens for the hypotheses so that only words from the lexicon
-# can be generated. A mathematical formula for the decoder optimization can be
+# can be generated.
+#
+# The underlying implementation is ported from `Flashlight <https://arxiv.org/pdf/2201.12465.pdf>`__'s
+# beam search decoder. A mathematical formula for the decoder optimization can be
 # found in the `Wav2Letter paper <https://arxiv.org/pdf/1609.03193.pdf>`__, and
 # a more detailed algorithm can be found in this `blog
 # <https://towardsdatascience.com/boosting-your-sequence-generation-performance-with-beam-search-language-model-decoding-74ee64de435a>`__.

--- a/torchaudio/prototype/ctc_decoder/ctc_decoder.py
+++ b/torchaudio/prototype/ctc_decoder/ctc_decoder.py
@@ -39,10 +39,21 @@ class Hypothesis(NamedTuple):
 class LexiconDecoder:
    """torchaudio.prototype.ctc_decoder.LexiconDecoder()
+    Lexically contrained CTC beam search decoder from *Flashlight* [:footcite:`kahn2022flashlight`].
    Note:
-        To build the decoder, please use factory function
+        To build the decoder, please use the factory function :py:func:`lexicon_decoder`.
-        :py:func:`lexicon_decoder`.
+    Args:
+        nbest (int): number of best decodings to return
+        lexicon (Dict): lexicon mapping of words to spellings
+        word_dict (_Dictionary): dictionary of words
+        tokens_dict (_Dictionary): dictionary of tokens
+        lm (_LM): language model
+        decoder_options (_LexiconDecoderOptions): parameters used for beam search decoding
+        blank_token (str): token corresopnding to blank
+        sil_token (str): token corresponding to silence
+        unk_word (str): word corresponding to unknown
    """
    def __init__(
@@ -57,24 +68,6 @@ class LexiconDecoder:
        sil_token: str,
        unk_word: str,
    ) -> None:
-        """
-        CTC Decoder with Lexicon constraint.
-        Note:
-            To build the decoder, please use the factory function lexicon_decoder.
-        Args:
-            nbest (int): number of best decodings to return
-            lexicon (Dict): lexicon mapping of words to spellings
-            word_dict (_Dictionary): dictionary of words
-            tokens_dict (_Dictionary): dictionary of tokens
-            lm (_LM): language model
-            decoder_options (_LexiconDecoderOptions): parameters used for beam search decoding
-            blank_token (str): token corresopnding to blank
-            sil_token (str): token corresponding to silence
-            unk_word (str): word corresponding to unknown
-        """
        self.nbest = nbest
        self.word_dict = word_dict
        self.tokens_dict = tokens_dict
@@ -196,7 +189,8 @@ def lexicon_decoder(
    unk_word: str = "<unk>",
 ) -> LexiconDecoder:
    """
-    Builds Ken LM CTC Lexicon Decoder with given parameters
+    Builds lexically constrained CTC beam search decoder from
+    *Flashlight* [:footcite:`kahn2022flashlight`].
    Args:
        lexicon (str): lexicon file containing the possible words and corresponding spellings.