Add cuctc tutorial, change blank skip threshold into prob (#3297)

Summary: Add a separate tutorial for cuctc. Reslove https://github.com/pytorch/audio/issues/3096 Pull Request resolved: https://github.com/pytorch/audio/pull/3297 Reviewed By: huangruizhe Differential Revision: D47928400 Pulled By: mthrok fbshipit-source-id: 8c16492fb4d007b6ea7969ba77c866a51749c0ec

Add cuctc tutorial, change blank skip threshold into prob (#3297)
Summary: Add a separate tutorial for cuctc. Reslove https://github.com/pytorch/audio/issues/3096 Pull Request resolved: https://github.com/pytorch/audio/pull/3297 Reviewed By: huangruizhe Differential Revision: D47928400 Pulled By: mthrok fbshipit-source-id: 8c16492fb4d007b6ea7969ba77c866a51749c0ec
732c94a3 · Yuekai Zhang · Facebook GitHub Bot · 144cfcfc · 732c94a3 · 732c94a3
Commit 732c94a3 authored Aug 01, 2023 by Yuekai Zhang Committed by Facebook GitHub Bot Aug 01, 2023
4 changed files
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -68,6 +68,7 @@ model implementations and application components.
   tutorials/speech_recognition_pipeline_tutorial
   tutorials/asr_inference_with_ctc_decoder_tutorial
+   tutorials/asr_inference_with_cuda_ctc_decoder_tutorial
   tutorials/online_asr_tutorial
   tutorials/device_asr
   tutorials/device_avsr
@@ -147,6 +148,13 @@ Tutorials
 .. customcardstart::
+.. customcarditem::
+   :header: AM inference with CUDA CTC Beam Seach Decoder
+   :card_description: Learn how to perform ASR beam search decoding with GPU, using <code>torchaudio.models.decoder.cuda_ctc_decoder</code>.
+   :image: https://download.pytorch.org/torchaudio/tutorial-assets/thumbnails/asr_inference_with_ctc_decoder_tutorial.png
+   :link: tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.html
+   :tags: Pipelines,ASR,CTC-Decoder,CUDA-CTC-Decoder
 .. customcarditem::
   :header: On device audio-visual automatic speech recognition
   :card_description: Learn how to stream audio and video from laptop webcam and perform audio-visual automatic speech recognition using Emformer-RNNT model.

--- a/examples/asr/librispeech_cuda_ctc_decoder/inference.py
+++ b/examples/asr/librispeech_cuda_ctc_decoder/inference.py
@@ -43,9 +43,9 @@ def run_inference(args):
        )
    else:
        assert vocabs[0] == "<blk>", "idx of blank token has to be zero"
-        blank_frame_skip_threshold = float(torch.log(torch.tensor(args.blank_skip_threshold)))
        cuda_decoder = cuda_ctc_decoder(
-            vocabs, nbest=args.nbest, beam_size=args.beam_size, blank_skip_threshold=blank_frame_skip_threshold
+            vocabs, nbest=args.nbest, beam_size=args.beam_size, blank_skip_threshold=args.blank_skip_threshold
        )
    dataset = torchaudio.datasets.LIBRISPEECH(args.librispeech_path, url=args.split, download=True)

--- a/examples/tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.py
+++ b/examples/tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.py
+"""
+ASR Inference with CUDA CTC Decoder
+====================================
+**Author**: `Yuekai Zhang <yuekaiz@nvidia.com>`__
+This tutorial shows how to perform speech recognition inference using a
+CUDA-based CTC beam search decoder.
+We demonstrate this on a pretrained
+`Zipformer <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_ctc>`__
+model from `Next-gen Kaldi <https://nadirapovey.com/next-gen-kaldi-what-is-it>`__ project.
+"""
+######################################################################
+# Overview
+# --------
+#
+# Beam search decoding works by iteratively expanding text hypotheses (beams)
+# with next possible characters, and maintaining only the hypotheses with the
+# highest scores at each time step.
+#
+# The underlying implementation uses cuda to acclerate the whole decoding process
+#  A mathematical formula for the decoder can be
+# found in the `paper <https://arxiv.org/pdf/1408.2873.pdf>`__, and
+# a more detailed algorithm can be found in this `blog
+# <https://distill.pub/2017/ctc/>`__.
+#
+# Running ASR inference using a CUDA CTC Beam Search decoder
+# requires the following components
+#
+# -  Acoustic Model: model predicting modeling units (BPE in this tutorial) from acoustic features
+# -  BPE Model: the byte-pair encoding (BPE) tokenizer file
+#
+######################################################################
+# Acoustic Model and Set Up
+# -------------------------
+#
+# First we import the necessary utilities and fetch the data that we are
+# working with
+#
+import torch
+import torchaudio
+print(torch.__version__)
+print(torchaudio.__version__)
+######################################################################
+#
+import time
+from pathlib import Path
+import IPython
+import sentencepiece as spm
+from torchaudio.models.decoder import cuda_ctc_decoder
+from torchaudio.utils import download_asset
+######################################################################
+#
+# We use the pretrained
+# `Zipformer <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-ctc-2022-12-01>`__
+# model that is trained on the `LibriSpeech
+# dataset <http://www.openslr.org/12>`__. The model is jointly trained with CTC and Transducer loss functions.
+# In this tutorial, we only use CTC head of the model.
+def download_asset_external(url, key):
+    path = Path(torch.hub.get_dir()) / "torchaudio" / Path(key)
+    if not path.exists():
+        path.parent.mkdir(parents=True, exist_ok=True)
+        torch.hub.download_url_to_file(url, path)
+    return str(path)
+url_prefix = "https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-ctc-2022-12-01"
+model_link = f"{url_prefix}/resolve/main/exp/cpu_jit.pt"
+model_path = download_asset_external(model_link, "cuda_ctc_decoder/cpu_jit.pt")
+######################################################################
+# We will load a sample from the LibriSpeech test-other dataset.
+#
+speech_file = download_asset("tutorial-assets/ctc-decoding/1688-142285-0007.wav")
+waveform, sample_rate = torchaudio.load(speech_file)
+assert sample_rate == 16000
+IPython.display.Audio(speech_file)
+######################################################################
+# The transcript corresponding to this audio file is
+#
+# .. code-block::
+#
+#    i really was very much afraid of showing him how much shocked i was at some parts of what he said
+#
+######################################################################
+# Files and Data for Decoder
+# --------------------------
+#
+# Next, we load in our token from BPE model, which is the tokenizer for decoding.
+#
+######################################################################
+# Tokens
+# ~~~~~~
+#
+# The tokens are the possible symbols that the acoustic model can predict,
+# including the blank symbol in CTC. In this tutorial, it includes 500 BPE tokens.
+# It can either be passed in as a
+# file, where each line consists of the tokens corresponding to the same
+# index, or as a list of tokens, each mapping to a unique index.
+#
+# .. code-block::
+#
+#    # tokens
+#    <blk>
+#    <sos/eos>
+#    <unk>
+#    S
+#    _THE
+#    _A
+#    T
+#    _AND
+#    ...
+#
+bpe_link = f"{url_prefix}/resolve/main/data/lang_bpe_500/bpe.model"
+bpe_path = download_asset_external(bpe_link, "cuda_ctc_decoder/bpe.model")
+bpe_model = spm.SentencePieceProcessor()
+bpe_model.load(bpe_path)
+tokens = [bpe_model.id_to_piece(id) for id in range(bpe_model.get_piece_size())]
+print(tokens)
+######################################################################
+# Construct CUDA Decoder
+# ----------------------
+# In this tutorial, we will construct a CUDA beam search decoder.
+# The decoder can be constructed using the factory function
+# :py:func:`~torchaudio.models.decoder.cuda_ctc_decoder`.
+#
+cuda_decoder = cuda_ctc_decoder(tokens, nbest=10, beam_size=10, blank_skip_threshold=0.95)
+######################################################################
+# Run Inference
+# -------------
+#
+# Now that we have the data, acoustic model, and decoder, we can perform
+# inference. The output of the beam search decoder is of type
+# :py:class:`~torchaudio.models.decoder.CUCTCHypothesis`, consisting of the
+# predicted token IDs, words (symbols corresponding to the token IDs), and hypothesis scores.
+# Recall the transcript corresponding to the
+# waveform is
+#
+# .. code-block::
+#
+#    i really was very much afraid of showing him how much shocked i was at some parts of what he said
+#
+actual_transcript = "i really was very much afraid of showing him how much shocked i was at some parts of what he said"
+actual_transcript = actual_transcript.split()
+device = torch.device("cuda", 0)
+acoustic_model = torch.jit.load(model_path)
+acoustic_model.to(device)
+acoustic_model.eval()
+waveform = waveform.to(device)
+feat = torchaudio.compliance.kaldi.fbank(waveform, num_mel_bins=80, snip_edges=False)
+feat = feat.unsqueeze(0)
+feat_lens = torch.tensor(feat.size(1), device=device).unsqueeze(0)
+encoder_out, encoder_out_lens = acoustic_model.encoder(feat, feat_lens)
+nnet_output = acoustic_model.ctc_output(encoder_out)
+log_prob = torch.nn.functional.log_softmax(nnet_output, -1)
+print(f"The shape of log_prob: {log_prob.shape}, the shape of encoder_out_lens: {encoder_out_lens.shape}")
+######################################################################
+# The cuda ctc decoder gives the following result.
+#
+results = cuda_decoder(log_prob, encoder_out_lens.to(torch.int32))
+beam_search_transcript = bpe_model.decode(results[0][0].tokens).lower()
+beam_search_wer = torchaudio.functional.edit_distance(actual_transcript, beam_search_transcript.split()) / len(
+    actual_transcript
+)
+print(f"Transcript: {beam_search_transcript}")
+print(f"WER: {beam_search_wer}")
+######################################################################
+# Beam Search Decoder Parameters
+# ------------------------------
+#
+# In this section, we go a little bit more in depth about some different
+# parameters and tradeoffs. For the full list of customizable parameters,
+# please refer to the
+# :py:func:`documentation <torchaudio.models.decoder.cuda_ctc_decoder>`.
+#
+######################################################################
+# Helper Function
+# ~~~~~~~~~~~~~~~
+#
+def print_decoded(cuda_decoder, bpe_model, log_prob, encoder_out_lens, param, param_value):
+    start_time = time.monotonic()
+    results = cuda_decoder(log_prob, encoder_out_lens.to(torch.int32))
+    decode_time = time.monotonic() - start_time
+    transcript = bpe_model.decode(results[0][0].tokens).lower()
+    score = results[0][0].score
+    print(f"{param} {param_value:<3}: {transcript} (score: {score:.2f}; {decode_time:.4f} secs)")
+######################################################################
+# nbest
+# ~~~~~
+#
+# This parameter indicates the number of best hypotheses to return. For
+# instance, by setting ``nbest=10`` when constructing the beam search
+# decoder earlier, we can now access the hypotheses with the top 10 scores.
+#
+for i in range(10):
+    transcript = bpe_model.decode(results[0][i].tokens).lower()
+    score = results[0][i].score
+    print(f"{transcript} (score: {score})")
+######################################################################
+# beam size
+# ~~~~~~~~~
+#
+# The ``beam_size`` parameter determines the maximum number of best
+# hypotheses to hold after each decoding step. Using larger beam sizes
+# allows for exploring a larger range of possible hypotheses which can
+# produce hypotheses with higher scores, but it does not provide additional gains beyond a certain point.
+# We recommend to set beam_size=10 for cuda beam search decoder.
+#
+# In the example below, we see improvement in decoding quality as we
+# increase beam size from 1 to 3, but notice how using a beam size
+# of 3 provides the same output as beam size 10.
+#
+beam_sizes = [1, 2, 3, 10]
+for beam_size in beam_sizes:
+    beam_search_decoder = cuda_ctc_decoder(
+        tokens,
+        nbest=1,
+        beam_size=beam_size,
+        blank_skip_threshold=0.95,
+    )
+    print_decoded(beam_search_decoder, bpe_model, log_prob, encoder_out_lens, "beam size", beam_size)
+######################################################################
+# blank skip threshold
+# ~~~~~~~~~~~~~~~~~~~~
+#
+# The ``blank_skip_threshold`` parameter is used to prune the frames which have large blank probability.
+# Pruning these frames with a good blank_skip_threshold could speed up decoding
+# process a lot while no accuracy drop.
+# Since the rule of CTC, we would keep at least one blank frame between two non-blank frames
+# to avoid mistakenly merge two consecutive identical symbols.
+# We recommend to set blank_skip_threshold=0.95 for cuda beam search decoder.
+#
+blank_skip_probs = [0.25, 0.95, 1.0]
+for blank_skip_prob in blank_skip_probs:
+    beam_search_decoder = cuda_ctc_decoder(
+        tokens,
+        nbest=10,
+        beam_size=10,
+        blank_skip_threshold=blank_skip_prob,
+    )
+    print_decoded(beam_search_decoder, bpe_model, log_prob, encoder_out_lens, "blank_skip_threshold", blank_skip_prob)
+del cuda_decoder
+######################################################################
+# Benchmark with flashlight CPU decoder
+# -------------------------------------
+# We benchmark the throughput and accuracy between CUDA decoder and CPU decoder using librispeech test_other set.
+# To reproduce below benchmark results, you may refer `here <https://github.com/pytorch/audio/tree/main/examples/asr/librispeech_cuda_ctc_decoder>`__.
+#
+# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
+# | Decoder      | Setting                                  | WER (%) | N-Best Oracle WER (%) | Decoder Cost Time (seconds) |
+# +==============+==========================================+=========+=======================+=============================+
+# | CUDA decoder | blank_skip_threshold 0.95                | 5.81    | 4.11                  | 2.57                        |
+# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
+# | CUDA decoder | blank_skip_threshold 1.0 (no frame-skip) | 5.81    | 4.09                  | 6.24                        |
+# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
+# | CPU decoder  | beam_size_token 10                       | 5.86    | 4.30                  | 28.61                       |
+# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
+# | CPU decoder  | beam_size_token 500                      | 5.86    | 4.30                  | 791.80                      |
+# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
+#
+# From the above table, CUDA decoder could give a slight improvement in WER and a significant increase in throughput.
--- a/torchaudio/models/decoder/_cuda_ctc_decoder.py
+++ b/torchaudio/models/decoder/_cuda_ctc_decoder.py
@@ -36,7 +36,7 @@ class CUCTCHypothesis(NamedTuple):
    """Score corresponding to hypothesis"""
-_DEFAULT_SKIP_THREASHOLD = math.log(0.95)
+_DEFAULT_BLANK_SKIP_THREASHOLD = 0.95
 class CUCTCDecoder:
@@ -54,17 +54,18 @@ class CUCTCDecoder:
        blank_id: int = 0,
        beam_size: int = 10,
        nbest: int = 1,
-        blank_skip_threshold: float = _DEFAULT_SKIP_THREASHOLD,
+        blank_skip_threshold: float = _DEFAULT_BLANK_SKIP_THREASHOLD,
        cuda_stream: torch.cuda.streams.Stream = None,
    ):
        """
        Args:
-            blank_id (int): token id corresopnding to blank (Default: 0)
+            blank_id (int): token id corresopnding to blank, only support 0 for now. (Default: 0)
            vocab_list (List[str]): list of vocabulary tokens
            beam_size (int, optional): max number of hypos to hold after each decode step (Default: 10)
            nbest (int): number of best decodings to return
-            blank_skip_threshold (float): skip frames if log_prob(blank) > blank_skip_threshold, to speed up decoding.
+            blank_skip_threshold (float):
-                (Default: log(0.95)).
+                skip frames if log_prob(blank) > log(blank_skip_threshold), to speed up decoding.
+                (Default: 0.95).
            cuda_stream (torch.cuda.streams.Stream): using assigned cuda stream (Default: using default stream)
        """
@@ -74,11 +75,15 @@ class CUCTCDecoder:
        cuda_stream_ = cuda_stream.cuda_stream if cuda_stream else torch.cuda.current_stream().cuda_stream
        self.internal_data = cuctc.prefixCTC_alloc(cuda_stream_)
        self.memory = torch.empty(0, dtype=torch.int8, device=torch.device("cuda"))
-        self.blank_id = 0  # blank id has to be zero
+        if blank_id != 0:
+            raise AssertionError("blank_id must be 0")
+        self.blank_id = blank_id
        self.vocab_list = vocab_list
        self.space_id = 0
        self.nbest = nbest
-        self.blank_skip_threshold = blank_skip_threshold
+        if not (blank_skip_threshold >= 0 and blank_skip_threshold <= 1):
+            raise AssertionError("blank_skip_threshold must be between 0 and 1")
+        self.blank_skip_threshold = math.log(blank_skip_threshold)
        self.beam_size = min(beam_size, len(vocab_list))  # beam size must be smaller than vocab size
    def __del__(self):
@@ -153,7 +158,7 @@ def cuda_ctc_decoder(
    tokens: Union[str, List[str]],
    nbest: int = 1,
    beam_size: int = 10,
-    blank_skip_threshold: float = _DEFAULT_SKIP_THREASHOLD,
+    blank_skip_threshold: float = _DEFAULT_BLANK_SKIP_THREASHOLD,
 ) -> CUCTCDecoder:
    """Builds an instance of :class:`CUCTCDecoder`.
@@ -163,8 +168,8 @@ def cuda_ctc_decoder(
        beam_size (int, optional): The maximum number of hypos to hold after each decode step (Default: 10)
        nbest (int): The number of best decodings to return
        blank_id (int): The token ID corresopnding to the blank symbol.
-        blank_skip_threshold (float): skip frames if log_prob(blank) > blank_skip_threshold, to speed up decoding
+        blank_skip_threshold (float): skip frames if log_prob(blank) > log(blank_skip_threshold), to speed up decoding
-            (Default: log(0.95)).
+            (Default: 0.95).
    Returns:
        CUCTCDecoder: decoder
@@ -172,7 +177,7 @@ def cuda_ctc_decoder(
    Example
        >>> decoder = cuda_ctc_decoder(
        >>>     vocab_file="tokens.txt",
-        >>>     blank_skip_threshold=math.log(0.95),
+        >>>     blank_skip_threshold=0.95,
        >>> )
        >>> results = decoder(log_probs, encoder_out_lens) # List of shape (B, nbest) of Hypotheses
    """