"src/git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "365ff8f76d9dfc2086625b1c265b920be2218863"
Commit 732c94a3 authored by Yuekai Zhang's avatar Yuekai Zhang Committed by Facebook GitHub Bot
Browse files

Add cuctc tutorial, change blank skip threshold into prob (#3297)

Summary:
Add a separate tutorial for cuctc.
Reslove https://github.com/pytorch/audio/issues/3096

Pull Request resolved: https://github.com/pytorch/audio/pull/3297

Reviewed By: huangruizhe

Differential Revision: D47928400

Pulled By: mthrok

fbshipit-source-id: 8c16492fb4d007b6ea7969ba77c866a51749c0ec
parent 144cfcfc
...@@ -68,6 +68,7 @@ model implementations and application components. ...@@ -68,6 +68,7 @@ model implementations and application components.
tutorials/speech_recognition_pipeline_tutorial tutorials/speech_recognition_pipeline_tutorial
tutorials/asr_inference_with_ctc_decoder_tutorial tutorials/asr_inference_with_ctc_decoder_tutorial
tutorials/asr_inference_with_cuda_ctc_decoder_tutorial
tutorials/online_asr_tutorial tutorials/online_asr_tutorial
tutorials/device_asr tutorials/device_asr
tutorials/device_avsr tutorials/device_avsr
...@@ -147,6 +148,13 @@ Tutorials ...@@ -147,6 +148,13 @@ Tutorials
.. customcardstart:: .. customcardstart::
.. customcarditem::
:header: AM inference with CUDA CTC Beam Seach Decoder
:card_description: Learn how to perform ASR beam search decoding with GPU, using <code>torchaudio.models.decoder.cuda_ctc_decoder</code>.
:image: https://download.pytorch.org/torchaudio/tutorial-assets/thumbnails/asr_inference_with_ctc_decoder_tutorial.png
:link: tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.html
:tags: Pipelines,ASR,CTC-Decoder,CUDA-CTC-Decoder
.. customcarditem:: .. customcarditem::
:header: On device audio-visual automatic speech recognition :header: On device audio-visual automatic speech recognition
:card_description: Learn how to stream audio and video from laptop webcam and perform audio-visual automatic speech recognition using Emformer-RNNT model. :card_description: Learn how to stream audio and video from laptop webcam and perform audio-visual automatic speech recognition using Emformer-RNNT model.
......
...@@ -43,9 +43,9 @@ def run_inference(args): ...@@ -43,9 +43,9 @@ def run_inference(args):
) )
else: else:
assert vocabs[0] == "<blk>", "idx of blank token has to be zero" assert vocabs[0] == "<blk>", "idx of blank token has to be zero"
blank_frame_skip_threshold = float(torch.log(torch.tensor(args.blank_skip_threshold)))
cuda_decoder = cuda_ctc_decoder( cuda_decoder = cuda_ctc_decoder(
vocabs, nbest=args.nbest, beam_size=args.beam_size, blank_skip_threshold=blank_frame_skip_threshold vocabs, nbest=args.nbest, beam_size=args.beam_size, blank_skip_threshold=args.blank_skip_threshold
) )
dataset = torchaudio.datasets.LIBRISPEECH(args.librispeech_path, url=args.split, download=True) dataset = torchaudio.datasets.LIBRISPEECH(args.librispeech_path, url=args.split, download=True)
......
"""
ASR Inference with CUDA CTC Decoder
====================================
**Author**: `Yuekai Zhang <yuekaiz@nvidia.com>`__
This tutorial shows how to perform speech recognition inference using a
CUDA-based CTC beam search decoder.
We demonstrate this on a pretrained
`Zipformer <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_ctc>`__
model from `Next-gen Kaldi <https://nadirapovey.com/next-gen-kaldi-what-is-it>`__ project.
"""
######################################################################
# Overview
# --------
#
# Beam search decoding works by iteratively expanding text hypotheses (beams)
# with next possible characters, and maintaining only the hypotheses with the
# highest scores at each time step.
#
# The underlying implementation uses cuda to acclerate the whole decoding process
# A mathematical formula for the decoder can be
# found in the `paper <https://arxiv.org/pdf/1408.2873.pdf>`__, and
# a more detailed algorithm can be found in this `blog
# <https://distill.pub/2017/ctc/>`__.
#
# Running ASR inference using a CUDA CTC Beam Search decoder
# requires the following components
#
# - Acoustic Model: model predicting modeling units (BPE in this tutorial) from acoustic features
# - BPE Model: the byte-pair encoding (BPE) tokenizer file
#
######################################################################
# Acoustic Model and Set Up
# -------------------------
#
# First we import the necessary utilities and fetch the data that we are
# working with
#
import torch
import torchaudio
print(torch.__version__)
print(torchaudio.__version__)
######################################################################
#
import time
from pathlib import Path
import IPython
import sentencepiece as spm
from torchaudio.models.decoder import cuda_ctc_decoder
from torchaudio.utils import download_asset
######################################################################
#
# We use the pretrained
# `Zipformer <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-ctc-2022-12-01>`__
# model that is trained on the `LibriSpeech
# dataset <http://www.openslr.org/12>`__. The model is jointly trained with CTC and Transducer loss functions.
# In this tutorial, we only use CTC head of the model.
def download_asset_external(url, key):
path = Path(torch.hub.get_dir()) / "torchaudio" / Path(key)
if not path.exists():
path.parent.mkdir(parents=True, exist_ok=True)
torch.hub.download_url_to_file(url, path)
return str(path)
url_prefix = "https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-ctc-2022-12-01"
model_link = f"{url_prefix}/resolve/main/exp/cpu_jit.pt"
model_path = download_asset_external(model_link, "cuda_ctc_decoder/cpu_jit.pt")
######################################################################
# We will load a sample from the LibriSpeech test-other dataset.
#
speech_file = download_asset("tutorial-assets/ctc-decoding/1688-142285-0007.wav")
waveform, sample_rate = torchaudio.load(speech_file)
assert sample_rate == 16000
IPython.display.Audio(speech_file)
######################################################################
# The transcript corresponding to this audio file is
#
# .. code-block::
#
# i really was very much afraid of showing him how much shocked i was at some parts of what he said
#
######################################################################
# Files and Data for Decoder
# --------------------------
#
# Next, we load in our token from BPE model, which is the tokenizer for decoding.
#
######################################################################
# Tokens
# ~~~~~~
#
# The tokens are the possible symbols that the acoustic model can predict,
# including the blank symbol in CTC. In this tutorial, it includes 500 BPE tokens.
# It can either be passed in as a
# file, where each line consists of the tokens corresponding to the same
# index, or as a list of tokens, each mapping to a unique index.
#
# .. code-block::
#
# # tokens
# <blk>
# <sos/eos>
# <unk>
# S
# _THE
# _A
# T
# _AND
# ...
#
bpe_link = f"{url_prefix}/resolve/main/data/lang_bpe_500/bpe.model"
bpe_path = download_asset_external(bpe_link, "cuda_ctc_decoder/bpe.model")
bpe_model = spm.SentencePieceProcessor()
bpe_model.load(bpe_path)
tokens = [bpe_model.id_to_piece(id) for id in range(bpe_model.get_piece_size())]
print(tokens)
######################################################################
# Construct CUDA Decoder
# ----------------------
# In this tutorial, we will construct a CUDA beam search decoder.
# The decoder can be constructed using the factory function
# :py:func:`~torchaudio.models.decoder.cuda_ctc_decoder`.
#
cuda_decoder = cuda_ctc_decoder(tokens, nbest=10, beam_size=10, blank_skip_threshold=0.95)
######################################################################
# Run Inference
# -------------
#
# Now that we have the data, acoustic model, and decoder, we can perform
# inference. The output of the beam search decoder is of type
# :py:class:`~torchaudio.models.decoder.CUCTCHypothesis`, consisting of the
# predicted token IDs, words (symbols corresponding to the token IDs), and hypothesis scores.
# Recall the transcript corresponding to the
# waveform is
#
# .. code-block::
#
# i really was very much afraid of showing him how much shocked i was at some parts of what he said
#
actual_transcript = "i really was very much afraid of showing him how much shocked i was at some parts of what he said"
actual_transcript = actual_transcript.split()
device = torch.device("cuda", 0)
acoustic_model = torch.jit.load(model_path)
acoustic_model.to(device)
acoustic_model.eval()
waveform = waveform.to(device)
feat = torchaudio.compliance.kaldi.fbank(waveform, num_mel_bins=80, snip_edges=False)
feat = feat.unsqueeze(0)
feat_lens = torch.tensor(feat.size(1), device=device).unsqueeze(0)
encoder_out, encoder_out_lens = acoustic_model.encoder(feat, feat_lens)
nnet_output = acoustic_model.ctc_output(encoder_out)
log_prob = torch.nn.functional.log_softmax(nnet_output, -1)
print(f"The shape of log_prob: {log_prob.shape}, the shape of encoder_out_lens: {encoder_out_lens.shape}")
######################################################################
# The cuda ctc decoder gives the following result.
#
results = cuda_decoder(log_prob, encoder_out_lens.to(torch.int32))
beam_search_transcript = bpe_model.decode(results[0][0].tokens).lower()
beam_search_wer = torchaudio.functional.edit_distance(actual_transcript, beam_search_transcript.split()) / len(
actual_transcript
)
print(f"Transcript: {beam_search_transcript}")
print(f"WER: {beam_search_wer}")
######################################################################
# Beam Search Decoder Parameters
# ------------------------------
#
# In this section, we go a little bit more in depth about some different
# parameters and tradeoffs. For the full list of customizable parameters,
# please refer to the
# :py:func:`documentation <torchaudio.models.decoder.cuda_ctc_decoder>`.
#
######################################################################
# Helper Function
# ~~~~~~~~~~~~~~~
#
def print_decoded(cuda_decoder, bpe_model, log_prob, encoder_out_lens, param, param_value):
start_time = time.monotonic()
results = cuda_decoder(log_prob, encoder_out_lens.to(torch.int32))
decode_time = time.monotonic() - start_time
transcript = bpe_model.decode(results[0][0].tokens).lower()
score = results[0][0].score
print(f"{param} {param_value:<3}: {transcript} (score: {score:.2f}; {decode_time:.4f} secs)")
######################################################################
# nbest
# ~~~~~
#
# This parameter indicates the number of best hypotheses to return. For
# instance, by setting ``nbest=10`` when constructing the beam search
# decoder earlier, we can now access the hypotheses with the top 10 scores.
#
for i in range(10):
transcript = bpe_model.decode(results[0][i].tokens).lower()
score = results[0][i].score
print(f"{transcript} (score: {score})")
######################################################################
# beam size
# ~~~~~~~~~
#
# The ``beam_size`` parameter determines the maximum number of best
# hypotheses to hold after each decoding step. Using larger beam sizes
# allows for exploring a larger range of possible hypotheses which can
# produce hypotheses with higher scores, but it does not provide additional gains beyond a certain point.
# We recommend to set beam_size=10 for cuda beam search decoder.
#
# In the example below, we see improvement in decoding quality as we
# increase beam size from 1 to 3, but notice how using a beam size
# of 3 provides the same output as beam size 10.
#
beam_sizes = [1, 2, 3, 10]
for beam_size in beam_sizes:
beam_search_decoder = cuda_ctc_decoder(
tokens,
nbest=1,
beam_size=beam_size,
blank_skip_threshold=0.95,
)
print_decoded(beam_search_decoder, bpe_model, log_prob, encoder_out_lens, "beam size", beam_size)
######################################################################
# blank skip threshold
# ~~~~~~~~~~~~~~~~~~~~
#
# The ``blank_skip_threshold`` parameter is used to prune the frames which have large blank probability.
# Pruning these frames with a good blank_skip_threshold could speed up decoding
# process a lot while no accuracy drop.
# Since the rule of CTC, we would keep at least one blank frame between two non-blank frames
# to avoid mistakenly merge two consecutive identical symbols.
# We recommend to set blank_skip_threshold=0.95 for cuda beam search decoder.
#
blank_skip_probs = [0.25, 0.95, 1.0]
for blank_skip_prob in blank_skip_probs:
beam_search_decoder = cuda_ctc_decoder(
tokens,
nbest=10,
beam_size=10,
blank_skip_threshold=blank_skip_prob,
)
print_decoded(beam_search_decoder, bpe_model, log_prob, encoder_out_lens, "blank_skip_threshold", blank_skip_prob)
del cuda_decoder
######################################################################
# Benchmark with flashlight CPU decoder
# -------------------------------------
# We benchmark the throughput and accuracy between CUDA decoder and CPU decoder using librispeech test_other set.
# To reproduce below benchmark results, you may refer `here <https://github.com/pytorch/audio/tree/main/examples/asr/librispeech_cuda_ctc_decoder>`__.
#
# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
# | Decoder | Setting | WER (%) | N-Best Oracle WER (%) | Decoder Cost Time (seconds) |
# +==============+==========================================+=========+=======================+=============================+
# | CUDA decoder | blank_skip_threshold 0.95 | 5.81 | 4.11 | 2.57 |
# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
# | CUDA decoder | blank_skip_threshold 1.0 (no frame-skip) | 5.81 | 4.09 | 6.24 |
# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
# | CPU decoder | beam_size_token 10 | 5.86 | 4.30 | 28.61 |
# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
# | CPU decoder | beam_size_token 500 | 5.86 | 4.30 | 791.80 |
# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
#
# From the above table, CUDA decoder could give a slight improvement in WER and a significant increase in throughput.
...@@ -36,7 +36,7 @@ class CUCTCHypothesis(NamedTuple): ...@@ -36,7 +36,7 @@ class CUCTCHypothesis(NamedTuple):
"""Score corresponding to hypothesis""" """Score corresponding to hypothesis"""
_DEFAULT_SKIP_THREASHOLD = math.log(0.95) _DEFAULT_BLANK_SKIP_THREASHOLD = 0.95
class CUCTCDecoder: class CUCTCDecoder:
...@@ -54,17 +54,18 @@ class CUCTCDecoder: ...@@ -54,17 +54,18 @@ class CUCTCDecoder:
blank_id: int = 0, blank_id: int = 0,
beam_size: int = 10, beam_size: int = 10,
nbest: int = 1, nbest: int = 1,
blank_skip_threshold: float = _DEFAULT_SKIP_THREASHOLD, blank_skip_threshold: float = _DEFAULT_BLANK_SKIP_THREASHOLD,
cuda_stream: torch.cuda.streams.Stream = None, cuda_stream: torch.cuda.streams.Stream = None,
): ):
""" """
Args: Args:
blank_id (int): token id corresopnding to blank (Default: 0) blank_id (int): token id corresopnding to blank, only support 0 for now. (Default: 0)
vocab_list (List[str]): list of vocabulary tokens vocab_list (List[str]): list of vocabulary tokens
beam_size (int, optional): max number of hypos to hold after each decode step (Default: 10) beam_size (int, optional): max number of hypos to hold after each decode step (Default: 10)
nbest (int): number of best decodings to return nbest (int): number of best decodings to return
blank_skip_threshold (float): skip frames if log_prob(blank) > blank_skip_threshold, to speed up decoding. blank_skip_threshold (float):
(Default: log(0.95)). skip frames if log_prob(blank) > log(blank_skip_threshold), to speed up decoding.
(Default: 0.95).
cuda_stream (torch.cuda.streams.Stream): using assigned cuda stream (Default: using default stream) cuda_stream (torch.cuda.streams.Stream): using assigned cuda stream (Default: using default stream)
""" """
...@@ -74,11 +75,15 @@ class CUCTCDecoder: ...@@ -74,11 +75,15 @@ class CUCTCDecoder:
cuda_stream_ = cuda_stream.cuda_stream if cuda_stream else torch.cuda.current_stream().cuda_stream cuda_stream_ = cuda_stream.cuda_stream if cuda_stream else torch.cuda.current_stream().cuda_stream
self.internal_data = cuctc.prefixCTC_alloc(cuda_stream_) self.internal_data = cuctc.prefixCTC_alloc(cuda_stream_)
self.memory = torch.empty(0, dtype=torch.int8, device=torch.device("cuda")) self.memory = torch.empty(0, dtype=torch.int8, device=torch.device("cuda"))
self.blank_id = 0 # blank id has to be zero if blank_id != 0:
raise AssertionError("blank_id must be 0")
self.blank_id = blank_id
self.vocab_list = vocab_list self.vocab_list = vocab_list
self.space_id = 0 self.space_id = 0
self.nbest = nbest self.nbest = nbest
self.blank_skip_threshold = blank_skip_threshold if not (blank_skip_threshold >= 0 and blank_skip_threshold <= 1):
raise AssertionError("blank_skip_threshold must be between 0 and 1")
self.blank_skip_threshold = math.log(blank_skip_threshold)
self.beam_size = min(beam_size, len(vocab_list)) # beam size must be smaller than vocab size self.beam_size = min(beam_size, len(vocab_list)) # beam size must be smaller than vocab size
def __del__(self): def __del__(self):
...@@ -153,7 +158,7 @@ def cuda_ctc_decoder( ...@@ -153,7 +158,7 @@ def cuda_ctc_decoder(
tokens: Union[str, List[str]], tokens: Union[str, List[str]],
nbest: int = 1, nbest: int = 1,
beam_size: int = 10, beam_size: int = 10,
blank_skip_threshold: float = _DEFAULT_SKIP_THREASHOLD, blank_skip_threshold: float = _DEFAULT_BLANK_SKIP_THREASHOLD,
) -> CUCTCDecoder: ) -> CUCTCDecoder:
"""Builds an instance of :class:`CUCTCDecoder`. """Builds an instance of :class:`CUCTCDecoder`.
...@@ -163,8 +168,8 @@ def cuda_ctc_decoder( ...@@ -163,8 +168,8 @@ def cuda_ctc_decoder(
beam_size (int, optional): The maximum number of hypos to hold after each decode step (Default: 10) beam_size (int, optional): The maximum number of hypos to hold after each decode step (Default: 10)
nbest (int): The number of best decodings to return nbest (int): The number of best decodings to return
blank_id (int): The token ID corresopnding to the blank symbol. blank_id (int): The token ID corresopnding to the blank symbol.
blank_skip_threshold (float): skip frames if log_prob(blank) > blank_skip_threshold, to speed up decoding blank_skip_threshold (float): skip frames if log_prob(blank) > log(blank_skip_threshold), to speed up decoding
(Default: log(0.95)). (Default: 0.95).
Returns: Returns:
CUCTCDecoder: decoder CUCTCDecoder: decoder
...@@ -172,7 +177,7 @@ def cuda_ctc_decoder( ...@@ -172,7 +177,7 @@ def cuda_ctc_decoder(
Example Example
>>> decoder = cuda_ctc_decoder( >>> decoder = cuda_ctc_decoder(
>>> vocab_file="tokens.txt", >>> vocab_file="tokens.txt",
>>> blank_skip_threshold=math.log(0.95), >>> blank_skip_threshold=0.95,
>>> ) >>> )
>>> results = decoder(log_probs, encoder_out_lens) # List of shape (B, nbest) of Hypotheses >>> results = decoder(log_probs, encoder_out_lens) # List of shape (B, nbest) of Hypotheses
""" """
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment