UPDATE

ffeba11a · mayp777 · 29deb085 · ffeba11a · ffeba11a · ffeba11a
Commit ffeba11a authored Sep 02, 2024 by mayp777
20 changed files
--- a/examples/self_supervised_learning/losses/__init__.py
+++ b/examples/self_supervised_learning/losses/__init__.py
+from ._hubert_loss import hubert_loss
+__all__ = [
+    "hubert_loss",
+    "wav2vec2_loss",
+]
--- a/examples/self_supervised_learning/losses/_hubert_loss.py
+++ b/examples/self_supervised_learning/losses/_hubert_loss.py
+from typing import Optional, Tuple
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+def hubert_loss(
+    logit_m: Optional[Tensor],
+    logit_u: Optional[Tensor],
+    feature_penalty: Tensor,
+    label: Optional[Tensor] = None,
+    masked_weight: float = 1.0,
+    unmasked_weight: float = 0.0,
+    feature_weight: float = 10.0,
+    reduction: str = "sum",
+) -> Tuple[Tensor, float]:
+    """Compute the cross-entropy loss on HuBERT masked and non-masked logits.
+    Args:
+        logit_m (Tensor or None): The masked logit Tensor of dimension `(masked_frames, final_dim)`.
+        logit_u (Tensor or None): The non-masked logit Tensor of dimension `(unmasked_frames, final_dim)`.
+        feature_penalty (Tensor): The feature mean value for additional penalty loss.
+        masked_weight (float, optional): The weight for masked cross-entropy loss (Default: ``1.0``).
+        unmasked_weight (float, optional): The weight for non-masked cross-entropy loss (Default: ``0.0``).
+        feature_weight (float, optional): The weight for feature penalty loss (Default: ``10.0``).
+        reduction (str, optional): The reduction method for cross-entropy loss (Default: ``"sum"``).
+    Returns:
+        (Tensor, float)
+        Tensor: The desired loss Tensor.
+        float: Number of frames used in loss computation.
+    """
+    num_frame = 0.0
+    loss = 0.0
+    if logit_m is not None:
+        target_m = torch.zeros(logit_m.shape[0], dtype=torch.long, device=logit_m.device)
+        loss_m = F.cross_entropy(logit_m, target_m, reduction=reduction)
+        loss += loss_m * masked_weight
+        num_frame += logit_m.shape[0]
+    if logit_u is not None:
+        target_u = torch.zeros(logit_u.shape[0], dtype=torch.long, device=logit_m.device)
+        loss_u = F.cross_entropy(logit_u, target_u, reduction=reduction)
+        loss += loss_u * unmasked_weight
+        num_frame += logit_u.shape[0]
+    loss += feature_penalty * feature_weight * num_frame
+    return loss, num_frame
--- a/examples/self_supervised_learning/losses/_wav2vec2_loss.py
+++ b/examples/self_supervised_learning/losses/_wav2vec2_loss.py
+from typing import Tuple
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+def compute_contrastive_loss(
+    x: Tensor,
+    mask_indices: Tensor,
+    targets: Tensor,
+    neg_is_pos: Tensor,
+    reduction: str = "none",
+    logit_temp: float = 0.1,
+):
+    """
+    Computes the contrastive loss used in Wav2Vec2 loss function.
+    Args:
+        x (Tensor): Input embeddings of shape `(batch_size, sequence_length, hidden_size)`.
+        mask_indices (Tensor): Indices to mask negative samples of shape `(batch_size, sequence_length)`.
+        targets (Tensor): Labels indicating positive samples.
+            Tensor of shape `(num_negative + 1, batch, sequence_length, hidden_size)`.
+        neg_is_pos (Tensor): Boolean tensor indicating whether negative samples should be treated as positives.
+            Tensor of shape `(batch, sequence_length)`.
+        reduction (str): Reduction type ("sum" or "none").
+        logit_temp (float, optional): Temperature scaling factor for logits, defaults to 0.1.
+    Returns:
+        The computed contrastive loss and sample size
+    """
+    x = x[mask_indices].view(x.size(0), -1, x.size(-1)).unsqueeze(0).expand(targets.shape)
+    logits = torch.cosine_similarity(x.float(), targets.float(), dim=-1).float()
+    logits /= logit_temp
+    if neg_is_pos.any():
+        logits[1:][neg_is_pos] = float("-inf")
+    target = logits.new_zeros(logits.size(1) * logits.size(2), dtype=torch.long, device=logits.device)
+    logits = logits.transpose(0, 2)
+    logits = logits.reshape(-1, logits.size(-1))
+    loss = F.cross_entropy(
+        logits,
+        target,
+        reduction=reduction,
+    )
+    sample_size = target.numel()
+    return loss, sample_size
+def wav2vec2_loss(
+    x: Tensor, mask_indices: Tensor, positives: Tensor, negatives: Tensor, reduction: str = "none"
+) -> Tuple[Tensor, float]:
+    """Compute Wav2Vec2 loss.
+    Args:
+        x (Tensor): The masked sequences of Wav2Vec 2.0 model.
+            Tensor of shape `(batch_size, sequence_length, hidden_size)`.
+        mask_indices (Tensor): The mask indices. Tensor of shape `(batch_size, sequence_length)`
+        positives (Tensor): The positives, prior to negative sampling.
+            Tensor of shape `(batch_size, masked_sequence_length, hidden_size)`
+        negatives (Tensor): The negative samples.
+            Tensor of shape `(num_negative, batch_size, masked_sequence_length, hidden_size)`
+        reduction (str): Use "sum" as reduction for cross-entropy loss (Default: ``none``)
+    Returns:
+        (Tensor, float)
+        Tensor: The desired loss Tensor.
+        float: Sample size according to mask_indices
+    """
+    assert positives is not None
+    assert mask_indices is not None
+    assert mask_indices.sum() == positives.shape[0] * positives.shape[1]
+    neg_is_pos = (positives == negatives).all(-1)
+    positives = positives.unsqueeze(0)
+    targets = torch.cat([positives, negatives], dim=0)
+    loss, sample_size = compute_contrastive_loss(x, mask_indices, targets, neg_is_pos, reduction)
+    return loss, sample_size
--- a/examples/self_supervised_learning/lr_schedulers/__init__.py
+++ b/examples/self_supervised_learning/lr_schedulers/__init__.py
+from ._linear_decay import LinearDecayLRScheduler
+__all__ = [
+    "LinearDecayLRScheduler",
+]
--- a/examples/self_supervised_learning/lr_schedulers/_linear_decay.py
+++ b/examples/self_supervised_learning/lr_schedulers/_linear_decay.py
+import torch
+from torch.optim.optimizer import Optimizer
+class LinearDecayLRScheduler(torch.optim.lr_scheduler._LRScheduler):
+    """Linear learning rate scheduler with warm up."""
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        warmup_updates: int,
+        max_updates: int,
+        last_epoch: int = -1,
+        verbose: bool = False,
+    ):
+        self.warmup_updates = warmup_updates
+        self.max_updates = max_updates
+        super().__init__(optimizer, last_epoch=last_epoch, verbose=verbose)
+    def get_lr(self):
+        if self._step_count <= self.warmup_updates:
+            return [self._step_count / self.warmup_updates * base_lr for base_lr in self.base_lrs]
+        elif self._step_count >= self.max_updates:
+            return [0.0 for _ in self.base_lrs]
+        else:
+            pct_remaining = (self.max_updates - self._step_count) / (self.max_updates - self.warmup_updates)
+            return [base_lr * pct_remaining for base_lr in self.base_lrs]
--- a/examples/self_supervised_learning/train_hubert.py
+++ b/examples/self_supervised_learning/train_hubert.py
+import logging
+import pathlib
+from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, RawDescriptionHelpFormatter
+from functools import partial
+from typing import Dict, Tuple
+import torch
+import torchaudio.models
+from lightning.pytorch import seed_everything, Trainer
+from lightning.pytorch.callbacks import ModelCheckpoint
+from .data_modules import HuBERTDataModule
+from .lightning_modules import SSLPretrainModule
+from .losses import hubert_loss
+from .lr_schedulers import LinearDecayLRScheduler
+class _Formatter(ArgumentDefaultsHelpFormatter, RawDescriptionHelpFormatter):
+    # To use ArgumentDefaultsHelpFormatter as the formatter_class and
+    # RawDescriptionHelpFormatter to add custom formatting to description or epilog.
+    # Check: https://stackoverflow.com/a/18462760
+    pass
+def _compute_accuracy(logits: torch.Tensor):
+    with torch.no_grad():
+        max = logits.argmax(-1) == 0
+        min = logits.argmin(-1) == 0
+        both = max & min
+        corr = max.long().sum().item() - both.long().sum().item()
+        count = max.numel()
+    return corr / count
+class HuBERTModule(SSLPretrainModule):
+    def configure_optimizers(self):
+        return (
+            [self.optimizer],
+            [
+                {
+                    "scheduler": self.lr_scheduler,
+                    "interval": "step",
+                },
+            ],
+        )
+    def log_metric(self, batch: Dict, output: Tuple, loss: torch.Tensor, step_type: str):
+        logit_m, logit_u, _ = output
+        self.log(
+            f"{step_type}_loss",
+            loss.item(),
+            on_step=True,
+            on_epoch=True,
+        )
+        acc_m = _compute_accuracy(logit_m)
+        acc_u = _compute_accuracy(logit_u)
+        self.log(
+            f"{step_type}_acc_m",
+            acc_m,
+            on_step=True,
+            on_epoch=True,
+            sync_dist=True,
+            prog_bar=step_type == "train",
+        )
+        self.log(
+            f"{step_type}_acc_u",
+            acc_u,
+            on_step=True,
+            on_epoch=True,
+            sync_dist=True,
+            prog_bar=step_type == "train",
+        )
+def run_train(args):
+    seed_everything(1337)
+    checkpoint_dir = args.exp_dir / f"checkpoints_{args.dataset}_{args.model_name}"
+    checkpoint = ModelCheckpoint(
+        checkpoint_dir,
+        monitor="val_loss",
+        mode="min",
+        save_top_k=5,
+        save_weights_only=False,
+        verbose=True,
+    )
+    train_checkpoint = ModelCheckpoint(
+        checkpoint_dir,
+        monitor="train_loss",
+        mode="min",
+        save_top_k=5,
+        save_weights_only=False,
+        verbose=True,
+    )
+    callbacks = [
+        checkpoint,
+        train_checkpoint,
+    ]
+    trainer = Trainer(
+        default_root_dir=args.exp_dir,
+        max_steps=args.max_updates,
+        num_nodes=args.num_nodes,
+        devices=args.gpus,
+        accelerator="gpu",
+        strategy="ddp_find_unused_parameters_true",
+        precision=args.precision,
+        accumulate_grad_batches=args.accumulate_grad_batches,
+        gradient_clip_val=args.clip_norm,
+        use_distributed_sampler=False,
+        callbacks=callbacks,
+        reload_dataloaders_every_n_epochs=1,
+    )
+    if args.model_name not in ["hubert_pretrain_base", "hubert_pretrain_large", "hubert_pretrain_xlarge"]:
+        raise ValueError(
+            "Expect model_name to be one of 'hubert_pretrain_base', 'hubert_pretrain_large', 'hubert_pretrain_xlarge'."
+            f"Found {args.model_name}."
+        )
+    model = getattr(torchaudio.models, args.model_name)()
+    loss_fn = partial(
+        hubert_loss,
+        masked_weight=args.masked_weight,
+        unmasked_weight=args.unmasked_weight,
+        feature_weight=args.feature_weight,
+    )
+    optimizer = torch.optim.AdamW(
+        model.parameters(),
+        lr=args.learning_rate,
+        betas=args.betas,
+        eps=args.eps,
+        weight_decay=args.weight_decay,
+    )
+    lr_scheduler = LinearDecayLRScheduler(optimizer, args.warmup_updates, args.max_updates)
+    lightning_module = HuBERTModule(
+        model,
+        loss_fn,
+        optimizer,
+        lr_scheduler,
+    )
+    data_module = HuBERTDataModule(
+        dataset_path=args.dataset_path,
+        dataset="librispeech",
+        feature_type="mfcc",
+        seconds_per_batch=args.seconds_per_batch,
+        train_shuffle=True,
+        num_workers=10,
+    )
+    trainer.fit(lightning_module, datamodule=data_module)
+def _parse_args():
+    parser = ArgumentParser(
+        description=__doc__,
+        formatter_class=_Formatter,
+    )
+    parser.add_argument(
+        "--dataset-path",
+        type=pathlib.Path,
+        required=True,
+        help="Path to the feature and label directories.",
+    )
+    parser.add_argument(
+        "--resume-checkpoint",
+        type=pathlib.Path,
+        default=None,
+        help="Path to the feature and label directories. (Default: None)",
+    )
+    parser.add_argument(
+        "--feature-type",
+        choices=["mfcc", "hubert"],
+        type=str,
+        required=True,
+    )
+    parser.add_argument(
+        "--feature-grad-mult",
+        default=0.1,
+        type=float,
+        help="The scaling factor to multiply the feature extractor gradient. (Default: 0.1)",
+    )
+    parser.add_argument(
+        "--num-classes",
+        choices=[100, 500],
+        type=int,
+        required=True,
+        help="The ``num_class`` when building the hubert_pretrain_base model.",
+    )
+    parser.add_argument(
+        "--model-name",
+        default="hubert_pretrain_base",
+        choices=[
+            "hubert_pretrain_base",
+            "hubert_pretrain_large",
+            "hubert_pretrain_xlarge",
+        ],
+        type=str,
+        help="The HuBERT model to train. (Default: 'hubert_pretrain_base')",
+    )
+    parser.add_argument(
+        "--exp-dir",
+        default=pathlib.Path("./exp"),
+        type=pathlib.Path,
+        help="Directory to save checkpoints and logs to. (Default: './exp')",
+    )
+    parser.add_argument(
+        "--dataset",
+        default="librispeech",
+        choices=["librispeech", "librilight"],
+        type=str,
+        help="The dataset for training. (Default: 'librispeech')",
+    )
+    parser.add_argument(
+        "--learning-rate",
+        default=0.0005,
+        type=float,
+        help="The peak learning rate. (Default: 0.0005)",
+    )
+    parser.add_argument(
+        "--betas",
+        default=(0.9, 0.98),
+        type=Tuple,
+        help="The coefficients for computing running averages of gradient and its square (Default: (0.9, 0.98))",
+    )
+    parser.add_argument(
+        "--eps",
+        default=1e-6,
+        type=float,
+        help="Epsilon value in Adam optimizer. (Default: 1e-6)",
+    )
+    parser.add_argument(
+        "--weight-decay",
+        default=0.01,
+        type=float,
+        help="Weight decay (L2 penalty) (Default: 0.01)",
+    )
+    parser.add_argument(
+        "--precision",
+        default=16,
+        choices=[16, 32, 64, "bf16"],
+        help="Precision of model training. (Default: 16)",
+    )
+    parser.add_argument(
+        "--accumulate-grad-batches",
+        default=1,
+        type=int,
+        help="Number of steps for accumulating gradients. (Default: 1)",
+    )
+    parser.add_argument(
+        "--clip-norm",
+        default=10.0,
+        type=float,
+        help="The gradient norm value to clip. (Default: 10.0)",
+    )
+    parser.add_argument(
+        "--num-nodes",
+        default=4,
+        type=int,
+        help="Number of nodes to use for training. (Default: 4)",
+    )
+    parser.add_argument(
+        "--gpus",
+        default=8,
+        type=int,
+        help="Number of GPUs per node to use for training. (Default: 8)",
+    )
+    parser.add_argument(
+        "--warmup-updates",
+        default=32000,
+        type=int,
+        help="Number of steps for warm up the learning rate. (Default: 32000)",
+    )
+    parser.add_argument(
+        "--max-updates",
+        default=250000,
+        type=int,
+        help="Total number of training steps. (Default: 250000)",
+    )
+    parser.add_argument(
+        "--seconds-per-batch",
+        default=87.5,
+        type=float,
+        help="Number of seconds of audio in a mini-batch. (Default: 87.5)",
+    )
+    parser.add_argument(
+        "--masked-weight",
+        default=1.0,
+        type=float,
+        help="The weight for cross-entropy loss of masksed frames. (Default: ``1.0``)",
+    )
+    parser.add_argument(
+        "--unmasked-weight",
+        default=0.0,
+        type=float,
+        help="The weight for cross-entropy loss of unmasksed frames. (Default: ``0.0``)",
+    )
+    parser.add_argument(
+        "--feature-weight",
+        default=10.0,
+        type=float,
+        help="The weight for feature penalty loss. (Default: ``10.0``)",
+    )
+    parser.add_argument("--debug", action="store_true", help="whether to use debug level for logging")
+    return parser.parse_args()
+def _init_logger(debug):
+    fmt = "%(asctime)s %(message)s" if debug else "%(message)s"
+    level = logging.DEBUG if debug else logging.INFO
+    logging.basicConfig(format=fmt, level=level, datefmt="%Y-%m-%d %H:%M:%S")
+def cli_main():
+    args = _parse_args()
+    _init_logger(args.debug)
+    run_train(args)
+if __name__ == "__main__":
+    cli_main()
--- a/examples/source_separation/eval.py
+++ b/examples/source_separation/eval.py
@@ -31,7 +31,7 @@ def _eval(model, data_loader, device):
 def cli_main():
    parser = ArgumentParser()
-    parser.add_argument("--dataset", default="librimix", type=str, choices=["wsj0-mix", "librimix"])
+    parser.add_argument("--dataset", default="librimix", type=str, choices=["wsj0mix", "librimix"])
    parser.add_argument(
        "--root-dir",
        type=Path,
@@ -79,7 +79,7 @@ def cli_main():
    _, _, eval_loader = _get_dataloader(
        args.dataset,
-        args.data_dir,
+        args.root_dir,
        args.num_speakers,
        args.sample_rate,
        1,  # batch size is set to 1 to avoid masking

--- a/examples/source_separation/lightning_train.py
+++ b/examples/source_separation/lightning_train.py
@@ -308,7 +308,7 @@ def _get_dataloader(
 def cli_main():
    parser = ArgumentParser()
    parser.add_argument("--batch-size", default=6, type=int)
-    parser.add_argument("--dataset", default="librimix", type=str, choices=["wsj0-mix", "librimix"])
+    parser.add_argument("--dataset", default="librimix", type=str, choices=["wsj0mix", "librimix"])
    parser.add_argument(
        "--root-dir",
        type=Path,
@@ -412,9 +412,10 @@ def cli_main():
    trainer = Trainer(
        default_root_dir=args.exp_dir,
        max_epochs=args.epochs,
-        gpus=args.num_gpu,
        num_nodes=args.num_node,
+        accelerator="gpu",
        strategy="ddp_find_unused_parameters_false",
+        devices=args.num_gpu,
        limit_train_batches=1.0,  # Useful for fast experiment
        gradient_clip_val=5.0,
        callbacks=callbacks,

--- a/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py
+++ b/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py
@@ -207,6 +207,7 @@ from torchaudio.models.decoder import CTCDecoderLM, CTCDecoderLMState
 class CustomLM(CTCDecoderLM):
    """Create a Python wrapper around `language_model` to feed to the decoder."""
    def __init__(self, language_model: torch.nn.Module):
        CTCDecoderLM.__init__(self)
        self.language_model = language_model
@@ -386,6 +387,47 @@ print(f"WER: {beam_search_wer}")
 # and “shoktd”.
 #
+######################################################################
+# Incremental decoding
+# ~~~~~~~~~~~~~~~~~~~~
+#
+# If the input speech is long, one can decode the emission in
+# incremental manner.
+#
+# You need to first initialize the internal state of the decoder with
+# :py:meth:`~torchaudio.models.decoder.CTCDecoder.decode_begin`.
+beam_search_decoder.decode_begin()
+######################################################################
+# Then, you can pass emissions to
+# :py:meth:`~torchaudio.models.decoder.CTCDecoder.decode_begin`.
+# Here we use the same emission but pass it to the decoder one frame
+# at a time.
+for t in range(emission.size(1)):
+    beam_search_decoder.decode_step(emission[0, t:t + 1, :])
+######################################################################
+# Finally, finalize the internal state of the decoder, and retrieve the
+# result.
+beam_search_decoder.decode_end()
+beam_search_result_inc = beam_search_decoder.get_final_hypothesis()
+######################################################################
+# The result of incremental decoding is identical to batch decoding.
+#
+beam_search_transcript_inc = " ".join(beam_search_result_inc[0].words).strip()
+beam_search_wer_inc = torchaudio.functional.edit_distance(
+    actual_transcript, beam_search_result_inc[0].words) / len(actual_transcript)
+print(f"Transcript: {beam_search_transcript_inc}")
+print(f"WER: {beam_search_wer_inc}")
+assert beam_search_result[0][0].words == beam_search_result_inc[0].words
+assert beam_search_result[0][0].score == beam_search_result_inc[0].score
+torch.testing.assert_close(beam_search_result[0][0].timesteps, beam_search_result_inc[0].timesteps)
 ######################################################################
 # Timestep Alignments
@@ -406,30 +448,45 @@ print(timesteps, timesteps.shape[0])
 #
-def plot_alignments(waveform, emission, tokens, timesteps):
+def plot_alignments(waveform, emission, tokens, timesteps, sample_rate):
-    fig, ax = plt.subplots(figsize=(32, 10))
+    t = torch.arange(waveform.size(0)) / sample_rate
+    ratio = waveform.size(0) / emission.size(1) / sample_rate
-    ax.plot(waveform)
+    chars = []
+    words = []
+    word_start = None
+    for token, timestep in zip(tokens, timesteps * ratio):
+        if token == "|":
+            if word_start is not None:
+                words.append((word_start, timestep))
+            word_start = None
+        else:
+            chars.append((token, timestep))
+            if word_start is None:
+                word_start = timestep
-    ratio = waveform.shape[0] / emission.shape[1]
+    fig, axes = plt.subplots(3, 1)
-    word_start = 0
-    for i in range(len(tokens)):
+    def _plot(ax, xlim):
-        if i != 0 and tokens[i - 1] == "|":
+        ax.plot(t, waveform)
-            word_start = timesteps[i]
+        for token, timestep in chars:
-        if tokens[i] != "|":
+            ax.annotate(token.upper(), (timestep, 0.5))
-            plt.annotate(tokens[i].upper(), (timesteps[i] * ratio, waveform.max() * 1.02), size=14)
+        for word_start, word_end in words:
-        elif i != 0:
+            ax.axvspan(word_start, word_end, alpha=0.1, color="red")
-            word_end = timesteps[i]
+        ax.set_ylim(-0.6, 0.7)
-            ax.axvspan(word_start * ratio, word_end * ratio, alpha=0.1, color="red")
+        ax.set_yticks([0])
+        ax.grid(True, axis="y")
+        ax.set_xlim(xlim)
-    xticks = ax.get_xticks()
+    _plot(axes[0], (0.3, 2.5))
-    plt.xticks(xticks, xticks / bundle.sample_rate)
+    _plot(axes[1], (2.5, 4.7))
-    ax.set_xlabel("time (sec)")
+    _plot(axes[2], (4.7, 6.9))
-    ax.set_xlim(0, waveform.shape[0])
+    axes[2].set_xlabel("time (sec)")
+    fig.tight_layout()
-plot_alignments(waveform[0], emission, predicted_tokens, timesteps)
+plot_alignments(waveform[0], emission, predicted_tokens, timesteps, bundle.sample_rate)
 ######################################################################

--- a/examples/tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.py
+++ b/examples/tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.py
+"""
+ASR Inference with CUDA CTC Decoder
+====================================
+**Author**: `Yuekai Zhang <yuekaiz@nvidia.com>`__
+This tutorial shows how to perform speech recognition inference using a
+CUDA-based CTC beam search decoder.
+We demonstrate this on a pretrained
+`Zipformer <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_ctc>`__
+model from `Next-gen Kaldi <https://nadirapovey.com/next-gen-kaldi-what-is-it>`__ project.
+"""
+######################################################################
+# Overview
+# --------
+#
+# Beam search decoding works by iteratively expanding text hypotheses (beams)
+# with next possible characters, and maintaining only the hypotheses with the
+# highest scores at each time step.
+#
+# The underlying implementation uses cuda to acclerate the whole decoding process
+#  A mathematical formula for the decoder can be
+# found in the `paper <https://arxiv.org/pdf/1408.2873.pdf>`__, and
+# a more detailed algorithm can be found in this `blog
+# <https://distill.pub/2017/ctc/>`__.
+#
+# Running ASR inference using a CUDA CTC Beam Search decoder
+# requires the following components
+#
+# -  Acoustic Model: model predicting modeling units (BPE in this tutorial) from acoustic features
+# -  BPE Model: the byte-pair encoding (BPE) tokenizer file
+#
+######################################################################
+# Acoustic Model and Set Up
+# -------------------------
+#
+# First we import the necessary utilities and fetch the data that we are
+# working with
+#
+import torch
+import torchaudio
+print(torch.__version__)
+print(torchaudio.__version__)
+######################################################################
+#
+import time
+from pathlib import Path
+import IPython
+import sentencepiece as spm
+from torchaudio.models.decoder import cuda_ctc_decoder
+from torchaudio.utils import download_asset
+######################################################################
+#
+# We use the pretrained
+# `Zipformer <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-ctc-2022-12-01>`__
+# model that is trained on the `LibriSpeech
+# dataset <http://www.openslr.org/12>`__. The model is jointly trained with CTC and Transducer loss functions.
+# In this tutorial, we only use CTC head of the model.
+def download_asset_external(url, key):
+    path = Path(torch.hub.get_dir()) / "torchaudio" / Path(key)
+    if not path.exists():
+        path.parent.mkdir(parents=True, exist_ok=True)
+        torch.hub.download_url_to_file(url, path)
+    return str(path)
+url_prefix = "https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-ctc-2022-12-01"
+model_link = f"{url_prefix}/resolve/main/exp/cpu_jit.pt"
+model_path = download_asset_external(model_link, "cuda_ctc_decoder/cpu_jit.pt")
+######################################################################
+# We will load a sample from the LibriSpeech test-other dataset.
+#
+speech_file = download_asset("tutorial-assets/ctc-decoding/1688-142285-0007.wav")
+waveform, sample_rate = torchaudio.load(speech_file)
+assert sample_rate == 16000
+IPython.display.Audio(speech_file)
+######################################################################
+# The transcript corresponding to this audio file is
+#
+# .. code-block::
+#
+#    i really was very much afraid of showing him how much shocked i was at some parts of what he said
+#
+######################################################################
+# Files and Data for Decoder
+# --------------------------
+#
+# Next, we load in our token from BPE model, which is the tokenizer for decoding.
+#
+######################################################################
+# Tokens
+# ~~~~~~
+#
+# The tokens are the possible symbols that the acoustic model can predict,
+# including the blank symbol in CTC. In this tutorial, it includes 500 BPE tokens.
+# It can either be passed in as a
+# file, where each line consists of the tokens corresponding to the same
+# index, or as a list of tokens, each mapping to a unique index.
+#
+# .. code-block::
+#
+#    # tokens
+#    <blk>
+#    <sos/eos>
+#    <unk>
+#    S
+#    _THE
+#    _A
+#    T
+#    _AND
+#    ...
+#
+bpe_link = f"{url_prefix}/resolve/main/data/lang_bpe_500/bpe.model"
+bpe_path = download_asset_external(bpe_link, "cuda_ctc_decoder/bpe.model")
+bpe_model = spm.SentencePieceProcessor()
+bpe_model.load(bpe_path)
+tokens = [bpe_model.id_to_piece(id) for id in range(bpe_model.get_piece_size())]
+print(tokens)
+######################################################################
+# Construct CUDA Decoder
+# ----------------------
+# In this tutorial, we will construct a CUDA beam search decoder.
+# The decoder can be constructed using the factory function
+# :py:func:`~torchaudio.models.decoder.cuda_ctc_decoder`.
+#
+cuda_decoder = cuda_ctc_decoder(tokens, nbest=10, beam_size=10, blank_skip_threshold=0.95)
+######################################################################
+# Run Inference
+# -------------
+#
+# Now that we have the data, acoustic model, and decoder, we can perform
+# inference. The output of the beam search decoder is of type
+# :py:class:`~torchaudio.models.decoder.CUCTCHypothesis`, consisting of the
+# predicted token IDs, words (symbols corresponding to the token IDs), and hypothesis scores.
+# Recall the transcript corresponding to the
+# waveform is
+#
+# .. code-block::
+#
+#    i really was very much afraid of showing him how much shocked i was at some parts of what he said
+#
+actual_transcript = "i really was very much afraid of showing him how much shocked i was at some parts of what he said"
+actual_transcript = actual_transcript.split()
+device = torch.device("cuda", 0)
+acoustic_model = torch.jit.load(model_path)
+acoustic_model.to(device)
+acoustic_model.eval()
+waveform = waveform.to(device)
+feat = torchaudio.compliance.kaldi.fbank(waveform, num_mel_bins=80, snip_edges=False)
+feat = feat.unsqueeze(0)
+feat_lens = torch.tensor(feat.size(1), device=device).unsqueeze(0)
+encoder_out, encoder_out_lens = acoustic_model.encoder(feat, feat_lens)
+nnet_output = acoustic_model.ctc_output(encoder_out)
+log_prob = torch.nn.functional.log_softmax(nnet_output, -1)
+print(f"The shape of log_prob: {log_prob.shape}, the shape of encoder_out_lens: {encoder_out_lens.shape}")
+######################################################################
+# The cuda ctc decoder gives the following result.
+#
+results = cuda_decoder(log_prob, encoder_out_lens.to(torch.int32))
+beam_search_transcript = bpe_model.decode(results[0][0].tokens).lower()
+beam_search_wer = torchaudio.functional.edit_distance(actual_transcript, beam_search_transcript.split()) / len(
+    actual_transcript
+)
+print(f"Transcript: {beam_search_transcript}")
+print(f"WER: {beam_search_wer}")
+######################################################################
+# Beam Search Decoder Parameters
+# ------------------------------
+#
+# In this section, we go a little bit more in depth about some different
+# parameters and tradeoffs. For the full list of customizable parameters,
+# please refer to the
+# :py:func:`documentation <torchaudio.models.decoder.cuda_ctc_decoder>`.
+#
+######################################################################
+# Helper Function
+# ~~~~~~~~~~~~~~~
+#
+def print_decoded(cuda_decoder, bpe_model, log_prob, encoder_out_lens, param, param_value):
+    start_time = time.monotonic()
+    results = cuda_decoder(log_prob, encoder_out_lens.to(torch.int32))
+    decode_time = time.monotonic() - start_time
+    transcript = bpe_model.decode(results[0][0].tokens).lower()
+    score = results[0][0].score
+    print(f"{param} {param_value:<3}: {transcript} (score: {score:.2f}; {decode_time:.4f} secs)")
+######################################################################
+# nbest
+# ~~~~~
+#
+# This parameter indicates the number of best hypotheses to return. For
+# instance, by setting ``nbest=10`` when constructing the beam search
+# decoder earlier, we can now access the hypotheses with the top 10 scores.
+#
+for i in range(10):
+    transcript = bpe_model.decode(results[0][i].tokens).lower()
+    score = results[0][i].score
+    print(f"{transcript} (score: {score})")
+######################################################################
+# beam size
+# ~~~~~~~~~
+#
+# The ``beam_size`` parameter determines the maximum number of best
+# hypotheses to hold after each decoding step. Using larger beam sizes
+# allows for exploring a larger range of possible hypotheses which can
+# produce hypotheses with higher scores, but it does not provide additional gains beyond a certain point.
+# We recommend to set beam_size=10 for cuda beam search decoder.
+#
+# In the example below, we see improvement in decoding quality as we
+# increase beam size from 1 to 3, but notice how using a beam size
+# of 3 provides the same output as beam size 10.
+#
+beam_sizes = [1, 2, 3, 10]
+for beam_size in beam_sizes:
+    beam_search_decoder = cuda_ctc_decoder(
+        tokens,
+        nbest=1,
+        beam_size=beam_size,
+        blank_skip_threshold=0.95,
+    )
+    print_decoded(beam_search_decoder, bpe_model, log_prob, encoder_out_lens, "beam size", beam_size)
+######################################################################
+# blank skip threshold
+# ~~~~~~~~~~~~~~~~~~~~
+#
+# The ``blank_skip_threshold`` parameter is used to prune the frames which have large blank probability.
+# Pruning these frames with a good blank_skip_threshold could speed up decoding
+# process a lot while no accuracy drop.
+# Since the rule of CTC, we would keep at least one blank frame between two non-blank frames
+# to avoid mistakenly merge two consecutive identical symbols.
+# We recommend to set blank_skip_threshold=0.95 for cuda beam search decoder.
+#
+blank_skip_probs = [0.25, 0.95, 1.0]
+for blank_skip_prob in blank_skip_probs:
+    beam_search_decoder = cuda_ctc_decoder(
+        tokens,
+        nbest=10,
+        beam_size=10,
+        blank_skip_threshold=blank_skip_prob,
+    )
+    print_decoded(beam_search_decoder, bpe_model, log_prob, encoder_out_lens, "blank_skip_threshold", blank_skip_prob)
+del cuda_decoder
+######################################################################
+# Benchmark with flashlight CPU decoder
+# -------------------------------------
+# We benchmark the throughput and accuracy between CUDA decoder and CPU decoder using librispeech test_other set.
+# To reproduce below benchmark results, you may refer `here <https://github.com/pytorch/audio/tree/main/examples/asr/librispeech_cuda_ctc_decoder>`__.
+#
+# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
+# | Decoder      | Setting                                  | WER (%) | N-Best Oracle WER (%) | Decoder Cost Time (seconds) |
+# +==============+==========================================+=========+=======================+=============================+
+# | CUDA decoder | blank_skip_threshold 0.95                | 5.81    | 4.11                  | 2.57                        |
+# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
+# | CUDA decoder | blank_skip_threshold 1.0 (no frame-skip) | 5.81    | 4.09                  | 6.24                        |
+# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
+# | CPU decoder  | beam_size_token 10                       | 5.86    | 4.30                  | 28.61                       |
+# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
+# | CPU decoder  | beam_size_token 500                      | 5.86    | 4.30                  | 791.80                      |
+# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
+#
+# From the above table, CUDA decoder could give a slight improvement in WER and a significant increase in throughput.
--- a/examples/tutorials/audio_data_augmentation_tutorial.py
+++ b/examples/tutorials/audio_data_augmentation_tutorial.py
@@ -20,6 +20,8 @@ import torchaudio.functional as F
 print(torch.__version__)
 print(torchaudio.__version__)
+import matplotlib.pyplot as plt
 ######################################################################
 # Preparation
 # -----------
@@ -27,10 +29,7 @@ print(torchaudio.__version__)
 # First, we import the modules and download the audio assets we use in this tutorial.
 #
-import math
 from IPython.display import Audio
-import matplotlib.pyplot as plt
 from torchaudio.utils import download_asset
@@ -44,56 +43,38 @@ SAMPLE_NOISE = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-st
 # Applying effects and filtering
 # ------------------------------
 #
-# :py:func:`torchaudio.sox_effects` allows for directly applying filters similar to
+# :py:class:`torchaudio.io.AudioEffector` allows for directly applying
-# those available in ``sox`` to Tensor objects and file object audio sources.
+# filters and codecs to Tensor objects, in a similar way as ``ffmpeg``
-#
+# command
-# There are two functions for this:
-#
-# -  :py:func:`torchaudio.sox_effects.apply_effects_tensor` for applying effects
-#    to Tensor.
-# -  :py:func:`torchaudio.sox_effects.apply_effects_file` for applying effects to
-#    other audio sources.
 #
-# Both functions accept effect definitions in the form
+# `AudioEffector Usages <./effector_tutorial.html>` explains how to use
-# ``List[List[str]]``.
+# this class, so for the detail, please refer to the tutorial.
-# This is mostly consistent with how ``sox`` command works, but one caveat is
-# that ``sox`` adds some effects automatically, whereas ``torchaudio``’s
-# implementation does not.
-#
-# For the list of available effects, please refer to `the sox
-# documentation <http://sox.sourceforge.net/sox.html>`__.
-#
-# **Tip** If you need to load and resample your audio data on the fly,
-# then you can use :py:func:`torchaudio.sox_effects.apply_effects_file`
-# with effect ``"rate"``.
-#
-# **Note** :py:func:`torchaudio.sox_effects.apply_effects_file` accepts a
-# file-like object or path-like object.
-# Similar to :py:func:`torchaudio.load`, when the audio format cannot be
-# inferred from either the file extension or header, you can provide
-# argument ``format`` to specify the format of the audio source.
-#
-# **Note** This process is not differentiable.
 #
 # Load the data
-waveform1, sample_rate1 = torchaudio.load(SAMPLE_WAV)
+waveform1, sample_rate = torchaudio.load(SAMPLE_WAV, channels_first=False)
 # Define effects
-effects = [
+effect = ",".join(
-    ["lowpass", "-1", "300"],  # apply single-pole lowpass filter
+    [
-    ["speed", "0.8"],  # reduce the speed
+        "lowpass=frequency=300:poles=1",  # apply single-pole lowpass filter
-    # This only changes sample rate, so it is necessary to
+        "atempo=0.8",  # reduce the speed
-    # add `rate` effect with original sample rate after this.
+        "aecho=in_gain=0.8:out_gain=0.9:delays=200:decays=0.3|delays=400:decays=0.3"
-    ["rate", f"{sample_rate1}"],
+        # Applying echo gives some dramatic feeling
-    ["reverb", "-w"],  # Reverbration gives some dramatic feeling
+    ],
-]
+)
 # Apply effects
-waveform2, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor(waveform1, sample_rate1, effects)
+def apply_effect(waveform, sample_rate, effect):
+    effector = torchaudio.io.AudioEffector(effect=effect)
+    return effector.apply(waveform, sample_rate)
-print(waveform1.shape, sample_rate1)
+waveform2 = apply_effect(waveform1, sample_rate, effect)
-print(waveform2.shape, sample_rate2)
+print(waveform1.shape, sample_rate)
+print(waveform2.shape, sample_rate)
 ######################################################################
 # Note that the number of frames and number of channels are different from
@@ -101,6 +82,7 @@ print(waveform2.shape, sample_rate2)
 # audio.
 #
 def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None):
    waveform = waveform.numpy()
@@ -118,11 +100,12 @@ def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None):
        if xlim:
            axes[c].set_xlim(xlim)
    figure.suptitle(title)
-    plt.show(block=False)
 ######################################################################
 #
 def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
    waveform = waveform.numpy()
@@ -138,29 +121,26 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
        if xlim:
            axes[c].set_xlim(xlim)
    figure.suptitle(title)
-    plt.show(block=False)
 ######################################################################
-# Original:
+# Original
-# ~~~~~~~~~
+# ~~~~~~~~
 #
-plot_waveform(waveform1, sample_rate1, title="Original", xlim=(-0.1, 3.2))
+plot_waveform(waveform1.T, sample_rate, title="Original", xlim=(-0.1, 3.2))
-plot_specgram(waveform1, sample_rate1, title="Original", xlim=(0, 3.04))
+plot_specgram(waveform1.T, sample_rate, title="Original", xlim=(0, 3.04))
-Audio(waveform1, rate=sample_rate1)
+Audio(waveform1.T, rate=sample_rate)
 ######################################################################
-# Effects applied:
+# Effects applied
-# ~~~~~~~~~~~~~~~~
+# ~~~~~~~~~~~~~~~
 #
-plot_waveform(waveform2, sample_rate2, title="Effects Applied", xlim=(-0.1, 3.2))
+plot_waveform(waveform2.T, sample_rate, title="Effects Applied", xlim=(-0.1, 3.2))
-plot_specgram(waveform2, sample_rate2, title="Effects Applied", xlim=(0, 3.04))
+plot_specgram(waveform2.T, sample_rate, title="Effects Applied", xlim=(0, 3.04))
-Audio(waveform2, rate=sample_rate2)
+Audio(waveform2.T, rate=sample_rate)
-######################################################################
-# Doesn’t it sound more dramatic?
-#
 ######################################################################
 # Simulating room reverberation
@@ -185,28 +165,26 @@ plot_specgram(rir_raw, sample_rate, title="Room Impulse Response (raw)")
 Audio(rir_raw, rate=sample_rate)
 ######################################################################
-# First, we need to clean up the RIR. We extract the main impulse, normalize
+# First, we need to clean up the RIR. We extract the main impulse and normalize
-# the signal power, then flip along the time axis.
+# it by its power.
 #
 rir = rir_raw[:, int(sample_rate * 1.01) : int(sample_rate * 1.3)]
-rir = rir / torch.norm(rir, p=2)
+rir = rir / torch.linalg.vector_norm(rir, ord=2)
-RIR = torch.flip(rir, [1])
 plot_waveform(rir, sample_rate, title="Room Impulse Response")
 ######################################################################
-# Then, we convolve the speech signal with the RIR filter.
+# Then, using :py:func:`torchaudio.functional.fftconvolve`,
+# we convolve the speech signal with the RIR.
 #
 speech, _ = torchaudio.load(SAMPLE_SPEECH)
+augmented = F.fftconvolve(speech, rir)
-speech_ = torch.nn.functional.pad(speech, (RIR.shape[1] - 1, 0))
-augmented = torch.nn.functional.conv1d(speech_[None, ...], RIR[None, ...])[0]
 ######################################################################
-# Original:
+# Original
-# ~~~~~~~~~
+# ~~~~~~~~
 #
 plot_waveform(speech, sample_rate, title="Original")
@@ -214,8 +192,8 @@ plot_specgram(speech, sample_rate, title="Original")
 Audio(speech, rate=sample_rate)
 ######################################################################
-# RIR applied:
+# RIR applied
-# ~~~~~~~~~~~~
+# ~~~~~~~~~~~
 #
 plot_waveform(augmented, sample_rate, title="RIR Applied")
@@ -227,33 +205,31 @@ Audio(augmented, rate=sample_rate)
 # Adding background noise
 # -----------------------
 #
-# To add background noise to audio data, you can simply add a noise Tensor to
+# To introduce background noise to audio data, we can add a noise Tensor to
-# the Tensor representing the audio data. A common method to adjust the
+# the Tensor representing the audio data according to some desired
-# intensity of noise is changing the Signal-to-Noise Ratio (SNR).
+# signal-to-noise ratio (SNR)
-# [`wikipedia <https://en.wikipedia.org/wiki/Signal-to-noise_ratio>`__]
+# [`wikipedia <https://en.wikipedia.org/wiki/Signal-to-noise_ratio>`__],
+# which determines the intensity of the audio data relative to that of the noise
+# in the output.
 #
 # $$ \\mathrm{SNR} = \\frac{P_{signal}}{P_{noise}} $$
 #
 # $$ \\mathrm{SNR_{dB}} = 10 \\log _{{10}} \\mathrm {SNR} $$
 #
+# To add noise to audio data per SNRs, we
+# use :py:func:`torchaudio.functional.add_noise`.
 speech, _ = torchaudio.load(SAMPLE_SPEECH)
 noise, _ = torchaudio.load(SAMPLE_NOISE)
 noise = noise[:, : speech.shape[1]]
-speech_rms = speech.norm(p=2)
+snr_dbs = torch.tensor([20, 10, 3])
-noise_rms = noise.norm(p=2)
+noisy_speeches = F.add_noise(speech, noise, snr_dbs)
-snr_dbs = [20, 10, 3]
-noisy_speeches = []
-for snr_db in snr_dbs:
-    snr = 10 ** (snr_db / 20)
-    scale = snr * noise_rms / speech_rms
-    noisy_speeches.append((scale * speech + noise) / 2)
 ######################################################################
-# Background noise:
+# Background noise
-# ~~~~~~~~~~~~~~~~~
+# ~~~~~~~~~~~~~~~~
 #
 plot_waveform(noise, sample_rate, title="Background noise")
@@ -261,31 +237,31 @@ plot_specgram(noise, sample_rate, title="Background noise")
 Audio(noise, rate=sample_rate)
 ######################################################################
-# SNR 20 dB:
+# SNR 20 dB
-# ~~~~~~~~~~
+# ~~~~~~~~~
 #
-snr_db, noisy_speech = snr_dbs[0], noisy_speeches[0]
+snr_db, noisy_speech = snr_dbs[0], noisy_speeches[0:1]
 plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
 plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
 Audio(noisy_speech, rate=sample_rate)
 ######################################################################
-# SNR 10 dB:
+# SNR 10 dB
-# ~~~~~~~~~~
+# ~~~~~~~~~
 #
-snr_db, noisy_speech = snr_dbs[1], noisy_speeches[1]
+snr_db, noisy_speech = snr_dbs[1], noisy_speeches[1:2]
 plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
 plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
 Audio(noisy_speech, rate=sample_rate)
 ######################################################################
-# SNR 3 dB:
+# SNR 3 dB
-# ~~~~~~~~~
+# ~~~~~~~~
 #
-snr_db, noisy_speech = snr_dbs[2], noisy_speeches[2]
+snr_db, noisy_speech = snr_dbs[2], noisy_speeches[2:3]
 plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
 plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
 Audio(noisy_speech, rate=sample_rate)
@@ -295,60 +271,56 @@ Audio(noisy_speech, rate=sample_rate)
 # Applying codec to Tensor object
 # -------------------------------
 #
-# :py:func:`torchaudio.functional.apply_codec` can apply codecs to
+# :py:class:`torchaudio.io.AudioEffector` can also apply codecs to
 # a Tensor object.
 #
-# **Note** This process is not differentiable.
-#
+waveform, sample_rate = torchaudio.load(SAMPLE_SPEECH, channels_first=False)
-waveform, sample_rate = torchaudio.load(SAMPLE_SPEECH)
+def apply_codec(waveform, sample_rate, format, encoder=None):
+    encoder = torchaudio.io.AudioEffector(format=format, encoder=encoder)
+    return encoder.apply(waveform, sample_rate)
-configs = [
-    {"format": "wav", "encoding": "ULAW", "bits_per_sample": 8},
-    {"format": "gsm"},
-    {"format": "vorbis", "compression": -1},
-]
-waveforms = []
-for param in configs:
-    augmented = F.apply_codec(waveform, sample_rate, **param)
-    waveforms.append(augmented)
 ######################################################################
-# Original:
+# Original
-# ~~~~~~~~~
+# ~~~~~~~~
 #
-plot_waveform(waveform, sample_rate, title="Original")
+plot_waveform(waveform.T, sample_rate, title="Original")
-plot_specgram(waveform, sample_rate, title="Original")
+plot_specgram(waveform.T, sample_rate, title="Original")
-Audio(waveform, rate=sample_rate)
+Audio(waveform.T, rate=sample_rate)
 ######################################################################
-# 8 bit mu-law:
+# 8 bit mu-law
-# ~~~~~~~~~~~~~
+# ~~~~~~~~~~~~
 #
-plot_waveform(waveforms[0], sample_rate, title="8 bit mu-law")
+mulaw = apply_codec(waveform, sample_rate, "wav", encoder="pcm_mulaw")
-plot_specgram(waveforms[0], sample_rate, title="8 bit mu-law")
+plot_waveform(mulaw.T, sample_rate, title="8 bit mu-law")
-Audio(waveforms[0], rate=sample_rate)
+plot_specgram(mulaw.T, sample_rate, title="8 bit mu-law")
+Audio(mulaw.T, rate=sample_rate)
 ######################################################################
-# GSM-FR:
+# G.722
-# ~~~~~~~
+# ~~~~~
 #
-plot_waveform(waveforms[1], sample_rate, title="GSM-FR")
+g722 = apply_codec(waveform, sample_rate, "g722")
-plot_specgram(waveforms[1], sample_rate, title="GSM-FR")
+plot_waveform(g722.T, sample_rate, title="G.722")
-Audio(waveforms[1], rate=sample_rate)
+plot_specgram(g722.T, sample_rate, title="G.722")
+Audio(g722.T, rate=sample_rate)
 ######################################################################
-# Vorbis:
+# Vorbis
-# ~~~~~~~
+# ~~~~~~
 #
-plot_waveform(waveforms[2], sample_rate, title="Vorbis")
+vorbis = apply_codec(waveform, sample_rate, "ogg", encoder="vorbis")
-plot_specgram(waveforms[2], sample_rate, title="Vorbis")
+plot_waveform(vorbis.T, sample_rate, title="Vorbis")
-Audio(waveforms[2], rate=sample_rate)
+plot_specgram(vorbis.T, sample_rate, title="Vorbis")
+Audio(vorbis.T, rate=sample_rate)
 ######################################################################
 # Simulating a phone recoding
@@ -365,8 +337,7 @@ original_speech, sample_rate = torchaudio.load(SAMPLE_SPEECH)
 plot_specgram(original_speech, sample_rate, title="Original")
 # Apply RIR
-speech_ = torch.nn.functional.pad(original_speech, (RIR.shape[1] - 1, 0))
+rir_applied = F.fftconvolve(speech, rir)
-rir_applied = torch.nn.functional.conv1d(speech_[None, ...], RIR[None, ...])[0]
 plot_specgram(rir_applied, sample_rate, title="RIR Applied")
@@ -377,69 +348,60 @@ plot_specgram(rir_applied, sample_rate, title="RIR Applied")
 noise, _ = torchaudio.load(SAMPLE_NOISE)
 noise = noise[:, : rir_applied.shape[1]]
-snr_db = 8
+snr_db = torch.tensor([8])
-scale = (10 ** (snr_db / 20)) * noise.norm(p=2) / rir_applied.norm(p=2)
+bg_added = F.add_noise(rir_applied, noise, snr_db)
-bg_added = (scale * rir_applied + noise) / 2
 plot_specgram(bg_added, sample_rate, title="BG noise added")
 # Apply filtering and change sample rate
-filtered, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor(
+effect = ",".join(
-    bg_added,
+    [
-    sample_rate,
+        "lowpass=frequency=4000:poles=1",
-    effects=[
+        "compand=attacks=0.02:decays=0.05:points=-60/-60|-30/-10|-20/-8|-5/-8|-2/-8:gain=-8:volume=-7:delay=0.05",
-        ["lowpass", "4000"],
+    ]
-        [
-            "compand",
-            "0.02,0.05",
-            "-60,-60,-30,-10,-20,-8,-5,-8,-2,-8",
-            "-8",
-            "-7",
-            "0.05",
-        ],
-        ["rate", "8000"],
-    ],
 )
-plot_specgram(filtered, sample_rate2, title="Filtered")
+filtered = apply_effect(bg_added.T, sample_rate, effect)
+sample_rate2 = 8000
-# Apply telephony codec
+plot_specgram(filtered.T, sample_rate2, title="Filtered")
-codec_applied = F.apply_codec(filtered, sample_rate2, format="gsm")
-plot_specgram(codec_applied, sample_rate2, title="GSM Codec Applied")
+# Apply telephony codec
+codec_applied = apply_codec(filtered, sample_rate2, "g722")
+plot_specgram(codec_applied.T, sample_rate2, title="G.722 Codec Applied")
 ######################################################################
-# Original speech:
+# Original speech
-# ~~~~~~~~~~~~~~~~
+# ~~~~~~~~~~~~~~~
 #
 Audio(original_speech, rate=sample_rate)
 ######################################################################
-# RIR applied:
+# RIR applied
-# ~~~~~~~~~~~~
+# ~~~~~~~~~~~
 #
 Audio(rir_applied, rate=sample_rate)
 ######################################################################
-# Background noise added:
+# Background noise added
-# ~~~~~~~~~~~~~~~~~~~~~~~
+# ~~~~~~~~~~~~~~~~~~~~~~
 #
 Audio(bg_added, rate=sample_rate)
 ######################################################################
-# Filtered:
+# Filtered
-# ~~~~~~~~~
+# ~~~~~~~~
 #
-Audio(filtered, rate=sample_rate2)
+Audio(filtered.T, rate=sample_rate2)
 ######################################################################
-# Codec applied:
+# Codec applied
-# ~~~~~~~~~~~~~~
+# ~~~~~~~~~~~~~
 #
-Audio(codec_applied, rate=sample_rate2)
+Audio(codec_applied.T, rate=sample_rate2)
--- a/examples/tutorials/audio_datasets_tutorial.py
+++ b/examples/tutorials/audio_datasets_tutorial.py
-# -*- coding: utf-8 -*-
 """
 Audio Datasets
 ==============
@@ -10,10 +9,6 @@ datasets. Please refer to the official documentation for the list of
 available datasets.
 """
-# When running this tutorial in Google Colab, install the required packages
-# with the following.
-# !pip install torchaudio
 import torch
 import torchaudio
@@ -21,22 +16,13 @@ print(torch.__version__)
 print(torchaudio.__version__)
 ######################################################################
-# Preparing data and utility functions (skip this section)
-# --------------------------------------------------------
 #
-# @title Prepare data and utility functions. {display-mode: "form"}
-# @markdown
-# @markdown You do not need to look into this cell.
-# @markdown Just execute once and you are good to go.
-# -------------------------------------------------------------------------------
-# Preparation of data and helper functions.
-# -------------------------------------------------------------------------------
 import os
+import IPython
 import matplotlib.pyplot as plt
-from IPython.display import Audio, display
 _SAMPLE_DIR = "_assets"
@@ -44,34 +30,13 @@ YESNO_DATASET_PATH = os.path.join(_SAMPLE_DIR, "yes_no")
 os.makedirs(YESNO_DATASET_PATH, exist_ok=True)
-def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
+def plot_specgram(waveform, sample_rate, title="Spectrogram"):
    waveform = waveform.numpy()
-    num_channels, _ = waveform.shape
+    figure, ax = plt.subplots()
+    ax.specgram(waveform[0], Fs=sample_rate)
-    figure, axes = plt.subplots(num_channels, 1)
-    if num_channels == 1:
-        axes = [axes]
-    for c in range(num_channels):
-        axes[c].specgram(waveform[c], Fs=sample_rate)
-        if num_channels > 1:
-            axes[c].set_ylabel(f"Channel {c+1}")
-        if xlim:
-            axes[c].set_xlim(xlim)
    figure.suptitle(title)
-    plt.show(block=False)
+    figure.tight_layout()
-def play_audio(waveform, sample_rate):
-    waveform = waveform.numpy()
-    num_channels, _ = waveform.shape
-    if num_channels == 1:
-        display(Audio(waveform[0], rate=sample_rate))
-    elif num_channels == 2:
-        display(Audio((waveform[0], waveform[1]), rate=sample_rate))
-    else:
-        raise ValueError("Waveform with more than 2 channels are not supported.")
 ######################################################################
@@ -79,10 +44,25 @@ def play_audio(waveform, sample_rate):
 # :py:class:`torchaudio.datasets.YESNO` dataset.
 #
 dataset = torchaudio.datasets.YESNO(YESNO_DATASET_PATH, download=True)
-for i in [1, 3, 5]:
+######################################################################
-    waveform, sample_rate, label = dataset[i]
+#
-    plot_specgram(waveform, sample_rate, title=f"Sample {i}: {label}")
+i = 1
-    play_audio(waveform, sample_rate)
+waveform, sample_rate, label = dataset[i]
+plot_specgram(waveform, sample_rate, title=f"Sample {i}: {label}")
+IPython.display.Audio(waveform, rate=sample_rate)
+######################################################################
+#
+i = 3
+waveform, sample_rate, label = dataset[i]
+plot_specgram(waveform, sample_rate, title=f"Sample {i}: {label}")
+IPython.display.Audio(waveform, rate=sample_rate)
+######################################################################
+#
+i = 5
+waveform, sample_rate, label = dataset[i]
+plot_specgram(waveform, sample_rate, title=f"Sample {i}: {label}")
+IPython.display.Audio(waveform, rate=sample_rate)
--- a/examples/tutorials/audio_feature_augmentation_tutorial.py
+++ b/examples/tutorials/audio_feature_augmentation_tutorial.py
@@ -19,25 +19,20 @@ print(torch.__version__)
 print(torchaudio.__version__)
 ######################################################################
-# Preparing data and utility functions (skip this section)
+# Preparation
-# --------------------------------------------------------
+# -----------
 #
-# @title Prepare data and utility functions. {display-mode: "form"}
-# @markdown
-# @markdown You do not need to look into this cell.
-# @markdown Just execute once and you are good to go.
-# @markdown
-# @markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/),
-# @markdown which is licensed under Creative Commos BY 4.0.
-# -------------------------------------------------------------------------------
-# Preparation of data and helper functions.
-# -------------------------------------------------------------------------------
 import librosa
 import matplotlib.pyplot as plt
+from IPython.display import Audio
 from torchaudio.utils import download_asset
+######################################################################
+# In this tutorial, we will use a speech data from
+# `VOiCES dataset <https://iqtlabs.github.io/voices/>`__,
+# which is licensed under Creative Commos BY 4.0.
 SAMPLE_WAV_SPEECH_PATH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
@@ -75,18 +70,6 @@ def get_spectrogram(
    return spectrogram(waveform)
-def plot_spectrogram(spec, title=None, ylabel="freq_bin", aspect="auto", xmax=None):
-    fig, axs = plt.subplots(1, 1)
-    axs.set_title(title or "Spectrogram (db)")
-    axs.set_ylabel(ylabel)
-    axs.set_xlabel("frame")
-    im = axs.imshow(librosa.power_to_db(spec), origin="lower", aspect=aspect)
-    if xmax:
-        axs.set_xlim((0, xmax))
-    fig.colorbar(im, ax=axs)
-    plt.show(block=False)
 ######################################################################
 # SpecAugment
 # -----------
@@ -108,43 +91,79 @@ def plot_spectrogram(spec, title=None, ylabel="freq_bin", aspect="auto", xmax=No
 spec = get_spectrogram(power=None)
 stretch = T.TimeStretch()
-rate = 1.2
+spec_12 = stretch(spec, overriding_rate=1.2)
-spec_ = stretch(spec, rate)
+spec_09 = stretch(spec, overriding_rate=0.9)
-plot_spectrogram(torch.abs(spec_[0]), title=f"Stretched x{rate}", aspect="equal", xmax=304)
+######################################################################
+# Visualization
+# ~~~~~~~~~~~~~
+def plot():
+    def plot_spec(ax, spec, title):
+        ax.set_title(title)
+        ax.imshow(librosa.amplitude_to_db(spec), origin="lower", aspect="auto")
+    fig, axes = plt.subplots(3, 1, sharex=True, sharey=True)
+    plot_spec(axes[0], torch.abs(spec_12[0]), title="Stretched x1.2")
+    plot_spec(axes[1], torch.abs(spec[0]), title="Original")
+    plot_spec(axes[2], torch.abs(spec_09[0]), title="Stretched x0.9")
+    fig.tight_layout()
-plot_spectrogram(torch.abs(spec[0]), title="Original", aspect="equal", xmax=304)
-rate = 0.9
+plot()
-spec_ = stretch(spec, rate)
-plot_spectrogram(torch.abs(spec_[0]), title=f"Stretched x{rate}", aspect="equal", xmax=304)
 ######################################################################
-# TimeMasking
+# Audio Samples
-# -----------
+# ~~~~~~~~~~~~~
-#
+def preview(spec, rate=16000):
+    ispec = T.InverseSpectrogram()
+    waveform = ispec(spec)
-torch.random.manual_seed(4)
+    return Audio(waveform[0].numpy().T, rate=rate)
-spec = get_spectrogram()
-plot_spectrogram(spec[0], title="Original")
-masking = T.TimeMasking(time_mask_param=80)
+preview(spec)
-spec = masking(spec)
-plot_spectrogram(spec[0], title="Masked along time axis")
 ######################################################################
-# FrequencyMasking
-# ----------------
 #
+preview(spec_12)
+######################################################################
+#
+preview(spec_09)
+######################################################################
+# Time and Frequency Masking
+# --------------------------
+#
 torch.random.manual_seed(4)
+time_masking = T.TimeMasking(time_mask_param=80)
+freq_masking = T.FrequencyMasking(freq_mask_param=80)
 spec = get_spectrogram()
-plot_spectrogram(spec[0], title="Original")
+time_masked = time_masking(spec)
+freq_masked = freq_masking(spec)
+######################################################################
+#
+def plot():
+    def plot_spec(ax, spec, title):
+        ax.set_title(title)
+        ax.imshow(librosa.power_to_db(spec), origin="lower", aspect="auto")
+    fig, axes = plt.subplots(3, 1, sharex=True, sharey=True)
+    plot_spec(axes[0], spec[0], title="Original")
+    plot_spec(axes[1], time_masked[0], title="Masked along time axis")
+    plot_spec(axes[2], freq_masked[0], title="Masked along frequency axis")
+    fig.tight_layout()
-masking = T.FrequencyMasking(freq_mask_param=80)
-spec = masking(spec)
-plot_spectrogram(spec[0], title="Masked along frequency axis")
+plot()
--- a/examples/tutorials/audio_feature_extractions_tutorial.py
+++ b/examples/tutorials/audio_feature_extractions_tutorial.py
@@ -25,6 +25,23 @@ import torchaudio.transforms as T
 print(torch.__version__)
 print(torchaudio.__version__)
+import librosa
+import matplotlib.pyplot as plt
+######################################################################
+# Overview of audio features
+# --------------------------
+#
+# The following diagram shows the relationship between common audio features
+# and torchaudio APIs to generate them.
+#
+# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio_feature_extractions.png
+#
+# For the complete list of available features, please refer to the
+# documentation.
+#
 ######################################################################
 # Preparation
 # -----------
@@ -38,8 +55,7 @@ print(torchaudio.__version__)
 #       !pip install librosa
 #
 from IPython.display import Audio
-import librosa
+from matplotlib.patches import Rectangle
-import matplotlib.pyplot as plt
 from torchaudio.utils import download_asset
 torch.random.manual_seed(0)
@@ -47,27 +63,27 @@ torch.random.manual_seed(0)
 SAMPLE_SPEECH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
-def plot_waveform(waveform, sr, title="Waveform"):
+def plot_waveform(waveform, sr, title="Waveform", ax=None):
    waveform = waveform.numpy()
    num_channels, num_frames = waveform.shape
    time_axis = torch.arange(0, num_frames) / sr
-    figure, axes = plt.subplots(num_channels, 1)
+    if ax is None:
-    axes.plot(time_axis, waveform[0], linewidth=1)
+        _, ax = plt.subplots(num_channels, 1)
-    axes.grid(True)
+    ax.plot(time_axis, waveform[0], linewidth=1)
-    figure.suptitle(title)
+    ax.grid(True)
-    plt.show(block=False)
+    ax.set_xlim([0, time_axis[-1]])
+    ax.set_title(title)
-def plot_spectrogram(specgram, title=None, ylabel="freq_bin"):
+def plot_spectrogram(specgram, title=None, ylabel="freq_bin", ax=None):
-    fig, axs = plt.subplots(1, 1)
+    if ax is None:
-    axs.set_title(title or "Spectrogram (db)")
+        _, ax = plt.subplots(1, 1)
-    axs.set_ylabel(ylabel)
+    if title is not None:
-    axs.set_xlabel("frame")
+        ax.set_title(title)
-    im = axs.imshow(librosa.power_to_db(specgram), origin="lower", aspect="auto")
+    ax.set_ylabel(ylabel)
-    fig.colorbar(im, ax=axs)
+    ax.imshow(librosa.power_to_db(specgram), origin="lower", aspect="auto", interpolation="nearest")
-    plt.show(block=False)
 def plot_fbank(fbank, title=None):
@@ -76,21 +92,6 @@ def plot_fbank(fbank, title=None):
    axs.imshow(fbank, aspect="auto")
    axs.set_ylabel("frequency bin")
    axs.set_xlabel("mel bin")
-    plt.show(block=False)
-######################################################################
-# Overview of audio features
-# --------------------------
-#
-# The following diagram shows the relationship between common audio features
-# and torchaudio APIs to generate them.
-#
-# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio_feature_extractions.png
-#
-# For the complete list of available features, please refer to the
-# documentation.
-#
 ######################################################################
@@ -101,77 +102,157 @@ def plot_fbank(fbank, title=None):
 # you can use :py:func:`torchaudio.transforms.Spectrogram`.
 #
+# Load audio
 SPEECH_WAVEFORM, SAMPLE_RATE = torchaudio.load(SAMPLE_SPEECH)
-plot_waveform(SPEECH_WAVEFORM, SAMPLE_RATE, title="Original waveform")
+# Define transform
-Audio(SPEECH_WAVEFORM.numpy(), rate=SAMPLE_RATE)
+spectrogram = T.Spectrogram(n_fft=512)
+# Perform transform
+spec = spectrogram(SPEECH_WAVEFORM)
 ######################################################################
 #
-n_fft = 1024
+fig, axs = plt.subplots(2, 1)
-win_length = None
+plot_waveform(SPEECH_WAVEFORM, SAMPLE_RATE, title="Original waveform", ax=axs[0])
-hop_length = 512
+plot_spectrogram(spec[0], title="spectrogram", ax=axs[1])
+fig.tight_layout()
-# Define transform
+######################################################################
-spectrogram = T.Spectrogram(
+#
-    n_fft=n_fft,
-    win_length=win_length,
+Audio(SPEECH_WAVEFORM.numpy(), rate=SAMPLE_RATE)
-    hop_length=hop_length,
-    center=True,
-    pad_mode="reflect",
-    power=2.0,
-)
 ######################################################################
+# The effect of ``n_fft`` parameter
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# The core of spectrogram computation is (short-term) Fourier transform,
+# and the ``n_fft`` parameter corresponds to the :math:`N` in the following
+# definition of descrete Fourier transform.
+#
+# $$ X_k = \\sum_{n=0}^{N-1} x_n e^{-\\frac{2\\pi i}{N} nk} $$
+#
+# (For the detail of Fourier transform, please refer to
+# `Wikipedia <https://en.wikipedia.org/wiki/Fast_Fourier_transform>`__.
+#
+# The value of ``n_fft`` determines the resolution of frequency axis.
+# However, with the higher ``n_fft`` value, the energy will be distributed
+# among more bins, so when you visualize it, it might look more blurry,
+# even thought they are higher resolution.
+#
+# The following illustrates this;
 #
-# Perform transform
+######################################################################
-spec = spectrogram(SPEECH_WAVEFORM)
+#
+# .. note::
+#
+#    ``hop_length`` determines the time axis resolution.
+#    By default, (i.e. ``hop_length=None`` and ``win_length=None``),
+#    the value of ``n_fft // 4`` is used.
+#    Here we use the same ``hop_length`` value across different ``n_fft``
+#    so that they have the same number of elemets in the time axis.
+#
+n_ffts = [32, 128, 512, 2048]
+hop_length = 64
+specs = []
+for n_fft in n_ffts:
+    spectrogram = T.Spectrogram(n_fft=n_fft, hop_length=hop_length)
+    spec = spectrogram(SPEECH_WAVEFORM)
+    specs.append(spec)
 ######################################################################
 #
-plot_spectrogram(spec[0], title="torchaudio")
+fig, axs = plt.subplots(len(specs), 1, sharex=True)
+for i, (spec, n_fft) in enumerate(zip(specs, n_ffts)):
+    plot_spectrogram(spec[0], ylabel=f"n_fft={n_fft}", ax=axs[i])
+    axs[i].set_xlabel(None)
+fig.tight_layout()
 ######################################################################
-# GriffinLim
-# ----------
 #
-# To recover a waveform from a spectrogram, you can use ``GriffinLim``.
+# When comparing signals, it is desirable to use the same sampling rate,
+# however if you must use the different sampling rate, care must be
+# taken for interpretating the meaning of ``n_fft``.
+# Recall that ``n_fft`` determines the resolution of the frequency
+# axis for a given sampling rate. In other words, what each bin on
+# the frequency axis represents is subject to the sampling rate.
 #
+# As we have seen above, changing the value of ``n_fft`` does not change
+# the coverage of frequency range for the same input signal.
-torch.random.manual_seed(0)
+######################################################################
+#
+# Let's downsample the audio and apply spectrogram with the same ``n_fft``
+# value.
-n_fft = 1024
+# Downsample to half of the original sample rate
-win_length = None
+speech2 = torchaudio.functional.resample(SPEECH_WAVEFORM, SAMPLE_RATE, SAMPLE_RATE // 2)
-hop_length = 512
+# Upsample to the original sample rate
+speech3 = torchaudio.functional.resample(speech2, SAMPLE_RATE // 2, SAMPLE_RATE)
-spec = T.Spectrogram(
+######################################################################
-    n_fft=n_fft,
+#
-    win_length=win_length,
-    hop_length=hop_length,
+# Apply the same spectrogram
-)(SPEECH_WAVEFORM)
+spectrogram = T.Spectrogram(n_fft=512)
+spec0 = spectrogram(SPEECH_WAVEFORM)
+spec2 = spectrogram(speech2)
+spec3 = spectrogram(speech3)
 ######################################################################
 #
-griffin_lim = T.GriffinLim(
+# Visualize it
-    n_fft=n_fft,
+fig, axs = plt.subplots(3, 1)
-    win_length=win_length,
+plot_spectrogram(spec0[0], ylabel="Original", ax=axs[0])
-    hop_length=hop_length,
+axs[0].add_patch(Rectangle((0, 3), 212, 128, edgecolor="r", facecolor="none"))
-)
+plot_spectrogram(spec2[0], ylabel="Downsampled", ax=axs[1])
+plot_spectrogram(spec3[0], ylabel="Upsampled", ax=axs[2])
+fig.tight_layout()
 ######################################################################
 #
+# In the above visualization, the second plot ("Downsampled") might
+# give the impression that the spectrogram is streched.
+# This is because the meaning of frequency bins is different from
+# the original one.
+# Even though, they have the same number of bins, in the second plot,
+# the frequency is only covered to the half of the original sampling
+# rate.
+# This becomes more clear if we resample the downsampled signal again
+# so that it has the same sample rate as the original.
+######################################################################
+# GriffinLim
+# ----------
+#
+# To recover a waveform from a spectrogram, you can use
+# :py:class:`torchaudio.transforms.GriffinLim`.
+#
+# The same set of parameters used for spectrogram must be used.
+# Define transforms
+n_fft = 1024
+spectrogram = T.Spectrogram(n_fft=n_fft)
+griffin_lim = T.GriffinLim(n_fft=n_fft)
+# Apply the transforms
+spec = spectrogram(SPEECH_WAVEFORM)
 reconstructed_waveform = griffin_lim(spec)
 ######################################################################
 #
-plot_waveform(reconstructed_waveform, SAMPLE_RATE, title="Reconstructed")
+_, axes = plt.subplots(2, 1, sharex=True, sharey=True)
+plot_waveform(SPEECH_WAVEFORM, SAMPLE_RATE, title="Original", ax=axes[0])
+plot_waveform(reconstructed_waveform, SAMPLE_RATE, title="Reconstructed", ax=axes[1])
 Audio(reconstructed_waveform, rate=SAMPLE_RATE)
 ######################################################################
@@ -253,7 +334,6 @@ mel_spectrogram = T.MelSpectrogram(
    pad_mode="reflect",
    power=2.0,
    norm="slaney",
-    onesided=True,
    n_mels=n_mels,
    mel_scale="htk",
 )
@@ -322,7 +402,7 @@ mfcc = mfcc_transform(SPEECH_WAVEFORM)
 ######################################################################
 #
-plot_spectrogram(mfcc[0])
+plot_spectrogram(mfcc[0], title="MFCC")
 ######################################################################
 # Comparison against librosa
@@ -350,7 +430,7 @@ mfcc_librosa = librosa.feature.mfcc(
 ######################################################################
 #
-plot_spectrogram(mfcc_librosa)
+plot_spectrogram(mfcc_librosa, title="MFCC (librosa)")
 mse = torch.square(mfcc - mfcc_librosa).mean().item()
 print("Mean Square Difference: ", mse)
@@ -376,7 +456,7 @@ lfcc_transform = T.LFCC(
 )
 lfcc = lfcc_transform(SPEECH_WAVEFORM)
-plot_spectrogram(lfcc[0])
+plot_spectrogram(lfcc[0], title="LFCC")
 ######################################################################
 # Pitch
@@ -388,6 +468,7 @@ pitch = F.detect_pitch_frequency(SPEECH_WAVEFORM, SAMPLE_RATE)
 ######################################################################
 #
 def plot_pitch(waveform, sr, pitch):
    figure, axis = plt.subplots(1, 1)
    axis.set_title("Pitch Feature")
@@ -402,58 +483,6 @@ def plot_pitch(waveform, sr, pitch):
    axis2.plot(time_axis, pitch[0], linewidth=2, label="Pitch", color="green")
    axis2.legend(loc=0)
-    plt.show(block=False)
 plot_pitch(SPEECH_WAVEFORM, SAMPLE_RATE, pitch)
-######################################################################
-# Kaldi Pitch (beta)
-# ------------------
-#
-# Kaldi Pitch feature [1] is a pitch detection mechanism tuned for automatic
-# speech recognition (ASR) applications. This is a beta feature in ``torchaudio``,
-# and it is available as :py:func:`torchaudio.functional.compute_kaldi_pitch`.
-#
-# 1. A pitch extraction algorithm tuned for automatic speech recognition
-#
-#    Ghahremani, B. BabaAli, D. Povey, K. Riedhammer, J. Trmal and S.
-#    Khudanpur
-#
-#    2014 IEEE International Conference on Acoustics, Speech and Signal
-#    Processing (ICASSP), Florence, 2014, pp. 2494-2498, doi:
-#    10.1109/ICASSP.2014.6854049.
-#    [`abstract <https://ieeexplore.ieee.org/document/6854049>`__],
-#    [`paper <https://danielpovey.com/files/2014_icassp_pitch.pdf>`__]
-#
-pitch_feature = F.compute_kaldi_pitch(SPEECH_WAVEFORM, SAMPLE_RATE)
-pitch, nfcc = pitch_feature[..., 0], pitch_feature[..., 1]
-######################################################################
-#
-def plot_kaldi_pitch(waveform, sr, pitch, nfcc):
-    _, axis = plt.subplots(1, 1)
-    axis.set_title("Kaldi Pitch Feature")
-    axis.grid(True)
-    end_time = waveform.shape[1] / sr
-    time_axis = torch.linspace(0, end_time, waveform.shape[1])
-    axis.plot(time_axis, waveform[0], linewidth=1, color="gray", alpha=0.3)
-    time_axis = torch.linspace(0, end_time, pitch.shape[1])
-    ln1 = axis.plot(time_axis, pitch[0], linewidth=2, label="Pitch", color="green")
-    axis.set_ylim((-1.3, 1.3))
-    axis2 = axis.twinx()
-    time_axis = torch.linspace(0, end_time, nfcc.shape[1])
-    ln2 = axis2.plot(time_axis, nfcc[0], linewidth=2, label="NFCC", color="blue", linestyle="--")
-    lns = ln1 + ln2
-    labels = [l.get_label() for l in lns]
-    axis.legend(lns, labels, loc=0)
-    plt.show(block=False)
-plot_kaldi_pitch(SPEECH_WAVEFORM, SAMPLE_RATE, pitch, nfcc)
--- a/examples/tutorials/audio_io_tutorial.py
+++ b/examples/tutorials/audio_io_tutorial.py
@@ -5,8 +5,15 @@ Audio I/O
 **Author**: `Moto Hira <moto@meta.com>`__
-This tutorial shows how to use TorchAudio's basic I/O API to load audio files
+This tutorial shows how to use TorchAudio's basic I/O API to inspect audio data,
-into PyTorch's Tensor object, and save Tensor objects to audio files.
+load them into PyTorch Tensors and save PyTorch Tensors.
+.. warning::
+   There are multiple changes planned/made to audio I/O in recent releases.
+   For the detail of these changes please refer to
+   :ref:`Introduction of Dispatcher <dispatcher_migration>`.
 """
 import torch
@@ -47,6 +54,16 @@ SAMPLE_WAV = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch12753
 SAMPLE_WAV_8000 = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042-8000hz.wav")
+def _hide_seek(obj):
+    class _wrapper:
+        def __init__(self, obj):
+            self.obj = obj
+        def read(self, n):
+            return self.obj.read(n)
+    return _wrapper(obj)
 ######################################################################
 # Querying audio metadata
@@ -113,7 +130,7 @@ print(metadata)
 url = "https://download.pytorch.org/torchaudio/tutorial-assets/steam-train-whistle-daniel_simon.wav"
 with requests.get(url, stream=True) as response:
-    metadata = torchaudio.info(response.raw)
+    metadata = torchaudio.info(_hide_seek(response.raw))
 print(metadata)
 ######################################################################
@@ -164,7 +181,6 @@ def plot_waveform(waveform, sample_rate):
        if num_channels > 1:
            axes[c].set_ylabel(f"Channel {c+1}")
    figure.suptitle("waveform")
-    plt.show(block=False)
 ######################################################################
@@ -187,7 +203,6 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram"):
        if num_channels > 1:
            axes[c].set_ylabel(f"Channel {c+1}")
    figure.suptitle(title)
-    plt.show(block=False)
 ######################################################################
@@ -215,7 +230,7 @@ Audio(waveform.numpy()[0], rate=sample_rate)
 # Load audio data as HTTP request
 url = "https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
 with requests.get(url, stream=True) as response:
-    waveform, sample_rate = torchaudio.load(response.raw)
+    waveform, sample_rate = torchaudio.load(_hide_seek(response.raw))
 plot_specgram(waveform, sample_rate, title="HTTP datasource")
 ######################################################################
@@ -237,7 +252,7 @@ bucket = "pytorch-tutorial-assets"
 key = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
 client = boto3.client("s3", config=Config(signature_version=UNSIGNED))
 response = client.get_object(Bucket=bucket, Key=key)
-waveform, sample_rate = torchaudio.load(response["Body"])
+waveform, sample_rate = torchaudio.load(_hide_seek(response["Body"]))
 plot_specgram(waveform, sample_rate, title="From S3")
@@ -271,13 +286,15 @@ frame_offset, num_frames = 16000, 16000  # Fetch and decode the 1 - 2 seconds
 url = "https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
 print("Fetching all the data...")
 with requests.get(url, stream=True) as response:
-    waveform1, sample_rate1 = torchaudio.load(response.raw)
+    waveform1, sample_rate1 = torchaudio.load(_hide_seek(response.raw))
    waveform1 = waveform1[:, frame_offset : frame_offset + num_frames]
    print(f" - Fetched {response.raw.tell()} bytes")
 print("Fetching until the requested frames are available...")
 with requests.get(url, stream=True) as response:
-    waveform2, sample_rate2 = torchaudio.load(response.raw, frame_offset=frame_offset, num_frames=num_frames)
+    waveform2, sample_rate2 = torchaudio.load(
+        _hide_seek(response.raw), frame_offset=frame_offset, num_frames=num_frames
+    )
    print(f" - Fetched {response.raw.tell()} bytes")
 print("Checking the resulting waveform ... ", end="")
@@ -316,6 +333,7 @@ waveform, sample_rate = torchaudio.load(SAMPLE_WAV)
 ######################################################################
 #
 def inspect_file(path):
    print("-" * 10)
    print("Source:", path)
@@ -324,6 +342,7 @@ def inspect_file(path):
    print(f" - {torchaudio.info(path)}")
    print()
 ######################################################################
 #
 # Save without any encoding option.
@@ -351,11 +370,11 @@ with tempfile.TemporaryDirectory() as tempdir:
 formats = [
    "flac",
-    "vorbis",
+    # "vorbis",
-    "sph",
+    # "sph",
-    "amb",
+    # "amb",
-    "amr-nb",
+    # "amr-nb",
-    "gsm",
+    # "gsm",
 ]
 ######################################################################

--- a/examples/tutorials/audio_resampling_tutorial.py
+++ b/examples/tutorials/audio_resampling_tutorial.py
@@ -27,14 +27,14 @@ import math
 import timeit
 import librosa
-import resampy
-import matplotlib.pyplot as plt
 import matplotlib.colors as mcolors
+import matplotlib.pyplot as plt
 import pandas as pd
-from IPython.display import Audio, display
+import resampy
+from IPython.display import Audio
-pd.set_option('display.max_rows', None)
+pd.set_option("display.max_rows", None)
-pd.set_option('display.max_columns', None)
+pd.set_option("display.max_columns", None)
 DEFAULT_OFFSET = 201
@@ -105,7 +105,6 @@ def plot_sweep(
    axis.yaxis.grid(True, alpha=0.67)
    figure.suptitle(f"{title} (sample rate: {sample_rate} Hz)")
    plt.colorbar(cax)
-    plt.show(block=True)
 ######################################################################
@@ -240,13 +239,13 @@ plot_sweep(resampled_waveform, resample_rate, title="rolloff=0.8")
 sample_rate = 48000
 resample_rate = 32000
-resampled_waveform = F.resample(waveform, sample_rate, resample_rate, resampling_method="sinc_interpolation")
+resampled_waveform = F.resample(waveform, sample_rate, resample_rate, resampling_method="sinc_interp_hann")
 plot_sweep(resampled_waveform, resample_rate, title="Hann Window Default")
 ######################################################################
 #
-resampled_waveform = F.resample(waveform, sample_rate, resample_rate, resampling_method="kaiser_window")
+resampled_waveform = F.resample(waveform, sample_rate, resample_rate, resampling_method="sinc_interp_kaiser")
 plot_sweep(resampled_waveform, resample_rate, title="Kaiser Window Default")
@@ -271,7 +270,7 @@ resampled_waveform = F.resample(
    resample_rate,
    lowpass_filter_width=64,
    rolloff=0.9475937167399596,
-    resampling_method="kaiser_window",
+    resampling_method="sinc_interp_kaiser",
    beta=14.769656459379492,
 )
 plot_sweep(resampled_waveform, resample_rate, title="Kaiser Window Best (torchaudio)")
@@ -300,7 +299,7 @@ resampled_waveform = F.resample(
    resample_rate,
    lowpass_filter_width=16,
    rolloff=0.85,
-    resampling_method="kaiser_window",
+    resampling_method="sinc_interp_kaiser",
    beta=8.555504641634386,
 )
 plot_sweep(resampled_waveform, resample_rate, title="Kaiser Window Fast (torchaudio)")
@@ -325,7 +324,7 @@ print("torchaudio and librosa kaiser fast MSE:", mse)
 #
 # Below are benchmarks for downsampling and upsampling waveforms between
 # two pairs of sampling rates. We demonstrate the performance implications
-# that the ``lowpass_filter_wdith``, window type, and sample rates can
+# that the ``lowpass_filter_width``, window type, and sample rates can
 # have. Additionally, we provide a comparison against ``librosa``\ ’s
 # ``kaiser_best`` and ``kaiser_fast`` using their corresponding parameters
 # in ``torchaudio``.
@@ -338,18 +337,20 @@ print(f"resampy: {resampy.__version__}")
 ######################################################################
 #
 def benchmark_resample_functional(
    waveform,
    sample_rate,
    resample_rate,
    lowpass_filter_width=6,
    rolloff=0.99,
-    resampling_method="sinc_interpolation",
+    resampling_method="sinc_interp_hann",
    beta=None,
    iters=5,
 ):
-    return timeit.timeit(
+    return (
-        stmt='''
+        timeit.timeit(
+            stmt="""
 torchaudio.functional.resample(
    waveform,
    sample_rate,
@@ -359,29 +360,34 @@ torchaudio.functional.resample(
    resampling_method=resampling_method,
    beta=beta,
 )
-        ''',
+        """,
-        setup='import torchaudio',
+            setup="import torchaudio",
-        number=iters,
+            number=iters,
-        globals=locals(),
+            globals=locals(),
-    ) * 1000 / iters
+        )
+        * 1000
+        / iters
+    )
 ######################################################################
 #
 def benchmark_resample_transforms(
    waveform,
    sample_rate,
    resample_rate,
    lowpass_filter_width=6,
    rolloff=0.99,
-    resampling_method="sinc_interpolation",
+    resampling_method="sinc_interp_hann",
    beta=None,
    iters=5,
 ):
-    return timeit.timeit(
+    return (
-        stmt='resampler(waveform)',
+        timeit.timeit(
-        setup='''
+            stmt="resampler(waveform)",
+            setup="""
 import torchaudio
 resampler = torchaudio.transforms.Resample(
@@ -394,15 +400,19 @@ resampler = torchaudio.transforms.Resample(
    beta=beta,
 )
 resampler.to(waveform.device)
-        ''',
+        """,
-        number=iters,
+            number=iters,
-        globals=locals(),
+            globals=locals(),
-    ) * 1000 / iters
+        )
+        * 1000
+        / iters
+    )
 ######################################################################
 #
 def benchmark_resample_librosa(
    waveform,
    sample_rate,
@@ -411,24 +421,29 @@ def benchmark_resample_librosa(
    iters=5,
 ):
    waveform_np = waveform.squeeze().numpy()
-    return timeit.timeit(
+    return (
-        stmt='''
+        timeit.timeit(
+            stmt="""
 librosa.resample(
    waveform_np,
    orig_sr=sample_rate,
    target_sr=resample_rate,
    res_type=res_type,
 )
-        ''',
+        """,
-        setup='import librosa',
+            setup="import librosa",
-        number=iters,
+            number=iters,
-        globals=locals(),
+            globals=locals(),
-    ) * 1000 / iters
+        )
+        * 1000
+        / iters
+    )
 ######################################################################
 #
 def benchmark(sample_rate, resample_rate):
    times, rows = [], []
    waveform = get_sine_sweep(sample_rate).to(torch.float32)
@@ -451,7 +466,7 @@ def benchmark(sample_rate, resample_rate):
    kwargs = {
        "lowpass_filter_width": 64,
        "rolloff": 0.9475937167399596,
-        "resampling_method": "kaiser_window",
+        "resampling_method": "sinc_interp_kaiser",
        "beta": 14.769656459379492,
    }
    lib_time = benchmark_resample_librosa(*args, res_type="kaiser_best")
@@ -464,7 +479,7 @@ def benchmark(sample_rate, resample_rate):
    kwargs = {
        "lowpass_filter_width": 16,
        "rolloff": 0.85,
-        "resampling_method": "kaiser_window",
+        "resampling_method": "sinc_interp_kaiser",
        "beta": 8.555504641634386,
    }
    lib_time = benchmark_resample_librosa(*args, res_type="kaiser_fast")
@@ -483,7 +498,7 @@ def plot(df):
    print(df.round(2))
    ax = df.plot(kind="bar")
    plt.ylabel("Time Elapsed [ms]")
-    plt.xticks(rotation = 0, fontsize=10)
+    plt.xticks(rotation=0, fontsize=10)
    for cont, col, color in zip(ax.containers, df.columns, mcolors.TABLEAU_COLORS):
        label = ["N/A" if v != v else str(v) for v in df[col].round(2)]
        ax.bar_label(cont, labels=label, color=color, fontweight="bold", fontsize="x-small")
@@ -531,8 +546,8 @@ plot(df)
 # - a larger ``lowpass_filter_width`` results in a larger resampling kernel,
 #   and therefore increases computation time for both the kernel computation
 #   and convolution
-# - using ``kaiser_window`` results in longer computation times than the default
+# - using ``sinc_interp_kaiser`` results in longer computation times than the default
-#   ``sinc_interpolation`` because it is more complex to compute the intermediate
+#   ``sinc_interp_hann`` because it is more complex to compute the intermediate
 #   window values
 # - a large GCD between the sample and resample rate will result
 #   in a simplification that allows for a smaller kernel and faster kernel computation.

--- a/examples/tutorials/ctc_forced_alignment_api_tutorial.py
+++ b/examples/tutorials/ctc_forced_alignment_api_tutorial.py
+"""
+CTC forced alignment API tutorial
+=================================
+**Author**: `Xiaohui Zhang <xiaohuizhang@meta.com>`__, `Moto Hira <moto@meta.com>`__
+The forced alignment is a process to align transcript with speech.
+This tutorial shows how to align transcripts to speech using
+:py:func:`torchaudio.functional.forced_align` which was developed along the work of
+`Scaling Speech Technology to 1,000+ Languages
+<https://research.facebook.com/publications/scaling-speech-technology-to-1000-languages/>`__.
+:py:func:`~torchaudio.functional.forced_align` has custom CPU and CUDA
+implementations which are more performant than the vanilla Python
+implementation above, and are more accurate.
+It can also handle missing transcript with special ``<star>`` token.
+There is also a high-level API, :py:class:`torchaudio.pipelines.Wav2Vec2FABundle`,
+which wraps the pre/post-processing explained in this tutorial and makes it easy
+to run forced-alignments.
+`Forced alignment for multilingual data
+<./forced_alignment_for_multilingual_data_tutorial.html>`__ uses this API to
+illustrate how to align non-English transcripts.
+"""
+######################################################################
+# Preparation
+# -----------
+import torch
+import torchaudio
+print(torch.__version__)
+print(torchaudio.__version__)
+######################################################################
+#
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(device)
+######################################################################
+#
+import IPython
+import matplotlib.pyplot as plt
+import torchaudio.functional as F
+######################################################################
+# First we prepare the speech data and the transcript we area going
+# to use.
+#
+SPEECH_FILE = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
+waveform, _ = torchaudio.load(SPEECH_FILE)
+TRANSCRIPT = "i had that curiosity beside me at this moment".split()
+######################################################################
+# Generating emissions
+# ~~~~~~~~~~~~~~~~~~~~
+#
+# :py:func:`~torchaudio.functional.forced_align` takes emission and
+# token sequences and outputs timestaps of the tokens and their scores.
+#
+# Emission reperesents the frame-wise probability distribution over
+# tokens, and it can be obtained by passing waveform to an acoustic
+# model.
+#
+# Tokens are numerical expression of transcripts. There are many ways to
+# tokenize transcripts, but here, we simply map alphabets into integer,
+# which is how labels were constructed when the acoustice model we are
+# going to use was trained.
+#
+# We will use a pre-trained Wav2Vec2 model,
+# :py:data:`torchaudio.pipelines.MMS_FA`, to obtain emission and tokenize
+# the transcript.
+#
+bundle = torchaudio.pipelines.MMS_FA
+model = bundle.get_model(with_star=False).to(device)
+with torch.inference_mode():
+    emission, _ = model(waveform.to(device))
+######################################################################
+#
+def plot_emission(emission):
+    fig, ax = plt.subplots()
+    ax.imshow(emission.cpu().T)
+    ax.set_title("Frame-wise class probabilities")
+    ax.set_xlabel("Time")
+    ax.set_ylabel("Labels")
+    fig.tight_layout()
+plot_emission(emission[0])
+######################################################################
+# Tokenize the transcript
+# ~~~~~~~~~~~~~~~~~~~~~~~
+#
+# We create a dictionary, which maps each label into token.
+LABELS = bundle.get_labels(star=None)
+DICTIONARY = bundle.get_dict(star=None)
+for k, v in DICTIONARY.items():
+    print(f"{k}: {v}")
+######################################################################
+# converting transcript to tokens is as simple as
+tokenized_transcript = [DICTIONARY[c] for word in TRANSCRIPT for c in word]
+for t in tokenized_transcript:
+    print(t, end=" ")
+print()
+######################################################################
+# Computing alignments
+# --------------------
+#
+# Frame-level alignments
+# ~~~~~~~~~~~~~~~~~~~~~~
+#
+# Now we call TorchAudio’s forced alignment API to compute the
+# frame-level alignment. For the detail of function signature, please
+# refer to :py:func:`~torchaudio.functional.forced_align`.
+#
+def align(emission, tokens):
+    targets = torch.tensor([tokens], dtype=torch.int32, device=device)
+    alignments, scores = F.forced_align(emission, targets, blank=0)
+    alignments, scores = alignments[0], scores[0]  # remove batch dimension for simplicity
+    scores = scores.exp()  # convert back to probability
+    return alignments, scores
+aligned_tokens, alignment_scores = align(emission, tokenized_transcript)
+######################################################################
+# Now let's look at the output.
+for i, (ali, score) in enumerate(zip(aligned_tokens, alignment_scores)):
+    print(f"{i:3d}:\t{ali:2d} [{LABELS[ali]}], {score:.2f}")
+######################################################################
+#
+# .. note::
+#
+#    The alignment is expressed in the frame cordinate of the emission,
+#    which is different from the original waveform.
+#
+# It contains blank tokens and repeated tokens. The following is the
+# interpretation of the non-blank tokens.
+#
+# .. code-block::
+#
+#    31:     0 [-], 1.00
+#    32:     2 [i], 1.00  "i" starts and ends
+#    33:     0 [-], 1.00
+#    34:     0 [-], 1.00
+#    35:    15 [h], 1.00  "h" starts
+#    36:    15 [h], 0.93  "h" ends
+#    37:     1 [a], 1.00  "a" starts and ends
+#    38:     0 [-], 0.96
+#    39:     0 [-], 1.00
+#    40:     0 [-], 1.00
+#    41:    13 [d], 1.00  "d" starts and ends
+#    42:     0 [-], 1.00
+#
+# .. note::
+#
+#    When same token occured after blank tokens, it is not treated as
+#    a repeat, but as a new occurrence.
+#
+#    .. code-block::
+#
+#       a a a b -> a b
+#       a - - b -> a b
+#       a a - b -> a b
+#       a - a b -> a a b
+#         ^^^       ^^^
+#
+######################################################################
+# Token-level alignments
+# ~~~~~~~~~~~~~~~~~~~~~~
+#
+# Next step is to resolve the repetation, so that each alignment does
+# not depend on previous alignments.
+# :py:func:`torchaudio.functional.merge_tokens` computes the
+# :py:class:`~torchaudio.functional.TokenSpan` object, which represents
+# which token from the transcript is present at what time span.
+######################################################################
+#
+token_spans = F.merge_tokens(aligned_tokens, alignment_scores)
+print("Token\tTime\tScore")
+for s in token_spans:
+    print(f"{LABELS[s.token]}\t[{s.start:3d}, {s.end:3d})\t{s.score:.2f}")
+######################################################################
+# Word-level alignments
+# ~~~~~~~~~~~~~~~~~~~~~
+#
+# Now let’s group the token-level alignments into word-level alignments.
+def unflatten(list_, lengths):
+    assert len(list_) == sum(lengths)
+    i = 0
+    ret = []
+    for l in lengths:
+        ret.append(list_[i : i + l])
+        i += l
+    return ret
+word_spans = unflatten(token_spans, [len(word) for word in TRANSCRIPT])
+######################################################################
+# Audio previews
+# ~~~~~~~~~~~~~~
+#
+# Compute average score weighted by the span length
+def _score(spans):
+    return sum(s.score * len(s) for s in spans) / sum(len(s) for s in spans)
+def preview_word(waveform, spans, num_frames, transcript, sample_rate=bundle.sample_rate):
+    ratio = waveform.size(1) / num_frames
+    x0 = int(ratio * spans[0].start)
+    x1 = int(ratio * spans[-1].end)
+    print(f"{transcript} ({_score(spans):.2f}): {x0 / sample_rate:.3f} - {x1 / sample_rate:.3f} sec")
+    segment = waveform[:, x0:x1]
+    return IPython.display.Audio(segment.numpy(), rate=sample_rate)
+num_frames = emission.size(1)
+######################################################################
+# Generate the audio for each segment
+print(TRANSCRIPT)
+IPython.display.Audio(SPEECH_FILE)
+######################################################################
+#
+preview_word(waveform, word_spans[0], num_frames, TRANSCRIPT[0])
+######################################################################
+#
+preview_word(waveform, word_spans[1], num_frames, TRANSCRIPT[1])
+######################################################################
+#
+preview_word(waveform, word_spans[2], num_frames, TRANSCRIPT[2])
+######################################################################
+#
+preview_word(waveform, word_spans[3], num_frames, TRANSCRIPT[3])
+######################################################################
+#
+preview_word(waveform, word_spans[4], num_frames, TRANSCRIPT[4])
+######################################################################
+#
+preview_word(waveform, word_spans[5], num_frames, TRANSCRIPT[5])
+######################################################################
+#
+preview_word(waveform, word_spans[6], num_frames, TRANSCRIPT[6])
+######################################################################
+#
+preview_word(waveform, word_spans[7], num_frames, TRANSCRIPT[7])
+######################################################################
+#
+preview_word(waveform, word_spans[8], num_frames, TRANSCRIPT[8])
+######################################################################
+# Visualization
+# ~~~~~~~~~~~~~
+#
+# Now let's look at the alignment result and segment the original
+# speech into words.
+def plot_alignments(waveform, token_spans, emission, transcript, sample_rate=bundle.sample_rate):
+    ratio = waveform.size(1) / emission.size(1) / sample_rate
+    fig, axes = plt.subplots(2, 1)
+    axes[0].imshow(emission[0].detach().cpu().T, aspect="auto")
+    axes[0].set_title("Emission")
+    axes[0].set_xticks([])
+    axes[1].specgram(waveform[0], Fs=sample_rate)
+    for t_spans, chars in zip(token_spans, transcript):
+        t0, t1 = t_spans[0].start + 0.1, t_spans[-1].end - 0.1
+        axes[0].axvspan(t0 - 0.5, t1 - 0.5, facecolor="None", hatch="/", edgecolor="white")
+        axes[1].axvspan(ratio * t0, ratio * t1, facecolor="None", hatch="/", edgecolor="white")
+        axes[1].annotate(f"{_score(t_spans):.2f}", (ratio * t0, sample_rate * 0.51), annotation_clip=False)
+        for span, char in zip(t_spans, chars):
+            t0 = span.start * ratio
+            axes[1].annotate(char, (t0, sample_rate * 0.55), annotation_clip=False)
+    axes[1].set_xlabel("time [second]")
+    axes[1].set_xlim([0, None])
+    fig.tight_layout()
+######################################################################
+#
+plot_alignments(waveform, word_spans, emission, TRANSCRIPT)
+######################################################################
+#
+# Inconsistent treatment of ``blank`` token
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# When splitting the token-level alignments into words, you will
+# notice that some blank tokens are treated differently, and this makes
+# the interpretation of the result somehwat ambigious.
+#
+# This is easy to see when we plot the scores. The following figure
+# shows word regions and non-word regions, with the frame-level scores
+# of non-blank tokens.
+def plot_scores(word_spans, scores):
+    fig, ax = plt.subplots()
+    span_xs, span_hs = [], []
+    ax.axvspan(word_spans[0][0].start - 0.05, word_spans[-1][-1].end + 0.05, facecolor="paleturquoise", edgecolor="none", zorder=-1)
+    for t_span in word_spans:
+        for span in t_span:
+            for t in range(span.start, span.end):
+                span_xs.append(t + 0.5)
+                span_hs.append(scores[t].item())
+            ax.annotate(LABELS[span.token], (span.start, -0.07))
+        ax.axvspan(t_span[0].start - 0.05, t_span[-1].end + 0.05, facecolor="mistyrose", edgecolor="none", zorder=-1)
+    ax.bar(span_xs, span_hs, color="lightsalmon", edgecolor="coral")
+    ax.set_title("Frame-level scores and word segments")
+    ax.set_ylim(-0.1, None)
+    ax.grid(True, axis="y")
+    ax.axhline(0, color="black")
+    fig.tight_layout()
+plot_scores(word_spans, alignment_scores)
+######################################################################
+# In this plot, the blank tokens are those highlighted area without
+# vertical bar.
+# You can see that there are blank tokens which are interpreted as
+# part of a word (highlighted red), while the others (highlighted blue)
+# are not.
+#
+# One reason for this is because the model was trained without a
+# label for the word boundary. The blank tokens are treated not just
+# as repeatation but also as silence between words.
+#
+# But then, a question arises. Should frames immediately after or
+# near the end of a word be silent or repeat?
+#
+# In the above example, if you go back to the previous plot of
+# spectrogram and word regions, you see that after "y" in "curiosity",
+# there is still some activities in multiple frequency buckets.
+#
+# Would it be more accurate if that frame was included in the word?
+#
+# Unfortunately, CTC does not provide a comprehensive solution to this.
+# Models trained with CTC are known to exhibit "peaky" response,
+# that is, they tend to spike for an aoccurance of a label, but the
+# spike does not last for the duration of the label.
+# (Note: Pre-trained Wav2Vec2 models tend to spike at the beginning of
+# label occurances, but this not always the case.)
+#
+# :cite:`zeyer2021does` has in-depth alanysis on the peaky behavior of
+# CTC.
+# We encourage those who are interested understanding more to refer
+# to the paper.
+# The following is a quote from the paper, which is the exact issue we
+# are facing here.
+#
+#    *Peaky behavior can be problematic in certain cases,*
+#    *e.g. when an application requires to not use the blank label,*
+#    *e.g. to get meaningful time accurate alignments of phonemes*
+#    *to a transcription.*
+######################################################################
+# Advanced: Handling transcripts with ``<star>`` token
+# ----------------------------------------------------
+#
+# Now let’s look at when the transcript is partially missing, how can we
+# improve alignment quality using the ``<star>`` token, which is capable of modeling
+# any token.
+#
+# Here we use the same English example as used above. But we remove the
+# beginning text ``“i had that curiosity beside me at”`` from the transcript.
+# Aligning audio with such transcript results in wrong alignments of the
+# existing word “this”. However, this issue can be mitigated by using the
+# ``<star>`` token to model the missing text.
+#
+######################################################################
+# First, we extend the dictionary to include the ``<star>`` token.
+DICTIONARY["*"] = len(DICTIONARY)
+######################################################################
+# Next, we extend the emission tensor with the extra dimension
+# corresponding to the ``<star>`` token.
+#
+star_dim = torch.zeros((1, emission.size(1), 1), device=emission.device, dtype=emission.dtype)
+emission = torch.cat((emission, star_dim), 2)
+assert len(DICTIONARY) == emission.shape[2]
+plot_emission(emission[0])
+######################################################################
+# The following function combines all the processes, and compute
+# word segments from emission in one-go.
+def compute_alignments(emission, transcript, dictionary):
+    tokens = [dictionary[char] for word in transcript for char in word]
+    alignment, scores = align(emission, tokens)
+    token_spans = F.merge_tokens(alignment, scores)
+    word_spans = unflatten(token_spans, [len(word) for word in transcript])
+    return word_spans
+######################################################################
+# Full Transcript
+# ~~~~~~~~~~~~~~~
+word_spans = compute_alignments(emission, TRANSCRIPT, DICTIONARY)
+plot_alignments(waveform, word_spans, emission, TRANSCRIPT)
+######################################################################
+# Partial Transcript with ``<star>`` token
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Now we replace the first part of the transcript with the ``<star>`` token.
+transcript = "* this moment".split()
+word_spans = compute_alignments(emission, transcript, DICTIONARY)
+plot_alignments(waveform, word_spans, emission, transcript)
+######################################################################
+#
+preview_word(waveform, word_spans[0], num_frames, transcript[0])
+######################################################################
+#
+preview_word(waveform, word_spans[1], num_frames, transcript[1])
+######################################################################
+#
+preview_word(waveform, word_spans[2], num_frames, transcript[2])
+######################################################################
+# Partial Transcript without ``<star>`` token
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# As a comparison, the following aligns the partial transcript
+# without using ``<star>`` token.
+# It demonstrates the effect of ``<star>`` token for dealing with deletion errors.
+transcript = "this moment".split()
+word_spans = compute_alignments(emission, transcript, DICTIONARY)
+plot_alignments(waveform, word_spans, emission, transcript)
+######################################################################
+# Conclusion
+# ----------
+#
+# In this tutorial, we looked at how to use torchaudio’s forced alignment
+# API to align and segment speech files, and demonstrated one advanced usage:
+# How introducing a ``<star>`` token could improve alignment accuracy when
+# transcription errors exist.
+#
+######################################################################
+# Acknowledgement
+# ---------------
+#
+# Thanks to `Vineel Pratap <vineelkpratap@meta.com>`__ and `Zhaoheng
+# Ni <zni@meta.com>`__ for developing and open-sourcing the
+# forced aligner API.
--- a/examples/tutorials/device_asr.py
+++ b/examples/tutorials/device_asr.py
@@ -7,26 +7,23 @@ Device ASR with Emformer RNN-T
 This tutorial shows how to use Emformer RNN-T and streaming API
 to perform speech recognition on a streaming device input, i.e. microphone
 on laptop.
-.. note::
-   This tutorial requires FFmpeg libraries (>=4.1, <4.4) and SentencePiece.
-   There are multiple ways to install FFmpeg libraries.
-   If you are using Anaconda Python distribution,
-   ``conda install 'ffmpeg<4.4'`` will install
-   the required FFmpeg libraries.
-   You can install SentencePiece by running ``pip install sentencepiece``.
-.. note::
-   This tutorial was tested on MacBook Pro and Dynabook with Windows 10.
-   This tutorial does NOT work on Google Colab because the server running
-   this tutorial does not have a microphone that you can talk to.
 """
+######################################################################
+#
+# .. note::
+#
+#    This tutorial requires FFmpeg libraries.
+#    Please refer to :ref:`FFmpeg dependency <ffmpeg_dependency>` for
+#    the detail.
+#
+# .. note::
+#
+#    This tutorial was tested on MacBook Pro and Dynabook with Windows 10.
+#
+#    This tutorial does NOT work on Google Colab because the server running
+#    this tutorial does not have a microphone that you can talk to.
 ######################################################################
 # 1. Overview
 # -----------

--- a/examples/tutorials/effector_tutorial.py
+++ b/examples/tutorials/effector_tutorial.py
+"""
+AudioEffector Usages
+====================
+**Author**: `Moto Hira <moto@meta.com>`__
+This tutorial shows how to use :py:class:`torchaudio.io.AudioEffector` to
+apply various effects and codecs to waveform tensor.
+"""
+######################################################################
+#
+# .. note::
+#
+#    This tutorial requires FFmpeg libraries.
+#    Please refer to :ref:`FFmpeg dependency <ffmpeg_dependency>` for
+#    the detail.
+#
+######################################################################
+# Overview
+# --------
+#
+# :py:class:`~torchaudio.io.AudioEffector` combines in-memory encoding,
+# decoding and filtering that are provided by
+# :py:class:`~torchaudio.io.StreamWriter` and
+# :py:class:`~torchaudio.io.StreamReader`.
+#
+# The following figure illustrates the process.
+#
+# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/AudioEffector.png
+#
+import torch
+import torchaudio
+print(torch.__version__)
+print(torchaudio.__version__)
+######################################################################
+#
+from torchaudio.io import AudioEffector, CodecConfig
+import matplotlib.pyplot as plt
+from IPython.display import Audio
+######################################################################
+#
+for k, v in torchaudio.utils.ffmpeg_utils.get_versions().items():
+    print(k, v)
+######################################################################
+# Usage
+# -----
+#
+# To use ``AudioEffector``, instantiate it with ``effect`` and
+# ``format``, then either pass the waveform to
+# :py:meth:`~torchaudio.io.AudioEffector.apply` or
+# :py:meth:`~torchaudio.io.AudioEffector.stream` method.
+#
+# .. code:: python
+#
+#    effector = AudioEffector(effect=..., format=...,)
+#
+#    # Apply at once
+#    applied = effector.apply(waveform, sample_rate)
+#
+# ``apply`` method applies effect and codec to the entire waveform at
+# once. So if the input waveform is long, and memory consumption is an
+# issue, one can use ``stream`` method to process chunk by chunk.
+#
+# .. code:: python
+#
+#    # Apply chunk by chunk
+#    for applied_chunk = effector.stream(waveform, sample_rate):
+#        ...
+#
+######################################################################
+# Example
+# -------
+#
+src = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
+waveform, sr = torchaudio.load(src, channels_first=False)
+######################################################################
+# Gallery
+# -------
+#
+def show(effect, *, stereo=False):
+    wf = torch.cat([waveform] * 2, dim=1) if stereo else waveform
+    figsize = (6.4, 2.1 if stereo else 1.2)
+    effector = AudioEffector(effect=effect, pad_end=False)
+    result = effector.apply(wf, int(sr))
+    num_channels = result.size(1)
+    f, ax = plt.subplots(num_channels, 1, squeeze=False, figsize=figsize, sharex=True)
+    for i in range(num_channels):
+        ax[i][0].specgram(result[:, i], Fs=sr)
+    f.set_tight_layout(True)
+    return Audio(result.numpy().T, rate=sr)
+######################################################################
+# Original
+# --------
+#
+show(effect=None)
+######################################################################
+# Effects
+# -------
+#
+######################################################################
+# tempo
+# ~~~~~
+# https://ffmpeg.org/ffmpeg-filters.html#atempo
+show("atempo=0.7")
+######################################################################
+#
+show("atempo=1.8")
+######################################################################
+# highpass
+# ~~~~~~~~
+# https://ffmpeg.org/ffmpeg-filters.html#highpass
+show("highpass=frequency=1500")
+######################################################################
+# lowpass
+# ~~~~~~~
+# https://ffmpeg.org/ffmpeg-filters.html#lowpass
+show("lowpass=frequency=1000")
+######################################################################
+# allpass
+# ~~~~~~~~
+# https://ffmpeg.org/ffmpeg-filters.html#allpass
+show("allpass")
+######################################################################
+# bandpass
+# ~~~~~~~~
+# https://ffmpeg.org/ffmpeg-filters.html#bandpass
+show("bandpass=frequency=3000")
+######################################################################
+# bandreject
+# ~~~~~~~~~~
+# https://ffmpeg.org/ffmpeg-filters.html#bandreject
+show("bandreject=frequency=3000")
+######################################################################
+# echo
+# ~~~~
+# https://ffmpeg.org/ffmpeg-filters.html#aecho
+show("aecho=in_gain=0.8:out_gain=0.88:delays=6:decays=0.4")
+######################################################################
+#
+show("aecho=in_gain=0.8:out_gain=0.88:delays=60:decays=0.4")
+######################################################################
+#
+show("aecho=in_gain=0.8:out_gain=0.9:delays=1000:decays=0.3")
+######################################################################
+# chorus
+# ~~~~~~
+# https://ffmpeg.org/ffmpeg-filters.html#chorus
+show("chorus=0.5:0.9:50|60|40:0.4|0.32|0.3:0.25|0.4|0.3:2|2.3|1.3")
+######################################################################
+# fft filter
+# ~~~~~~~~~~
+# https://ffmpeg.org/ffmpeg-filters.html#afftfilt
+# fmt: off
+show(
+    "afftfilt="
+    "real='re * (1-clip(b * (b/nb), 0, 1))':"
+    "imag='im * (1-clip(b * (b/nb), 0, 1))'"
+)
+######################################################################
+#
+show(
+    "afftfilt="
+    "real='hypot(re,im) * sin(0)':"
+    "imag='hypot(re,im) * cos(0)':"
+    "win_size=512:"
+    "overlap=0.75"
+)
+######################################################################
+#
+show(
+    "afftfilt="
+    "real='hypot(re,im) * cos(2 * 3.14 * (random(0) * 2-1))':"
+    "imag='hypot(re,im) * sin(2 * 3.14 * (random(1) * 2-1))':"
+    "win_size=128:"
+    "overlap=0.8"
+)
+# fmt: on
+######################################################################
+# vibrato
+# ~~~~~~~
+# https://ffmpeg.org/ffmpeg-filters.html#vibrato
+show("vibrato=f=10:d=0.8")
+######################################################################
+# tremolo
+# ~~~~~~~
+# https://ffmpeg.org/ffmpeg-filters.html#tremolo
+show("tremolo=f=8:d=0.8")
+######################################################################
+# crystalizer
+# ~~~~~~~~~~~
+# https://ffmpeg.org/ffmpeg-filters.html#crystalizer
+show("crystalizer")
+######################################################################
+# flanger
+# ~~~~~~~
+# https://ffmpeg.org/ffmpeg-filters.html#flanger
+show("flanger")
+######################################################################
+# phaser
+# ~~~~~~
+# https://ffmpeg.org/ffmpeg-filters.html#aphaser
+show("aphaser")
+######################################################################
+# pulsator
+# ~~~~~~~~
+# https://ffmpeg.org/ffmpeg-filters.html#apulsator
+show("apulsator", stereo=True)
+######################################################################
+# haas
+# ~~~~
+# https://ffmpeg.org/ffmpeg-filters.html#haas
+show("haas")
+######################################################################
+# Codecs
+# ------
+#
+def show_multi(configs):
+    results = []
+    for config in configs:
+        effector = AudioEffector(**config)
+        results.append(effector.apply(waveform, int(sr)))
+    num_configs = len(configs)
+    figsize = (6.4, 0.3 + num_configs * 0.9)
+    f, axes = plt.subplots(num_configs, 1, figsize=figsize, sharex=True)
+    for result, ax in zip(results, axes):
+        ax.specgram(result[:, 0], Fs=sr)
+    f.set_tight_layout(True)
+    return [Audio(r.numpy().T, rate=sr) for r in results]
+######################################################################
+# ogg
+# ~~~
+#
+results = show_multi(
+    [
+        {"format": "ogg"},
+        {"format": "ogg", "encoder": "vorbis"},
+        {"format": "ogg", "encoder": "opus"},
+    ]
+)
+######################################################################
+# ogg - default encoder (flac)
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+results[0]
+######################################################################
+# ogg - vorbis
+# ^^^^^^^^^^^^
+#
+results[1]
+######################################################################
+# ogg - opus
+# ^^^^^^^^^^
+#
+results[2]
+######################################################################
+# mp3
+# ~~~
+# https://trac.ffmpeg.org/wiki/Encode/MP3
+results = show_multi(
+    [
+        {"format": "mp3"},
+        {"format": "mp3", "codec_config": CodecConfig(compression_level=1)},
+        {"format": "mp3", "codec_config": CodecConfig(compression_level=9)},
+        {"format": "mp3", "codec_config": CodecConfig(bit_rate=192_000)},
+        {"format": "mp3", "codec_config": CodecConfig(bit_rate=8_000)},
+        {"format": "mp3", "codec_config": CodecConfig(qscale=9)},
+        {"format": "mp3", "codec_config": CodecConfig(qscale=1)},
+    ]
+)
+######################################################################
+# default
+# ^^^^^^^
+results[0]
+######################################################################
+# compression_level=1
+# ^^^^^^^^^^^^^^^^^^^
+results[1]
+######################################################################
+# compression_level=9
+# ^^^^^^^^^^^^^^^^^^^
+results[2]
+######################################################################
+# bit_rate=192k
+# ^^^^^^^^^^^^^
+results[3]
+######################################################################
+# bit_rate=8k
+# ^^^^^^^^^^^^^
+results[4]
+######################################################################
+# qscale=9
+# ^^^^^^^^
+results[5]
+######################################################################
+# qscale=1
+# ^^^^^^^^
+results[6]
+######################################################################
+#
+# Tag: :obj:`torchaudio.io`
--- a/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py
+++ b/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py
+"""
+Forced alignment for multilingual data
+======================================
+**Authors**: `Xiaohui Zhang <xiaohuizhang@meta.com>`__, `Moto Hira <moto@meta.com>`__.
+This tutorial shows how to align transcript to speech for non-English languages.
+The process of aligning non-English (normalized) transcript is identical to aligning
+English (normalized) transcript, and the process for English is covered in detail in
+`CTC forced alignment tutorial <./ctc_forced_alignment_api_tutorial.html>`__.
+In this tutorial, we use TorchAudio's high-level API,
+:py:class:`torchaudio.pipelines.Wav2Vec2FABundle`, which packages the pre-trained
+model, tokenizer and aligner, to perform the forced alignment with less code.
+"""
+import torch
+import torchaudio
+print(torch.__version__)
+print(torchaudio.__version__)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(device)
+######################################################################
+#
+from typing import List
+import IPython
+import matplotlib.pyplot as plt
+######################################################################
+# Creating the pipeline
+# ---------------------
+#
+# First, we instantiate the model and pre/post-processing pipelines.
+#
+# The following diagram illustrates the process of alignment.
+#
+# .. image:: https://download.pytorch.org/torchaudio/doc-assets/pipelines-wav2vec2fabundle.png
+#
+# The waveform is passed to an acoustic model, which produces the sequence of
+# probability distribution of tokens.
+# The transcript is passed to tokenizer, which converts the transcript to
+# sequence of tokens.
+# Aligner takes the results from the acoustic model and the tokenizer and generate
+# timestamps for each token.
+#
+# .. note::
+#
+#    This process expects that the input transcript is already normalized.
+#    The process of normalization, which involves romanization of non-English
+#    languages, is language-dependent, so it is not covered in this tutorial,
+#    but we will breifly look into it.
+#
+# The acoustic model and the tokenizer must use the same set of tokens.
+# To facilitate the creation of matching processors,
+# :py:class:`~torchaudio.pipelines.Wav2Vec2FABundle` associates a
+# pre-trained accoustic model and a tokenizer.
+# :py:data:`torchaudio.pipelines.MMS_FA` is one of such instance.
+#
+# The following code instantiates a pre-trained acoustic model, a tokenizer
+# which uses the same set of tokens as the model, and an aligner.
+#
+from torchaudio.pipelines import MMS_FA as bundle
+model = bundle.get_model()
+model.to(device)
+tokenizer = bundle.get_tokenizer()
+aligner = bundle.get_aligner()
+######################################################################
+# .. note::
+#
+#    The model instantiated by :py:data:`~torchaudio.pipelines.MMS_FA`'s
+#    :py:meth:`~torchaudio.pipelines.Wav2Vec2FABundle.get_model`
+#    method by default includes the feature dimension for ``<star>`` token.
+#    You can disable this by passing ``with_star=False``.
+#
+######################################################################
+# The acoustic model of :py:data:`~torchaudio.pipelines.MMS_FA` was
+# created and open-sourced as part of the research project,
+# `Scaling Speech Technology to 1,000+ Languages
+# <https://research.facebook.com/publications/scaling-speech-technology-to-1000-languages/>`__.
+# It was trained with 23,000 hours of audio from 1100+ languages.
+#
+# The tokenizer simply maps the normalized characters to integers.
+# You can check the mapping as follow;
+print(bundle.get_dict())
+######################################################################
+#
+# The aligner internally uses :py:func:`torchaudio.functional.forced_align`
+# and :py:func:`torchaudio.functional.merge_tokens` to infer the time
+# stamps of the input tokens.
+#
+# The detail of the underlying mechanism is covered in
+# `CTC forced alignment API tutorial <./ctc_forced_alignment_api_tutorial.html>`__,
+# so please refer to it.
+######################################################################
+# We define a utility function that performs the forced alignment with
+# the above model, the tokenizer and the aligner.
+#
+def compute_alignments(waveform: torch.Tensor, transcript: List[str]):
+    with torch.inference_mode():
+        emission, _ = model(waveform.to(device))
+        token_spans = aligner(emission[0], tokenizer(transcript))
+    return emission, token_spans
+######################################################################
+# We also define utility functions for plotting the result and previewing
+# the audio segments.
+# Compute average score weighted by the span length
+def _score(spans):
+    return sum(s.score * len(s) for s in spans) / sum(len(s) for s in spans)
+def plot_alignments(waveform, token_spans, emission, transcript, sample_rate=bundle.sample_rate):
+    ratio = waveform.size(1) / emission.size(1) / sample_rate
+    fig, axes = plt.subplots(2, 1)
+    axes[0].imshow(emission[0].detach().cpu().T, aspect="auto")
+    axes[0].set_title("Emission")
+    axes[0].set_xticks([])
+    axes[1].specgram(waveform[0], Fs=sample_rate)
+    for t_spans, chars in zip(token_spans, transcript):
+        t0, t1 = t_spans[0].start, t_spans[-1].end
+        axes[0].axvspan(t0 - 0.5, t1 - 0.5, facecolor="None", hatch="/", edgecolor="white")
+        axes[1].axvspan(ratio * t0, ratio * t1, facecolor="None", hatch="/", edgecolor="white")
+        axes[1].annotate(f"{_score(t_spans):.2f}", (ratio * t0, sample_rate * 0.51), annotation_clip=False)
+        for span, char in zip(t_spans, chars):
+            t0 = span.start * ratio
+            axes[1].annotate(char, (t0, sample_rate * 0.55), annotation_clip=False)
+    axes[1].set_xlabel("time [second]")
+    fig.tight_layout()
+######################################################################
+#
+def preview_word(waveform, spans, num_frames, transcript, sample_rate=bundle.sample_rate):
+    ratio = waveform.size(1) / num_frames
+    x0 = int(ratio * spans[0].start)
+    x1 = int(ratio * spans[-1].end)
+    print(f"{transcript} ({_score(spans):.2f}): {x0 / sample_rate:.3f} - {x1 / sample_rate:.3f} sec")
+    segment = waveform[:, x0:x1]
+    return IPython.display.Audio(segment.numpy(), rate=sample_rate)
+######################################################################
+# Normalizing the transcript
+# --------------------------
+#
+# The transcripts passed to the pipeline must be normalized beforehand.
+# The exact process of normalization depends on language.
+#
+# Languages that do not have explicit word boundaries
+# (such as Chinese, Japanese and Korean) require segmentation first.
+# There are dedicated tools for this, but let's say we have segmented
+# transcript.
+#
+# The first step of normalization is romanization.
+# `uroman <https://github.com/isi-nlp/uroman>`__ is a tool that
+# supports many languages.
+#
+# Here is a BASH commands to romanize the input text file and write
+# the output to another text file using ``uroman``.
+#
+# .. code-block:: bash
+#
+#    $ echo "des événements d'actualité qui se sont produits durant l'année 1882" > text.txt
+#    $ uroman/bin/uroman.pl < text.txt > text_romanized.txt
+#    $ cat text_romanized.txt
+#
+# .. code-block:: text
+#
+#    Cette page concerne des evenements d'actualite qui se sont produits durant l'annee 1882
+#
+# The next step is to remove non-alphabets and punctuations.
+# The following snippet normalizes the romanized transcript.
+#
+# .. code-block:: python
+#
+#    import re
+#
+#
+#    def normalize_uroman(text):
+#        text = text.lower()
+#        text = text.replace("’", "'")
+#        text = re.sub("([^a-z' ])", " ", text)
+#        text = re.sub(' +', ' ', text)
+#        return text.strip()
+#
+#
+#    with open("text_romanized.txt", "r") as f:
+#        for line in f:
+#            text_normalized = normalize_uroman(line)
+#            print(text_normalized)
+#
+# Running the script on the above exanple produces the following.
+#
+# .. code-block:: text
+#
+#    cette page concerne des evenements d'actualite qui se sont produits durant l'annee
+#
+# Note that, in this example, since "1882" was not romanized by ``uroman``,
+# it was removed in the normalization step.
+# To avoid this, one needs to romanize numbers, but this is known to be a non-trivial task.
+#
+######################################################################
+# Aligning transcripts to speech
+# ------------------------------
+#
+# Now we perform the forced alignment for multiple languages.
+#
+#
+# German
+# ~~~~~~
+text_raw = "aber seit ich bei ihnen das brot hole"
+text_normalized = "aber seit ich bei ihnen das brot hole"
+url = "https://download.pytorch.org/torchaudio/tutorial-assets/10349_8674_000087.flac"
+waveform, sample_rate = torchaudio.load(
+    url, frame_offset=int(0.5 * bundle.sample_rate), num_frames=int(2.5 * bundle.sample_rate)
+)
+######################################################################
+#
+assert sample_rate == bundle.sample_rate
+######################################################################
+#
+transcript = text_normalized.split()
+tokens = tokenizer(transcript)
+emission, token_spans = compute_alignments(waveform, transcript)
+num_frames = emission.size(1)
+plot_alignments(waveform, token_spans, emission, transcript)
+print("Raw Transcript: ", text_raw)
+print("Normalized Transcript: ", text_normalized)
+IPython.display.Audio(waveform, rate=sample_rate)
+######################################################################
+#
+preview_word(waveform, token_spans[0], num_frames, transcript[0])
+######################################################################
+#
+preview_word(waveform, token_spans[1], num_frames, transcript[1])
+######################################################################
+#
+preview_word(waveform, token_spans[2], num_frames, transcript[2])
+######################################################################
+#
+preview_word(waveform, token_spans[3], num_frames, transcript[3])
+######################################################################
+#
+preview_word(waveform, token_spans[4], num_frames, transcript[4])
+######################################################################
+#
+preview_word(waveform, token_spans[5], num_frames, transcript[5])
+######################################################################
+#
+preview_word(waveform, token_spans[6], num_frames, transcript[6])
+######################################################################
+#
+preview_word(waveform, token_spans[7], num_frames, transcript[7])
+######################################################################
+# Chinese
+# ~~~~~~~
+#
+# Chinese is a character-based language, and there is not explicit word-level
+# tokenization (separated by spaces) in its raw written form. In order to
+# obtain word level alignments, you need to first tokenize the transcripts
+# at the word level using a word tokenizer like `“Stanford
+# Tokenizer” <https://michelleful.github.io/code-blog/2015/09/10/parsing-chinese-with-stanford/>`__.
+# However this is not needed if you only want character-level alignments.
+#
+text_raw = "关 服务 高端 产品 仍 处于 供不应求 的 局面"
+text_normalized = "guan fuwu gaoduan chanpin reng chuyu gongbuyingqiu de jumian"
+######################################################################
+#
+url = "https://download.pytorch.org/torchaudio/tutorial-assets/mvdr/clean_speech.wav"
+waveform, sample_rate = torchaudio.load(url)
+waveform = waveform[0:1]
+######################################################################
+#
+assert sample_rate == bundle.sample_rate
+######################################################################
+#
+transcript = text_normalized.split()
+emission, token_spans = compute_alignments(waveform, transcript)
+num_frames = emission.size(1)
+plot_alignments(waveform, token_spans, emission, transcript)
+print("Raw Transcript: ", text_raw)
+print("Normalized Transcript: ", text_normalized)
+IPython.display.Audio(waveform, rate=sample_rate)
+######################################################################
+#
+preview_word(waveform, token_spans[0], num_frames, transcript[0])
+######################################################################
+#
+preview_word(waveform, token_spans[1], num_frames, transcript[1])
+######################################################################
+#
+preview_word(waveform, token_spans[2], num_frames, transcript[2])
+######################################################################
+#
+preview_word(waveform, token_spans[3], num_frames, transcript[3])
+######################################################################
+#
+preview_word(waveform, token_spans[4], num_frames, transcript[4])
+######################################################################
+#
+preview_word(waveform, token_spans[5], num_frames, transcript[5])
+######################################################################
+#
+preview_word(waveform, token_spans[6], num_frames, transcript[6])
+######################################################################
+#
+preview_word(waveform, token_spans[7], num_frames, transcript[7])
+######################################################################
+#
+preview_word(waveform, token_spans[8], num_frames, transcript[8])
+######################################################################
+# Polish
+# ~~~~~~
+text_raw = "wtedy ujrzałem na jego brzuchu okrągłą czarną ranę"
+text_normalized = "wtedy ujrzalem na jego brzuchu okragla czarna rane"
+url = "https://download.pytorch.org/torchaudio/tutorial-assets/5090_1447_000088.flac"
+waveform, sample_rate = torchaudio.load(url, num_frames=int(4.5 * bundle.sample_rate))
+######################################################################
+#
+assert sample_rate == bundle.sample_rate
+######################################################################
+#
+transcript = text_normalized.split()
+emission, token_spans = compute_alignments(waveform, transcript)
+num_frames = emission.size(1)
+plot_alignments(waveform, token_spans, emission, transcript)
+print("Raw Transcript: ", text_raw)
+print("Normalized Transcript: ", text_normalized)
+IPython.display.Audio(waveform, rate=sample_rate)
+######################################################################
+#
+preview_word(waveform, token_spans[0], num_frames, transcript[0])
+######################################################################
+#
+preview_word(waveform, token_spans[1], num_frames, transcript[1])
+######################################################################
+#
+preview_word(waveform, token_spans[2], num_frames, transcript[2])
+######################################################################
+#
+preview_word(waveform, token_spans[3], num_frames, transcript[3])
+######################################################################
+#
+preview_word(waveform, token_spans[4], num_frames, transcript[4])
+######################################################################
+#
+preview_word(waveform, token_spans[5], num_frames, transcript[5])
+######################################################################
+#
+preview_word(waveform, token_spans[6], num_frames, transcript[6])
+######################################################################
+#
+preview_word(waveform, token_spans[7], num_frames, transcript[7])
+######################################################################
+# Portuguese
+# ~~~~~~~~~~
+text_raw = "na imensa extensão onde se esconde o inconsciente imortal"
+text_normalized = "na imensa extensao onde se esconde o inconsciente imortal"
+url = "https://download.pytorch.org/torchaudio/tutorial-assets/6566_5323_000027.flac"
+waveform, sample_rate = torchaudio.load(
+    url, frame_offset=int(bundle.sample_rate), num_frames=int(4.6 * bundle.sample_rate)
+)
+######################################################################
+#
+assert sample_rate == bundle.sample_rate
+######################################################################
+#
+transcript = text_normalized.split()
+emission, token_spans = compute_alignments(waveform, transcript)
+num_frames = emission.size(1)
+plot_alignments(waveform, token_spans, emission, transcript)
+print("Raw Transcript: ", text_raw)
+print("Normalized Transcript: ", text_normalized)
+IPython.display.Audio(waveform, rate=sample_rate)
+######################################################################
+#
+preview_word(waveform, token_spans[0], num_frames, transcript[0])
+######################################################################
+#
+preview_word(waveform, token_spans[1], num_frames, transcript[1])
+######################################################################
+#
+preview_word(waveform, token_spans[2], num_frames, transcript[2])
+######################################################################
+#
+preview_word(waveform, token_spans[3], num_frames, transcript[3])
+######################################################################
+#
+preview_word(waveform, token_spans[4], num_frames, transcript[4])
+######################################################################
+#
+preview_word(waveform, token_spans[5], num_frames, transcript[5])
+######################################################################
+#
+preview_word(waveform, token_spans[6], num_frames, transcript[6])
+######################################################################
+#
+preview_word(waveform, token_spans[7], num_frames, transcript[7])
+######################################################################
+#
+preview_word(waveform, token_spans[8], num_frames, transcript[8])
+######################################################################
+# Italian
+# ~~~~~~~
+text_raw = "elle giacean per terra tutte quante"
+text_normalized = "elle giacean per terra tutte quante"
+url = "https://download.pytorch.org/torchaudio/tutorial-assets/642_529_000025.flac"
+waveform, sample_rate = torchaudio.load(url, num_frames=int(4 * bundle.sample_rate))
+######################################################################
+#
+assert sample_rate == bundle.sample_rate
+######################################################################
+#
+transcript = text_normalized.split()
+emission, token_spans = compute_alignments(waveform, transcript)
+num_frames = emission.size(1)
+plot_alignments(waveform, token_spans, emission, transcript)
+print("Raw Transcript: ", text_raw)
+print("Normalized Transcript: ", text_normalized)
+IPython.display.Audio(waveform, rate=sample_rate)
+######################################################################
+#
+preview_word(waveform, token_spans[0], num_frames, transcript[0])
+######################################################################
+#
+preview_word(waveform, token_spans[1], num_frames, transcript[1])
+######################################################################
+#
+preview_word(waveform, token_spans[2], num_frames, transcript[2])
+######################################################################
+#
+preview_word(waveform, token_spans[3], num_frames, transcript[3])
+######################################################################
+#
+preview_word(waveform, token_spans[4], num_frames, transcript[4])
+######################################################################
+#
+preview_word(waveform, token_spans[5], num_frames, transcript[5])
+######################################################################
+# Conclusion
+# ----------
+#
+# In this tutorial, we looked at how to use torchaudio’s forced alignment
+# API and a Wav2Vec2 pre-trained mulilingual acoustic model to align
+# speech data to transcripts in five languages.
+#
+######################################################################
+# Acknowledgement
+# ---------------
+#
+# Thanks to `Vineel Pratap <vineelkpratap@meta.com>`__ and `Zhaoheng
+# Ni <zni@meta.com>`__ for developing and open-sourcing the
+# forced aligner API.
+#