Commit ffeba11a authored by mayp777's avatar mayp777
Browse files

UPDATE

parent 29deb085
from ._hubert_loss import hubert_loss
__all__ = [
"hubert_loss",
"wav2vec2_loss",
]
from typing import Optional, Tuple
import torch
import torch.nn.functional as F
from torch import Tensor
def hubert_loss(
logit_m: Optional[Tensor],
logit_u: Optional[Tensor],
feature_penalty: Tensor,
label: Optional[Tensor] = None,
masked_weight: float = 1.0,
unmasked_weight: float = 0.0,
feature_weight: float = 10.0,
reduction: str = "sum",
) -> Tuple[Tensor, float]:
"""Compute the cross-entropy loss on HuBERT masked and non-masked logits.
Args:
logit_m (Tensor or None): The masked logit Tensor of dimension `(masked_frames, final_dim)`.
logit_u (Tensor or None): The non-masked logit Tensor of dimension `(unmasked_frames, final_dim)`.
feature_penalty (Tensor): The feature mean value for additional penalty loss.
masked_weight (float, optional): The weight for masked cross-entropy loss (Default: ``1.0``).
unmasked_weight (float, optional): The weight for non-masked cross-entropy loss (Default: ``0.0``).
feature_weight (float, optional): The weight for feature penalty loss (Default: ``10.0``).
reduction (str, optional): The reduction method for cross-entropy loss (Default: ``"sum"``).
Returns:
(Tensor, float)
Tensor: The desired loss Tensor.
float: Number of frames used in loss computation.
"""
num_frame = 0.0
loss = 0.0
if logit_m is not None:
target_m = torch.zeros(logit_m.shape[0], dtype=torch.long, device=logit_m.device)
loss_m = F.cross_entropy(logit_m, target_m, reduction=reduction)
loss += loss_m * masked_weight
num_frame += logit_m.shape[0]
if logit_u is not None:
target_u = torch.zeros(logit_u.shape[0], dtype=torch.long, device=logit_m.device)
loss_u = F.cross_entropy(logit_u, target_u, reduction=reduction)
loss += loss_u * unmasked_weight
num_frame += logit_u.shape[0]
loss += feature_penalty * feature_weight * num_frame
return loss, num_frame
from typing import Tuple
import torch
import torch.nn.functional as F
from torch import Tensor
def compute_contrastive_loss(
x: Tensor,
mask_indices: Tensor,
targets: Tensor,
neg_is_pos: Tensor,
reduction: str = "none",
logit_temp: float = 0.1,
):
"""
Computes the contrastive loss used in Wav2Vec2 loss function.
Args:
x (Tensor): Input embeddings of shape `(batch_size, sequence_length, hidden_size)`.
mask_indices (Tensor): Indices to mask negative samples of shape `(batch_size, sequence_length)`.
targets (Tensor): Labels indicating positive samples.
Tensor of shape `(num_negative + 1, batch, sequence_length, hidden_size)`.
neg_is_pos (Tensor): Boolean tensor indicating whether negative samples should be treated as positives.
Tensor of shape `(batch, sequence_length)`.
reduction (str): Reduction type ("sum" or "none").
logit_temp (float, optional): Temperature scaling factor for logits, defaults to 0.1.
Returns:
The computed contrastive loss and sample size
"""
x = x[mask_indices].view(x.size(0), -1, x.size(-1)).unsqueeze(0).expand(targets.shape)
logits = torch.cosine_similarity(x.float(), targets.float(), dim=-1).float()
logits /= logit_temp
if neg_is_pos.any():
logits[1:][neg_is_pos] = float("-inf")
target = logits.new_zeros(logits.size(1) * logits.size(2), dtype=torch.long, device=logits.device)
logits = logits.transpose(0, 2)
logits = logits.reshape(-1, logits.size(-1))
loss = F.cross_entropy(
logits,
target,
reduction=reduction,
)
sample_size = target.numel()
return loss, sample_size
def wav2vec2_loss(
x: Tensor, mask_indices: Tensor, positives: Tensor, negatives: Tensor, reduction: str = "none"
) -> Tuple[Tensor, float]:
"""Compute Wav2Vec2 loss.
Args:
x (Tensor): The masked sequences of Wav2Vec 2.0 model.
Tensor of shape `(batch_size, sequence_length, hidden_size)`.
mask_indices (Tensor): The mask indices. Tensor of shape `(batch_size, sequence_length)`
positives (Tensor): The positives, prior to negative sampling.
Tensor of shape `(batch_size, masked_sequence_length, hidden_size)`
negatives (Tensor): The negative samples.
Tensor of shape `(num_negative, batch_size, masked_sequence_length, hidden_size)`
reduction (str): Use "sum" as reduction for cross-entropy loss (Default: ``none``)
Returns:
(Tensor, float)
Tensor: The desired loss Tensor.
float: Sample size according to mask_indices
"""
assert positives is not None
assert mask_indices is not None
assert mask_indices.sum() == positives.shape[0] * positives.shape[1]
neg_is_pos = (positives == negatives).all(-1)
positives = positives.unsqueeze(0)
targets = torch.cat([positives, negatives], dim=0)
loss, sample_size = compute_contrastive_loss(x, mask_indices, targets, neg_is_pos, reduction)
return loss, sample_size
from ._linear_decay import LinearDecayLRScheduler
__all__ = [
"LinearDecayLRScheduler",
]
import torch
from torch.optim.optimizer import Optimizer
class LinearDecayLRScheduler(torch.optim.lr_scheduler._LRScheduler):
"""Linear learning rate scheduler with warm up."""
def __init__(
self,
optimizer: Optimizer,
warmup_updates: int,
max_updates: int,
last_epoch: int = -1,
verbose: bool = False,
):
self.warmup_updates = warmup_updates
self.max_updates = max_updates
super().__init__(optimizer, last_epoch=last_epoch, verbose=verbose)
def get_lr(self):
if self._step_count <= self.warmup_updates:
return [self._step_count / self.warmup_updates * base_lr for base_lr in self.base_lrs]
elif self._step_count >= self.max_updates:
return [0.0 for _ in self.base_lrs]
else:
pct_remaining = (self.max_updates - self._step_count) / (self.max_updates - self.warmup_updates)
return [base_lr * pct_remaining for base_lr in self.base_lrs]
import logging
import pathlib
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, RawDescriptionHelpFormatter
from functools import partial
from typing import Dict, Tuple
import torch
import torchaudio.models
from lightning.pytorch import seed_everything, Trainer
from lightning.pytorch.callbacks import ModelCheckpoint
from .data_modules import HuBERTDataModule
from .lightning_modules import SSLPretrainModule
from .losses import hubert_loss
from .lr_schedulers import LinearDecayLRScheduler
class _Formatter(ArgumentDefaultsHelpFormatter, RawDescriptionHelpFormatter):
# To use ArgumentDefaultsHelpFormatter as the formatter_class and
# RawDescriptionHelpFormatter to add custom formatting to description or epilog.
# Check: https://stackoverflow.com/a/18462760
pass
def _compute_accuracy(logits: torch.Tensor):
with torch.no_grad():
max = logits.argmax(-1) == 0
min = logits.argmin(-1) == 0
both = max & min
corr = max.long().sum().item() - both.long().sum().item()
count = max.numel()
return corr / count
class HuBERTModule(SSLPretrainModule):
def configure_optimizers(self):
return (
[self.optimizer],
[
{
"scheduler": self.lr_scheduler,
"interval": "step",
},
],
)
def log_metric(self, batch: Dict, output: Tuple, loss: torch.Tensor, step_type: str):
logit_m, logit_u, _ = output
self.log(
f"{step_type}_loss",
loss.item(),
on_step=True,
on_epoch=True,
)
acc_m = _compute_accuracy(logit_m)
acc_u = _compute_accuracy(logit_u)
self.log(
f"{step_type}_acc_m",
acc_m,
on_step=True,
on_epoch=True,
sync_dist=True,
prog_bar=step_type == "train",
)
self.log(
f"{step_type}_acc_u",
acc_u,
on_step=True,
on_epoch=True,
sync_dist=True,
prog_bar=step_type == "train",
)
def run_train(args):
seed_everything(1337)
checkpoint_dir = args.exp_dir / f"checkpoints_{args.dataset}_{args.model_name}"
checkpoint = ModelCheckpoint(
checkpoint_dir,
monitor="val_loss",
mode="min",
save_top_k=5,
save_weights_only=False,
verbose=True,
)
train_checkpoint = ModelCheckpoint(
checkpoint_dir,
monitor="train_loss",
mode="min",
save_top_k=5,
save_weights_only=False,
verbose=True,
)
callbacks = [
checkpoint,
train_checkpoint,
]
trainer = Trainer(
default_root_dir=args.exp_dir,
max_steps=args.max_updates,
num_nodes=args.num_nodes,
devices=args.gpus,
accelerator="gpu",
strategy="ddp_find_unused_parameters_true",
precision=args.precision,
accumulate_grad_batches=args.accumulate_grad_batches,
gradient_clip_val=args.clip_norm,
use_distributed_sampler=False,
callbacks=callbacks,
reload_dataloaders_every_n_epochs=1,
)
if args.model_name not in ["hubert_pretrain_base", "hubert_pretrain_large", "hubert_pretrain_xlarge"]:
raise ValueError(
"Expect model_name to be one of 'hubert_pretrain_base', 'hubert_pretrain_large', 'hubert_pretrain_xlarge'."
f"Found {args.model_name}."
)
model = getattr(torchaudio.models, args.model_name)()
loss_fn = partial(
hubert_loss,
masked_weight=args.masked_weight,
unmasked_weight=args.unmasked_weight,
feature_weight=args.feature_weight,
)
optimizer = torch.optim.AdamW(
model.parameters(),
lr=args.learning_rate,
betas=args.betas,
eps=args.eps,
weight_decay=args.weight_decay,
)
lr_scheduler = LinearDecayLRScheduler(optimizer, args.warmup_updates, args.max_updates)
lightning_module = HuBERTModule(
model,
loss_fn,
optimizer,
lr_scheduler,
)
data_module = HuBERTDataModule(
dataset_path=args.dataset_path,
dataset="librispeech",
feature_type="mfcc",
seconds_per_batch=args.seconds_per_batch,
train_shuffle=True,
num_workers=10,
)
trainer.fit(lightning_module, datamodule=data_module)
def _parse_args():
parser = ArgumentParser(
description=__doc__,
formatter_class=_Formatter,
)
parser.add_argument(
"--dataset-path",
type=pathlib.Path,
required=True,
help="Path to the feature and label directories.",
)
parser.add_argument(
"--resume-checkpoint",
type=pathlib.Path,
default=None,
help="Path to the feature and label directories. (Default: None)",
)
parser.add_argument(
"--feature-type",
choices=["mfcc", "hubert"],
type=str,
required=True,
)
parser.add_argument(
"--feature-grad-mult",
default=0.1,
type=float,
help="The scaling factor to multiply the feature extractor gradient. (Default: 0.1)",
)
parser.add_argument(
"--num-classes",
choices=[100, 500],
type=int,
required=True,
help="The ``num_class`` when building the hubert_pretrain_base model.",
)
parser.add_argument(
"--model-name",
default="hubert_pretrain_base",
choices=[
"hubert_pretrain_base",
"hubert_pretrain_large",
"hubert_pretrain_xlarge",
],
type=str,
help="The HuBERT model to train. (Default: 'hubert_pretrain_base')",
)
parser.add_argument(
"--exp-dir",
default=pathlib.Path("./exp"),
type=pathlib.Path,
help="Directory to save checkpoints and logs to. (Default: './exp')",
)
parser.add_argument(
"--dataset",
default="librispeech",
choices=["librispeech", "librilight"],
type=str,
help="The dataset for training. (Default: 'librispeech')",
)
parser.add_argument(
"--learning-rate",
default=0.0005,
type=float,
help="The peak learning rate. (Default: 0.0005)",
)
parser.add_argument(
"--betas",
default=(0.9, 0.98),
type=Tuple,
help="The coefficients for computing running averages of gradient and its square (Default: (0.9, 0.98))",
)
parser.add_argument(
"--eps",
default=1e-6,
type=float,
help="Epsilon value in Adam optimizer. (Default: 1e-6)",
)
parser.add_argument(
"--weight-decay",
default=0.01,
type=float,
help="Weight decay (L2 penalty) (Default: 0.01)",
)
parser.add_argument(
"--precision",
default=16,
choices=[16, 32, 64, "bf16"],
help="Precision of model training. (Default: 16)",
)
parser.add_argument(
"--accumulate-grad-batches",
default=1,
type=int,
help="Number of steps for accumulating gradients. (Default: 1)",
)
parser.add_argument(
"--clip-norm",
default=10.0,
type=float,
help="The gradient norm value to clip. (Default: 10.0)",
)
parser.add_argument(
"--num-nodes",
default=4,
type=int,
help="Number of nodes to use for training. (Default: 4)",
)
parser.add_argument(
"--gpus",
default=8,
type=int,
help="Number of GPUs per node to use for training. (Default: 8)",
)
parser.add_argument(
"--warmup-updates",
default=32000,
type=int,
help="Number of steps for warm up the learning rate. (Default: 32000)",
)
parser.add_argument(
"--max-updates",
default=250000,
type=int,
help="Total number of training steps. (Default: 250000)",
)
parser.add_argument(
"--seconds-per-batch",
default=87.5,
type=float,
help="Number of seconds of audio in a mini-batch. (Default: 87.5)",
)
parser.add_argument(
"--masked-weight",
default=1.0,
type=float,
help="The weight for cross-entropy loss of masksed frames. (Default: ``1.0``)",
)
parser.add_argument(
"--unmasked-weight",
default=0.0,
type=float,
help="The weight for cross-entropy loss of unmasksed frames. (Default: ``0.0``)",
)
parser.add_argument(
"--feature-weight",
default=10.0,
type=float,
help="The weight for feature penalty loss. (Default: ``10.0``)",
)
parser.add_argument("--debug", action="store_true", help="whether to use debug level for logging")
return parser.parse_args()
def _init_logger(debug):
fmt = "%(asctime)s %(message)s" if debug else "%(message)s"
level = logging.DEBUG if debug else logging.INFO
logging.basicConfig(format=fmt, level=level, datefmt="%Y-%m-%d %H:%M:%S")
def cli_main():
args = _parse_args()
_init_logger(args.debug)
run_train(args)
if __name__ == "__main__":
cli_main()
......@@ -31,7 +31,7 @@ def _eval(model, data_loader, device):
def cli_main():
parser = ArgumentParser()
parser.add_argument("--dataset", default="librimix", type=str, choices=["wsj0-mix", "librimix"])
parser.add_argument("--dataset", default="librimix", type=str, choices=["wsj0mix", "librimix"])
parser.add_argument(
"--root-dir",
type=Path,
......@@ -79,7 +79,7 @@ def cli_main():
_, _, eval_loader = _get_dataloader(
args.dataset,
args.data_dir,
args.root_dir,
args.num_speakers,
args.sample_rate,
1, # batch size is set to 1 to avoid masking
......
......@@ -308,7 +308,7 @@ def _get_dataloader(
def cli_main():
parser = ArgumentParser()
parser.add_argument("--batch-size", default=6, type=int)
parser.add_argument("--dataset", default="librimix", type=str, choices=["wsj0-mix", "librimix"])
parser.add_argument("--dataset", default="librimix", type=str, choices=["wsj0mix", "librimix"])
parser.add_argument(
"--root-dir",
type=Path,
......@@ -412,9 +412,10 @@ def cli_main():
trainer = Trainer(
default_root_dir=args.exp_dir,
max_epochs=args.epochs,
gpus=args.num_gpu,
num_nodes=args.num_node,
accelerator="gpu",
strategy="ddp_find_unused_parameters_false",
devices=args.num_gpu,
limit_train_batches=1.0, # Useful for fast experiment
gradient_clip_val=5.0,
callbacks=callbacks,
......
......@@ -207,6 +207,7 @@ from torchaudio.models.decoder import CTCDecoderLM, CTCDecoderLMState
class CustomLM(CTCDecoderLM):
"""Create a Python wrapper around `language_model` to feed to the decoder."""
def __init__(self, language_model: torch.nn.Module):
CTCDecoderLM.__init__(self)
self.language_model = language_model
......@@ -386,6 +387,47 @@ print(f"WER: {beam_search_wer}")
# and “shoktd”.
#
######################################################################
# Incremental decoding
# ~~~~~~~~~~~~~~~~~~~~
#
# If the input speech is long, one can decode the emission in
# incremental manner.
#
# You need to first initialize the internal state of the decoder with
# :py:meth:`~torchaudio.models.decoder.CTCDecoder.decode_begin`.
beam_search_decoder.decode_begin()
######################################################################
# Then, you can pass emissions to
# :py:meth:`~torchaudio.models.decoder.CTCDecoder.decode_begin`.
# Here we use the same emission but pass it to the decoder one frame
# at a time.
for t in range(emission.size(1)):
beam_search_decoder.decode_step(emission[0, t:t + 1, :])
######################################################################
# Finally, finalize the internal state of the decoder, and retrieve the
# result.
beam_search_decoder.decode_end()
beam_search_result_inc = beam_search_decoder.get_final_hypothesis()
######################################################################
# The result of incremental decoding is identical to batch decoding.
#
beam_search_transcript_inc = " ".join(beam_search_result_inc[0].words).strip()
beam_search_wer_inc = torchaudio.functional.edit_distance(
actual_transcript, beam_search_result_inc[0].words) / len(actual_transcript)
print(f"Transcript: {beam_search_transcript_inc}")
print(f"WER: {beam_search_wer_inc}")
assert beam_search_result[0][0].words == beam_search_result_inc[0].words
assert beam_search_result[0][0].score == beam_search_result_inc[0].score
torch.testing.assert_close(beam_search_result[0][0].timesteps, beam_search_result_inc[0].timesteps)
######################################################################
# Timestep Alignments
......@@ -406,30 +448,45 @@ print(timesteps, timesteps.shape[0])
#
def plot_alignments(waveform, emission, tokens, timesteps):
fig, ax = plt.subplots(figsize=(32, 10))
def plot_alignments(waveform, emission, tokens, timesteps, sample_rate):
t = torch.arange(waveform.size(0)) / sample_rate
ratio = waveform.size(0) / emission.size(1) / sample_rate
ax.plot(waveform)
chars = []
words = []
word_start = None
for token, timestep in zip(tokens, timesteps * ratio):
if token == "|":
if word_start is not None:
words.append((word_start, timestep))
word_start = None
else:
chars.append((token, timestep))
if word_start is None:
word_start = timestep
ratio = waveform.shape[0] / emission.shape[1]
word_start = 0
fig, axes = plt.subplots(3, 1)
for i in range(len(tokens)):
if i != 0 and tokens[i - 1] == "|":
word_start = timesteps[i]
if tokens[i] != "|":
plt.annotate(tokens[i].upper(), (timesteps[i] * ratio, waveform.max() * 1.02), size=14)
elif i != 0:
word_end = timesteps[i]
ax.axvspan(word_start * ratio, word_end * ratio, alpha=0.1, color="red")
def _plot(ax, xlim):
ax.plot(t, waveform)
for token, timestep in chars:
ax.annotate(token.upper(), (timestep, 0.5))
for word_start, word_end in words:
ax.axvspan(word_start, word_end, alpha=0.1, color="red")
ax.set_ylim(-0.6, 0.7)
ax.set_yticks([0])
ax.grid(True, axis="y")
ax.set_xlim(xlim)
xticks = ax.get_xticks()
plt.xticks(xticks, xticks / bundle.sample_rate)
ax.set_xlabel("time (sec)")
ax.set_xlim(0, waveform.shape[0])
_plot(axes[0], (0.3, 2.5))
_plot(axes[1], (2.5, 4.7))
_plot(axes[2], (4.7, 6.9))
axes[2].set_xlabel("time (sec)")
fig.tight_layout()
plot_alignments(waveform[0], emission, predicted_tokens, timesteps)
plot_alignments(waveform[0], emission, predicted_tokens, timesteps, bundle.sample_rate)
######################################################################
......
"""
ASR Inference with CUDA CTC Decoder
====================================
**Author**: `Yuekai Zhang <yuekaiz@nvidia.com>`__
This tutorial shows how to perform speech recognition inference using a
CUDA-based CTC beam search decoder.
We demonstrate this on a pretrained
`Zipformer <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_ctc>`__
model from `Next-gen Kaldi <https://nadirapovey.com/next-gen-kaldi-what-is-it>`__ project.
"""
######################################################################
# Overview
# --------
#
# Beam search decoding works by iteratively expanding text hypotheses (beams)
# with next possible characters, and maintaining only the hypotheses with the
# highest scores at each time step.
#
# The underlying implementation uses cuda to acclerate the whole decoding process
# A mathematical formula for the decoder can be
# found in the `paper <https://arxiv.org/pdf/1408.2873.pdf>`__, and
# a more detailed algorithm can be found in this `blog
# <https://distill.pub/2017/ctc/>`__.
#
# Running ASR inference using a CUDA CTC Beam Search decoder
# requires the following components
#
# - Acoustic Model: model predicting modeling units (BPE in this tutorial) from acoustic features
# - BPE Model: the byte-pair encoding (BPE) tokenizer file
#
######################################################################
# Acoustic Model and Set Up
# -------------------------
#
# First we import the necessary utilities and fetch the data that we are
# working with
#
import torch
import torchaudio
print(torch.__version__)
print(torchaudio.__version__)
######################################################################
#
import time
from pathlib import Path
import IPython
import sentencepiece as spm
from torchaudio.models.decoder import cuda_ctc_decoder
from torchaudio.utils import download_asset
######################################################################
#
# We use the pretrained
# `Zipformer <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-ctc-2022-12-01>`__
# model that is trained on the `LibriSpeech
# dataset <http://www.openslr.org/12>`__. The model is jointly trained with CTC and Transducer loss functions.
# In this tutorial, we only use CTC head of the model.
def download_asset_external(url, key):
path = Path(torch.hub.get_dir()) / "torchaudio" / Path(key)
if not path.exists():
path.parent.mkdir(parents=True, exist_ok=True)
torch.hub.download_url_to_file(url, path)
return str(path)
url_prefix = "https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-ctc-2022-12-01"
model_link = f"{url_prefix}/resolve/main/exp/cpu_jit.pt"
model_path = download_asset_external(model_link, "cuda_ctc_decoder/cpu_jit.pt")
######################################################################
# We will load a sample from the LibriSpeech test-other dataset.
#
speech_file = download_asset("tutorial-assets/ctc-decoding/1688-142285-0007.wav")
waveform, sample_rate = torchaudio.load(speech_file)
assert sample_rate == 16000
IPython.display.Audio(speech_file)
######################################################################
# The transcript corresponding to this audio file is
#
# .. code-block::
#
# i really was very much afraid of showing him how much shocked i was at some parts of what he said
#
######################################################################
# Files and Data for Decoder
# --------------------------
#
# Next, we load in our token from BPE model, which is the tokenizer for decoding.
#
######################################################################
# Tokens
# ~~~~~~
#
# The tokens are the possible symbols that the acoustic model can predict,
# including the blank symbol in CTC. In this tutorial, it includes 500 BPE tokens.
# It can either be passed in as a
# file, where each line consists of the tokens corresponding to the same
# index, or as a list of tokens, each mapping to a unique index.
#
# .. code-block::
#
# # tokens
# <blk>
# <sos/eos>
# <unk>
# S
# _THE
# _A
# T
# _AND
# ...
#
bpe_link = f"{url_prefix}/resolve/main/data/lang_bpe_500/bpe.model"
bpe_path = download_asset_external(bpe_link, "cuda_ctc_decoder/bpe.model")
bpe_model = spm.SentencePieceProcessor()
bpe_model.load(bpe_path)
tokens = [bpe_model.id_to_piece(id) for id in range(bpe_model.get_piece_size())]
print(tokens)
######################################################################
# Construct CUDA Decoder
# ----------------------
# In this tutorial, we will construct a CUDA beam search decoder.
# The decoder can be constructed using the factory function
# :py:func:`~torchaudio.models.decoder.cuda_ctc_decoder`.
#
cuda_decoder = cuda_ctc_decoder(tokens, nbest=10, beam_size=10, blank_skip_threshold=0.95)
######################################################################
# Run Inference
# -------------
#
# Now that we have the data, acoustic model, and decoder, we can perform
# inference. The output of the beam search decoder is of type
# :py:class:`~torchaudio.models.decoder.CUCTCHypothesis`, consisting of the
# predicted token IDs, words (symbols corresponding to the token IDs), and hypothesis scores.
# Recall the transcript corresponding to the
# waveform is
#
# .. code-block::
#
# i really was very much afraid of showing him how much shocked i was at some parts of what he said
#
actual_transcript = "i really was very much afraid of showing him how much shocked i was at some parts of what he said"
actual_transcript = actual_transcript.split()
device = torch.device("cuda", 0)
acoustic_model = torch.jit.load(model_path)
acoustic_model.to(device)
acoustic_model.eval()
waveform = waveform.to(device)
feat = torchaudio.compliance.kaldi.fbank(waveform, num_mel_bins=80, snip_edges=False)
feat = feat.unsqueeze(0)
feat_lens = torch.tensor(feat.size(1), device=device).unsqueeze(0)
encoder_out, encoder_out_lens = acoustic_model.encoder(feat, feat_lens)
nnet_output = acoustic_model.ctc_output(encoder_out)
log_prob = torch.nn.functional.log_softmax(nnet_output, -1)
print(f"The shape of log_prob: {log_prob.shape}, the shape of encoder_out_lens: {encoder_out_lens.shape}")
######################################################################
# The cuda ctc decoder gives the following result.
#
results = cuda_decoder(log_prob, encoder_out_lens.to(torch.int32))
beam_search_transcript = bpe_model.decode(results[0][0].tokens).lower()
beam_search_wer = torchaudio.functional.edit_distance(actual_transcript, beam_search_transcript.split()) / len(
actual_transcript
)
print(f"Transcript: {beam_search_transcript}")
print(f"WER: {beam_search_wer}")
######################################################################
# Beam Search Decoder Parameters
# ------------------------------
#
# In this section, we go a little bit more in depth about some different
# parameters and tradeoffs. For the full list of customizable parameters,
# please refer to the
# :py:func:`documentation <torchaudio.models.decoder.cuda_ctc_decoder>`.
#
######################################################################
# Helper Function
# ~~~~~~~~~~~~~~~
#
def print_decoded(cuda_decoder, bpe_model, log_prob, encoder_out_lens, param, param_value):
start_time = time.monotonic()
results = cuda_decoder(log_prob, encoder_out_lens.to(torch.int32))
decode_time = time.monotonic() - start_time
transcript = bpe_model.decode(results[0][0].tokens).lower()
score = results[0][0].score
print(f"{param} {param_value:<3}: {transcript} (score: {score:.2f}; {decode_time:.4f} secs)")
######################################################################
# nbest
# ~~~~~
#
# This parameter indicates the number of best hypotheses to return. For
# instance, by setting ``nbest=10`` when constructing the beam search
# decoder earlier, we can now access the hypotheses with the top 10 scores.
#
for i in range(10):
transcript = bpe_model.decode(results[0][i].tokens).lower()
score = results[0][i].score
print(f"{transcript} (score: {score})")
######################################################################
# beam size
# ~~~~~~~~~
#
# The ``beam_size`` parameter determines the maximum number of best
# hypotheses to hold after each decoding step. Using larger beam sizes
# allows for exploring a larger range of possible hypotheses which can
# produce hypotheses with higher scores, but it does not provide additional gains beyond a certain point.
# We recommend to set beam_size=10 for cuda beam search decoder.
#
# In the example below, we see improvement in decoding quality as we
# increase beam size from 1 to 3, but notice how using a beam size
# of 3 provides the same output as beam size 10.
#
beam_sizes = [1, 2, 3, 10]
for beam_size in beam_sizes:
beam_search_decoder = cuda_ctc_decoder(
tokens,
nbest=1,
beam_size=beam_size,
blank_skip_threshold=0.95,
)
print_decoded(beam_search_decoder, bpe_model, log_prob, encoder_out_lens, "beam size", beam_size)
######################################################################
# blank skip threshold
# ~~~~~~~~~~~~~~~~~~~~
#
# The ``blank_skip_threshold`` parameter is used to prune the frames which have large blank probability.
# Pruning these frames with a good blank_skip_threshold could speed up decoding
# process a lot while no accuracy drop.
# Since the rule of CTC, we would keep at least one blank frame between two non-blank frames
# to avoid mistakenly merge two consecutive identical symbols.
# We recommend to set blank_skip_threshold=0.95 for cuda beam search decoder.
#
blank_skip_probs = [0.25, 0.95, 1.0]
for blank_skip_prob in blank_skip_probs:
beam_search_decoder = cuda_ctc_decoder(
tokens,
nbest=10,
beam_size=10,
blank_skip_threshold=blank_skip_prob,
)
print_decoded(beam_search_decoder, bpe_model, log_prob, encoder_out_lens, "blank_skip_threshold", blank_skip_prob)
del cuda_decoder
######################################################################
# Benchmark with flashlight CPU decoder
# -------------------------------------
# We benchmark the throughput and accuracy between CUDA decoder and CPU decoder using librispeech test_other set.
# To reproduce below benchmark results, you may refer `here <https://github.com/pytorch/audio/tree/main/examples/asr/librispeech_cuda_ctc_decoder>`__.
#
# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
# | Decoder | Setting | WER (%) | N-Best Oracle WER (%) | Decoder Cost Time (seconds) |
# +==============+==========================================+=========+=======================+=============================+
# | CUDA decoder | blank_skip_threshold 0.95 | 5.81 | 4.11 | 2.57 |
# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
# | CUDA decoder | blank_skip_threshold 1.0 (no frame-skip) | 5.81 | 4.09 | 6.24 |
# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
# | CPU decoder | beam_size_token 10 | 5.86 | 4.30 | 28.61 |
# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
# | CPU decoder | beam_size_token 500 | 5.86 | 4.30 | 791.80 |
# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
#
# From the above table, CUDA decoder could give a slight improvement in WER and a significant increase in throughput.
......@@ -20,6 +20,8 @@ import torchaudio.functional as F
print(torch.__version__)
print(torchaudio.__version__)
import matplotlib.pyplot as plt
######################################################################
# Preparation
# -----------
......@@ -27,10 +29,7 @@ print(torchaudio.__version__)
# First, we import the modules and download the audio assets we use in this tutorial.
#
import math
from IPython.display import Audio
import matplotlib.pyplot as plt
from torchaudio.utils import download_asset
......@@ -44,56 +43,38 @@ SAMPLE_NOISE = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-st
# Applying effects and filtering
# ------------------------------
#
# :py:func:`torchaudio.sox_effects` allows for directly applying filters similar to
# those available in ``sox`` to Tensor objects and file object audio sources.
#
# There are two functions for this:
#
# - :py:func:`torchaudio.sox_effects.apply_effects_tensor` for applying effects
# to Tensor.
# - :py:func:`torchaudio.sox_effects.apply_effects_file` for applying effects to
# other audio sources.
# :py:class:`torchaudio.io.AudioEffector` allows for directly applying
# filters and codecs to Tensor objects, in a similar way as ``ffmpeg``
# command
#
# Both functions accept effect definitions in the form
# ``List[List[str]]``.
# This is mostly consistent with how ``sox`` command works, but one caveat is
# that ``sox`` adds some effects automatically, whereas ``torchaudio``’s
# implementation does not.
#
# For the list of available effects, please refer to `the sox
# documentation <http://sox.sourceforge.net/sox.html>`__.
#
# **Tip** If you need to load and resample your audio data on the fly,
# then you can use :py:func:`torchaudio.sox_effects.apply_effects_file`
# with effect ``"rate"``.
#
# **Note** :py:func:`torchaudio.sox_effects.apply_effects_file` accepts a
# file-like object or path-like object.
# Similar to :py:func:`torchaudio.load`, when the audio format cannot be
# inferred from either the file extension or header, you can provide
# argument ``format`` to specify the format of the audio source.
#
# **Note** This process is not differentiable.
# `AudioEffector Usages <./effector_tutorial.html>` explains how to use
# this class, so for the detail, please refer to the tutorial.
#
# Load the data
waveform1, sample_rate1 = torchaudio.load(SAMPLE_WAV)
waveform1, sample_rate = torchaudio.load(SAMPLE_WAV, channels_first=False)
# Define effects
effects = [
["lowpass", "-1", "300"], # apply single-pole lowpass filter
["speed", "0.8"], # reduce the speed
# This only changes sample rate, so it is necessary to
# add `rate` effect with original sample rate after this.
["rate", f"{sample_rate1}"],
["reverb", "-w"], # Reverbration gives some dramatic feeling
]
effect = ",".join(
[
"lowpass=frequency=300:poles=1", # apply single-pole lowpass filter
"atempo=0.8", # reduce the speed
"aecho=in_gain=0.8:out_gain=0.9:delays=200:decays=0.3|delays=400:decays=0.3"
# Applying echo gives some dramatic feeling
],
)
# Apply effects
waveform2, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor(waveform1, sample_rate1, effects)
def apply_effect(waveform, sample_rate, effect):
effector = torchaudio.io.AudioEffector(effect=effect)
return effector.apply(waveform, sample_rate)
print(waveform1.shape, sample_rate1)
print(waveform2.shape, sample_rate2)
waveform2 = apply_effect(waveform1, sample_rate, effect)
print(waveform1.shape, sample_rate)
print(waveform2.shape, sample_rate)
######################################################################
# Note that the number of frames and number of channels are different from
......@@ -101,6 +82,7 @@ print(waveform2.shape, sample_rate2)
# audio.
#
def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None):
waveform = waveform.numpy()
......@@ -118,11 +100,12 @@ def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None):
if xlim:
axes[c].set_xlim(xlim)
figure.suptitle(title)
plt.show(block=False)
######################################################################
#
def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
waveform = waveform.numpy()
......@@ -138,29 +121,26 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
if xlim:
axes[c].set_xlim(xlim)
figure.suptitle(title)
plt.show(block=False)
######################################################################
# Original:
# ~~~~~~~~~
# Original
# ~~~~~~~~
#
plot_waveform(waveform1, sample_rate1, title="Original", xlim=(-0.1, 3.2))
plot_specgram(waveform1, sample_rate1, title="Original", xlim=(0, 3.04))
Audio(waveform1, rate=sample_rate1)
plot_waveform(waveform1.T, sample_rate, title="Original", xlim=(-0.1, 3.2))
plot_specgram(waveform1.T, sample_rate, title="Original", xlim=(0, 3.04))
Audio(waveform1.T, rate=sample_rate)
######################################################################
# Effects applied:
# ~~~~~~~~~~~~~~~~
# Effects applied
# ~~~~~~~~~~~~~~~
#
plot_waveform(waveform2, sample_rate2, title="Effects Applied", xlim=(-0.1, 3.2))
plot_specgram(waveform2, sample_rate2, title="Effects Applied", xlim=(0, 3.04))
Audio(waveform2, rate=sample_rate2)
plot_waveform(waveform2.T, sample_rate, title="Effects Applied", xlim=(-0.1, 3.2))
plot_specgram(waveform2.T, sample_rate, title="Effects Applied", xlim=(0, 3.04))
Audio(waveform2.T, rate=sample_rate)
######################################################################
# Doesn’t it sound more dramatic?
#
######################################################################
# Simulating room reverberation
......@@ -185,28 +165,26 @@ plot_specgram(rir_raw, sample_rate, title="Room Impulse Response (raw)")
Audio(rir_raw, rate=sample_rate)
######################################################################
# First, we need to clean up the RIR. We extract the main impulse, normalize
# the signal power, then flip along the time axis.
# First, we need to clean up the RIR. We extract the main impulse and normalize
# it by its power.
#
rir = rir_raw[:, int(sample_rate * 1.01) : int(sample_rate * 1.3)]
rir = rir / torch.norm(rir, p=2)
RIR = torch.flip(rir, [1])
rir = rir / torch.linalg.vector_norm(rir, ord=2)
plot_waveform(rir, sample_rate, title="Room Impulse Response")
######################################################################
# Then, we convolve the speech signal with the RIR filter.
# Then, using :py:func:`torchaudio.functional.fftconvolve`,
# we convolve the speech signal with the RIR.
#
speech, _ = torchaudio.load(SAMPLE_SPEECH)
speech_ = torch.nn.functional.pad(speech, (RIR.shape[1] - 1, 0))
augmented = torch.nn.functional.conv1d(speech_[None, ...], RIR[None, ...])[0]
augmented = F.fftconvolve(speech, rir)
######################################################################
# Original:
# ~~~~~~~~~
# Original
# ~~~~~~~~
#
plot_waveform(speech, sample_rate, title="Original")
......@@ -214,8 +192,8 @@ plot_specgram(speech, sample_rate, title="Original")
Audio(speech, rate=sample_rate)
######################################################################
# RIR applied:
# ~~~~~~~~~~~~
# RIR applied
# ~~~~~~~~~~~
#
plot_waveform(augmented, sample_rate, title="RIR Applied")
......@@ -227,33 +205,31 @@ Audio(augmented, rate=sample_rate)
# Adding background noise
# -----------------------
#
# To add background noise to audio data, you can simply add a noise Tensor to
# the Tensor representing the audio data. A common method to adjust the
# intensity of noise is changing the Signal-to-Noise Ratio (SNR).
# [`wikipedia <https://en.wikipedia.org/wiki/Signal-to-noise_ratio>`__]
# To introduce background noise to audio data, we can add a noise Tensor to
# the Tensor representing the audio data according to some desired
# signal-to-noise ratio (SNR)
# [`wikipedia <https://en.wikipedia.org/wiki/Signal-to-noise_ratio>`__],
# which determines the intensity of the audio data relative to that of the noise
# in the output.
#
# $$ \\mathrm{SNR} = \\frac{P_{signal}}{P_{noise}} $$
#
# $$ \\mathrm{SNR_{dB}} = 10 \\log _{{10}} \\mathrm {SNR} $$
#
# To add noise to audio data per SNRs, we
# use :py:func:`torchaudio.functional.add_noise`.
speech, _ = torchaudio.load(SAMPLE_SPEECH)
noise, _ = torchaudio.load(SAMPLE_NOISE)
noise = noise[:, : speech.shape[1]]
speech_rms = speech.norm(p=2)
noise_rms = noise.norm(p=2)
snr_dbs = torch.tensor([20, 10, 3])
noisy_speeches = F.add_noise(speech, noise, snr_dbs)
snr_dbs = [20, 10, 3]
noisy_speeches = []
for snr_db in snr_dbs:
snr = 10 ** (snr_db / 20)
scale = snr * noise_rms / speech_rms
noisy_speeches.append((scale * speech + noise) / 2)
######################################################################
# Background noise:
# ~~~~~~~~~~~~~~~~~
# Background noise
# ~~~~~~~~~~~~~~~~
#
plot_waveform(noise, sample_rate, title="Background noise")
......@@ -261,31 +237,31 @@ plot_specgram(noise, sample_rate, title="Background noise")
Audio(noise, rate=sample_rate)
######################################################################
# SNR 20 dB:
# ~~~~~~~~~~
# SNR 20 dB
# ~~~~~~~~~
#
snr_db, noisy_speech = snr_dbs[0], noisy_speeches[0]
snr_db, noisy_speech = snr_dbs[0], noisy_speeches[0:1]
plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
Audio(noisy_speech, rate=sample_rate)
######################################################################
# SNR 10 dB:
# ~~~~~~~~~~
# SNR 10 dB
# ~~~~~~~~~
#
snr_db, noisy_speech = snr_dbs[1], noisy_speeches[1]
snr_db, noisy_speech = snr_dbs[1], noisy_speeches[1:2]
plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
Audio(noisy_speech, rate=sample_rate)
######################################################################
# SNR 3 dB:
# ~~~~~~~~~
# SNR 3 dB
# ~~~~~~~~
#
snr_db, noisy_speech = snr_dbs[2], noisy_speeches[2]
snr_db, noisy_speech = snr_dbs[2], noisy_speeches[2:3]
plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
Audio(noisy_speech, rate=sample_rate)
......@@ -295,60 +271,56 @@ Audio(noisy_speech, rate=sample_rate)
# Applying codec to Tensor object
# -------------------------------
#
# :py:func:`torchaudio.functional.apply_codec` can apply codecs to
# :py:class:`torchaudio.io.AudioEffector` can also apply codecs to
# a Tensor object.
#
# **Note** This process is not differentiable.
#
waveform, sample_rate = torchaudio.load(SAMPLE_SPEECH, channels_first=False)
waveform, sample_rate = torchaudio.load(SAMPLE_SPEECH)
def apply_codec(waveform, sample_rate, format, encoder=None):
encoder = torchaudio.io.AudioEffector(format=format, encoder=encoder)
return encoder.apply(waveform, sample_rate)
configs = [
{"format": "wav", "encoding": "ULAW", "bits_per_sample": 8},
{"format": "gsm"},
{"format": "vorbis", "compression": -1},
]
waveforms = []
for param in configs:
augmented = F.apply_codec(waveform, sample_rate, **param)
waveforms.append(augmented)
######################################################################
# Original:
# ~~~~~~~~~
# Original
# ~~~~~~~~
#
plot_waveform(waveform, sample_rate, title="Original")
plot_specgram(waveform, sample_rate, title="Original")
Audio(waveform, rate=sample_rate)
plot_waveform(waveform.T, sample_rate, title="Original")
plot_specgram(waveform.T, sample_rate, title="Original")
Audio(waveform.T, rate=sample_rate)
######################################################################
# 8 bit mu-law:
# ~~~~~~~~~~~~~
# 8 bit mu-law
# ~~~~~~~~~~~~
#
plot_waveform(waveforms[0], sample_rate, title="8 bit mu-law")
plot_specgram(waveforms[0], sample_rate, title="8 bit mu-law")
Audio(waveforms[0], rate=sample_rate)
mulaw = apply_codec(waveform, sample_rate, "wav", encoder="pcm_mulaw")
plot_waveform(mulaw.T, sample_rate, title="8 bit mu-law")
plot_specgram(mulaw.T, sample_rate, title="8 bit mu-law")
Audio(mulaw.T, rate=sample_rate)
######################################################################
# GSM-FR:
# ~~~~~~~
# G.722
# ~~~~~
#
plot_waveform(waveforms[1], sample_rate, title="GSM-FR")
plot_specgram(waveforms[1], sample_rate, title="GSM-FR")
Audio(waveforms[1], rate=sample_rate)
g722 = apply_codec(waveform, sample_rate, "g722")
plot_waveform(g722.T, sample_rate, title="G.722")
plot_specgram(g722.T, sample_rate, title="G.722")
Audio(g722.T, rate=sample_rate)
######################################################################
# Vorbis:
# ~~~~~~~
# Vorbis
# ~~~~~~
#
plot_waveform(waveforms[2], sample_rate, title="Vorbis")
plot_specgram(waveforms[2], sample_rate, title="Vorbis")
Audio(waveforms[2], rate=sample_rate)
vorbis = apply_codec(waveform, sample_rate, "ogg", encoder="vorbis")
plot_waveform(vorbis.T, sample_rate, title="Vorbis")
plot_specgram(vorbis.T, sample_rate, title="Vorbis")
Audio(vorbis.T, rate=sample_rate)
######################################################################
# Simulating a phone recoding
......@@ -365,8 +337,7 @@ original_speech, sample_rate = torchaudio.load(SAMPLE_SPEECH)
plot_specgram(original_speech, sample_rate, title="Original")
# Apply RIR
speech_ = torch.nn.functional.pad(original_speech, (RIR.shape[1] - 1, 0))
rir_applied = torch.nn.functional.conv1d(speech_[None, ...], RIR[None, ...])[0]
rir_applied = F.fftconvolve(speech, rir)
plot_specgram(rir_applied, sample_rate, title="RIR Applied")
......@@ -377,69 +348,60 @@ plot_specgram(rir_applied, sample_rate, title="RIR Applied")
noise, _ = torchaudio.load(SAMPLE_NOISE)
noise = noise[:, : rir_applied.shape[1]]
snr_db = 8
scale = (10 ** (snr_db / 20)) * noise.norm(p=2) / rir_applied.norm(p=2)
bg_added = (scale * rir_applied + noise) / 2
snr_db = torch.tensor([8])
bg_added = F.add_noise(rir_applied, noise, snr_db)
plot_specgram(bg_added, sample_rate, title="BG noise added")
# Apply filtering and change sample rate
filtered, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor(
bg_added,
sample_rate,
effects=[
["lowpass", "4000"],
[
"compand",
"0.02,0.05",
"-60,-60,-30,-10,-20,-8,-5,-8,-2,-8",
"-8",
"-7",
"0.05",
],
["rate", "8000"],
],
effect = ",".join(
[
"lowpass=frequency=4000:poles=1",
"compand=attacks=0.02:decays=0.05:points=-60/-60|-30/-10|-20/-8|-5/-8|-2/-8:gain=-8:volume=-7:delay=0.05",
]
)
plot_specgram(filtered, sample_rate2, title="Filtered")
filtered = apply_effect(bg_added.T, sample_rate, effect)
sample_rate2 = 8000
# Apply telephony codec
codec_applied = F.apply_codec(filtered, sample_rate2, format="gsm")
plot_specgram(filtered.T, sample_rate2, title="Filtered")
plot_specgram(codec_applied, sample_rate2, title="GSM Codec Applied")
# Apply telephony codec
codec_applied = apply_codec(filtered, sample_rate2, "g722")
plot_specgram(codec_applied.T, sample_rate2, title="G.722 Codec Applied")
######################################################################
# Original speech:
# ~~~~~~~~~~~~~~~~
# Original speech
# ~~~~~~~~~~~~~~~
#
Audio(original_speech, rate=sample_rate)
######################################################################
# RIR applied:
# ~~~~~~~~~~~~
# RIR applied
# ~~~~~~~~~~~
#
Audio(rir_applied, rate=sample_rate)
######################################################################
# Background noise added:
# ~~~~~~~~~~~~~~~~~~~~~~~
# Background noise added
# ~~~~~~~~~~~~~~~~~~~~~~
#
Audio(bg_added, rate=sample_rate)
######################################################################
# Filtered:
# ~~~~~~~~~
# Filtered
# ~~~~~~~~
#
Audio(filtered, rate=sample_rate2)
Audio(filtered.T, rate=sample_rate2)
######################################################################
# Codec applied:
# ~~~~~~~~~~~~~~
# Codec applied
# ~~~~~~~~~~~~~
#
Audio(codec_applied, rate=sample_rate2)
Audio(codec_applied.T, rate=sample_rate2)
# -*- coding: utf-8 -*-
"""
Audio Datasets
==============
......@@ -10,10 +9,6 @@ datasets. Please refer to the official documentation for the list of
available datasets.
"""
# When running this tutorial in Google Colab, install the required packages
# with the following.
# !pip install torchaudio
import torch
import torchaudio
......@@ -21,22 +16,13 @@ print(torch.__version__)
print(torchaudio.__version__)
######################################################################
# Preparing data and utility functions (skip this section)
# --------------------------------------------------------
#
# @title Prepare data and utility functions. {display-mode: "form"}
# @markdown
# @markdown You do not need to look into this cell.
# @markdown Just execute once and you are good to go.
# -------------------------------------------------------------------------------
# Preparation of data and helper functions.
# -------------------------------------------------------------------------------
import os
import IPython
import matplotlib.pyplot as plt
from IPython.display import Audio, display
_SAMPLE_DIR = "_assets"
......@@ -44,34 +30,13 @@ YESNO_DATASET_PATH = os.path.join(_SAMPLE_DIR, "yes_no")
os.makedirs(YESNO_DATASET_PATH, exist_ok=True)
def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
def plot_specgram(waveform, sample_rate, title="Spectrogram"):
waveform = waveform.numpy()
num_channels, _ = waveform.shape
figure, axes = plt.subplots(num_channels, 1)
if num_channels == 1:
axes = [axes]
for c in range(num_channels):
axes[c].specgram(waveform[c], Fs=sample_rate)
if num_channels > 1:
axes[c].set_ylabel(f"Channel {c+1}")
if xlim:
axes[c].set_xlim(xlim)
figure, ax = plt.subplots()
ax.specgram(waveform[0], Fs=sample_rate)
figure.suptitle(title)
plt.show(block=False)
def play_audio(waveform, sample_rate):
waveform = waveform.numpy()
num_channels, _ = waveform.shape
if num_channels == 1:
display(Audio(waveform[0], rate=sample_rate))
elif num_channels == 2:
display(Audio((waveform[0], waveform[1]), rate=sample_rate))
else:
raise ValueError("Waveform with more than 2 channels are not supported.")
figure.tight_layout()
######################################################################
......@@ -79,10 +44,25 @@ def play_audio(waveform, sample_rate):
# :py:class:`torchaudio.datasets.YESNO` dataset.
#
dataset = torchaudio.datasets.YESNO(YESNO_DATASET_PATH, download=True)
for i in [1, 3, 5]:
waveform, sample_rate, label = dataset[i]
plot_specgram(waveform, sample_rate, title=f"Sample {i}: {label}")
play_audio(waveform, sample_rate)
######################################################################
#
i = 1
waveform, sample_rate, label = dataset[i]
plot_specgram(waveform, sample_rate, title=f"Sample {i}: {label}")
IPython.display.Audio(waveform, rate=sample_rate)
######################################################################
#
i = 3
waveform, sample_rate, label = dataset[i]
plot_specgram(waveform, sample_rate, title=f"Sample {i}: {label}")
IPython.display.Audio(waveform, rate=sample_rate)
######################################################################
#
i = 5
waveform, sample_rate, label = dataset[i]
plot_specgram(waveform, sample_rate, title=f"Sample {i}: {label}")
IPython.display.Audio(waveform, rate=sample_rate)
......@@ -19,25 +19,20 @@ print(torch.__version__)
print(torchaudio.__version__)
######################################################################
# Preparing data and utility functions (skip this section)
# --------------------------------------------------------
# Preparation
# -----------
#
# @title Prepare data and utility functions. {display-mode: "form"}
# @markdown
# @markdown You do not need to look into this cell.
# @markdown Just execute once and you are good to go.
# @markdown
# @markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/),
# @markdown which is licensed under Creative Commos BY 4.0.
# -------------------------------------------------------------------------------
# Preparation of data and helper functions.
# -------------------------------------------------------------------------------
import librosa
import matplotlib.pyplot as plt
from IPython.display import Audio
from torchaudio.utils import download_asset
######################################################################
# In this tutorial, we will use a speech data from
# `VOiCES dataset <https://iqtlabs.github.io/voices/>`__,
# which is licensed under Creative Commos BY 4.0.
SAMPLE_WAV_SPEECH_PATH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
......@@ -75,18 +70,6 @@ def get_spectrogram(
return spectrogram(waveform)
def plot_spectrogram(spec, title=None, ylabel="freq_bin", aspect="auto", xmax=None):
fig, axs = plt.subplots(1, 1)
axs.set_title(title or "Spectrogram (db)")
axs.set_ylabel(ylabel)
axs.set_xlabel("frame")
im = axs.imshow(librosa.power_to_db(spec), origin="lower", aspect=aspect)
if xmax:
axs.set_xlim((0, xmax))
fig.colorbar(im, ax=axs)
plt.show(block=False)
######################################################################
# SpecAugment
# -----------
......@@ -108,43 +91,79 @@ def plot_spectrogram(spec, title=None, ylabel="freq_bin", aspect="auto", xmax=No
spec = get_spectrogram(power=None)
stretch = T.TimeStretch()
rate = 1.2
spec_ = stretch(spec, rate)
plot_spectrogram(torch.abs(spec_[0]), title=f"Stretched x{rate}", aspect="equal", xmax=304)
spec_12 = stretch(spec, overriding_rate=1.2)
spec_09 = stretch(spec, overriding_rate=0.9)
######################################################################
# Visualization
# ~~~~~~~~~~~~~
def plot():
def plot_spec(ax, spec, title):
ax.set_title(title)
ax.imshow(librosa.amplitude_to_db(spec), origin="lower", aspect="auto")
fig, axes = plt.subplots(3, 1, sharex=True, sharey=True)
plot_spec(axes[0], torch.abs(spec_12[0]), title="Stretched x1.2")
plot_spec(axes[1], torch.abs(spec[0]), title="Original")
plot_spec(axes[2], torch.abs(spec_09[0]), title="Stretched x0.9")
fig.tight_layout()
plot_spectrogram(torch.abs(spec[0]), title="Original", aspect="equal", xmax=304)
rate = 0.9
spec_ = stretch(spec, rate)
plot_spectrogram(torch.abs(spec_[0]), title=f"Stretched x{rate}", aspect="equal", xmax=304)
plot()
######################################################################
# TimeMasking
# -----------
#
# Audio Samples
# ~~~~~~~~~~~~~
def preview(spec, rate=16000):
ispec = T.InverseSpectrogram()
waveform = ispec(spec)
torch.random.manual_seed(4)
return Audio(waveform[0].numpy().T, rate=rate)
spec = get_spectrogram()
plot_spectrogram(spec[0], title="Original")
masking = T.TimeMasking(time_mask_param=80)
spec = masking(spec)
preview(spec)
plot_spectrogram(spec[0], title="Masked along time axis")
######################################################################
# FrequencyMasking
# ----------------
#
preview(spec_12)
######################################################################
#
preview(spec_09)
######################################################################
# Time and Frequency Masking
# --------------------------
#
torch.random.manual_seed(4)
time_masking = T.TimeMasking(time_mask_param=80)
freq_masking = T.FrequencyMasking(freq_mask_param=80)
spec = get_spectrogram()
plot_spectrogram(spec[0], title="Original")
time_masked = time_masking(spec)
freq_masked = freq_masking(spec)
######################################################################
#
def plot():
def plot_spec(ax, spec, title):
ax.set_title(title)
ax.imshow(librosa.power_to_db(spec), origin="lower", aspect="auto")
fig, axes = plt.subplots(3, 1, sharex=True, sharey=True)
plot_spec(axes[0], spec[0], title="Original")
plot_spec(axes[1], time_masked[0], title="Masked along time axis")
plot_spec(axes[2], freq_masked[0], title="Masked along frequency axis")
fig.tight_layout()
masking = T.FrequencyMasking(freq_mask_param=80)
spec = masking(spec)
plot_spectrogram(spec[0], title="Masked along frequency axis")
plot()
......@@ -25,6 +25,23 @@ import torchaudio.transforms as T
print(torch.__version__)
print(torchaudio.__version__)
import librosa
import matplotlib.pyplot as plt
######################################################################
# Overview of audio features
# --------------------------
#
# The following diagram shows the relationship between common audio features
# and torchaudio APIs to generate them.
#
# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio_feature_extractions.png
#
# For the complete list of available features, please refer to the
# documentation.
#
######################################################################
# Preparation
# -----------
......@@ -38,8 +55,7 @@ print(torchaudio.__version__)
# !pip install librosa
#
from IPython.display import Audio
import librosa
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from torchaudio.utils import download_asset
torch.random.manual_seed(0)
......@@ -47,27 +63,27 @@ torch.random.manual_seed(0)
SAMPLE_SPEECH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
def plot_waveform(waveform, sr, title="Waveform"):
def plot_waveform(waveform, sr, title="Waveform", ax=None):
waveform = waveform.numpy()
num_channels, num_frames = waveform.shape
time_axis = torch.arange(0, num_frames) / sr
figure, axes = plt.subplots(num_channels, 1)
axes.plot(time_axis, waveform[0], linewidth=1)
axes.grid(True)
figure.suptitle(title)
plt.show(block=False)
if ax is None:
_, ax = plt.subplots(num_channels, 1)
ax.plot(time_axis, waveform[0], linewidth=1)
ax.grid(True)
ax.set_xlim([0, time_axis[-1]])
ax.set_title(title)
def plot_spectrogram(specgram, title=None, ylabel="freq_bin"):
fig, axs = plt.subplots(1, 1)
axs.set_title(title or "Spectrogram (db)")
axs.set_ylabel(ylabel)
axs.set_xlabel("frame")
im = axs.imshow(librosa.power_to_db(specgram), origin="lower", aspect="auto")
fig.colorbar(im, ax=axs)
plt.show(block=False)
def plot_spectrogram(specgram, title=None, ylabel="freq_bin", ax=None):
if ax is None:
_, ax = plt.subplots(1, 1)
if title is not None:
ax.set_title(title)
ax.set_ylabel(ylabel)
ax.imshow(librosa.power_to_db(specgram), origin="lower", aspect="auto", interpolation="nearest")
def plot_fbank(fbank, title=None):
......@@ -76,21 +92,6 @@ def plot_fbank(fbank, title=None):
axs.imshow(fbank, aspect="auto")
axs.set_ylabel("frequency bin")
axs.set_xlabel("mel bin")
plt.show(block=False)
######################################################################
# Overview of audio features
# --------------------------
#
# The following diagram shows the relationship between common audio features
# and torchaudio APIs to generate them.
#
# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio_feature_extractions.png
#
# For the complete list of available features, please refer to the
# documentation.
#
######################################################################
......@@ -101,77 +102,157 @@ def plot_fbank(fbank, title=None):
# you can use :py:func:`torchaudio.transforms.Spectrogram`.
#
# Load audio
SPEECH_WAVEFORM, SAMPLE_RATE = torchaudio.load(SAMPLE_SPEECH)
plot_waveform(SPEECH_WAVEFORM, SAMPLE_RATE, title="Original waveform")
Audio(SPEECH_WAVEFORM.numpy(), rate=SAMPLE_RATE)
# Define transform
spectrogram = T.Spectrogram(n_fft=512)
# Perform transform
spec = spectrogram(SPEECH_WAVEFORM)
######################################################################
#
n_fft = 1024
win_length = None
hop_length = 512
fig, axs = plt.subplots(2, 1)
plot_waveform(SPEECH_WAVEFORM, SAMPLE_RATE, title="Original waveform", ax=axs[0])
plot_spectrogram(spec[0], title="spectrogram", ax=axs[1])
fig.tight_layout()
# Define transform
spectrogram = T.Spectrogram(
n_fft=n_fft,
win_length=win_length,
hop_length=hop_length,
center=True,
pad_mode="reflect",
power=2.0,
)
######################################################################
#
Audio(SPEECH_WAVEFORM.numpy(), rate=SAMPLE_RATE)
######################################################################
# The effect of ``n_fft`` parameter
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# The core of spectrogram computation is (short-term) Fourier transform,
# and the ``n_fft`` parameter corresponds to the :math:`N` in the following
# definition of descrete Fourier transform.
#
# $$ X_k = \\sum_{n=0}^{N-1} x_n e^{-\\frac{2\\pi i}{N} nk} $$
#
# (For the detail of Fourier transform, please refer to
# `Wikipedia <https://en.wikipedia.org/wiki/Fast_Fourier_transform>`__.
#
# The value of ``n_fft`` determines the resolution of frequency axis.
# However, with the higher ``n_fft`` value, the energy will be distributed
# among more bins, so when you visualize it, it might look more blurry,
# even thought they are higher resolution.
#
# The following illustrates this;
#
# Perform transform
spec = spectrogram(SPEECH_WAVEFORM)
######################################################################
#
# .. note::
#
# ``hop_length`` determines the time axis resolution.
# By default, (i.e. ``hop_length=None`` and ``win_length=None``),
# the value of ``n_fft // 4`` is used.
# Here we use the same ``hop_length`` value across different ``n_fft``
# so that they have the same number of elemets in the time axis.
#
n_ffts = [32, 128, 512, 2048]
hop_length = 64
specs = []
for n_fft in n_ffts:
spectrogram = T.Spectrogram(n_fft=n_fft, hop_length=hop_length)
spec = spectrogram(SPEECH_WAVEFORM)
specs.append(spec)
######################################################################
#
plot_spectrogram(spec[0], title="torchaudio")
fig, axs = plt.subplots(len(specs), 1, sharex=True)
for i, (spec, n_fft) in enumerate(zip(specs, n_ffts)):
plot_spectrogram(spec[0], ylabel=f"n_fft={n_fft}", ax=axs[i])
axs[i].set_xlabel(None)
fig.tight_layout()
######################################################################
# GriffinLim
# ----------
#
# To recover a waveform from a spectrogram, you can use ``GriffinLim``.
# When comparing signals, it is desirable to use the same sampling rate,
# however if you must use the different sampling rate, care must be
# taken for interpretating the meaning of ``n_fft``.
# Recall that ``n_fft`` determines the resolution of the frequency
# axis for a given sampling rate. In other words, what each bin on
# the frequency axis represents is subject to the sampling rate.
#
# As we have seen above, changing the value of ``n_fft`` does not change
# the coverage of frequency range for the same input signal.
torch.random.manual_seed(0)
######################################################################
#
# Let's downsample the audio and apply spectrogram with the same ``n_fft``
# value.
n_fft = 1024
win_length = None
hop_length = 512
# Downsample to half of the original sample rate
speech2 = torchaudio.functional.resample(SPEECH_WAVEFORM, SAMPLE_RATE, SAMPLE_RATE // 2)
# Upsample to the original sample rate
speech3 = torchaudio.functional.resample(speech2, SAMPLE_RATE // 2, SAMPLE_RATE)
spec = T.Spectrogram(
n_fft=n_fft,
win_length=win_length,
hop_length=hop_length,
)(SPEECH_WAVEFORM)
######################################################################
#
# Apply the same spectrogram
spectrogram = T.Spectrogram(n_fft=512)
spec0 = spectrogram(SPEECH_WAVEFORM)
spec2 = spectrogram(speech2)
spec3 = spectrogram(speech3)
######################################################################
#
griffin_lim = T.GriffinLim(
n_fft=n_fft,
win_length=win_length,
hop_length=hop_length,
)
# Visualize it
fig, axs = plt.subplots(3, 1)
plot_spectrogram(spec0[0], ylabel="Original", ax=axs[0])
axs[0].add_patch(Rectangle((0, 3), 212, 128, edgecolor="r", facecolor="none"))
plot_spectrogram(spec2[0], ylabel="Downsampled", ax=axs[1])
plot_spectrogram(spec3[0], ylabel="Upsampled", ax=axs[2])
fig.tight_layout()
######################################################################
#
# In the above visualization, the second plot ("Downsampled") might
# give the impression that the spectrogram is streched.
# This is because the meaning of frequency bins is different from
# the original one.
# Even though, they have the same number of bins, in the second plot,
# the frequency is only covered to the half of the original sampling
# rate.
# This becomes more clear if we resample the downsampled signal again
# so that it has the same sample rate as the original.
######################################################################
# GriffinLim
# ----------
#
# To recover a waveform from a spectrogram, you can use
# :py:class:`torchaudio.transforms.GriffinLim`.
#
# The same set of parameters used for spectrogram must be used.
# Define transforms
n_fft = 1024
spectrogram = T.Spectrogram(n_fft=n_fft)
griffin_lim = T.GriffinLim(n_fft=n_fft)
# Apply the transforms
spec = spectrogram(SPEECH_WAVEFORM)
reconstructed_waveform = griffin_lim(spec)
######################################################################
#
plot_waveform(reconstructed_waveform, SAMPLE_RATE, title="Reconstructed")
_, axes = plt.subplots(2, 1, sharex=True, sharey=True)
plot_waveform(SPEECH_WAVEFORM, SAMPLE_RATE, title="Original", ax=axes[0])
plot_waveform(reconstructed_waveform, SAMPLE_RATE, title="Reconstructed", ax=axes[1])
Audio(reconstructed_waveform, rate=SAMPLE_RATE)
######################################################################
......@@ -253,7 +334,6 @@ mel_spectrogram = T.MelSpectrogram(
pad_mode="reflect",
power=2.0,
norm="slaney",
onesided=True,
n_mels=n_mels,
mel_scale="htk",
)
......@@ -322,7 +402,7 @@ mfcc = mfcc_transform(SPEECH_WAVEFORM)
######################################################################
#
plot_spectrogram(mfcc[0])
plot_spectrogram(mfcc[0], title="MFCC")
######################################################################
# Comparison against librosa
......@@ -350,7 +430,7 @@ mfcc_librosa = librosa.feature.mfcc(
######################################################################
#
plot_spectrogram(mfcc_librosa)
plot_spectrogram(mfcc_librosa, title="MFCC (librosa)")
mse = torch.square(mfcc - mfcc_librosa).mean().item()
print("Mean Square Difference: ", mse)
......@@ -376,7 +456,7 @@ lfcc_transform = T.LFCC(
)
lfcc = lfcc_transform(SPEECH_WAVEFORM)
plot_spectrogram(lfcc[0])
plot_spectrogram(lfcc[0], title="LFCC")
######################################################################
# Pitch
......@@ -388,6 +468,7 @@ pitch = F.detect_pitch_frequency(SPEECH_WAVEFORM, SAMPLE_RATE)
######################################################################
#
def plot_pitch(waveform, sr, pitch):
figure, axis = plt.subplots(1, 1)
axis.set_title("Pitch Feature")
......@@ -402,58 +483,6 @@ def plot_pitch(waveform, sr, pitch):
axis2.plot(time_axis, pitch[0], linewidth=2, label="Pitch", color="green")
axis2.legend(loc=0)
plt.show(block=False)
plot_pitch(SPEECH_WAVEFORM, SAMPLE_RATE, pitch)
######################################################################
# Kaldi Pitch (beta)
# ------------------
#
# Kaldi Pitch feature [1] is a pitch detection mechanism tuned for automatic
# speech recognition (ASR) applications. This is a beta feature in ``torchaudio``,
# and it is available as :py:func:`torchaudio.functional.compute_kaldi_pitch`.
#
# 1. A pitch extraction algorithm tuned for automatic speech recognition
#
# Ghahremani, B. BabaAli, D. Povey, K. Riedhammer, J. Trmal and S.
# Khudanpur
#
# 2014 IEEE International Conference on Acoustics, Speech and Signal
# Processing (ICASSP), Florence, 2014, pp. 2494-2498, doi:
# 10.1109/ICASSP.2014.6854049.
# [`abstract <https://ieeexplore.ieee.org/document/6854049>`__],
# [`paper <https://danielpovey.com/files/2014_icassp_pitch.pdf>`__]
#
pitch_feature = F.compute_kaldi_pitch(SPEECH_WAVEFORM, SAMPLE_RATE)
pitch, nfcc = pitch_feature[..., 0], pitch_feature[..., 1]
######################################################################
#
def plot_kaldi_pitch(waveform, sr, pitch, nfcc):
_, axis = plt.subplots(1, 1)
axis.set_title("Kaldi Pitch Feature")
axis.grid(True)
end_time = waveform.shape[1] / sr
time_axis = torch.linspace(0, end_time, waveform.shape[1])
axis.plot(time_axis, waveform[0], linewidth=1, color="gray", alpha=0.3)
time_axis = torch.linspace(0, end_time, pitch.shape[1])
ln1 = axis.plot(time_axis, pitch[0], linewidth=2, label="Pitch", color="green")
axis.set_ylim((-1.3, 1.3))
axis2 = axis.twinx()
time_axis = torch.linspace(0, end_time, nfcc.shape[1])
ln2 = axis2.plot(time_axis, nfcc[0], linewidth=2, label="NFCC", color="blue", linestyle="--")
lns = ln1 + ln2
labels = [l.get_label() for l in lns]
axis.legend(lns, labels, loc=0)
plt.show(block=False)
plot_kaldi_pitch(SPEECH_WAVEFORM, SAMPLE_RATE, pitch, nfcc)
......@@ -5,8 +5,15 @@ Audio I/O
**Author**: `Moto Hira <moto@meta.com>`__
This tutorial shows how to use TorchAudio's basic I/O API to load audio files
into PyTorch's Tensor object, and save Tensor objects to audio files.
This tutorial shows how to use TorchAudio's basic I/O API to inspect audio data,
load them into PyTorch Tensors and save PyTorch Tensors.
.. warning::
There are multiple changes planned/made to audio I/O in recent releases.
For the detail of these changes please refer to
:ref:`Introduction of Dispatcher <dispatcher_migration>`.
"""
import torch
......@@ -47,6 +54,16 @@ SAMPLE_WAV = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch12753
SAMPLE_WAV_8000 = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042-8000hz.wav")
def _hide_seek(obj):
class _wrapper:
def __init__(self, obj):
self.obj = obj
def read(self, n):
return self.obj.read(n)
return _wrapper(obj)
######################################################################
# Querying audio metadata
......@@ -113,7 +130,7 @@ print(metadata)
url = "https://download.pytorch.org/torchaudio/tutorial-assets/steam-train-whistle-daniel_simon.wav"
with requests.get(url, stream=True) as response:
metadata = torchaudio.info(response.raw)
metadata = torchaudio.info(_hide_seek(response.raw))
print(metadata)
######################################################################
......@@ -164,7 +181,6 @@ def plot_waveform(waveform, sample_rate):
if num_channels > 1:
axes[c].set_ylabel(f"Channel {c+1}")
figure.suptitle("waveform")
plt.show(block=False)
######################################################################
......@@ -187,7 +203,6 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram"):
if num_channels > 1:
axes[c].set_ylabel(f"Channel {c+1}")
figure.suptitle(title)
plt.show(block=False)
######################################################################
......@@ -215,7 +230,7 @@ Audio(waveform.numpy()[0], rate=sample_rate)
# Load audio data as HTTP request
url = "https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
with requests.get(url, stream=True) as response:
waveform, sample_rate = torchaudio.load(response.raw)
waveform, sample_rate = torchaudio.load(_hide_seek(response.raw))
plot_specgram(waveform, sample_rate, title="HTTP datasource")
######################################################################
......@@ -237,7 +252,7 @@ bucket = "pytorch-tutorial-assets"
key = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
client = boto3.client("s3", config=Config(signature_version=UNSIGNED))
response = client.get_object(Bucket=bucket, Key=key)
waveform, sample_rate = torchaudio.load(response["Body"])
waveform, sample_rate = torchaudio.load(_hide_seek(response["Body"]))
plot_specgram(waveform, sample_rate, title="From S3")
......@@ -271,13 +286,15 @@ frame_offset, num_frames = 16000, 16000 # Fetch and decode the 1 - 2 seconds
url = "https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
print("Fetching all the data...")
with requests.get(url, stream=True) as response:
waveform1, sample_rate1 = torchaudio.load(response.raw)
waveform1, sample_rate1 = torchaudio.load(_hide_seek(response.raw))
waveform1 = waveform1[:, frame_offset : frame_offset + num_frames]
print(f" - Fetched {response.raw.tell()} bytes")
print("Fetching until the requested frames are available...")
with requests.get(url, stream=True) as response:
waveform2, sample_rate2 = torchaudio.load(response.raw, frame_offset=frame_offset, num_frames=num_frames)
waveform2, sample_rate2 = torchaudio.load(
_hide_seek(response.raw), frame_offset=frame_offset, num_frames=num_frames
)
print(f" - Fetched {response.raw.tell()} bytes")
print("Checking the resulting waveform ... ", end="")
......@@ -316,6 +333,7 @@ waveform, sample_rate = torchaudio.load(SAMPLE_WAV)
######################################################################
#
def inspect_file(path):
print("-" * 10)
print("Source:", path)
......@@ -324,6 +342,7 @@ def inspect_file(path):
print(f" - {torchaudio.info(path)}")
print()
######################################################################
#
# Save without any encoding option.
......@@ -351,11 +370,11 @@ with tempfile.TemporaryDirectory() as tempdir:
formats = [
"flac",
"vorbis",
"sph",
"amb",
"amr-nb",
"gsm",
# "vorbis",
# "sph",
# "amb",
# "amr-nb",
# "gsm",
]
######################################################################
......
......@@ -27,14 +27,14 @@ import math
import timeit
import librosa
import resampy
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import Audio, display
import resampy
from IPython.display import Audio
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
DEFAULT_OFFSET = 201
......@@ -105,7 +105,6 @@ def plot_sweep(
axis.yaxis.grid(True, alpha=0.67)
figure.suptitle(f"{title} (sample rate: {sample_rate} Hz)")
plt.colorbar(cax)
plt.show(block=True)
######################################################################
......@@ -240,13 +239,13 @@ plot_sweep(resampled_waveform, resample_rate, title="rolloff=0.8")
sample_rate = 48000
resample_rate = 32000
resampled_waveform = F.resample(waveform, sample_rate, resample_rate, resampling_method="sinc_interpolation")
resampled_waveform = F.resample(waveform, sample_rate, resample_rate, resampling_method="sinc_interp_hann")
plot_sweep(resampled_waveform, resample_rate, title="Hann Window Default")
######################################################################
#
resampled_waveform = F.resample(waveform, sample_rate, resample_rate, resampling_method="kaiser_window")
resampled_waveform = F.resample(waveform, sample_rate, resample_rate, resampling_method="sinc_interp_kaiser")
plot_sweep(resampled_waveform, resample_rate, title="Kaiser Window Default")
......@@ -271,7 +270,7 @@ resampled_waveform = F.resample(
resample_rate,
lowpass_filter_width=64,
rolloff=0.9475937167399596,
resampling_method="kaiser_window",
resampling_method="sinc_interp_kaiser",
beta=14.769656459379492,
)
plot_sweep(resampled_waveform, resample_rate, title="Kaiser Window Best (torchaudio)")
......@@ -300,7 +299,7 @@ resampled_waveform = F.resample(
resample_rate,
lowpass_filter_width=16,
rolloff=0.85,
resampling_method="kaiser_window",
resampling_method="sinc_interp_kaiser",
beta=8.555504641634386,
)
plot_sweep(resampled_waveform, resample_rate, title="Kaiser Window Fast (torchaudio)")
......@@ -325,7 +324,7 @@ print("torchaudio and librosa kaiser fast MSE:", mse)
#
# Below are benchmarks for downsampling and upsampling waveforms between
# two pairs of sampling rates. We demonstrate the performance implications
# that the ``lowpass_filter_wdith``, window type, and sample rates can
# that the ``lowpass_filter_width``, window type, and sample rates can
# have. Additionally, we provide a comparison against ``librosa``\ ’s
# ``kaiser_best`` and ``kaiser_fast`` using their corresponding parameters
# in ``torchaudio``.
......@@ -338,18 +337,20 @@ print(f"resampy: {resampy.__version__}")
######################################################################
#
def benchmark_resample_functional(
waveform,
sample_rate,
resample_rate,
lowpass_filter_width=6,
rolloff=0.99,
resampling_method="sinc_interpolation",
resampling_method="sinc_interp_hann",
beta=None,
iters=5,
):
return timeit.timeit(
stmt='''
return (
timeit.timeit(
stmt="""
torchaudio.functional.resample(
waveform,
sample_rate,
......@@ -359,29 +360,34 @@ torchaudio.functional.resample(
resampling_method=resampling_method,
beta=beta,
)
''',
setup='import torchaudio',
number=iters,
globals=locals(),
) * 1000 / iters
""",
setup="import torchaudio",
number=iters,
globals=locals(),
)
* 1000
/ iters
)
######################################################################
#
def benchmark_resample_transforms(
waveform,
sample_rate,
resample_rate,
lowpass_filter_width=6,
rolloff=0.99,
resampling_method="sinc_interpolation",
resampling_method="sinc_interp_hann",
beta=None,
iters=5,
):
return timeit.timeit(
stmt='resampler(waveform)',
setup='''
return (
timeit.timeit(
stmt="resampler(waveform)",
setup="""
import torchaudio
resampler = torchaudio.transforms.Resample(
......@@ -394,15 +400,19 @@ resampler = torchaudio.transforms.Resample(
beta=beta,
)
resampler.to(waveform.device)
''',
number=iters,
globals=locals(),
) * 1000 / iters
""",
number=iters,
globals=locals(),
)
* 1000
/ iters
)
######################################################################
#
def benchmark_resample_librosa(
waveform,
sample_rate,
......@@ -411,24 +421,29 @@ def benchmark_resample_librosa(
iters=5,
):
waveform_np = waveform.squeeze().numpy()
return timeit.timeit(
stmt='''
return (
timeit.timeit(
stmt="""
librosa.resample(
waveform_np,
orig_sr=sample_rate,
target_sr=resample_rate,
res_type=res_type,
)
''',
setup='import librosa',
number=iters,
globals=locals(),
) * 1000 / iters
""",
setup="import librosa",
number=iters,
globals=locals(),
)
* 1000
/ iters
)
######################################################################
#
def benchmark(sample_rate, resample_rate):
times, rows = [], []
waveform = get_sine_sweep(sample_rate).to(torch.float32)
......@@ -451,7 +466,7 @@ def benchmark(sample_rate, resample_rate):
kwargs = {
"lowpass_filter_width": 64,
"rolloff": 0.9475937167399596,
"resampling_method": "kaiser_window",
"resampling_method": "sinc_interp_kaiser",
"beta": 14.769656459379492,
}
lib_time = benchmark_resample_librosa(*args, res_type="kaiser_best")
......@@ -464,7 +479,7 @@ def benchmark(sample_rate, resample_rate):
kwargs = {
"lowpass_filter_width": 16,
"rolloff": 0.85,
"resampling_method": "kaiser_window",
"resampling_method": "sinc_interp_kaiser",
"beta": 8.555504641634386,
}
lib_time = benchmark_resample_librosa(*args, res_type="kaiser_fast")
......@@ -483,7 +498,7 @@ def plot(df):
print(df.round(2))
ax = df.plot(kind="bar")
plt.ylabel("Time Elapsed [ms]")
plt.xticks(rotation = 0, fontsize=10)
plt.xticks(rotation=0, fontsize=10)
for cont, col, color in zip(ax.containers, df.columns, mcolors.TABLEAU_COLORS):
label = ["N/A" if v != v else str(v) for v in df[col].round(2)]
ax.bar_label(cont, labels=label, color=color, fontweight="bold", fontsize="x-small")
......@@ -531,8 +546,8 @@ plot(df)
# - a larger ``lowpass_filter_width`` results in a larger resampling kernel,
# and therefore increases computation time for both the kernel computation
# and convolution
# - using ``kaiser_window`` results in longer computation times than the default
# ``sinc_interpolation`` because it is more complex to compute the intermediate
# - using ``sinc_interp_kaiser`` results in longer computation times than the default
# ``sinc_interp_hann`` because it is more complex to compute the intermediate
# window values
# - a large GCD between the sample and resample rate will result
# in a simplification that allows for a smaller kernel and faster kernel computation.
......
"""
CTC forced alignment API tutorial
=================================
**Author**: `Xiaohui Zhang <xiaohuizhang@meta.com>`__, `Moto Hira <moto@meta.com>`__
The forced alignment is a process to align transcript with speech.
This tutorial shows how to align transcripts to speech using
:py:func:`torchaudio.functional.forced_align` which was developed along the work of
`Scaling Speech Technology to 1,000+ Languages
<https://research.facebook.com/publications/scaling-speech-technology-to-1000-languages/>`__.
:py:func:`~torchaudio.functional.forced_align` has custom CPU and CUDA
implementations which are more performant than the vanilla Python
implementation above, and are more accurate.
It can also handle missing transcript with special ``<star>`` token.
There is also a high-level API, :py:class:`torchaudio.pipelines.Wav2Vec2FABundle`,
which wraps the pre/post-processing explained in this tutorial and makes it easy
to run forced-alignments.
`Forced alignment for multilingual data
<./forced_alignment_for_multilingual_data_tutorial.html>`__ uses this API to
illustrate how to align non-English transcripts.
"""
######################################################################
# Preparation
# -----------
import torch
import torchaudio
print(torch.__version__)
print(torchaudio.__version__)
######################################################################
#
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
######################################################################
#
import IPython
import matplotlib.pyplot as plt
import torchaudio.functional as F
######################################################################
# First we prepare the speech data and the transcript we area going
# to use.
#
SPEECH_FILE = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
waveform, _ = torchaudio.load(SPEECH_FILE)
TRANSCRIPT = "i had that curiosity beside me at this moment".split()
######################################################################
# Generating emissions
# ~~~~~~~~~~~~~~~~~~~~
#
# :py:func:`~torchaudio.functional.forced_align` takes emission and
# token sequences and outputs timestaps of the tokens and their scores.
#
# Emission reperesents the frame-wise probability distribution over
# tokens, and it can be obtained by passing waveform to an acoustic
# model.
#
# Tokens are numerical expression of transcripts. There are many ways to
# tokenize transcripts, but here, we simply map alphabets into integer,
# which is how labels were constructed when the acoustice model we are
# going to use was trained.
#
# We will use a pre-trained Wav2Vec2 model,
# :py:data:`torchaudio.pipelines.MMS_FA`, to obtain emission and tokenize
# the transcript.
#
bundle = torchaudio.pipelines.MMS_FA
model = bundle.get_model(with_star=False).to(device)
with torch.inference_mode():
emission, _ = model(waveform.to(device))
######################################################################
#
def plot_emission(emission):
fig, ax = plt.subplots()
ax.imshow(emission.cpu().T)
ax.set_title("Frame-wise class probabilities")
ax.set_xlabel("Time")
ax.set_ylabel("Labels")
fig.tight_layout()
plot_emission(emission[0])
######################################################################
# Tokenize the transcript
# ~~~~~~~~~~~~~~~~~~~~~~~
#
# We create a dictionary, which maps each label into token.
LABELS = bundle.get_labels(star=None)
DICTIONARY = bundle.get_dict(star=None)
for k, v in DICTIONARY.items():
print(f"{k}: {v}")
######################################################################
# converting transcript to tokens is as simple as
tokenized_transcript = [DICTIONARY[c] for word in TRANSCRIPT for c in word]
for t in tokenized_transcript:
print(t, end=" ")
print()
######################################################################
# Computing alignments
# --------------------
#
# Frame-level alignments
# ~~~~~~~~~~~~~~~~~~~~~~
#
# Now we call TorchAudio’s forced alignment API to compute the
# frame-level alignment. For the detail of function signature, please
# refer to :py:func:`~torchaudio.functional.forced_align`.
#
def align(emission, tokens):
targets = torch.tensor([tokens], dtype=torch.int32, device=device)
alignments, scores = F.forced_align(emission, targets, blank=0)
alignments, scores = alignments[0], scores[0] # remove batch dimension for simplicity
scores = scores.exp() # convert back to probability
return alignments, scores
aligned_tokens, alignment_scores = align(emission, tokenized_transcript)
######################################################################
# Now let's look at the output.
for i, (ali, score) in enumerate(zip(aligned_tokens, alignment_scores)):
print(f"{i:3d}:\t{ali:2d} [{LABELS[ali]}], {score:.2f}")
######################################################################
#
# .. note::
#
# The alignment is expressed in the frame cordinate of the emission,
# which is different from the original waveform.
#
# It contains blank tokens and repeated tokens. The following is the
# interpretation of the non-blank tokens.
#
# .. code-block::
#
# 31: 0 [-], 1.00
# 32: 2 [i], 1.00 "i" starts and ends
# 33: 0 [-], 1.00
# 34: 0 [-], 1.00
# 35: 15 [h], 1.00 "h" starts
# 36: 15 [h], 0.93 "h" ends
# 37: 1 [a], 1.00 "a" starts and ends
# 38: 0 [-], 0.96
# 39: 0 [-], 1.00
# 40: 0 [-], 1.00
# 41: 13 [d], 1.00 "d" starts and ends
# 42: 0 [-], 1.00
#
# .. note::
#
# When same token occured after blank tokens, it is not treated as
# a repeat, but as a new occurrence.
#
# .. code-block::
#
# a a a b -> a b
# a - - b -> a b
# a a - b -> a b
# a - a b -> a a b
# ^^^ ^^^
#
######################################################################
# Token-level alignments
# ~~~~~~~~~~~~~~~~~~~~~~
#
# Next step is to resolve the repetation, so that each alignment does
# not depend on previous alignments.
# :py:func:`torchaudio.functional.merge_tokens` computes the
# :py:class:`~torchaudio.functional.TokenSpan` object, which represents
# which token from the transcript is present at what time span.
######################################################################
#
token_spans = F.merge_tokens(aligned_tokens, alignment_scores)
print("Token\tTime\tScore")
for s in token_spans:
print(f"{LABELS[s.token]}\t[{s.start:3d}, {s.end:3d})\t{s.score:.2f}")
######################################################################
# Word-level alignments
# ~~~~~~~~~~~~~~~~~~~~~
#
# Now let’s group the token-level alignments into word-level alignments.
def unflatten(list_, lengths):
assert len(list_) == sum(lengths)
i = 0
ret = []
for l in lengths:
ret.append(list_[i : i + l])
i += l
return ret
word_spans = unflatten(token_spans, [len(word) for word in TRANSCRIPT])
######################################################################
# Audio previews
# ~~~~~~~~~~~~~~
#
# Compute average score weighted by the span length
def _score(spans):
return sum(s.score * len(s) for s in spans) / sum(len(s) for s in spans)
def preview_word(waveform, spans, num_frames, transcript, sample_rate=bundle.sample_rate):
ratio = waveform.size(1) / num_frames
x0 = int(ratio * spans[0].start)
x1 = int(ratio * spans[-1].end)
print(f"{transcript} ({_score(spans):.2f}): {x0 / sample_rate:.3f} - {x1 / sample_rate:.3f} sec")
segment = waveform[:, x0:x1]
return IPython.display.Audio(segment.numpy(), rate=sample_rate)
num_frames = emission.size(1)
######################################################################
# Generate the audio for each segment
print(TRANSCRIPT)
IPython.display.Audio(SPEECH_FILE)
######################################################################
#
preview_word(waveform, word_spans[0], num_frames, TRANSCRIPT[0])
######################################################################
#
preview_word(waveform, word_spans[1], num_frames, TRANSCRIPT[1])
######################################################################
#
preview_word(waveform, word_spans[2], num_frames, TRANSCRIPT[2])
######################################################################
#
preview_word(waveform, word_spans[3], num_frames, TRANSCRIPT[3])
######################################################################
#
preview_word(waveform, word_spans[4], num_frames, TRANSCRIPT[4])
######################################################################
#
preview_word(waveform, word_spans[5], num_frames, TRANSCRIPT[5])
######################################################################
#
preview_word(waveform, word_spans[6], num_frames, TRANSCRIPT[6])
######################################################################
#
preview_word(waveform, word_spans[7], num_frames, TRANSCRIPT[7])
######################################################################
#
preview_word(waveform, word_spans[8], num_frames, TRANSCRIPT[8])
######################################################################
# Visualization
# ~~~~~~~~~~~~~
#
# Now let's look at the alignment result and segment the original
# speech into words.
def plot_alignments(waveform, token_spans, emission, transcript, sample_rate=bundle.sample_rate):
ratio = waveform.size(1) / emission.size(1) / sample_rate
fig, axes = plt.subplots(2, 1)
axes[0].imshow(emission[0].detach().cpu().T, aspect="auto")
axes[0].set_title("Emission")
axes[0].set_xticks([])
axes[1].specgram(waveform[0], Fs=sample_rate)
for t_spans, chars in zip(token_spans, transcript):
t0, t1 = t_spans[0].start + 0.1, t_spans[-1].end - 0.1
axes[0].axvspan(t0 - 0.5, t1 - 0.5, facecolor="None", hatch="/", edgecolor="white")
axes[1].axvspan(ratio * t0, ratio * t1, facecolor="None", hatch="/", edgecolor="white")
axes[1].annotate(f"{_score(t_spans):.2f}", (ratio * t0, sample_rate * 0.51), annotation_clip=False)
for span, char in zip(t_spans, chars):
t0 = span.start * ratio
axes[1].annotate(char, (t0, sample_rate * 0.55), annotation_clip=False)
axes[1].set_xlabel("time [second]")
axes[1].set_xlim([0, None])
fig.tight_layout()
######################################################################
#
plot_alignments(waveform, word_spans, emission, TRANSCRIPT)
######################################################################
#
# Inconsistent treatment of ``blank`` token
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# When splitting the token-level alignments into words, you will
# notice that some blank tokens are treated differently, and this makes
# the interpretation of the result somehwat ambigious.
#
# This is easy to see when we plot the scores. The following figure
# shows word regions and non-word regions, with the frame-level scores
# of non-blank tokens.
def plot_scores(word_spans, scores):
fig, ax = plt.subplots()
span_xs, span_hs = [], []
ax.axvspan(word_spans[0][0].start - 0.05, word_spans[-1][-1].end + 0.05, facecolor="paleturquoise", edgecolor="none", zorder=-1)
for t_span in word_spans:
for span in t_span:
for t in range(span.start, span.end):
span_xs.append(t + 0.5)
span_hs.append(scores[t].item())
ax.annotate(LABELS[span.token], (span.start, -0.07))
ax.axvspan(t_span[0].start - 0.05, t_span[-1].end + 0.05, facecolor="mistyrose", edgecolor="none", zorder=-1)
ax.bar(span_xs, span_hs, color="lightsalmon", edgecolor="coral")
ax.set_title("Frame-level scores and word segments")
ax.set_ylim(-0.1, None)
ax.grid(True, axis="y")
ax.axhline(0, color="black")
fig.tight_layout()
plot_scores(word_spans, alignment_scores)
######################################################################
# In this plot, the blank tokens are those highlighted area without
# vertical bar.
# You can see that there are blank tokens which are interpreted as
# part of a word (highlighted red), while the others (highlighted blue)
# are not.
#
# One reason for this is because the model was trained without a
# label for the word boundary. The blank tokens are treated not just
# as repeatation but also as silence between words.
#
# But then, a question arises. Should frames immediately after or
# near the end of a word be silent or repeat?
#
# In the above example, if you go back to the previous plot of
# spectrogram and word regions, you see that after "y" in "curiosity",
# there is still some activities in multiple frequency buckets.
#
# Would it be more accurate if that frame was included in the word?
#
# Unfortunately, CTC does not provide a comprehensive solution to this.
# Models trained with CTC are known to exhibit "peaky" response,
# that is, they tend to spike for an aoccurance of a label, but the
# spike does not last for the duration of the label.
# (Note: Pre-trained Wav2Vec2 models tend to spike at the beginning of
# label occurances, but this not always the case.)
#
# :cite:`zeyer2021does` has in-depth alanysis on the peaky behavior of
# CTC.
# We encourage those who are interested understanding more to refer
# to the paper.
# The following is a quote from the paper, which is the exact issue we
# are facing here.
#
# *Peaky behavior can be problematic in certain cases,*
# *e.g. when an application requires to not use the blank label,*
# *e.g. to get meaningful time accurate alignments of phonemes*
# *to a transcription.*
######################################################################
# Advanced: Handling transcripts with ``<star>`` token
# ----------------------------------------------------
#
# Now let’s look at when the transcript is partially missing, how can we
# improve alignment quality using the ``<star>`` token, which is capable of modeling
# any token.
#
# Here we use the same English example as used above. But we remove the
# beginning text ``“i had that curiosity beside me at”`` from the transcript.
# Aligning audio with such transcript results in wrong alignments of the
# existing word “this”. However, this issue can be mitigated by using the
# ``<star>`` token to model the missing text.
#
######################################################################
# First, we extend the dictionary to include the ``<star>`` token.
DICTIONARY["*"] = len(DICTIONARY)
######################################################################
# Next, we extend the emission tensor with the extra dimension
# corresponding to the ``<star>`` token.
#
star_dim = torch.zeros((1, emission.size(1), 1), device=emission.device, dtype=emission.dtype)
emission = torch.cat((emission, star_dim), 2)
assert len(DICTIONARY) == emission.shape[2]
plot_emission(emission[0])
######################################################################
# The following function combines all the processes, and compute
# word segments from emission in one-go.
def compute_alignments(emission, transcript, dictionary):
tokens = [dictionary[char] for word in transcript for char in word]
alignment, scores = align(emission, tokens)
token_spans = F.merge_tokens(alignment, scores)
word_spans = unflatten(token_spans, [len(word) for word in transcript])
return word_spans
######################################################################
# Full Transcript
# ~~~~~~~~~~~~~~~
word_spans = compute_alignments(emission, TRANSCRIPT, DICTIONARY)
plot_alignments(waveform, word_spans, emission, TRANSCRIPT)
######################################################################
# Partial Transcript with ``<star>`` token
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# Now we replace the first part of the transcript with the ``<star>`` token.
transcript = "* this moment".split()
word_spans = compute_alignments(emission, transcript, DICTIONARY)
plot_alignments(waveform, word_spans, emission, transcript)
######################################################################
#
preview_word(waveform, word_spans[0], num_frames, transcript[0])
######################################################################
#
preview_word(waveform, word_spans[1], num_frames, transcript[1])
######################################################################
#
preview_word(waveform, word_spans[2], num_frames, transcript[2])
######################################################################
# Partial Transcript without ``<star>`` token
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# As a comparison, the following aligns the partial transcript
# without using ``<star>`` token.
# It demonstrates the effect of ``<star>`` token for dealing with deletion errors.
transcript = "this moment".split()
word_spans = compute_alignments(emission, transcript, DICTIONARY)
plot_alignments(waveform, word_spans, emission, transcript)
######################################################################
# Conclusion
# ----------
#
# In this tutorial, we looked at how to use torchaudio’s forced alignment
# API to align and segment speech files, and demonstrated one advanced usage:
# How introducing a ``<star>`` token could improve alignment accuracy when
# transcription errors exist.
#
######################################################################
# Acknowledgement
# ---------------
#
# Thanks to `Vineel Pratap <vineelkpratap@meta.com>`__ and `Zhaoheng
# Ni <zni@meta.com>`__ for developing and open-sourcing the
# forced aligner API.
......@@ -7,26 +7,23 @@ Device ASR with Emformer RNN-T
This tutorial shows how to use Emformer RNN-T and streaming API
to perform speech recognition on a streaming device input, i.e. microphone
on laptop.
.. note::
This tutorial requires FFmpeg libraries (>=4.1, <4.4) and SentencePiece.
There are multiple ways to install FFmpeg libraries.
If you are using Anaconda Python distribution,
``conda install 'ffmpeg<4.4'`` will install
the required FFmpeg libraries.
You can install SentencePiece by running ``pip install sentencepiece``.
.. note::
This tutorial was tested on MacBook Pro and Dynabook with Windows 10.
This tutorial does NOT work on Google Colab because the server running
this tutorial does not have a microphone that you can talk to.
"""
######################################################################
#
# .. note::
#
# This tutorial requires FFmpeg libraries.
# Please refer to :ref:`FFmpeg dependency <ffmpeg_dependency>` for
# the detail.
#
# .. note::
#
# This tutorial was tested on MacBook Pro and Dynabook with Windows 10.
#
# This tutorial does NOT work on Google Colab because the server running
# this tutorial does not have a microphone that you can talk to.
######################################################################
# 1. Overview
# -----------
......
"""
AudioEffector Usages
====================
**Author**: `Moto Hira <moto@meta.com>`__
This tutorial shows how to use :py:class:`torchaudio.io.AudioEffector` to
apply various effects and codecs to waveform tensor.
"""
######################################################################
#
# .. note::
#
# This tutorial requires FFmpeg libraries.
# Please refer to :ref:`FFmpeg dependency <ffmpeg_dependency>` for
# the detail.
#
######################################################################
# Overview
# --------
#
# :py:class:`~torchaudio.io.AudioEffector` combines in-memory encoding,
# decoding and filtering that are provided by
# :py:class:`~torchaudio.io.StreamWriter` and
# :py:class:`~torchaudio.io.StreamReader`.
#
# The following figure illustrates the process.
#
# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/AudioEffector.png
#
import torch
import torchaudio
print(torch.__version__)
print(torchaudio.__version__)
######################################################################
#
from torchaudio.io import AudioEffector, CodecConfig
import matplotlib.pyplot as plt
from IPython.display import Audio
######################################################################
#
for k, v in torchaudio.utils.ffmpeg_utils.get_versions().items():
print(k, v)
######################################################################
# Usage
# -----
#
# To use ``AudioEffector``, instantiate it with ``effect`` and
# ``format``, then either pass the waveform to
# :py:meth:`~torchaudio.io.AudioEffector.apply` or
# :py:meth:`~torchaudio.io.AudioEffector.stream` method.
#
# .. code:: python
#
# effector = AudioEffector(effect=..., format=...,)
#
# # Apply at once
# applied = effector.apply(waveform, sample_rate)
#
# ``apply`` method applies effect and codec to the entire waveform at
# once. So if the input waveform is long, and memory consumption is an
# issue, one can use ``stream`` method to process chunk by chunk.
#
# .. code:: python
#
# # Apply chunk by chunk
# for applied_chunk = effector.stream(waveform, sample_rate):
# ...
#
######################################################################
# Example
# -------
#
src = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
waveform, sr = torchaudio.load(src, channels_first=False)
######################################################################
# Gallery
# -------
#
def show(effect, *, stereo=False):
wf = torch.cat([waveform] * 2, dim=1) if stereo else waveform
figsize = (6.4, 2.1 if stereo else 1.2)
effector = AudioEffector(effect=effect, pad_end=False)
result = effector.apply(wf, int(sr))
num_channels = result.size(1)
f, ax = plt.subplots(num_channels, 1, squeeze=False, figsize=figsize, sharex=True)
for i in range(num_channels):
ax[i][0].specgram(result[:, i], Fs=sr)
f.set_tight_layout(True)
return Audio(result.numpy().T, rate=sr)
######################################################################
# Original
# --------
#
show(effect=None)
######################################################################
# Effects
# -------
#
######################################################################
# tempo
# ~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#atempo
show("atempo=0.7")
######################################################################
#
show("atempo=1.8")
######################################################################
# highpass
# ~~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#highpass
show("highpass=frequency=1500")
######################################################################
# lowpass
# ~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#lowpass
show("lowpass=frequency=1000")
######################################################################
# allpass
# ~~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#allpass
show("allpass")
######################################################################
# bandpass
# ~~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#bandpass
show("bandpass=frequency=3000")
######################################################################
# bandreject
# ~~~~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#bandreject
show("bandreject=frequency=3000")
######################################################################
# echo
# ~~~~
# https://ffmpeg.org/ffmpeg-filters.html#aecho
show("aecho=in_gain=0.8:out_gain=0.88:delays=6:decays=0.4")
######################################################################
#
show("aecho=in_gain=0.8:out_gain=0.88:delays=60:decays=0.4")
######################################################################
#
show("aecho=in_gain=0.8:out_gain=0.9:delays=1000:decays=0.3")
######################################################################
# chorus
# ~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#chorus
show("chorus=0.5:0.9:50|60|40:0.4|0.32|0.3:0.25|0.4|0.3:2|2.3|1.3")
######################################################################
# fft filter
# ~~~~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#afftfilt
# fmt: off
show(
"afftfilt="
"real='re * (1-clip(b * (b/nb), 0, 1))':"
"imag='im * (1-clip(b * (b/nb), 0, 1))'"
)
######################################################################
#
show(
"afftfilt="
"real='hypot(re,im) * sin(0)':"
"imag='hypot(re,im) * cos(0)':"
"win_size=512:"
"overlap=0.75"
)
######################################################################
#
show(
"afftfilt="
"real='hypot(re,im) * cos(2 * 3.14 * (random(0) * 2-1))':"
"imag='hypot(re,im) * sin(2 * 3.14 * (random(1) * 2-1))':"
"win_size=128:"
"overlap=0.8"
)
# fmt: on
######################################################################
# vibrato
# ~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#vibrato
show("vibrato=f=10:d=0.8")
######################################################################
# tremolo
# ~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#tremolo
show("tremolo=f=8:d=0.8")
######################################################################
# crystalizer
# ~~~~~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#crystalizer
show("crystalizer")
######################################################################
# flanger
# ~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#flanger
show("flanger")
######################################################################
# phaser
# ~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#aphaser
show("aphaser")
######################################################################
# pulsator
# ~~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#apulsator
show("apulsator", stereo=True)
######################################################################
# haas
# ~~~~
# https://ffmpeg.org/ffmpeg-filters.html#haas
show("haas")
######################################################################
# Codecs
# ------
#
def show_multi(configs):
results = []
for config in configs:
effector = AudioEffector(**config)
results.append(effector.apply(waveform, int(sr)))
num_configs = len(configs)
figsize = (6.4, 0.3 + num_configs * 0.9)
f, axes = plt.subplots(num_configs, 1, figsize=figsize, sharex=True)
for result, ax in zip(results, axes):
ax.specgram(result[:, 0], Fs=sr)
f.set_tight_layout(True)
return [Audio(r.numpy().T, rate=sr) for r in results]
######################################################################
# ogg
# ~~~
#
results = show_multi(
[
{"format": "ogg"},
{"format": "ogg", "encoder": "vorbis"},
{"format": "ogg", "encoder": "opus"},
]
)
######################################################################
# ogg - default encoder (flac)
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
results[0]
######################################################################
# ogg - vorbis
# ^^^^^^^^^^^^
#
results[1]
######################################################################
# ogg - opus
# ^^^^^^^^^^
#
results[2]
######################################################################
# mp3
# ~~~
# https://trac.ffmpeg.org/wiki/Encode/MP3
results = show_multi(
[
{"format": "mp3"},
{"format": "mp3", "codec_config": CodecConfig(compression_level=1)},
{"format": "mp3", "codec_config": CodecConfig(compression_level=9)},
{"format": "mp3", "codec_config": CodecConfig(bit_rate=192_000)},
{"format": "mp3", "codec_config": CodecConfig(bit_rate=8_000)},
{"format": "mp3", "codec_config": CodecConfig(qscale=9)},
{"format": "mp3", "codec_config": CodecConfig(qscale=1)},
]
)
######################################################################
# default
# ^^^^^^^
results[0]
######################################################################
# compression_level=1
# ^^^^^^^^^^^^^^^^^^^
results[1]
######################################################################
# compression_level=9
# ^^^^^^^^^^^^^^^^^^^
results[2]
######################################################################
# bit_rate=192k
# ^^^^^^^^^^^^^
results[3]
######################################################################
# bit_rate=8k
# ^^^^^^^^^^^^^
results[4]
######################################################################
# qscale=9
# ^^^^^^^^
results[5]
######################################################################
# qscale=1
# ^^^^^^^^
results[6]
######################################################################
#
# Tag: :obj:`torchaudio.io`
"""
Forced alignment for multilingual data
======================================
**Authors**: `Xiaohui Zhang <xiaohuizhang@meta.com>`__, `Moto Hira <moto@meta.com>`__.
This tutorial shows how to align transcript to speech for non-English languages.
The process of aligning non-English (normalized) transcript is identical to aligning
English (normalized) transcript, and the process for English is covered in detail in
`CTC forced alignment tutorial <./ctc_forced_alignment_api_tutorial.html>`__.
In this tutorial, we use TorchAudio's high-level API,
:py:class:`torchaudio.pipelines.Wav2Vec2FABundle`, which packages the pre-trained
model, tokenizer and aligner, to perform the forced alignment with less code.
"""
import torch
import torchaudio
print(torch.__version__)
print(torchaudio.__version__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
######################################################################
#
from typing import List
import IPython
import matplotlib.pyplot as plt
######################################################################
# Creating the pipeline
# ---------------------
#
# First, we instantiate the model and pre/post-processing pipelines.
#
# The following diagram illustrates the process of alignment.
#
# .. image:: https://download.pytorch.org/torchaudio/doc-assets/pipelines-wav2vec2fabundle.png
#
# The waveform is passed to an acoustic model, which produces the sequence of
# probability distribution of tokens.
# The transcript is passed to tokenizer, which converts the transcript to
# sequence of tokens.
# Aligner takes the results from the acoustic model and the tokenizer and generate
# timestamps for each token.
#
# .. note::
#
# This process expects that the input transcript is already normalized.
# The process of normalization, which involves romanization of non-English
# languages, is language-dependent, so it is not covered in this tutorial,
# but we will breifly look into it.
#
# The acoustic model and the tokenizer must use the same set of tokens.
# To facilitate the creation of matching processors,
# :py:class:`~torchaudio.pipelines.Wav2Vec2FABundle` associates a
# pre-trained accoustic model and a tokenizer.
# :py:data:`torchaudio.pipelines.MMS_FA` is one of such instance.
#
# The following code instantiates a pre-trained acoustic model, a tokenizer
# which uses the same set of tokens as the model, and an aligner.
#
from torchaudio.pipelines import MMS_FA as bundle
model = bundle.get_model()
model.to(device)
tokenizer = bundle.get_tokenizer()
aligner = bundle.get_aligner()
######################################################################
# .. note::
#
# The model instantiated by :py:data:`~torchaudio.pipelines.MMS_FA`'s
# :py:meth:`~torchaudio.pipelines.Wav2Vec2FABundle.get_model`
# method by default includes the feature dimension for ``<star>`` token.
# You can disable this by passing ``with_star=False``.
#
######################################################################
# The acoustic model of :py:data:`~torchaudio.pipelines.MMS_FA` was
# created and open-sourced as part of the research project,
# `Scaling Speech Technology to 1,000+ Languages
# <https://research.facebook.com/publications/scaling-speech-technology-to-1000-languages/>`__.
# It was trained with 23,000 hours of audio from 1100+ languages.
#
# The tokenizer simply maps the normalized characters to integers.
# You can check the mapping as follow;
print(bundle.get_dict())
######################################################################
#
# The aligner internally uses :py:func:`torchaudio.functional.forced_align`
# and :py:func:`torchaudio.functional.merge_tokens` to infer the time
# stamps of the input tokens.
#
# The detail of the underlying mechanism is covered in
# `CTC forced alignment API tutorial <./ctc_forced_alignment_api_tutorial.html>`__,
# so please refer to it.
######################################################################
# We define a utility function that performs the forced alignment with
# the above model, the tokenizer and the aligner.
#
def compute_alignments(waveform: torch.Tensor, transcript: List[str]):
with torch.inference_mode():
emission, _ = model(waveform.to(device))
token_spans = aligner(emission[0], tokenizer(transcript))
return emission, token_spans
######################################################################
# We also define utility functions for plotting the result and previewing
# the audio segments.
# Compute average score weighted by the span length
def _score(spans):
return sum(s.score * len(s) for s in spans) / sum(len(s) for s in spans)
def plot_alignments(waveform, token_spans, emission, transcript, sample_rate=bundle.sample_rate):
ratio = waveform.size(1) / emission.size(1) / sample_rate
fig, axes = plt.subplots(2, 1)
axes[0].imshow(emission[0].detach().cpu().T, aspect="auto")
axes[0].set_title("Emission")
axes[0].set_xticks([])
axes[1].specgram(waveform[0], Fs=sample_rate)
for t_spans, chars in zip(token_spans, transcript):
t0, t1 = t_spans[0].start, t_spans[-1].end
axes[0].axvspan(t0 - 0.5, t1 - 0.5, facecolor="None", hatch="/", edgecolor="white")
axes[1].axvspan(ratio * t0, ratio * t1, facecolor="None", hatch="/", edgecolor="white")
axes[1].annotate(f"{_score(t_spans):.2f}", (ratio * t0, sample_rate * 0.51), annotation_clip=False)
for span, char in zip(t_spans, chars):
t0 = span.start * ratio
axes[1].annotate(char, (t0, sample_rate * 0.55), annotation_clip=False)
axes[1].set_xlabel("time [second]")
fig.tight_layout()
######################################################################
#
def preview_word(waveform, spans, num_frames, transcript, sample_rate=bundle.sample_rate):
ratio = waveform.size(1) / num_frames
x0 = int(ratio * spans[0].start)
x1 = int(ratio * spans[-1].end)
print(f"{transcript} ({_score(spans):.2f}): {x0 / sample_rate:.3f} - {x1 / sample_rate:.3f} sec")
segment = waveform[:, x0:x1]
return IPython.display.Audio(segment.numpy(), rate=sample_rate)
######################################################################
# Normalizing the transcript
# --------------------------
#
# The transcripts passed to the pipeline must be normalized beforehand.
# The exact process of normalization depends on language.
#
# Languages that do not have explicit word boundaries
# (such as Chinese, Japanese and Korean) require segmentation first.
# There are dedicated tools for this, but let's say we have segmented
# transcript.
#
# The first step of normalization is romanization.
# `uroman <https://github.com/isi-nlp/uroman>`__ is a tool that
# supports many languages.
#
# Here is a BASH commands to romanize the input text file and write
# the output to another text file using ``uroman``.
#
# .. code-block:: bash
#
# $ echo "des événements d'actualité qui se sont produits durant l'année 1882" > text.txt
# $ uroman/bin/uroman.pl < text.txt > text_romanized.txt
# $ cat text_romanized.txt
#
# .. code-block:: text
#
# Cette page concerne des evenements d'actualite qui se sont produits durant l'annee 1882
#
# The next step is to remove non-alphabets and punctuations.
# The following snippet normalizes the romanized transcript.
#
# .. code-block:: python
#
# import re
#
#
# def normalize_uroman(text):
# text = text.lower()
# text = text.replace("’", "'")
# text = re.sub("([^a-z' ])", " ", text)
# text = re.sub(' +', ' ', text)
# return text.strip()
#
#
# with open("text_romanized.txt", "r") as f:
# for line in f:
# text_normalized = normalize_uroman(line)
# print(text_normalized)
#
# Running the script on the above exanple produces the following.
#
# .. code-block:: text
#
# cette page concerne des evenements d'actualite qui se sont produits durant l'annee
#
# Note that, in this example, since "1882" was not romanized by ``uroman``,
# it was removed in the normalization step.
# To avoid this, one needs to romanize numbers, but this is known to be a non-trivial task.
#
######################################################################
# Aligning transcripts to speech
# ------------------------------
#
# Now we perform the forced alignment for multiple languages.
#
#
# German
# ~~~~~~
text_raw = "aber seit ich bei ihnen das brot hole"
text_normalized = "aber seit ich bei ihnen das brot hole"
url = "https://download.pytorch.org/torchaudio/tutorial-assets/10349_8674_000087.flac"
waveform, sample_rate = torchaudio.load(
url, frame_offset=int(0.5 * bundle.sample_rate), num_frames=int(2.5 * bundle.sample_rate)
)
######################################################################
#
assert sample_rate == bundle.sample_rate
######################################################################
#
transcript = text_normalized.split()
tokens = tokenizer(transcript)
emission, token_spans = compute_alignments(waveform, transcript)
num_frames = emission.size(1)
plot_alignments(waveform, token_spans, emission, transcript)
print("Raw Transcript: ", text_raw)
print("Normalized Transcript: ", text_normalized)
IPython.display.Audio(waveform, rate=sample_rate)
######################################################################
#
preview_word(waveform, token_spans[0], num_frames, transcript[0])
######################################################################
#
preview_word(waveform, token_spans[1], num_frames, transcript[1])
######################################################################
#
preview_word(waveform, token_spans[2], num_frames, transcript[2])
######################################################################
#
preview_word(waveform, token_spans[3], num_frames, transcript[3])
######################################################################
#
preview_word(waveform, token_spans[4], num_frames, transcript[4])
######################################################################
#
preview_word(waveform, token_spans[5], num_frames, transcript[5])
######################################################################
#
preview_word(waveform, token_spans[6], num_frames, transcript[6])
######################################################################
#
preview_word(waveform, token_spans[7], num_frames, transcript[7])
######################################################################
# Chinese
# ~~~~~~~
#
# Chinese is a character-based language, and there is not explicit word-level
# tokenization (separated by spaces) in its raw written form. In order to
# obtain word level alignments, you need to first tokenize the transcripts
# at the word level using a word tokenizer like `“Stanford
# Tokenizer” <https://michelleful.github.io/code-blog/2015/09/10/parsing-chinese-with-stanford/>`__.
# However this is not needed if you only want character-level alignments.
#
text_raw = "关 服务 高端 产品 仍 处于 供不应求 的 局面"
text_normalized = "guan fuwu gaoduan chanpin reng chuyu gongbuyingqiu de jumian"
######################################################################
#
url = "https://download.pytorch.org/torchaudio/tutorial-assets/mvdr/clean_speech.wav"
waveform, sample_rate = torchaudio.load(url)
waveform = waveform[0:1]
######################################################################
#
assert sample_rate == bundle.sample_rate
######################################################################
#
transcript = text_normalized.split()
emission, token_spans = compute_alignments(waveform, transcript)
num_frames = emission.size(1)
plot_alignments(waveform, token_spans, emission, transcript)
print("Raw Transcript: ", text_raw)
print("Normalized Transcript: ", text_normalized)
IPython.display.Audio(waveform, rate=sample_rate)
######################################################################
#
preview_word(waveform, token_spans[0], num_frames, transcript[0])
######################################################################
#
preview_word(waveform, token_spans[1], num_frames, transcript[1])
######################################################################
#
preview_word(waveform, token_spans[2], num_frames, transcript[2])
######################################################################
#
preview_word(waveform, token_spans[3], num_frames, transcript[3])
######################################################################
#
preview_word(waveform, token_spans[4], num_frames, transcript[4])
######################################################################
#
preview_word(waveform, token_spans[5], num_frames, transcript[5])
######################################################################
#
preview_word(waveform, token_spans[6], num_frames, transcript[6])
######################################################################
#
preview_word(waveform, token_spans[7], num_frames, transcript[7])
######################################################################
#
preview_word(waveform, token_spans[8], num_frames, transcript[8])
######################################################################
# Polish
# ~~~~~~
text_raw = "wtedy ujrzałem na jego brzuchu okrągłą czarną ranę"
text_normalized = "wtedy ujrzalem na jego brzuchu okragla czarna rane"
url = "https://download.pytorch.org/torchaudio/tutorial-assets/5090_1447_000088.flac"
waveform, sample_rate = torchaudio.load(url, num_frames=int(4.5 * bundle.sample_rate))
######################################################################
#
assert sample_rate == bundle.sample_rate
######################################################################
#
transcript = text_normalized.split()
emission, token_spans = compute_alignments(waveform, transcript)
num_frames = emission.size(1)
plot_alignments(waveform, token_spans, emission, transcript)
print("Raw Transcript: ", text_raw)
print("Normalized Transcript: ", text_normalized)
IPython.display.Audio(waveform, rate=sample_rate)
######################################################################
#
preview_word(waveform, token_spans[0], num_frames, transcript[0])
######################################################################
#
preview_word(waveform, token_spans[1], num_frames, transcript[1])
######################################################################
#
preview_word(waveform, token_spans[2], num_frames, transcript[2])
######################################################################
#
preview_word(waveform, token_spans[3], num_frames, transcript[3])
######################################################################
#
preview_word(waveform, token_spans[4], num_frames, transcript[4])
######################################################################
#
preview_word(waveform, token_spans[5], num_frames, transcript[5])
######################################################################
#
preview_word(waveform, token_spans[6], num_frames, transcript[6])
######################################################################
#
preview_word(waveform, token_spans[7], num_frames, transcript[7])
######################################################################
# Portuguese
# ~~~~~~~~~~
text_raw = "na imensa extensão onde se esconde o inconsciente imortal"
text_normalized = "na imensa extensao onde se esconde o inconsciente imortal"
url = "https://download.pytorch.org/torchaudio/tutorial-assets/6566_5323_000027.flac"
waveform, sample_rate = torchaudio.load(
url, frame_offset=int(bundle.sample_rate), num_frames=int(4.6 * bundle.sample_rate)
)
######################################################################
#
assert sample_rate == bundle.sample_rate
######################################################################
#
transcript = text_normalized.split()
emission, token_spans = compute_alignments(waveform, transcript)
num_frames = emission.size(1)
plot_alignments(waveform, token_spans, emission, transcript)
print("Raw Transcript: ", text_raw)
print("Normalized Transcript: ", text_normalized)
IPython.display.Audio(waveform, rate=sample_rate)
######################################################################
#
preview_word(waveform, token_spans[0], num_frames, transcript[0])
######################################################################
#
preview_word(waveform, token_spans[1], num_frames, transcript[1])
######################################################################
#
preview_word(waveform, token_spans[2], num_frames, transcript[2])
######################################################################
#
preview_word(waveform, token_spans[3], num_frames, transcript[3])
######################################################################
#
preview_word(waveform, token_spans[4], num_frames, transcript[4])
######################################################################
#
preview_word(waveform, token_spans[5], num_frames, transcript[5])
######################################################################
#
preview_word(waveform, token_spans[6], num_frames, transcript[6])
######################################################################
#
preview_word(waveform, token_spans[7], num_frames, transcript[7])
######################################################################
#
preview_word(waveform, token_spans[8], num_frames, transcript[8])
######################################################################
# Italian
# ~~~~~~~
text_raw = "elle giacean per terra tutte quante"
text_normalized = "elle giacean per terra tutte quante"
url = "https://download.pytorch.org/torchaudio/tutorial-assets/642_529_000025.flac"
waveform, sample_rate = torchaudio.load(url, num_frames=int(4 * bundle.sample_rate))
######################################################################
#
assert sample_rate == bundle.sample_rate
######################################################################
#
transcript = text_normalized.split()
emission, token_spans = compute_alignments(waveform, transcript)
num_frames = emission.size(1)
plot_alignments(waveform, token_spans, emission, transcript)
print("Raw Transcript: ", text_raw)
print("Normalized Transcript: ", text_normalized)
IPython.display.Audio(waveform, rate=sample_rate)
######################################################################
#
preview_word(waveform, token_spans[0], num_frames, transcript[0])
######################################################################
#
preview_word(waveform, token_spans[1], num_frames, transcript[1])
######################################################################
#
preview_word(waveform, token_spans[2], num_frames, transcript[2])
######################################################################
#
preview_word(waveform, token_spans[3], num_frames, transcript[3])
######################################################################
#
preview_word(waveform, token_spans[4], num_frames, transcript[4])
######################################################################
#
preview_word(waveform, token_spans[5], num_frames, transcript[5])
######################################################################
# Conclusion
# ----------
#
# In this tutorial, we looked at how to use torchaudio’s forced alignment
# API and a Wav2Vec2 pre-trained mulilingual acoustic model to align
# speech data to transcripts in five languages.
#
######################################################################
# Acknowledgement
# ---------------
#
# Thanks to `Vineel Pratap <vineelkpratap@meta.com>`__ and `Zhaoheng
# Ni <zni@meta.com>`__ for developing and open-sourcing the
# forced aligner API.
#
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment