Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Torchaudio
Commits
ffeba11a
Commit
ffeba11a
authored
Sep 02, 2024
by
mayp777
Browse files
UPDATE
parent
29deb085
Changes
337
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2825 additions
and
471 deletions
+2825
-471
examples/self_supervised_learning/losses/__init__.py
examples/self_supervised_learning/losses/__init__.py
+6
-0
examples/self_supervised_learning/losses/_hubert_loss.py
examples/self_supervised_learning/losses/_hubert_loss.py
+47
-0
examples/self_supervised_learning/losses/_wav2vec2_loss.py
examples/self_supervised_learning/losses/_wav2vec2_loss.py
+80
-0
examples/self_supervised_learning/lr_schedulers/__init__.py
examples/self_supervised_learning/lr_schedulers/__init__.py
+5
-0
examples/self_supervised_learning/lr_schedulers/_linear_decay.py
...s/self_supervised_learning/lr_schedulers/_linear_decay.py
+27
-0
examples/self_supervised_learning/train_hubert.py
examples/self_supervised_learning/train_hubert.py
+316
-0
examples/source_separation/eval.py
examples/source_separation/eval.py
+2
-2
examples/source_separation/lightning_train.py
examples/source_separation/lightning_train.py
+3
-2
examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py
...ples/tutorials/asr_inference_with_ctc_decoder_tutorial.py
+75
-18
examples/tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.py
...tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.py
+311
-0
examples/tutorials/audio_data_augmentation_tutorial.py
examples/tutorials/audio_data_augmentation_tutorial.py
+125
-163
examples/tutorials/audio_datasets_tutorial.py
examples/tutorials/audio_datasets_tutorial.py
+26
-46
examples/tutorials/audio_feature_augmentation_tutorial.py
examples/tutorials/audio_feature_augmentation_tutorial.py
+66
-47
examples/tutorials/audio_feature_extractions_tutorial.py
examples/tutorials/audio_feature_extractions_tutorial.py
+151
-122
examples/tutorials/audio_io_tutorial.py
examples/tutorials/audio_io_tutorial.py
+33
-14
examples/tutorials/audio_resampling_tutorial.py
examples/tutorials/audio_resampling_tutorial.py
+54
-39
examples/tutorials/ctc_forced_alignment_api_tutorial.py
examples/tutorials/ctc_forced_alignment_api_tutorial.py
+517
-0
examples/tutorials/device_asr.py
examples/tutorials/device_asr.py
+15
-18
examples/tutorials/effector_tutorial.py
examples/tutorials/effector_tutorial.py
+366
-0
examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py
...orials/forced_alignment_for_multilingual_data_tutorial.py
+600
-0
No files found.
Too many changes to show.
To preserve performance only
337 of 337+
files are displayed.
Plain diff
Email patch
examples/self_supervised_learning/losses/__init__.py
0 → 100644
View file @
ffeba11a
from
._hubert_loss
import
hubert_loss
__all__
=
[
"hubert_loss"
,
"wav2vec2_loss"
,
]
examples/self_supervised_learning/losses/_hubert_loss.py
0 → 100644
View file @
ffeba11a
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn.functional
as
F
from
torch
import
Tensor
def
hubert_loss
(
logit_m
:
Optional
[
Tensor
],
logit_u
:
Optional
[
Tensor
],
feature_penalty
:
Tensor
,
label
:
Optional
[
Tensor
]
=
None
,
masked_weight
:
float
=
1.0
,
unmasked_weight
:
float
=
0.0
,
feature_weight
:
float
=
10.0
,
reduction
:
str
=
"sum"
,
)
->
Tuple
[
Tensor
,
float
]:
"""Compute the cross-entropy loss on HuBERT masked and non-masked logits.
Args:
logit_m (Tensor or None): The masked logit Tensor of dimension `(masked_frames, final_dim)`.
logit_u (Tensor or None): The non-masked logit Tensor of dimension `(unmasked_frames, final_dim)`.
feature_penalty (Tensor): The feature mean value for additional penalty loss.
masked_weight (float, optional): The weight for masked cross-entropy loss (Default: ``1.0``).
unmasked_weight (float, optional): The weight for non-masked cross-entropy loss (Default: ``0.0``).
feature_weight (float, optional): The weight for feature penalty loss (Default: ``10.0``).
reduction (str, optional): The reduction method for cross-entropy loss (Default: ``"sum"``).
Returns:
(Tensor, float)
Tensor: The desired loss Tensor.
float: Number of frames used in loss computation.
"""
num_frame
=
0.0
loss
=
0.0
if
logit_m
is
not
None
:
target_m
=
torch
.
zeros
(
logit_m
.
shape
[
0
],
dtype
=
torch
.
long
,
device
=
logit_m
.
device
)
loss_m
=
F
.
cross_entropy
(
logit_m
,
target_m
,
reduction
=
reduction
)
loss
+=
loss_m
*
masked_weight
num_frame
+=
logit_m
.
shape
[
0
]
if
logit_u
is
not
None
:
target_u
=
torch
.
zeros
(
logit_u
.
shape
[
0
],
dtype
=
torch
.
long
,
device
=
logit_m
.
device
)
loss_u
=
F
.
cross_entropy
(
logit_u
,
target_u
,
reduction
=
reduction
)
loss
+=
loss_u
*
unmasked_weight
num_frame
+=
logit_u
.
shape
[
0
]
loss
+=
feature_penalty
*
feature_weight
*
num_frame
return
loss
,
num_frame
examples/self_supervised_learning/losses/_wav2vec2_loss.py
0 → 100644
View file @
ffeba11a
from
typing
import
Tuple
import
torch
import
torch.nn.functional
as
F
from
torch
import
Tensor
def
compute_contrastive_loss
(
x
:
Tensor
,
mask_indices
:
Tensor
,
targets
:
Tensor
,
neg_is_pos
:
Tensor
,
reduction
:
str
=
"none"
,
logit_temp
:
float
=
0.1
,
):
"""
Computes the contrastive loss used in Wav2Vec2 loss function.
Args:
x (Tensor): Input embeddings of shape `(batch_size, sequence_length, hidden_size)`.
mask_indices (Tensor): Indices to mask negative samples of shape `(batch_size, sequence_length)`.
targets (Tensor): Labels indicating positive samples.
Tensor of shape `(num_negative + 1, batch, sequence_length, hidden_size)`.
neg_is_pos (Tensor): Boolean tensor indicating whether negative samples should be treated as positives.
Tensor of shape `(batch, sequence_length)`.
reduction (str): Reduction type ("sum" or "none").
logit_temp (float, optional): Temperature scaling factor for logits, defaults to 0.1.
Returns:
The computed contrastive loss and sample size
"""
x
=
x
[
mask_indices
].
view
(
x
.
size
(
0
),
-
1
,
x
.
size
(
-
1
)).
unsqueeze
(
0
).
expand
(
targets
.
shape
)
logits
=
torch
.
cosine_similarity
(
x
.
float
(),
targets
.
float
(),
dim
=-
1
).
float
()
logits
/=
logit_temp
if
neg_is_pos
.
any
():
logits
[
1
:][
neg_is_pos
]
=
float
(
"-inf"
)
target
=
logits
.
new_zeros
(
logits
.
size
(
1
)
*
logits
.
size
(
2
),
dtype
=
torch
.
long
,
device
=
logits
.
device
)
logits
=
logits
.
transpose
(
0
,
2
)
logits
=
logits
.
reshape
(
-
1
,
logits
.
size
(
-
1
))
loss
=
F
.
cross_entropy
(
logits
,
target
,
reduction
=
reduction
,
)
sample_size
=
target
.
numel
()
return
loss
,
sample_size
def
wav2vec2_loss
(
x
:
Tensor
,
mask_indices
:
Tensor
,
positives
:
Tensor
,
negatives
:
Tensor
,
reduction
:
str
=
"none"
)
->
Tuple
[
Tensor
,
float
]:
"""Compute Wav2Vec2 loss.
Args:
x (Tensor): The masked sequences of Wav2Vec 2.0 model.
Tensor of shape `(batch_size, sequence_length, hidden_size)`.
mask_indices (Tensor): The mask indices. Tensor of shape `(batch_size, sequence_length)`
positives (Tensor): The positives, prior to negative sampling.
Tensor of shape `(batch_size, masked_sequence_length, hidden_size)`
negatives (Tensor): The negative samples.
Tensor of shape `(num_negative, batch_size, masked_sequence_length, hidden_size)`
reduction (str): Use "sum" as reduction for cross-entropy loss (Default: ``none``)
Returns:
(Tensor, float)
Tensor: The desired loss Tensor.
float: Sample size according to mask_indices
"""
assert
positives
is
not
None
assert
mask_indices
is
not
None
assert
mask_indices
.
sum
()
==
positives
.
shape
[
0
]
*
positives
.
shape
[
1
]
neg_is_pos
=
(
positives
==
negatives
).
all
(
-
1
)
positives
=
positives
.
unsqueeze
(
0
)
targets
=
torch
.
cat
([
positives
,
negatives
],
dim
=
0
)
loss
,
sample_size
=
compute_contrastive_loss
(
x
,
mask_indices
,
targets
,
neg_is_pos
,
reduction
)
return
loss
,
sample_size
examples/self_supervised_learning/lr_schedulers/__init__.py
0 → 100644
View file @
ffeba11a
from
._linear_decay
import
LinearDecayLRScheduler
__all__
=
[
"LinearDecayLRScheduler"
,
]
examples/self_supervised_learning/lr_schedulers/_linear_decay.py
0 → 100644
View file @
ffeba11a
import
torch
from
torch.optim.optimizer
import
Optimizer
class
LinearDecayLRScheduler
(
torch
.
optim
.
lr_scheduler
.
_LRScheduler
):
"""Linear learning rate scheduler with warm up."""
def
__init__
(
self
,
optimizer
:
Optimizer
,
warmup_updates
:
int
,
max_updates
:
int
,
last_epoch
:
int
=
-
1
,
verbose
:
bool
=
False
,
):
self
.
warmup_updates
=
warmup_updates
self
.
max_updates
=
max_updates
super
().
__init__
(
optimizer
,
last_epoch
=
last_epoch
,
verbose
=
verbose
)
def
get_lr
(
self
):
if
self
.
_step_count
<=
self
.
warmup_updates
:
return
[
self
.
_step_count
/
self
.
warmup_updates
*
base_lr
for
base_lr
in
self
.
base_lrs
]
elif
self
.
_step_count
>=
self
.
max_updates
:
return
[
0.0
for
_
in
self
.
base_lrs
]
else
:
pct_remaining
=
(
self
.
max_updates
-
self
.
_step_count
)
/
(
self
.
max_updates
-
self
.
warmup_updates
)
return
[
base_lr
*
pct_remaining
for
base_lr
in
self
.
base_lrs
]
examples/self_supervised_learning/train_hubert.py
0 → 100644
View file @
ffeba11a
import
logging
import
pathlib
from
argparse
import
ArgumentDefaultsHelpFormatter
,
ArgumentParser
,
RawDescriptionHelpFormatter
from
functools
import
partial
from
typing
import
Dict
,
Tuple
import
torch
import
torchaudio.models
from
lightning.pytorch
import
seed_everything
,
Trainer
from
lightning.pytorch.callbacks
import
ModelCheckpoint
from
.data_modules
import
HuBERTDataModule
from
.lightning_modules
import
SSLPretrainModule
from
.losses
import
hubert_loss
from
.lr_schedulers
import
LinearDecayLRScheduler
class
_Formatter
(
ArgumentDefaultsHelpFormatter
,
RawDescriptionHelpFormatter
):
# To use ArgumentDefaultsHelpFormatter as the formatter_class and
# RawDescriptionHelpFormatter to add custom formatting to description or epilog.
# Check: https://stackoverflow.com/a/18462760
pass
def
_compute_accuracy
(
logits
:
torch
.
Tensor
):
with
torch
.
no_grad
():
max
=
logits
.
argmax
(
-
1
)
==
0
min
=
logits
.
argmin
(
-
1
)
==
0
both
=
max
&
min
corr
=
max
.
long
().
sum
().
item
()
-
both
.
long
().
sum
().
item
()
count
=
max
.
numel
()
return
corr
/
count
class
HuBERTModule
(
SSLPretrainModule
):
def
configure_optimizers
(
self
):
return
(
[
self
.
optimizer
],
[
{
"scheduler"
:
self
.
lr_scheduler
,
"interval"
:
"step"
,
},
],
)
def
log_metric
(
self
,
batch
:
Dict
,
output
:
Tuple
,
loss
:
torch
.
Tensor
,
step_type
:
str
):
logit_m
,
logit_u
,
_
=
output
self
.
log
(
f
"
{
step_type
}
_loss"
,
loss
.
item
(),
on_step
=
True
,
on_epoch
=
True
,
)
acc_m
=
_compute_accuracy
(
logit_m
)
acc_u
=
_compute_accuracy
(
logit_u
)
self
.
log
(
f
"
{
step_type
}
_acc_m"
,
acc_m
,
on_step
=
True
,
on_epoch
=
True
,
sync_dist
=
True
,
prog_bar
=
step_type
==
"train"
,
)
self
.
log
(
f
"
{
step_type
}
_acc_u"
,
acc_u
,
on_step
=
True
,
on_epoch
=
True
,
sync_dist
=
True
,
prog_bar
=
step_type
==
"train"
,
)
def
run_train
(
args
):
seed_everything
(
1337
)
checkpoint_dir
=
args
.
exp_dir
/
f
"checkpoints_
{
args
.
dataset
}
_
{
args
.
model_name
}
"
checkpoint
=
ModelCheckpoint
(
checkpoint_dir
,
monitor
=
"val_loss"
,
mode
=
"min"
,
save_top_k
=
5
,
save_weights_only
=
False
,
verbose
=
True
,
)
train_checkpoint
=
ModelCheckpoint
(
checkpoint_dir
,
monitor
=
"train_loss"
,
mode
=
"min"
,
save_top_k
=
5
,
save_weights_only
=
False
,
verbose
=
True
,
)
callbacks
=
[
checkpoint
,
train_checkpoint
,
]
trainer
=
Trainer
(
default_root_dir
=
args
.
exp_dir
,
max_steps
=
args
.
max_updates
,
num_nodes
=
args
.
num_nodes
,
devices
=
args
.
gpus
,
accelerator
=
"gpu"
,
strategy
=
"ddp_find_unused_parameters_true"
,
precision
=
args
.
precision
,
accumulate_grad_batches
=
args
.
accumulate_grad_batches
,
gradient_clip_val
=
args
.
clip_norm
,
use_distributed_sampler
=
False
,
callbacks
=
callbacks
,
reload_dataloaders_every_n_epochs
=
1
,
)
if
args
.
model_name
not
in
[
"hubert_pretrain_base"
,
"hubert_pretrain_large"
,
"hubert_pretrain_xlarge"
]:
raise
ValueError
(
"Expect model_name to be one of 'hubert_pretrain_base', 'hubert_pretrain_large', 'hubert_pretrain_xlarge'."
f
"Found
{
args
.
model_name
}
."
)
model
=
getattr
(
torchaudio
.
models
,
args
.
model_name
)()
loss_fn
=
partial
(
hubert_loss
,
masked_weight
=
args
.
masked_weight
,
unmasked_weight
=
args
.
unmasked_weight
,
feature_weight
=
args
.
feature_weight
,
)
optimizer
=
torch
.
optim
.
AdamW
(
model
.
parameters
(),
lr
=
args
.
learning_rate
,
betas
=
args
.
betas
,
eps
=
args
.
eps
,
weight_decay
=
args
.
weight_decay
,
)
lr_scheduler
=
LinearDecayLRScheduler
(
optimizer
,
args
.
warmup_updates
,
args
.
max_updates
)
lightning_module
=
HuBERTModule
(
model
,
loss_fn
,
optimizer
,
lr_scheduler
,
)
data_module
=
HuBERTDataModule
(
dataset_path
=
args
.
dataset_path
,
dataset
=
"librispeech"
,
feature_type
=
"mfcc"
,
seconds_per_batch
=
args
.
seconds_per_batch
,
train_shuffle
=
True
,
num_workers
=
10
,
)
trainer
.
fit
(
lightning_module
,
datamodule
=
data_module
)
def
_parse_args
():
parser
=
ArgumentParser
(
description
=
__doc__
,
formatter_class
=
_Formatter
,
)
parser
.
add_argument
(
"--dataset-path"
,
type
=
pathlib
.
Path
,
required
=
True
,
help
=
"Path to the feature and label directories."
,
)
parser
.
add_argument
(
"--resume-checkpoint"
,
type
=
pathlib
.
Path
,
default
=
None
,
help
=
"Path to the feature and label directories. (Default: None)"
,
)
parser
.
add_argument
(
"--feature-type"
,
choices
=
[
"mfcc"
,
"hubert"
],
type
=
str
,
required
=
True
,
)
parser
.
add_argument
(
"--feature-grad-mult"
,
default
=
0.1
,
type
=
float
,
help
=
"The scaling factor to multiply the feature extractor gradient. (Default: 0.1)"
,
)
parser
.
add_argument
(
"--num-classes"
,
choices
=
[
100
,
500
],
type
=
int
,
required
=
True
,
help
=
"The ``num_class`` when building the hubert_pretrain_base model."
,
)
parser
.
add_argument
(
"--model-name"
,
default
=
"hubert_pretrain_base"
,
choices
=
[
"hubert_pretrain_base"
,
"hubert_pretrain_large"
,
"hubert_pretrain_xlarge"
,
],
type
=
str
,
help
=
"The HuBERT model to train. (Default: 'hubert_pretrain_base')"
,
)
parser
.
add_argument
(
"--exp-dir"
,
default
=
pathlib
.
Path
(
"./exp"
),
type
=
pathlib
.
Path
,
help
=
"Directory to save checkpoints and logs to. (Default: './exp')"
,
)
parser
.
add_argument
(
"--dataset"
,
default
=
"librispeech"
,
choices
=
[
"librispeech"
,
"librilight"
],
type
=
str
,
help
=
"The dataset for training. (Default: 'librispeech')"
,
)
parser
.
add_argument
(
"--learning-rate"
,
default
=
0.0005
,
type
=
float
,
help
=
"The peak learning rate. (Default: 0.0005)"
,
)
parser
.
add_argument
(
"--betas"
,
default
=
(
0.9
,
0.98
),
type
=
Tuple
,
help
=
"The coefficients for computing running averages of gradient and its square (Default: (0.9, 0.98))"
,
)
parser
.
add_argument
(
"--eps"
,
default
=
1e-6
,
type
=
float
,
help
=
"Epsilon value in Adam optimizer. (Default: 1e-6)"
,
)
parser
.
add_argument
(
"--weight-decay"
,
default
=
0.01
,
type
=
float
,
help
=
"Weight decay (L2 penalty) (Default: 0.01)"
,
)
parser
.
add_argument
(
"--precision"
,
default
=
16
,
choices
=
[
16
,
32
,
64
,
"bf16"
],
help
=
"Precision of model training. (Default: 16)"
,
)
parser
.
add_argument
(
"--accumulate-grad-batches"
,
default
=
1
,
type
=
int
,
help
=
"Number of steps for accumulating gradients. (Default: 1)"
,
)
parser
.
add_argument
(
"--clip-norm"
,
default
=
10.0
,
type
=
float
,
help
=
"The gradient norm value to clip. (Default: 10.0)"
,
)
parser
.
add_argument
(
"--num-nodes"
,
default
=
4
,
type
=
int
,
help
=
"Number of nodes to use for training. (Default: 4)"
,
)
parser
.
add_argument
(
"--gpus"
,
default
=
8
,
type
=
int
,
help
=
"Number of GPUs per node to use for training. (Default: 8)"
,
)
parser
.
add_argument
(
"--warmup-updates"
,
default
=
32000
,
type
=
int
,
help
=
"Number of steps for warm up the learning rate. (Default: 32000)"
,
)
parser
.
add_argument
(
"--max-updates"
,
default
=
250000
,
type
=
int
,
help
=
"Total number of training steps. (Default: 250000)"
,
)
parser
.
add_argument
(
"--seconds-per-batch"
,
default
=
87.5
,
type
=
float
,
help
=
"Number of seconds of audio in a mini-batch. (Default: 87.5)"
,
)
parser
.
add_argument
(
"--masked-weight"
,
default
=
1.0
,
type
=
float
,
help
=
"The weight for cross-entropy loss of masksed frames. (Default: ``1.0``)"
,
)
parser
.
add_argument
(
"--unmasked-weight"
,
default
=
0.0
,
type
=
float
,
help
=
"The weight for cross-entropy loss of unmasksed frames. (Default: ``0.0``)"
,
)
parser
.
add_argument
(
"--feature-weight"
,
default
=
10.0
,
type
=
float
,
help
=
"The weight for feature penalty loss. (Default: ``10.0``)"
,
)
parser
.
add_argument
(
"--debug"
,
action
=
"store_true"
,
help
=
"whether to use debug level for logging"
)
return
parser
.
parse_args
()
def
_init_logger
(
debug
):
fmt
=
"%(asctime)s %(message)s"
if
debug
else
"%(message)s"
level
=
logging
.
DEBUG
if
debug
else
logging
.
INFO
logging
.
basicConfig
(
format
=
fmt
,
level
=
level
,
datefmt
=
"%Y-%m-%d %H:%M:%S"
)
def
cli_main
():
args
=
_parse_args
()
_init_logger
(
args
.
debug
)
run_train
(
args
)
if
__name__
==
"__main__"
:
cli_main
()
examples/source_separation/eval.py
View file @
ffeba11a
...
...
@@ -31,7 +31,7 @@ def _eval(model, data_loader, device):
def
cli_main
():
parser
=
ArgumentParser
()
parser
.
add_argument
(
"--dataset"
,
default
=
"librimix"
,
type
=
str
,
choices
=
[
"wsj0
-
mix"
,
"librimix"
])
parser
.
add_argument
(
"--dataset"
,
default
=
"librimix"
,
type
=
str
,
choices
=
[
"wsj0mix"
,
"librimix"
])
parser
.
add_argument
(
"--root-dir"
,
type
=
Path
,
...
...
@@ -79,7 +79,7 @@ def cli_main():
_
,
_
,
eval_loader
=
_get_dataloader
(
args
.
dataset
,
args
.
data
_dir
,
args
.
root
_dir
,
args
.
num_speakers
,
args
.
sample_rate
,
1
,
# batch size is set to 1 to avoid masking
...
...
examples/source_separation/lightning_train.py
View file @
ffeba11a
...
...
@@ -308,7 +308,7 @@ def _get_dataloader(
def
cli_main
():
parser
=
ArgumentParser
()
parser
.
add_argument
(
"--batch-size"
,
default
=
6
,
type
=
int
)
parser
.
add_argument
(
"--dataset"
,
default
=
"librimix"
,
type
=
str
,
choices
=
[
"wsj0
-
mix"
,
"librimix"
])
parser
.
add_argument
(
"--dataset"
,
default
=
"librimix"
,
type
=
str
,
choices
=
[
"wsj0mix"
,
"librimix"
])
parser
.
add_argument
(
"--root-dir"
,
type
=
Path
,
...
...
@@ -412,9 +412,10 @@ def cli_main():
trainer
=
Trainer
(
default_root_dir
=
args
.
exp_dir
,
max_epochs
=
args
.
epochs
,
gpus
=
args
.
num_gpu
,
num_nodes
=
args
.
num_node
,
accelerator
=
"gpu"
,
strategy
=
"ddp_find_unused_parameters_false"
,
devices
=
args
.
num_gpu
,
limit_train_batches
=
1.0
,
# Useful for fast experiment
gradient_clip_val
=
5.0
,
callbacks
=
callbacks
,
...
...
examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py
View file @
ffeba11a
...
...
@@ -207,6 +207,7 @@ from torchaudio.models.decoder import CTCDecoderLM, CTCDecoderLMState
class
CustomLM
(
CTCDecoderLM
):
"""Create a Python wrapper around `language_model` to feed to the decoder."""
def
__init__
(
self
,
language_model
:
torch
.
nn
.
Module
):
CTCDecoderLM
.
__init__
(
self
)
self
.
language_model
=
language_model
...
...
@@ -386,6 +387,47 @@ print(f"WER: {beam_search_wer}")
# and “shoktd”.
#
######################################################################
# Incremental decoding
# ~~~~~~~~~~~~~~~~~~~~
#
# If the input speech is long, one can decode the emission in
# incremental manner.
#
# You need to first initialize the internal state of the decoder with
# :py:meth:`~torchaudio.models.decoder.CTCDecoder.decode_begin`.
beam_search_decoder
.
decode_begin
()
######################################################################
# Then, you can pass emissions to
# :py:meth:`~torchaudio.models.decoder.CTCDecoder.decode_begin`.
# Here we use the same emission but pass it to the decoder one frame
# at a time.
for
t
in
range
(
emission
.
size
(
1
)):
beam_search_decoder
.
decode_step
(
emission
[
0
,
t
:
t
+
1
,
:])
######################################################################
# Finally, finalize the internal state of the decoder, and retrieve the
# result.
beam_search_decoder
.
decode_end
()
beam_search_result_inc
=
beam_search_decoder
.
get_final_hypothesis
()
######################################################################
# The result of incremental decoding is identical to batch decoding.
#
beam_search_transcript_inc
=
" "
.
join
(
beam_search_result_inc
[
0
].
words
).
strip
()
beam_search_wer_inc
=
torchaudio
.
functional
.
edit_distance
(
actual_transcript
,
beam_search_result_inc
[
0
].
words
)
/
len
(
actual_transcript
)
print
(
f
"Transcript:
{
beam_search_transcript_inc
}
"
)
print
(
f
"WER:
{
beam_search_wer_inc
}
"
)
assert
beam_search_result
[
0
][
0
].
words
==
beam_search_result_inc
[
0
].
words
assert
beam_search_result
[
0
][
0
].
score
==
beam_search_result_inc
[
0
].
score
torch
.
testing
.
assert_close
(
beam_search_result
[
0
][
0
].
timesteps
,
beam_search_result_inc
[
0
].
timesteps
)
######################################################################
# Timestep Alignments
...
...
@@ -406,30 +448,45 @@ print(timesteps, timesteps.shape[0])
#
def
plot_alignments
(
waveform
,
emission
,
tokens
,
timesteps
):
fig
,
ax
=
plt
.
subplots
(
figsize
=
(
32
,
10
))
def
plot_alignments
(
waveform
,
emission
,
tokens
,
timesteps
,
sample_rate
):
t
=
torch
.
arange
(
waveform
.
size
(
0
))
/
sample_rate
ratio
=
waveform
.
size
(
0
)
/
emission
.
size
(
1
)
/
sample_rate
ax
.
plot
(
waveform
)
chars
=
[]
words
=
[]
word_start
=
None
for
token
,
timestep
in
zip
(
tokens
,
timesteps
*
ratio
):
if
token
==
"|"
:
if
word_start
is
not
None
:
words
.
append
((
word_start
,
timestep
))
word_start
=
None
else
:
chars
.
append
((
token
,
timestep
))
if
word_start
is
None
:
word_start
=
timestep
ratio
=
waveform
.
shape
[
0
]
/
emission
.
shape
[
1
]
word_start
=
0
fig
,
axes
=
plt
.
subplots
(
3
,
1
)
for
i
in
range
(
len
(
tokens
)):
if
i
!=
0
and
tokens
[
i
-
1
]
==
"|"
:
word_start
=
timesteps
[
i
]
if
tokens
[
i
]
!=
"|"
:
plt
.
annotate
(
tokens
[
i
].
upper
(),
(
timesteps
[
i
]
*
ratio
,
waveform
.
max
()
*
1.02
),
size
=
14
)
elif
i
!=
0
:
word_end
=
timesteps
[
i
]
ax
.
axvspan
(
word_start
*
ratio
,
word_end
*
ratio
,
alpha
=
0.1
,
color
=
"red"
)
def
_plot
(
ax
,
xlim
):
ax
.
plot
(
t
,
waveform
)
for
token
,
timestep
in
chars
:
ax
.
annotate
(
token
.
upper
(),
(
timestep
,
0.5
))
for
word_start
,
word_end
in
words
:
ax
.
axvspan
(
word_start
,
word_end
,
alpha
=
0.1
,
color
=
"red"
)
ax
.
set_ylim
(
-
0.6
,
0.7
)
ax
.
set_yticks
([
0
])
ax
.
grid
(
True
,
axis
=
"y"
)
ax
.
set_xlim
(
xlim
)
xticks
=
ax
.
get_xticks
()
plt
.
xticks
(
xticks
,
xticks
/
bundle
.
sample_rate
)
ax
.
set_xlabel
(
"time (sec)"
)
ax
.
set_xlim
(
0
,
waveform
.
shape
[
0
])
_plot
(
axes
[
0
],
(
0.3
,
2.5
))
_plot
(
axes
[
1
],
(
2.5
,
4.7
))
_plot
(
axes
[
2
],
(
4.7
,
6.9
))
axes
[
2
].
set_xlabel
(
"time (sec)"
)
fig
.
tight_layout
()
plot_alignments
(
waveform
[
0
],
emission
,
predicted_tokens
,
timesteps
)
plot_alignments
(
waveform
[
0
],
emission
,
predicted_tokens
,
timesteps
,
bundle
.
sample_rate
)
######################################################################
...
...
examples/tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.py
0 → 100644
View file @
ffeba11a
"""
ASR Inference with CUDA CTC Decoder
====================================
**Author**: `Yuekai Zhang <yuekaiz@nvidia.com>`__
This tutorial shows how to perform speech recognition inference using a
CUDA-based CTC beam search decoder.
We demonstrate this on a pretrained
`Zipformer <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_ctc>`__
model from `Next-gen Kaldi <https://nadirapovey.com/next-gen-kaldi-what-is-it>`__ project.
"""
######################################################################
# Overview
# --------
#
# Beam search decoding works by iteratively expanding text hypotheses (beams)
# with next possible characters, and maintaining only the hypotheses with the
# highest scores at each time step.
#
# The underlying implementation uses cuda to acclerate the whole decoding process
# A mathematical formula for the decoder can be
# found in the `paper <https://arxiv.org/pdf/1408.2873.pdf>`__, and
# a more detailed algorithm can be found in this `blog
# <https://distill.pub/2017/ctc/>`__.
#
# Running ASR inference using a CUDA CTC Beam Search decoder
# requires the following components
#
# - Acoustic Model: model predicting modeling units (BPE in this tutorial) from acoustic features
# - BPE Model: the byte-pair encoding (BPE) tokenizer file
#
######################################################################
# Acoustic Model and Set Up
# -------------------------
#
# First we import the necessary utilities and fetch the data that we are
# working with
#
import
torch
import
torchaudio
print
(
torch
.
__version__
)
print
(
torchaudio
.
__version__
)
######################################################################
#
import
time
from
pathlib
import
Path
import
IPython
import
sentencepiece
as
spm
from
torchaudio.models.decoder
import
cuda_ctc_decoder
from
torchaudio.utils
import
download_asset
######################################################################
#
# We use the pretrained
# `Zipformer <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-ctc-2022-12-01>`__
# model that is trained on the `LibriSpeech
# dataset <http://www.openslr.org/12>`__. The model is jointly trained with CTC and Transducer loss functions.
# In this tutorial, we only use CTC head of the model.
def
download_asset_external
(
url
,
key
):
path
=
Path
(
torch
.
hub
.
get_dir
())
/
"torchaudio"
/
Path
(
key
)
if
not
path
.
exists
():
path
.
parent
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
torch
.
hub
.
download_url_to_file
(
url
,
path
)
return
str
(
path
)
url_prefix
=
"https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-ctc-2022-12-01"
model_link
=
f
"
{
url_prefix
}
/resolve/main/exp/cpu_jit.pt"
model_path
=
download_asset_external
(
model_link
,
"cuda_ctc_decoder/cpu_jit.pt"
)
######################################################################
# We will load a sample from the LibriSpeech test-other dataset.
#
speech_file
=
download_asset
(
"tutorial-assets/ctc-decoding/1688-142285-0007.wav"
)
waveform
,
sample_rate
=
torchaudio
.
load
(
speech_file
)
assert
sample_rate
==
16000
IPython
.
display
.
Audio
(
speech_file
)
######################################################################
# The transcript corresponding to this audio file is
#
# .. code-block::
#
# i really was very much afraid of showing him how much shocked i was at some parts of what he said
#
######################################################################
# Files and Data for Decoder
# --------------------------
#
# Next, we load in our token from BPE model, which is the tokenizer for decoding.
#
######################################################################
# Tokens
# ~~~~~~
#
# The tokens are the possible symbols that the acoustic model can predict,
# including the blank symbol in CTC. In this tutorial, it includes 500 BPE tokens.
# It can either be passed in as a
# file, where each line consists of the tokens corresponding to the same
# index, or as a list of tokens, each mapping to a unique index.
#
# .. code-block::
#
# # tokens
# <blk>
# <sos/eos>
# <unk>
# S
# _THE
# _A
# T
# _AND
# ...
#
bpe_link
=
f
"
{
url_prefix
}
/resolve/main/data/lang_bpe_500/bpe.model"
bpe_path
=
download_asset_external
(
bpe_link
,
"cuda_ctc_decoder/bpe.model"
)
bpe_model
=
spm
.
SentencePieceProcessor
()
bpe_model
.
load
(
bpe_path
)
tokens
=
[
bpe_model
.
id_to_piece
(
id
)
for
id
in
range
(
bpe_model
.
get_piece_size
())]
print
(
tokens
)
######################################################################
# Construct CUDA Decoder
# ----------------------
# In this tutorial, we will construct a CUDA beam search decoder.
# The decoder can be constructed using the factory function
# :py:func:`~torchaudio.models.decoder.cuda_ctc_decoder`.
#
cuda_decoder
=
cuda_ctc_decoder
(
tokens
,
nbest
=
10
,
beam_size
=
10
,
blank_skip_threshold
=
0.95
)
######################################################################
# Run Inference
# -------------
#
# Now that we have the data, acoustic model, and decoder, we can perform
# inference. The output of the beam search decoder is of type
# :py:class:`~torchaudio.models.decoder.CUCTCHypothesis`, consisting of the
# predicted token IDs, words (symbols corresponding to the token IDs), and hypothesis scores.
# Recall the transcript corresponding to the
# waveform is
#
# .. code-block::
#
# i really was very much afraid of showing him how much shocked i was at some parts of what he said
#
actual_transcript
=
"i really was very much afraid of showing him how much shocked i was at some parts of what he said"
actual_transcript
=
actual_transcript
.
split
()
device
=
torch
.
device
(
"cuda"
,
0
)
acoustic_model
=
torch
.
jit
.
load
(
model_path
)
acoustic_model
.
to
(
device
)
acoustic_model
.
eval
()
waveform
=
waveform
.
to
(
device
)
feat
=
torchaudio
.
compliance
.
kaldi
.
fbank
(
waveform
,
num_mel_bins
=
80
,
snip_edges
=
False
)
feat
=
feat
.
unsqueeze
(
0
)
feat_lens
=
torch
.
tensor
(
feat
.
size
(
1
),
device
=
device
).
unsqueeze
(
0
)
encoder_out
,
encoder_out_lens
=
acoustic_model
.
encoder
(
feat
,
feat_lens
)
nnet_output
=
acoustic_model
.
ctc_output
(
encoder_out
)
log_prob
=
torch
.
nn
.
functional
.
log_softmax
(
nnet_output
,
-
1
)
print
(
f
"The shape of log_prob:
{
log_prob
.
shape
}
, the shape of encoder_out_lens:
{
encoder_out_lens
.
shape
}
"
)
######################################################################
# The cuda ctc decoder gives the following result.
#
results
=
cuda_decoder
(
log_prob
,
encoder_out_lens
.
to
(
torch
.
int32
))
beam_search_transcript
=
bpe_model
.
decode
(
results
[
0
][
0
].
tokens
).
lower
()
beam_search_wer
=
torchaudio
.
functional
.
edit_distance
(
actual_transcript
,
beam_search_transcript
.
split
())
/
len
(
actual_transcript
)
print
(
f
"Transcript:
{
beam_search_transcript
}
"
)
print
(
f
"WER:
{
beam_search_wer
}
"
)
######################################################################
# Beam Search Decoder Parameters
# ------------------------------
#
# In this section, we go a little bit more in depth about some different
# parameters and tradeoffs. For the full list of customizable parameters,
# please refer to the
# :py:func:`documentation <torchaudio.models.decoder.cuda_ctc_decoder>`.
#
######################################################################
# Helper Function
# ~~~~~~~~~~~~~~~
#
def
print_decoded
(
cuda_decoder
,
bpe_model
,
log_prob
,
encoder_out_lens
,
param
,
param_value
):
start_time
=
time
.
monotonic
()
results
=
cuda_decoder
(
log_prob
,
encoder_out_lens
.
to
(
torch
.
int32
))
decode_time
=
time
.
monotonic
()
-
start_time
transcript
=
bpe_model
.
decode
(
results
[
0
][
0
].
tokens
).
lower
()
score
=
results
[
0
][
0
].
score
print
(
f
"
{
param
}
{
param_value
:
<
3
}
:
{
transcript
}
(score:
{
score
:.
2
f
}
;
{
decode_time
:.
4
f
}
secs)"
)
######################################################################
# nbest
# ~~~~~
#
# This parameter indicates the number of best hypotheses to return. For
# instance, by setting ``nbest=10`` when constructing the beam search
# decoder earlier, we can now access the hypotheses with the top 10 scores.
#
for
i
in
range
(
10
):
transcript
=
bpe_model
.
decode
(
results
[
0
][
i
].
tokens
).
lower
()
score
=
results
[
0
][
i
].
score
print
(
f
"
{
transcript
}
(score:
{
score
}
)"
)
######################################################################
# beam size
# ~~~~~~~~~
#
# The ``beam_size`` parameter determines the maximum number of best
# hypotheses to hold after each decoding step. Using larger beam sizes
# allows for exploring a larger range of possible hypotheses which can
# produce hypotheses with higher scores, but it does not provide additional gains beyond a certain point.
# We recommend to set beam_size=10 for cuda beam search decoder.
#
# In the example below, we see improvement in decoding quality as we
# increase beam size from 1 to 3, but notice how using a beam size
# of 3 provides the same output as beam size 10.
#
beam_sizes
=
[
1
,
2
,
3
,
10
]
for
beam_size
in
beam_sizes
:
beam_search_decoder
=
cuda_ctc_decoder
(
tokens
,
nbest
=
1
,
beam_size
=
beam_size
,
blank_skip_threshold
=
0.95
,
)
print_decoded
(
beam_search_decoder
,
bpe_model
,
log_prob
,
encoder_out_lens
,
"beam size"
,
beam_size
)
######################################################################
# blank skip threshold
# ~~~~~~~~~~~~~~~~~~~~
#
# The ``blank_skip_threshold`` parameter is used to prune the frames which have large blank probability.
# Pruning these frames with a good blank_skip_threshold could speed up decoding
# process a lot while no accuracy drop.
# Since the rule of CTC, we would keep at least one blank frame between two non-blank frames
# to avoid mistakenly merge two consecutive identical symbols.
# We recommend to set blank_skip_threshold=0.95 for cuda beam search decoder.
#
blank_skip_probs
=
[
0.25
,
0.95
,
1.0
]
for
blank_skip_prob
in
blank_skip_probs
:
beam_search_decoder
=
cuda_ctc_decoder
(
tokens
,
nbest
=
10
,
beam_size
=
10
,
blank_skip_threshold
=
blank_skip_prob
,
)
print_decoded
(
beam_search_decoder
,
bpe_model
,
log_prob
,
encoder_out_lens
,
"blank_skip_threshold"
,
blank_skip_prob
)
del
cuda_decoder
######################################################################
# Benchmark with flashlight CPU decoder
# -------------------------------------
# We benchmark the throughput and accuracy between CUDA decoder and CPU decoder using librispeech test_other set.
# To reproduce below benchmark results, you may refer `here <https://github.com/pytorch/audio/tree/main/examples/asr/librispeech_cuda_ctc_decoder>`__.
#
# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
# | Decoder | Setting | WER (%) | N-Best Oracle WER (%) | Decoder Cost Time (seconds) |
# +==============+==========================================+=========+=======================+=============================+
# | CUDA decoder | blank_skip_threshold 0.95 | 5.81 | 4.11 | 2.57 |
# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
# | CUDA decoder | blank_skip_threshold 1.0 (no frame-skip) | 5.81 | 4.09 | 6.24 |
# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
# | CPU decoder | beam_size_token 10 | 5.86 | 4.30 | 28.61 |
# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
# | CPU decoder | beam_size_token 500 | 5.86 | 4.30 | 791.80 |
# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
#
# From the above table, CUDA decoder could give a slight improvement in WER and a significant increase in throughput.
examples/tutorials/audio_data_augmentation_tutorial.py
View file @
ffeba11a
...
...
@@ -20,6 +20,8 @@ import torchaudio.functional as F
print
(
torch
.
__version__
)
print
(
torchaudio
.
__version__
)
import
matplotlib.pyplot
as
plt
######################################################################
# Preparation
# -----------
...
...
@@ -27,10 +29,7 @@ print(torchaudio.__version__)
# First, we import the modules and download the audio assets we use in this tutorial.
#
import
math
from
IPython.display
import
Audio
import
matplotlib.pyplot
as
plt
from
torchaudio.utils
import
download_asset
...
...
@@ -44,56 +43,38 @@ SAMPLE_NOISE = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-st
# Applying effects and filtering
# ------------------------------
#
# :py:func:`torchaudio.sox_effects` allows for directly applying filters similar to
# those available in ``sox`` to Tensor objects and file object audio sources.
#
# There are two functions for this:
#
# - :py:func:`torchaudio.sox_effects.apply_effects_tensor` for applying effects
# to Tensor.
# - :py:func:`torchaudio.sox_effects.apply_effects_file` for applying effects to
# other audio sources.
# :py:class:`torchaudio.io.AudioEffector` allows for directly applying
# filters and codecs to Tensor objects, in a similar way as ``ffmpeg``
# command
#
# Both functions accept effect definitions in the form
# ``List[List[str]]``.
# This is mostly consistent with how ``sox`` command works, but one caveat is
# that ``sox`` adds some effects automatically, whereas ``torchaudio``’s
# implementation does not.
#
# For the list of available effects, please refer to `the sox
# documentation <http://sox.sourceforge.net/sox.html>`__.
#
# **Tip** If you need to load and resample your audio data on the fly,
# then you can use :py:func:`torchaudio.sox_effects.apply_effects_file`
# with effect ``"rate"``.
#
# **Note** :py:func:`torchaudio.sox_effects.apply_effects_file` accepts a
# file-like object or path-like object.
# Similar to :py:func:`torchaudio.load`, when the audio format cannot be
# inferred from either the file extension or header, you can provide
# argument ``format`` to specify the format of the audio source.
#
# **Note** This process is not differentiable.
# `AudioEffector Usages <./effector_tutorial.html>` explains how to use
# this class, so for the detail, please refer to the tutorial.
#
# Load the data
waveform1
,
sample_rate
1
=
torchaudio
.
load
(
SAMPLE_WAV
)
waveform1
,
sample_rate
=
torchaudio
.
load
(
SAMPLE_WAV
,
channels_first
=
False
)
# Define effects
effects
=
[
[
"lowpass"
,
"-1"
,
"300"
],
# apply single-pole lowpass filter
[
"speed"
,
"0.8"
],
# reduce the speed
# This only changes sample rate, so it is necessary to
# add `rate` effect with original sample rate after this.
[
"rate"
,
f
"
{
sample_rate1
}
"
],
[
"reverb"
,
"-w"
],
# Reverbration gives some dramatic feeling
]
effect
=
","
.
join
(
[
"lowpass=frequency=300:poles=1"
,
# apply single-pole lowpass filter
"atempo=0.8"
,
# reduce the speed
"aecho=in_gain=0.8:out_gain=0.9:delays=200:decays=0.3|delays=400:decays=0.3"
# Applying echo gives some dramatic feeling
],
)
# Apply effects
waveform2
,
sample_rate2
=
torchaudio
.
sox_effects
.
apply_effects_tensor
(
waveform1
,
sample_rate1
,
effects
)
def
apply_effect
(
waveform
,
sample_rate
,
effect
):
effector
=
torchaudio
.
io
.
AudioEffector
(
effect
=
effect
)
return
effector
.
apply
(
waveform
,
sample_rate
)
print
(
waveform1
.
shape
,
sample_rate1
)
print
(
waveform2
.
shape
,
sample_rate2
)
waveform2
=
apply_effect
(
waveform1
,
sample_rate
,
effect
)
print
(
waveform1
.
shape
,
sample_rate
)
print
(
waveform2
.
shape
,
sample_rate
)
######################################################################
# Note that the number of frames and number of channels are different from
...
...
@@ -101,6 +82,7 @@ print(waveform2.shape, sample_rate2)
# audio.
#
def
plot_waveform
(
waveform
,
sample_rate
,
title
=
"Waveform"
,
xlim
=
None
):
waveform
=
waveform
.
numpy
()
...
...
@@ -118,11 +100,12 @@ def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None):
if
xlim
:
axes
[
c
].
set_xlim
(
xlim
)
figure
.
suptitle
(
title
)
plt
.
show
(
block
=
False
)
######################################################################
#
def
plot_specgram
(
waveform
,
sample_rate
,
title
=
"Spectrogram"
,
xlim
=
None
):
waveform
=
waveform
.
numpy
()
...
...
@@ -138,29 +121,26 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
if
xlim
:
axes
[
c
].
set_xlim
(
xlim
)
figure
.
suptitle
(
title
)
plt
.
show
(
block
=
False
)
######################################################################
# Original
:
# ~~~~~~~~
~
# Original
# ~~~~~~~~
#
plot_waveform
(
waveform1
,
sample_rate
1
,
title
=
"Original"
,
xlim
=
(
-
0.1
,
3.2
))
plot_specgram
(
waveform1
,
sample_rate
1
,
title
=
"Original"
,
xlim
=
(
0
,
3.04
))
Audio
(
waveform1
,
rate
=
sample_rate
1
)
plot_waveform
(
waveform1
.
T
,
sample_rate
,
title
=
"Original"
,
xlim
=
(
-
0.1
,
3.2
))
plot_specgram
(
waveform1
.
T
,
sample_rate
,
title
=
"Original"
,
xlim
=
(
0
,
3.04
))
Audio
(
waveform1
.
T
,
rate
=
sample_rate
)
######################################################################
# Effects applied
:
# ~~~~~~~~~~~~~~~
~
# Effects applied
# ~~~~~~~~~~~~~~~
#
plot_waveform
(
waveform2
,
sample_rate
2
,
title
=
"Effects Applied"
,
xlim
=
(
-
0.1
,
3.2
))
plot_specgram
(
waveform2
,
sample_rate
2
,
title
=
"Effects Applied"
,
xlim
=
(
0
,
3.04
))
Audio
(
waveform2
,
rate
=
sample_rate
2
)
plot_waveform
(
waveform2
.
T
,
sample_rate
,
title
=
"Effects Applied"
,
xlim
=
(
-
0.1
,
3.2
))
plot_specgram
(
waveform2
.
T
,
sample_rate
,
title
=
"Effects Applied"
,
xlim
=
(
0
,
3.04
))
Audio
(
waveform2
.
T
,
rate
=
sample_rate
)
######################################################################
# Doesn’t it sound more dramatic?
#
######################################################################
# Simulating room reverberation
...
...
@@ -185,28 +165,26 @@ plot_specgram(rir_raw, sample_rate, title="Room Impulse Response (raw)")
Audio
(
rir_raw
,
rate
=
sample_rate
)
######################################################################
# First, we need to clean up the RIR. We extract the main impulse
,
normalize
#
the signal power, then flip along the time axis
.
# First, we need to clean up the RIR. We extract the main impulse
and
normalize
#
it by its power
.
#
rir
=
rir_raw
[:,
int
(
sample_rate
*
1.01
)
:
int
(
sample_rate
*
1.3
)]
rir
=
rir
/
torch
.
norm
(
rir
,
p
=
2
)
RIR
=
torch
.
flip
(
rir
,
[
1
])
rir
=
rir
/
torch
.
linalg
.
vector_norm
(
rir
,
ord
=
2
)
plot_waveform
(
rir
,
sample_rate
,
title
=
"Room Impulse Response"
)
######################################################################
# Then, we convolve the speech signal with the RIR filter.
# Then, using :py:func:`torchaudio.functional.fftconvolve`,
# we convolve the speech signal with the RIR.
#
speech
,
_
=
torchaudio
.
load
(
SAMPLE_SPEECH
)
speech_
=
torch
.
nn
.
functional
.
pad
(
speech
,
(
RIR
.
shape
[
1
]
-
1
,
0
))
augmented
=
torch
.
nn
.
functional
.
conv1d
(
speech_
[
None
,
...],
RIR
[
None
,
...])[
0
]
augmented
=
F
.
fftconvolve
(
speech
,
rir
)
######################################################################
# Original
:
# ~~~~~~~~
~
# Original
# ~~~~~~~~
#
plot_waveform
(
speech
,
sample_rate
,
title
=
"Original"
)
...
...
@@ -214,8 +192,8 @@ plot_specgram(speech, sample_rate, title="Original")
Audio
(
speech
,
rate
=
sample_rate
)
######################################################################
# RIR applied
:
# ~~~~~~~~~~~
~
# RIR applied
# ~~~~~~~~~~~
#
plot_waveform
(
augmented
,
sample_rate
,
title
=
"RIR Applied"
)
...
...
@@ -227,33 +205,31 @@ Audio(augmented, rate=sample_rate)
# Adding background noise
# -----------------------
#
# To add background noise to audio data, you can simply add a noise Tensor to
# the Tensor representing the audio data. A common method to adjust the
# intensity of noise is changing the Signal-to-Noise Ratio (SNR).
# [`wikipedia <https://en.wikipedia.org/wiki/Signal-to-noise_ratio>`__]
# To introduce background noise to audio data, we can add a noise Tensor to
# the Tensor representing the audio data according to some desired
# signal-to-noise ratio (SNR)
# [`wikipedia <https://en.wikipedia.org/wiki/Signal-to-noise_ratio>`__],
# which determines the intensity of the audio data relative to that of the noise
# in the output.
#
# $$ \\mathrm{SNR} = \\frac{P_{signal}}{P_{noise}} $$
#
# $$ \\mathrm{SNR_{dB}} = 10 \\log _{{10}} \\mathrm {SNR} $$
#
# To add noise to audio data per SNRs, we
# use :py:func:`torchaudio.functional.add_noise`.
speech
,
_
=
torchaudio
.
load
(
SAMPLE_SPEECH
)
noise
,
_
=
torchaudio
.
load
(
SAMPLE_NOISE
)
noise
=
noise
[:,
:
speech
.
shape
[
1
]]
s
peech_rms
=
speech
.
norm
(
p
=
2
)
nois
e_rms
=
noise
.
norm
(
p
=
2
)
s
nr_dbs
=
torch
.
tensor
([
20
,
10
,
3
]
)
nois
y_speeches
=
F
.
add_noise
(
speech
,
noise
,
snr_dbs
)
snr_dbs
=
[
20
,
10
,
3
]
noisy_speeches
=
[]
for
snr_db
in
snr_dbs
:
snr
=
10
**
(
snr_db
/
20
)
scale
=
snr
*
noise_rms
/
speech_rms
noisy_speeches
.
append
((
scale
*
speech
+
noise
)
/
2
)
######################################################################
# Background noise
:
# ~~~~~~~~~~~~~~~~
~
# Background noise
# ~~~~~~~~~~~~~~~~
#
plot_waveform
(
noise
,
sample_rate
,
title
=
"Background noise"
)
...
...
@@ -261,31 +237,31 @@ plot_specgram(noise, sample_rate, title="Background noise")
Audio
(
noise
,
rate
=
sample_rate
)
######################################################################
# SNR 20 dB
:
# ~~~~~~~~~
~
# SNR 20 dB
# ~~~~~~~~~
#
snr_db
,
noisy_speech
=
snr_dbs
[
0
],
noisy_speeches
[
0
]
snr_db
,
noisy_speech
=
snr_dbs
[
0
],
noisy_speeches
[
0
:
1
]
plot_waveform
(
noisy_speech
,
sample_rate
,
title
=
f
"SNR:
{
snr_db
}
[dB]"
)
plot_specgram
(
noisy_speech
,
sample_rate
,
title
=
f
"SNR:
{
snr_db
}
[dB]"
)
Audio
(
noisy_speech
,
rate
=
sample_rate
)
######################################################################
# SNR 10 dB
:
# ~~~~~~~~~
~
# SNR 10 dB
# ~~~~~~~~~
#
snr_db
,
noisy_speech
=
snr_dbs
[
1
],
noisy_speeches
[
1
]
snr_db
,
noisy_speech
=
snr_dbs
[
1
],
noisy_speeches
[
1
:
2
]
plot_waveform
(
noisy_speech
,
sample_rate
,
title
=
f
"SNR:
{
snr_db
}
[dB]"
)
plot_specgram
(
noisy_speech
,
sample_rate
,
title
=
f
"SNR:
{
snr_db
}
[dB]"
)
Audio
(
noisy_speech
,
rate
=
sample_rate
)
######################################################################
# SNR 3 dB
:
# ~~~~~~~~
~
# SNR 3 dB
# ~~~~~~~~
#
snr_db
,
noisy_speech
=
snr_dbs
[
2
],
noisy_speeches
[
2
]
snr_db
,
noisy_speech
=
snr_dbs
[
2
],
noisy_speeches
[
2
:
3
]
plot_waveform
(
noisy_speech
,
sample_rate
,
title
=
f
"SNR:
{
snr_db
}
[dB]"
)
plot_specgram
(
noisy_speech
,
sample_rate
,
title
=
f
"SNR:
{
snr_db
}
[dB]"
)
Audio
(
noisy_speech
,
rate
=
sample_rate
)
...
...
@@ -295,60 +271,56 @@ Audio(noisy_speech, rate=sample_rate)
# Applying codec to Tensor object
# -------------------------------
#
# :py:
func
:`torchaudio.
functional.apply_codec` can
apply codecs to
# :py:
class
:`torchaudio.
io.AudioEffector` can also
apply codecs to
# a Tensor object.
#
# **Note** This process is not differentiable.
#
waveform
,
sample_rate
=
torchaudio
.
load
(
SAMPLE_SPEECH
,
channels_first
=
False
)
waveform
,
sample_rate
=
torchaudio
.
load
(
SAMPLE_SPEECH
)
def
apply_codec
(
waveform
,
sample_rate
,
format
,
encoder
=
None
):
encoder
=
torchaudio
.
io
.
AudioEffector
(
format
=
format
,
encoder
=
encoder
)
return
encoder
.
apply
(
waveform
,
sample_rate
)
configs
=
[
{
"format"
:
"wav"
,
"encoding"
:
"ULAW"
,
"bits_per_sample"
:
8
},
{
"format"
:
"gsm"
},
{
"format"
:
"vorbis"
,
"compression"
:
-
1
},
]
waveforms
=
[]
for
param
in
configs
:
augmented
=
F
.
apply_codec
(
waveform
,
sample_rate
,
**
param
)
waveforms
.
append
(
augmented
)
######################################################################
# Original
:
# ~~~~~~~~
~
# Original
# ~~~~~~~~
#
plot_waveform
(
waveform
,
sample_rate
,
title
=
"Original"
)
plot_specgram
(
waveform
,
sample_rate
,
title
=
"Original"
)
Audio
(
waveform
,
rate
=
sample_rate
)
plot_waveform
(
waveform
.
T
,
sample_rate
,
title
=
"Original"
)
plot_specgram
(
waveform
.
T
,
sample_rate
,
title
=
"Original"
)
Audio
(
waveform
.
T
,
rate
=
sample_rate
)
######################################################################
# 8 bit mu-law
:
# ~~~~~~~~~~~~
~
# 8 bit mu-law
# ~~~~~~~~~~~~
#
plot_waveform
(
waveforms
[
0
],
sample_rate
,
title
=
"8 bit mu-law"
)
plot_specgram
(
waveforms
[
0
],
sample_rate
,
title
=
"8 bit mu-law"
)
Audio
(
waveforms
[
0
],
rate
=
sample_rate
)
mulaw
=
apply_codec
(
waveform
,
sample_rate
,
"wav"
,
encoder
=
"pcm_mulaw"
)
plot_waveform
(
mulaw
.
T
,
sample_rate
,
title
=
"8 bit mu-law"
)
plot_specgram
(
mulaw
.
T
,
sample_rate
,
title
=
"8 bit mu-law"
)
Audio
(
mulaw
.
T
,
rate
=
sample_rate
)
######################################################################
# G
SM-FR:
# ~~~~~
~~
# G
.722
# ~~~~~
#
plot_waveform
(
waveforms
[
1
],
sample_rate
,
title
=
"GSM-FR"
)
plot_specgram
(
waveforms
[
1
],
sample_rate
,
title
=
"GSM-FR"
)
Audio
(
waveforms
[
1
],
rate
=
sample_rate
)
g722
=
apply_codec
(
waveform
,
sample_rate
,
"g722"
)
plot_waveform
(
g722
.
T
,
sample_rate
,
title
=
"G.722"
)
plot_specgram
(
g722
.
T
,
sample_rate
,
title
=
"G.722"
)
Audio
(
g722
.
T
,
rate
=
sample_rate
)
######################################################################
# Vorbis
:
# ~~~~~~
~
# Vorbis
# ~~~~~~
#
plot_waveform
(
waveforms
[
2
],
sample_rate
,
title
=
"Vorbis"
)
plot_specgram
(
waveforms
[
2
],
sample_rate
,
title
=
"Vorbis"
)
Audio
(
waveforms
[
2
],
rate
=
sample_rate
)
vorbis
=
apply_codec
(
waveform
,
sample_rate
,
"ogg"
,
encoder
=
"vorbis"
)
plot_waveform
(
vorbis
.
T
,
sample_rate
,
title
=
"Vorbis"
)
plot_specgram
(
vorbis
.
T
,
sample_rate
,
title
=
"Vorbis"
)
Audio
(
vorbis
.
T
,
rate
=
sample_rate
)
######################################################################
# Simulating a phone recoding
...
...
@@ -365,8 +337,7 @@ original_speech, sample_rate = torchaudio.load(SAMPLE_SPEECH)
plot_specgram
(
original_speech
,
sample_rate
,
title
=
"Original"
)
# Apply RIR
speech_
=
torch
.
nn
.
functional
.
pad
(
original_speech
,
(
RIR
.
shape
[
1
]
-
1
,
0
))
rir_applied
=
torch
.
nn
.
functional
.
conv1d
(
speech_
[
None
,
...],
RIR
[
None
,
...])[
0
]
rir_applied
=
F
.
fftconvolve
(
speech
,
rir
)
plot_specgram
(
rir_applied
,
sample_rate
,
title
=
"RIR Applied"
)
...
...
@@ -377,69 +348,60 @@ plot_specgram(rir_applied, sample_rate, title="RIR Applied")
noise
,
_
=
torchaudio
.
load
(
SAMPLE_NOISE
)
noise
=
noise
[:,
:
rir_applied
.
shape
[
1
]]
snr_db
=
8
scale
=
(
10
**
(
snr_db
/
20
))
*
noise
.
norm
(
p
=
2
)
/
rir_applied
.
norm
(
p
=
2
)
bg_added
=
(
scale
*
rir_applied
+
noise
)
/
2
snr_db
=
torch
.
tensor
([
8
])
bg_added
=
F
.
add_noise
(
rir_applied
,
noise
,
snr_db
)
plot_specgram
(
bg_added
,
sample_rate
,
title
=
"BG noise added"
)
# Apply filtering and change sample rate
filtered
,
sample_rate2
=
torchaudio
.
sox_effects
.
apply_effects_tensor
(
bg_added
,
sample_rate
,
effects
=
[
[
"lowpass"
,
"4000"
],
[
"compand"
,
"0.02,0.05"
,
"-60,-60,-30,-10,-20,-8,-5,-8,-2,-8"
,
"-8"
,
"-7"
,
"0.05"
,
],
[
"rate"
,
"8000"
],
],
effect
=
","
.
join
(
[
"lowpass=frequency=4000:poles=1"
,
"compand=attacks=0.02:decays=0.05:points=-60/-60|-30/-10|-20/-8|-5/-8|-2/-8:gain=-8:volume=-7:delay=0.05"
,
]
)
plot_specgram
(
filtered
,
sample_rate2
,
title
=
"Filtered"
)
filtered
=
apply_effect
(
bg_added
.
T
,
sample_rate
,
effect
)
sample_rate2
=
8000
# Apply telephony codec
codec_applied
=
F
.
apply_codec
(
filtered
,
sample_rate2
,
format
=
"gsm"
)
plot_specgram
(
filtered
.
T
,
sample_rate2
,
title
=
"Filtered"
)
plot_specgram
(
codec_applied
,
sample_rate2
,
title
=
"GSM Codec Applied"
)
# Apply telephony codec
codec_applied
=
apply_codec
(
filtered
,
sample_rate2
,
"g722"
)
plot_specgram
(
codec_applied
.
T
,
sample_rate2
,
title
=
"G.722 Codec Applied"
)
######################################################################
# Original speech
:
# ~~~~~~~~~~~~~~~
~
# Original speech
# ~~~~~~~~~~~~~~~
#
Audio
(
original_speech
,
rate
=
sample_rate
)
######################################################################
# RIR applied
:
# ~~~~~~~~~~~
~
# RIR applied
# ~~~~~~~~~~~
#
Audio
(
rir_applied
,
rate
=
sample_rate
)
######################################################################
# Background noise added
:
# ~~~~~~~~~~~~~~~~~~~~~~
~
# Background noise added
# ~~~~~~~~~~~~~~~~~~~~~~
#
Audio
(
bg_added
,
rate
=
sample_rate
)
######################################################################
# Filtered
:
# ~~~~~~~~
~
# Filtered
# ~~~~~~~~
#
Audio
(
filtered
,
rate
=
sample_rate2
)
Audio
(
filtered
.
T
,
rate
=
sample_rate2
)
######################################################################
# Codec applied
:
# ~~~~~~~~~~~~~
~
# Codec applied
# ~~~~~~~~~~~~~
#
Audio
(
codec_applied
,
rate
=
sample_rate2
)
Audio
(
codec_applied
.
T
,
rate
=
sample_rate2
)
examples/tutorials/audio_datasets_tutorial.py
View file @
ffeba11a
# -*- coding: utf-8 -*-
"""
Audio Datasets
==============
...
...
@@ -10,10 +9,6 @@ datasets. Please refer to the official documentation for the list of
available datasets.
"""
# When running this tutorial in Google Colab, install the required packages
# with the following.
# !pip install torchaudio
import
torch
import
torchaudio
...
...
@@ -21,22 +16,13 @@ print(torch.__version__)
print
(
torchaudio
.
__version__
)
######################################################################
# Preparing data and utility functions (skip this section)
# --------------------------------------------------------
#
# @title Prepare data and utility functions. {display-mode: "form"}
# @markdown
# @markdown You do not need to look into this cell.
# @markdown Just execute once and you are good to go.
# -------------------------------------------------------------------------------
# Preparation of data and helper functions.
# -------------------------------------------------------------------------------
import
os
import
IPython
import
matplotlib.pyplot
as
plt
from
IPython.display
import
Audio
,
display
_SAMPLE_DIR
=
"_assets"
...
...
@@ -44,34 +30,13 @@ YESNO_DATASET_PATH = os.path.join(_SAMPLE_DIR, "yes_no")
os
.
makedirs
(
YESNO_DATASET_PATH
,
exist_ok
=
True
)
def
plot_specgram
(
waveform
,
sample_rate
,
title
=
"Spectrogram"
,
xlim
=
None
):
def
plot_specgram
(
waveform
,
sample_rate
,
title
=
"Spectrogram"
):
waveform
=
waveform
.
numpy
()
num_channels
,
_
=
waveform
.
shape
figure
,
axes
=
plt
.
subplots
(
num_channels
,
1
)
if
num_channels
==
1
:
axes
=
[
axes
]
for
c
in
range
(
num_channels
):
axes
[
c
].
specgram
(
waveform
[
c
],
Fs
=
sample_rate
)
if
num_channels
>
1
:
axes
[
c
].
set_ylabel
(
f
"Channel
{
c
+
1
}
"
)
if
xlim
:
axes
[
c
].
set_xlim
(
xlim
)
figure
,
ax
=
plt
.
subplots
()
ax
.
specgram
(
waveform
[
0
],
Fs
=
sample_rate
)
figure
.
suptitle
(
title
)
plt
.
show
(
block
=
False
)
def
play_audio
(
waveform
,
sample_rate
):
waveform
=
waveform
.
numpy
()
num_channels
,
_
=
waveform
.
shape
if
num_channels
==
1
:
display
(
Audio
(
waveform
[
0
],
rate
=
sample_rate
))
elif
num_channels
==
2
:
display
(
Audio
((
waveform
[
0
],
waveform
[
1
]),
rate
=
sample_rate
))
else
:
raise
ValueError
(
"Waveform with more than 2 channels are not supported."
)
figure
.
tight_layout
()
######################################################################
...
...
@@ -79,10 +44,25 @@ def play_audio(waveform, sample_rate):
# :py:class:`torchaudio.datasets.YESNO` dataset.
#
dataset
=
torchaudio
.
datasets
.
YESNO
(
YESNO_DATASET_PATH
,
download
=
True
)
for
i
in
[
1
,
3
,
5
]:
waveform
,
sample_rate
,
label
=
dataset
[
i
]
plot_specgram
(
waveform
,
sample_rate
,
title
=
f
"Sample
{
i
}
:
{
label
}
"
)
play_audio
(
waveform
,
sample_rate
)
######################################################################
#
i
=
1
waveform
,
sample_rate
,
label
=
dataset
[
i
]
plot_specgram
(
waveform
,
sample_rate
,
title
=
f
"Sample
{
i
}
:
{
label
}
"
)
IPython
.
display
.
Audio
(
waveform
,
rate
=
sample_rate
)
######################################################################
#
i
=
3
waveform
,
sample_rate
,
label
=
dataset
[
i
]
plot_specgram
(
waveform
,
sample_rate
,
title
=
f
"Sample
{
i
}
:
{
label
}
"
)
IPython
.
display
.
Audio
(
waveform
,
rate
=
sample_rate
)
######################################################################
#
i
=
5
waveform
,
sample_rate
,
label
=
dataset
[
i
]
plot_specgram
(
waveform
,
sample_rate
,
title
=
f
"Sample
{
i
}
:
{
label
}
"
)
IPython
.
display
.
Audio
(
waveform
,
rate
=
sample_rate
)
examples/tutorials/audio_feature_augmentation_tutorial.py
View file @
ffeba11a
...
...
@@ -19,25 +19,20 @@ print(torch.__version__)
print
(
torchaudio
.
__version__
)
######################################################################
# Prepar
ing data and utility functions (skip this section)
# -----------
---------------------------------------------
# Prepar
ation
# -----------
#
# @title Prepare data and utility functions. {display-mode: "form"}
# @markdown
# @markdown You do not need to look into this cell.
# @markdown Just execute once and you are good to go.
# @markdown
# @markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/),
# @markdown which is licensed under Creative Commos BY 4.0.
# -------------------------------------------------------------------------------
# Preparation of data and helper functions.
# -------------------------------------------------------------------------------
import
librosa
import
matplotlib.pyplot
as
plt
from
IPython.display
import
Audio
from
torchaudio.utils
import
download_asset
######################################################################
# In this tutorial, we will use a speech data from
# `VOiCES dataset <https://iqtlabs.github.io/voices/>`__,
# which is licensed under Creative Commos BY 4.0.
SAMPLE_WAV_SPEECH_PATH
=
download_asset
(
"tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
)
...
...
@@ -75,18 +70,6 @@ def get_spectrogram(
return
spectrogram
(
waveform
)
def
plot_spectrogram
(
spec
,
title
=
None
,
ylabel
=
"freq_bin"
,
aspect
=
"auto"
,
xmax
=
None
):
fig
,
axs
=
plt
.
subplots
(
1
,
1
)
axs
.
set_title
(
title
or
"Spectrogram (db)"
)
axs
.
set_ylabel
(
ylabel
)
axs
.
set_xlabel
(
"frame"
)
im
=
axs
.
imshow
(
librosa
.
power_to_db
(
spec
),
origin
=
"lower"
,
aspect
=
aspect
)
if
xmax
:
axs
.
set_xlim
((
0
,
xmax
))
fig
.
colorbar
(
im
,
ax
=
axs
)
plt
.
show
(
block
=
False
)
######################################################################
# SpecAugment
# -----------
...
...
@@ -108,43 +91,79 @@ def plot_spectrogram(spec, title=None, ylabel="freq_bin", aspect="auto", xmax=No
spec
=
get_spectrogram
(
power
=
None
)
stretch
=
T
.
TimeStretch
()
rate
=
1.2
spec_
=
stretch
(
spec
,
rate
)
plot_spectrogram
(
torch
.
abs
(
spec_
[
0
]),
title
=
f
"Stretched x
{
rate
}
"
,
aspect
=
"equal"
,
xmax
=
304
)
spec_12
=
stretch
(
spec
,
overriding_rate
=
1.2
)
spec_09
=
stretch
(
spec
,
overriding_rate
=
0.9
)
######################################################################
# Visualization
# ~~~~~~~~~~~~~
def
plot
():
def
plot_spec
(
ax
,
spec
,
title
):
ax
.
set_title
(
title
)
ax
.
imshow
(
librosa
.
amplitude_to_db
(
spec
),
origin
=
"lower"
,
aspect
=
"auto"
)
fig
,
axes
=
plt
.
subplots
(
3
,
1
,
sharex
=
True
,
sharey
=
True
)
plot_spec
(
axes
[
0
],
torch
.
abs
(
spec_12
[
0
]),
title
=
"Stretched x1.2"
)
plot_spec
(
axes
[
1
],
torch
.
abs
(
spec
[
0
]),
title
=
"Original"
)
plot_spec
(
axes
[
2
],
torch
.
abs
(
spec_09
[
0
]),
title
=
"Stretched x0.9"
)
fig
.
tight_layout
()
plot_spectrogram
(
torch
.
abs
(
spec
[
0
]),
title
=
"Original"
,
aspect
=
"equal"
,
xmax
=
304
)
rate
=
0.9
spec_
=
stretch
(
spec
,
rate
)
plot_spectrogram
(
torch
.
abs
(
spec_
[
0
]),
title
=
f
"Stretched x
{
rate
}
"
,
aspect
=
"equal"
,
xmax
=
304
)
plot
()
######################################################################
# TimeMasking
# -----------
#
# Audio Samples
# ~~~~~~~~~~~~~
def
preview
(
spec
,
rate
=
16000
):
ispec
=
T
.
InverseSpectrogram
()
waveform
=
ispec
(
spec
)
torch
.
random
.
manual_seed
(
4
)
return
Audio
(
waveform
[
0
].
numpy
().
T
,
rate
=
rate
)
spec
=
get_spectrogram
()
plot_spectrogram
(
spec
[
0
],
title
=
"Original"
)
masking
=
T
.
TimeMasking
(
time_mask_param
=
80
)
spec
=
masking
(
spec
)
preview
(
spec
)
plot_spectrogram
(
spec
[
0
],
title
=
"Masked along time axis"
)
######################################################################
# FrequencyMasking
# ----------------
#
preview
(
spec_12
)
######################################################################
#
preview
(
spec_09
)
######################################################################
# Time and Frequency Masking
# --------------------------
#
torch
.
random
.
manual_seed
(
4
)
time_masking
=
T
.
TimeMasking
(
time_mask_param
=
80
)
freq_masking
=
T
.
FrequencyMasking
(
freq_mask_param
=
80
)
spec
=
get_spectrogram
()
plot_spectrogram
(
spec
[
0
],
title
=
"Original"
)
time_masked
=
time_masking
(
spec
)
freq_masked
=
freq_masking
(
spec
)
######################################################################
#
def
plot
():
def
plot_spec
(
ax
,
spec
,
title
):
ax
.
set_title
(
title
)
ax
.
imshow
(
librosa
.
power_to_db
(
spec
),
origin
=
"lower"
,
aspect
=
"auto"
)
fig
,
axes
=
plt
.
subplots
(
3
,
1
,
sharex
=
True
,
sharey
=
True
)
plot_spec
(
axes
[
0
],
spec
[
0
],
title
=
"Original"
)
plot_spec
(
axes
[
1
],
time_masked
[
0
],
title
=
"Masked along time axis"
)
plot_spec
(
axes
[
2
],
freq_masked
[
0
],
title
=
"Masked along frequency axis"
)
fig
.
tight_layout
()
masking
=
T
.
FrequencyMasking
(
freq_mask_param
=
80
)
spec
=
masking
(
spec
)
plot
_spectrogram
(
spec
[
0
],
title
=
"Masked along frequency axis"
)
plot
(
)
examples/tutorials/audio_feature_extractions_tutorial.py
View file @
ffeba11a
...
...
@@ -25,6 +25,23 @@ import torchaudio.transforms as T
print
(
torch
.
__version__
)
print
(
torchaudio
.
__version__
)
import
librosa
import
matplotlib.pyplot
as
plt
######################################################################
# Overview of audio features
# --------------------------
#
# The following diagram shows the relationship between common audio features
# and torchaudio APIs to generate them.
#
# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio_feature_extractions.png
#
# For the complete list of available features, please refer to the
# documentation.
#
######################################################################
# Preparation
# -----------
...
...
@@ -38,8 +55,7 @@ print(torchaudio.__version__)
# !pip install librosa
#
from
IPython.display
import
Audio
import
librosa
import
matplotlib.pyplot
as
plt
from
matplotlib.patches
import
Rectangle
from
torchaudio.utils
import
download_asset
torch
.
random
.
manual_seed
(
0
)
...
...
@@ -47,27 +63,27 @@ torch.random.manual_seed(0)
SAMPLE_SPEECH
=
download_asset
(
"tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
)
def
plot_waveform
(
waveform
,
sr
,
title
=
"Waveform"
):
def
plot_waveform
(
waveform
,
sr
,
title
=
"Waveform"
,
ax
=
None
):
waveform
=
waveform
.
numpy
()
num_channels
,
num_frames
=
waveform
.
shape
time_axis
=
torch
.
arange
(
0
,
num_frames
)
/
sr
figure
,
axes
=
plt
.
subplots
(
num_channels
,
1
)
axes
.
plot
(
time_axis
,
waveform
[
0
],
linewidth
=
1
)
axes
.
grid
(
True
)
figure
.
suptitle
(
title
)
plt
.
show
(
block
=
False
)
if
ax
is
None
:
_
,
ax
=
plt
.
subplots
(
num_channels
,
1
)
ax
.
plot
(
time_axis
,
waveform
[
0
],
linewidth
=
1
)
ax
.
grid
(
True
)
ax
.
set_xlim
([
0
,
time_axis
[
-
1
]])
ax
.
set_title
(
title
)
def
plot_spectrogram
(
specgram
,
title
=
None
,
ylabel
=
"freq_bin"
):
fig
,
axs
=
plt
.
subplots
(
1
,
1
)
axs
.
set_title
(
title
or
"Spectrogram (db)"
)
axs
.
set_ylabel
(
ylabel
)
axs
.
set_xlabel
(
"frame"
)
im
=
axs
.
imshow
(
librosa
.
power_to_db
(
specgram
),
origin
=
"lower"
,
aspect
=
"auto"
)
fig
.
colorbar
(
im
,
ax
=
axs
)
plt
.
show
(
block
=
False
)
def
plot_spectrogram
(
specgram
,
title
=
None
,
ylabel
=
"freq_bin"
,
ax
=
None
):
if
ax
is
None
:
_
,
ax
=
plt
.
subplots
(
1
,
1
)
if
title
is
not
None
:
ax
.
set_title
(
title
)
ax
.
set_ylabel
(
ylabel
)
ax
.
imshow
(
librosa
.
power_to_db
(
specgram
),
origin
=
"lower"
,
aspect
=
"auto"
,
interpolation
=
"nearest"
)
def
plot_fbank
(
fbank
,
title
=
None
):
...
...
@@ -76,21 +92,6 @@ def plot_fbank(fbank, title=None):
axs
.
imshow
(
fbank
,
aspect
=
"auto"
)
axs
.
set_ylabel
(
"frequency bin"
)
axs
.
set_xlabel
(
"mel bin"
)
plt
.
show
(
block
=
False
)
######################################################################
# Overview of audio features
# --------------------------
#
# The following diagram shows the relationship between common audio features
# and torchaudio APIs to generate them.
#
# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio_feature_extractions.png
#
# For the complete list of available features, please refer to the
# documentation.
#
######################################################################
...
...
@@ -101,77 +102,157 @@ def plot_fbank(fbank, title=None):
# you can use :py:func:`torchaudio.transforms.Spectrogram`.
#
# Load audio
SPEECH_WAVEFORM
,
SAMPLE_RATE
=
torchaudio
.
load
(
SAMPLE_SPEECH
)
plot_waveform
(
SPEECH_WAVEFORM
,
SAMPLE_RATE
,
title
=
"Original wave
form
"
)
Audio
(
SPEECH_WAVEFORM
.
numpy
(),
rate
=
SAMPLE_RATE
)
# Define trans
form
spectrogram
=
T
.
Spectrogram
(
n_fft
=
512
)
# Perform transform
spec
=
spectrogram
(
SPEECH_WAVEFORM
)
######################################################################
#
n_fft
=
1024
win_length
=
None
hop_length
=
512
fig
,
axs
=
plt
.
subplots
(
2
,
1
)
plot_waveform
(
SPEECH_WAVEFORM
,
SAMPLE_RATE
,
title
=
"Original waveform"
,
ax
=
axs
[
0
])
plot_spectrogram
(
spec
[
0
],
title
=
"spectrogram"
,
ax
=
axs
[
1
])
fig
.
tight_layout
()
# Define transform
spectrogram
=
T
.
Spectrogram
(
n_fft
=
n_fft
,
win_length
=
win_length
,
hop_length
=
hop_length
,
center
=
True
,
pad_mode
=
"reflect"
,
power
=
2.0
,
)
######################################################################
#
Audio
(
SPEECH_WAVEFORM
.
numpy
(),
rate
=
SAMPLE_RATE
)
######################################################################
# The effect of ``n_fft`` parameter
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# The core of spectrogram computation is (short-term) Fourier transform,
# and the ``n_fft`` parameter corresponds to the :math:`N` in the following
# definition of descrete Fourier transform.
#
# $$ X_k = \\sum_{n=0}^{N-1} x_n e^{-\\frac{2\\pi i}{N} nk} $$
#
# (For the detail of Fourier transform, please refer to
# `Wikipedia <https://en.wikipedia.org/wiki/Fast_Fourier_transform>`__.
#
# The value of ``n_fft`` determines the resolution of frequency axis.
# However, with the higher ``n_fft`` value, the energy will be distributed
# among more bins, so when you visualize it, it might look more blurry,
# even thought they are higher resolution.
#
# The following illustrates this;
#
# Perform transform
spec
=
spectrogram
(
SPEECH_WAVEFORM
)
######################################################################
#
# .. note::
#
# ``hop_length`` determines the time axis resolution.
# By default, (i.e. ``hop_length=None`` and ``win_length=None``),
# the value of ``n_fft // 4`` is used.
# Here we use the same ``hop_length`` value across different ``n_fft``
# so that they have the same number of elemets in the time axis.
#
n_ffts
=
[
32
,
128
,
512
,
2048
]
hop_length
=
64
specs
=
[]
for
n_fft
in
n_ffts
:
spectrogram
=
T
.
Spectrogram
(
n_fft
=
n_fft
,
hop_length
=
hop_length
)
spec
=
spectrogram
(
SPEECH_WAVEFORM
)
specs
.
append
(
spec
)
######################################################################
#
plot_spectrogram
(
spec
[
0
],
title
=
"torchaudio"
)
fig
,
axs
=
plt
.
subplots
(
len
(
specs
),
1
,
sharex
=
True
)
for
i
,
(
spec
,
n_fft
)
in
enumerate
(
zip
(
specs
,
n_ffts
)):
plot_spectrogram
(
spec
[
0
],
ylabel
=
f
"n_fft=
{
n_fft
}
"
,
ax
=
axs
[
i
])
axs
[
i
].
set_xlabel
(
None
)
fig
.
tight_layout
()
######################################################################
# GriffinLim
# ----------
#
# To recover a waveform from a spectrogram, you can use ``GriffinLim``.
# When comparing signals, it is desirable to use the same sampling rate,
# however if you must use the different sampling rate, care must be
# taken for interpretating the meaning of ``n_fft``.
# Recall that ``n_fft`` determines the resolution of the frequency
# axis for a given sampling rate. In other words, what each bin on
# the frequency axis represents is subject to the sampling rate.
#
# As we have seen above, changing the value of ``n_fft`` does not change
# the coverage of frequency range for the same input signal.
torch
.
random
.
manual_seed
(
0
)
######################################################################
#
# Let's downsample the audio and apply spectrogram with the same ``n_fft``
# value.
n_fft
=
1024
win_length
=
None
hop_length
=
512
# Downsample to half of the original sample rate
speech2
=
torchaudio
.
functional
.
resample
(
SPEECH_WAVEFORM
,
SAMPLE_RATE
,
SAMPLE_RATE
//
2
)
# Upsample to the original sample rate
speech3
=
torchaudio
.
functional
.
resample
(
speech2
,
SAMPLE_RATE
//
2
,
SAMPLE_RATE
)
spec
=
T
.
Spectrogram
(
n_fft
=
n_fft
,
win_length
=
win_length
,
hop_length
=
hop_length
,
)(
SPEECH_WAVEFORM
)
######################################################################
#
# Apply the same spectrogram
spectrogram
=
T
.
Spectrogram
(
n_fft
=
512
)
spec0
=
spectrogram
(
SPEECH_WAVEFORM
)
spec2
=
spectrogram
(
speech2
)
spec3
=
spectrogram
(
speech3
)
######################################################################
#
griffin_lim
=
T
.
GriffinLim
(
n_fft
=
n_fft
,
win_length
=
win_length
,
hop_length
=
hop_length
,
)
# Visualize it
fig
,
axs
=
plt
.
subplots
(
3
,
1
)
plot_spectrogram
(
spec0
[
0
],
ylabel
=
"Original"
,
ax
=
axs
[
0
])
axs
[
0
].
add_patch
(
Rectangle
((
0
,
3
),
212
,
128
,
edgecolor
=
"r"
,
facecolor
=
"none"
))
plot_spectrogram
(
spec2
[
0
],
ylabel
=
"Downsampled"
,
ax
=
axs
[
1
])
plot_spectrogram
(
spec3
[
0
],
ylabel
=
"Upsampled"
,
ax
=
axs
[
2
])
fig
.
tight_layout
()
######################################################################
#
# In the above visualization, the second plot ("Downsampled") might
# give the impression that the spectrogram is streched.
# This is because the meaning of frequency bins is different from
# the original one.
# Even though, they have the same number of bins, in the second plot,
# the frequency is only covered to the half of the original sampling
# rate.
# This becomes more clear if we resample the downsampled signal again
# so that it has the same sample rate as the original.
######################################################################
# GriffinLim
# ----------
#
# To recover a waveform from a spectrogram, you can use
# :py:class:`torchaudio.transforms.GriffinLim`.
#
# The same set of parameters used for spectrogram must be used.
# Define transforms
n_fft
=
1024
spectrogram
=
T
.
Spectrogram
(
n_fft
=
n_fft
)
griffin_lim
=
T
.
GriffinLim
(
n_fft
=
n_fft
)
# Apply the transforms
spec
=
spectrogram
(
SPEECH_WAVEFORM
)
reconstructed_waveform
=
griffin_lim
(
spec
)
######################################################################
#
plot_waveform
(
reconstructed_waveform
,
SAMPLE_RATE
,
title
=
"Reconstructed"
)
_
,
axes
=
plt
.
subplots
(
2
,
1
,
sharex
=
True
,
sharey
=
True
)
plot_waveform
(
SPEECH_WAVEFORM
,
SAMPLE_RATE
,
title
=
"Original"
,
ax
=
axes
[
0
])
plot_waveform
(
reconstructed_waveform
,
SAMPLE_RATE
,
title
=
"Reconstructed"
,
ax
=
axes
[
1
])
Audio
(
reconstructed_waveform
,
rate
=
SAMPLE_RATE
)
######################################################################
...
...
@@ -253,7 +334,6 @@ mel_spectrogram = T.MelSpectrogram(
pad_mode
=
"reflect"
,
power
=
2.0
,
norm
=
"slaney"
,
onesided
=
True
,
n_mels
=
n_mels
,
mel_scale
=
"htk"
,
)
...
...
@@ -322,7 +402,7 @@ mfcc = mfcc_transform(SPEECH_WAVEFORM)
######################################################################
#
plot_spectrogram
(
mfcc
[
0
])
plot_spectrogram
(
mfcc
[
0
]
,
title
=
"MFCC"
)
######################################################################
# Comparison against librosa
...
...
@@ -350,7 +430,7 @@ mfcc_librosa = librosa.feature.mfcc(
######################################################################
#
plot_spectrogram
(
mfcc_librosa
)
plot_spectrogram
(
mfcc_librosa
,
title
=
"MFCC (librosa)"
)
mse
=
torch
.
square
(
mfcc
-
mfcc_librosa
).
mean
().
item
()
print
(
"Mean Square Difference: "
,
mse
)
...
...
@@ -376,7 +456,7 @@ lfcc_transform = T.LFCC(
)
lfcc
=
lfcc_transform
(
SPEECH_WAVEFORM
)
plot_spectrogram
(
lfcc
[
0
])
plot_spectrogram
(
lfcc
[
0
]
,
title
=
"LFCC"
)
######################################################################
# Pitch
...
...
@@ -388,6 +468,7 @@ pitch = F.detect_pitch_frequency(SPEECH_WAVEFORM, SAMPLE_RATE)
######################################################################
#
def
plot_pitch
(
waveform
,
sr
,
pitch
):
figure
,
axis
=
plt
.
subplots
(
1
,
1
)
axis
.
set_title
(
"Pitch Feature"
)
...
...
@@ -402,58 +483,6 @@ def plot_pitch(waveform, sr, pitch):
axis2
.
plot
(
time_axis
,
pitch
[
0
],
linewidth
=
2
,
label
=
"Pitch"
,
color
=
"green"
)
axis2
.
legend
(
loc
=
0
)
plt
.
show
(
block
=
False
)
plot_pitch
(
SPEECH_WAVEFORM
,
SAMPLE_RATE
,
pitch
)
######################################################################
# Kaldi Pitch (beta)
# ------------------
#
# Kaldi Pitch feature [1] is a pitch detection mechanism tuned for automatic
# speech recognition (ASR) applications. This is a beta feature in ``torchaudio``,
# and it is available as :py:func:`torchaudio.functional.compute_kaldi_pitch`.
#
# 1. A pitch extraction algorithm tuned for automatic speech recognition
#
# Ghahremani, B. BabaAli, D. Povey, K. Riedhammer, J. Trmal and S.
# Khudanpur
#
# 2014 IEEE International Conference on Acoustics, Speech and Signal
# Processing (ICASSP), Florence, 2014, pp. 2494-2498, doi:
# 10.1109/ICASSP.2014.6854049.
# [`abstract <https://ieeexplore.ieee.org/document/6854049>`__],
# [`paper <https://danielpovey.com/files/2014_icassp_pitch.pdf>`__]
#
pitch_feature
=
F
.
compute_kaldi_pitch
(
SPEECH_WAVEFORM
,
SAMPLE_RATE
)
pitch
,
nfcc
=
pitch_feature
[...,
0
],
pitch_feature
[...,
1
]
######################################################################
#
def
plot_kaldi_pitch
(
waveform
,
sr
,
pitch
,
nfcc
):
_
,
axis
=
plt
.
subplots
(
1
,
1
)
axis
.
set_title
(
"Kaldi Pitch Feature"
)
axis
.
grid
(
True
)
end_time
=
waveform
.
shape
[
1
]
/
sr
time_axis
=
torch
.
linspace
(
0
,
end_time
,
waveform
.
shape
[
1
])
axis
.
plot
(
time_axis
,
waveform
[
0
],
linewidth
=
1
,
color
=
"gray"
,
alpha
=
0.3
)
time_axis
=
torch
.
linspace
(
0
,
end_time
,
pitch
.
shape
[
1
])
ln1
=
axis
.
plot
(
time_axis
,
pitch
[
0
],
linewidth
=
2
,
label
=
"Pitch"
,
color
=
"green"
)
axis
.
set_ylim
((
-
1.3
,
1.3
))
axis2
=
axis
.
twinx
()
time_axis
=
torch
.
linspace
(
0
,
end_time
,
nfcc
.
shape
[
1
])
ln2
=
axis2
.
plot
(
time_axis
,
nfcc
[
0
],
linewidth
=
2
,
label
=
"NFCC"
,
color
=
"blue"
,
linestyle
=
"--"
)
lns
=
ln1
+
ln2
labels
=
[
l
.
get_label
()
for
l
in
lns
]
axis
.
legend
(
lns
,
labels
,
loc
=
0
)
plt
.
show
(
block
=
False
)
plot_kaldi_pitch
(
SPEECH_WAVEFORM
,
SAMPLE_RATE
,
pitch
,
nfcc
)
examples/tutorials/audio_io_tutorial.py
View file @
ffeba11a
...
...
@@ -5,8 +5,15 @@ Audio I/O
**Author**: `Moto Hira <moto@meta.com>`__
This tutorial shows how to use TorchAudio's basic I/O API to load audio files
into PyTorch's Tensor object, and save Tensor objects to audio files.
This tutorial shows how to use TorchAudio's basic I/O API to inspect audio data,
load them into PyTorch Tensors and save PyTorch Tensors.
.. warning::
There are multiple changes planned/made to audio I/O in recent releases.
For the detail of these changes please refer to
:ref:`Introduction of Dispatcher <dispatcher_migration>`.
"""
import
torch
...
...
@@ -47,6 +54,16 @@ SAMPLE_WAV = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch12753
SAMPLE_WAV_8000
=
download_asset
(
"tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042-8000hz.wav"
)
def
_hide_seek
(
obj
):
class
_wrapper
:
def
__init__
(
self
,
obj
):
self
.
obj
=
obj
def
read
(
self
,
n
):
return
self
.
obj
.
read
(
n
)
return
_wrapper
(
obj
)
######################################################################
# Querying audio metadata
...
...
@@ -113,7 +130,7 @@ print(metadata)
url
=
"https://download.pytorch.org/torchaudio/tutorial-assets/steam-train-whistle-daniel_simon.wav"
with
requests
.
get
(
url
,
stream
=
True
)
as
response
:
metadata
=
torchaudio
.
info
(
response
.
raw
)
metadata
=
torchaudio
.
info
(
_hide_seek
(
response
.
raw
)
)
print
(
metadata
)
######################################################################
...
...
@@ -164,7 +181,6 @@ def plot_waveform(waveform, sample_rate):
if
num_channels
>
1
:
axes
[
c
].
set_ylabel
(
f
"Channel
{
c
+
1
}
"
)
figure
.
suptitle
(
"waveform"
)
plt
.
show
(
block
=
False
)
######################################################################
...
...
@@ -187,7 +203,6 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram"):
if
num_channels
>
1
:
axes
[
c
].
set_ylabel
(
f
"Channel
{
c
+
1
}
"
)
figure
.
suptitle
(
title
)
plt
.
show
(
block
=
False
)
######################################################################
...
...
@@ -215,7 +230,7 @@ Audio(waveform.numpy()[0], rate=sample_rate)
# Load audio data as HTTP request
url
=
"https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
with
requests
.
get
(
url
,
stream
=
True
)
as
response
:
waveform
,
sample_rate
=
torchaudio
.
load
(
response
.
raw
)
waveform
,
sample_rate
=
torchaudio
.
load
(
_hide_seek
(
response
.
raw
)
)
plot_specgram
(
waveform
,
sample_rate
,
title
=
"HTTP datasource"
)
######################################################################
...
...
@@ -237,7 +252,7 @@ bucket = "pytorch-tutorial-assets"
key
=
"VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
client
=
boto3
.
client
(
"s3"
,
config
=
Config
(
signature_version
=
UNSIGNED
))
response
=
client
.
get_object
(
Bucket
=
bucket
,
Key
=
key
)
waveform
,
sample_rate
=
torchaudio
.
load
(
response
[
"Body"
])
waveform
,
sample_rate
=
torchaudio
.
load
(
_hide_seek
(
response
[
"Body"
])
)
plot_specgram
(
waveform
,
sample_rate
,
title
=
"From S3"
)
...
...
@@ -271,13 +286,15 @@ frame_offset, num_frames = 16000, 16000 # Fetch and decode the 1 - 2 seconds
url
=
"https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
print
(
"Fetching all the data..."
)
with
requests
.
get
(
url
,
stream
=
True
)
as
response
:
waveform1
,
sample_rate1
=
torchaudio
.
load
(
response
.
raw
)
waveform1
,
sample_rate1
=
torchaudio
.
load
(
_hide_seek
(
response
.
raw
)
)
waveform1
=
waveform1
[:,
frame_offset
:
frame_offset
+
num_frames
]
print
(
f
" - Fetched
{
response
.
raw
.
tell
()
}
bytes"
)
print
(
"Fetching until the requested frames are available..."
)
with
requests
.
get
(
url
,
stream
=
True
)
as
response
:
waveform2
,
sample_rate2
=
torchaudio
.
load
(
response
.
raw
,
frame_offset
=
frame_offset
,
num_frames
=
num_frames
)
waveform2
,
sample_rate2
=
torchaudio
.
load
(
_hide_seek
(
response
.
raw
),
frame_offset
=
frame_offset
,
num_frames
=
num_frames
)
print
(
f
" - Fetched
{
response
.
raw
.
tell
()
}
bytes"
)
print
(
"Checking the resulting waveform ... "
,
end
=
""
)
...
...
@@ -316,6 +333,7 @@ waveform, sample_rate = torchaudio.load(SAMPLE_WAV)
######################################################################
#
def
inspect_file
(
path
):
print
(
"-"
*
10
)
print
(
"Source:"
,
path
)
...
...
@@ -324,6 +342,7 @@ def inspect_file(path):
print
(
f
" -
{
torchaudio
.
info
(
path
)
}
"
)
print
()
######################################################################
#
# Save without any encoding option.
...
...
@@ -351,11 +370,11 @@ with tempfile.TemporaryDirectory() as tempdir:
formats
=
[
"flac"
,
"vorbis"
,
"sph"
,
"amb"
,
"amr-nb"
,
"gsm"
,
#
"vorbis",
#
"sph",
#
"amb",
#
"amr-nb",
#
"gsm",
]
######################################################################
...
...
examples/tutorials/audio_resampling_tutorial.py
View file @
ffeba11a
...
...
@@ -27,14 +27,14 @@ import math
import
timeit
import
librosa
import
resampy
import
matplotlib.pyplot
as
plt
import
matplotlib.colors
as
mcolors
import
matplotlib.pyplot
as
plt
import
pandas
as
pd
from
IPython.display
import
Audio
,
display
import
resampy
from
IPython.display
import
Audio
pd
.
set_option
(
'
display.max_rows
'
,
None
)
pd
.
set_option
(
'
display.max_columns
'
,
None
)
pd
.
set_option
(
"
display.max_rows
"
,
None
)
pd
.
set_option
(
"
display.max_columns
"
,
None
)
DEFAULT_OFFSET
=
201
...
...
@@ -105,7 +105,6 @@ def plot_sweep(
axis
.
yaxis
.
grid
(
True
,
alpha
=
0.67
)
figure
.
suptitle
(
f
"
{
title
}
(sample rate:
{
sample_rate
}
Hz)"
)
plt
.
colorbar
(
cax
)
plt
.
show
(
block
=
True
)
######################################################################
...
...
@@ -240,13 +239,13 @@ plot_sweep(resampled_waveform, resample_rate, title="rolloff=0.8")
sample_rate
=
48000
resample_rate
=
32000
resampled_waveform
=
F
.
resample
(
waveform
,
sample_rate
,
resample_rate
,
resampling_method
=
"sinc_interp
olatio
n"
)
resampled_waveform
=
F
.
resample
(
waveform
,
sample_rate
,
resample_rate
,
resampling_method
=
"sinc_interp
_han
n"
)
plot_sweep
(
resampled_waveform
,
resample_rate
,
title
=
"Hann Window Default"
)
######################################################################
#
resampled_waveform
=
F
.
resample
(
waveform
,
sample_rate
,
resample_rate
,
resampling_method
=
"
kaiser_window
"
)
resampled_waveform
=
F
.
resample
(
waveform
,
sample_rate
,
resample_rate
,
resampling_method
=
"
sinc_interp_kaiser
"
)
plot_sweep
(
resampled_waveform
,
resample_rate
,
title
=
"Kaiser Window Default"
)
...
...
@@ -271,7 +270,7 @@ resampled_waveform = F.resample(
resample_rate
,
lowpass_filter_width
=
64
,
rolloff
=
0.9475937167399596
,
resampling_method
=
"
kaiser_window
"
,
resampling_method
=
"
sinc_interp_kaiser
"
,
beta
=
14.769656459379492
,
)
plot_sweep
(
resampled_waveform
,
resample_rate
,
title
=
"Kaiser Window Best (torchaudio)"
)
...
...
@@ -300,7 +299,7 @@ resampled_waveform = F.resample(
resample_rate
,
lowpass_filter_width
=
16
,
rolloff
=
0.85
,
resampling_method
=
"
kaiser_window
"
,
resampling_method
=
"
sinc_interp_kaiser
"
,
beta
=
8.555504641634386
,
)
plot_sweep
(
resampled_waveform
,
resample_rate
,
title
=
"Kaiser Window Fast (torchaudio)"
)
...
...
@@ -325,7 +324,7 @@ print("torchaudio and librosa kaiser fast MSE:", mse)
#
# Below are benchmarks for downsampling and upsampling waveforms between
# two pairs of sampling rates. We demonstrate the performance implications
# that the ``lowpass_filter_w
d
ith``, window type, and sample rates can
# that the ``lowpass_filter_wi
d
th``, window type, and sample rates can
# have. Additionally, we provide a comparison against ``librosa``\ ’s
# ``kaiser_best`` and ``kaiser_fast`` using their corresponding parameters
# in ``torchaudio``.
...
...
@@ -338,18 +337,20 @@ print(f"resampy: {resampy.__version__}")
######################################################################
#
def
benchmark_resample_functional
(
waveform
,
sample_rate
,
resample_rate
,
lowpass_filter_width
=
6
,
rolloff
=
0.99
,
resampling_method
=
"sinc_interp
olatio
n"
,
resampling_method
=
"sinc_interp
_han
n"
,
beta
=
None
,
iters
=
5
,
):
return
timeit
.
timeit
(
stmt
=
'''
return
(
timeit
.
timeit
(
stmt
=
"""
torchaudio.functional.resample(
waveform,
sample_rate,
...
...
@@ -359,29 +360,34 @@ torchaudio.functional.resample(
resampling_method=resampling_method,
beta=beta,
)
'''
,
setup
=
'import torchaudio'
,
number
=
iters
,
globals
=
locals
(),
)
*
1000
/
iters
"""
,
setup
=
"import torchaudio"
,
number
=
iters
,
globals
=
locals
(),
)
*
1000
/
iters
)
######################################################################
#
def
benchmark_resample_transforms
(
waveform
,
sample_rate
,
resample_rate
,
lowpass_filter_width
=
6
,
rolloff
=
0.99
,
resampling_method
=
"sinc_interp
olatio
n"
,
resampling_method
=
"sinc_interp
_han
n"
,
beta
=
None
,
iters
=
5
,
):
return
timeit
.
timeit
(
stmt
=
'resampler(waveform)'
,
setup
=
'''
return
(
timeit
.
timeit
(
stmt
=
"resampler(waveform)"
,
setup
=
"""
import torchaudio
resampler = torchaudio.transforms.Resample(
...
...
@@ -394,15 +400,19 @@ resampler = torchaudio.transforms.Resample(
beta=beta,
)
resampler.to(waveform.device)
'''
,
number
=
iters
,
globals
=
locals
(),
)
*
1000
/
iters
"""
,
number
=
iters
,
globals
=
locals
(),
)
*
1000
/
iters
)
######################################################################
#
def
benchmark_resample_librosa
(
waveform
,
sample_rate
,
...
...
@@ -411,24 +421,29 @@ def benchmark_resample_librosa(
iters
=
5
,
):
waveform_np
=
waveform
.
squeeze
().
numpy
()
return
timeit
.
timeit
(
stmt
=
'''
return
(
timeit
.
timeit
(
stmt
=
"""
librosa.resample(
waveform_np,
orig_sr=sample_rate,
target_sr=resample_rate,
res_type=res_type,
)
'''
,
setup
=
'import librosa'
,
number
=
iters
,
globals
=
locals
(),
)
*
1000
/
iters
"""
,
setup
=
"import librosa"
,
number
=
iters
,
globals
=
locals
(),
)
*
1000
/
iters
)
######################################################################
#
def
benchmark
(
sample_rate
,
resample_rate
):
times
,
rows
=
[],
[]
waveform
=
get_sine_sweep
(
sample_rate
).
to
(
torch
.
float32
)
...
...
@@ -451,7 +466,7 @@ def benchmark(sample_rate, resample_rate):
kwargs
=
{
"lowpass_filter_width"
:
64
,
"rolloff"
:
0.9475937167399596
,
"resampling_method"
:
"
kaiser_window
"
,
"resampling_method"
:
"
sinc_interp_kaiser
"
,
"beta"
:
14.769656459379492
,
}
lib_time
=
benchmark_resample_librosa
(
*
args
,
res_type
=
"kaiser_best"
)
...
...
@@ -464,7 +479,7 @@ def benchmark(sample_rate, resample_rate):
kwargs
=
{
"lowpass_filter_width"
:
16
,
"rolloff"
:
0.85
,
"resampling_method"
:
"
kaiser_window
"
,
"resampling_method"
:
"
sinc_interp_kaiser
"
,
"beta"
:
8.555504641634386
,
}
lib_time
=
benchmark_resample_librosa
(
*
args
,
res_type
=
"kaiser_fast"
)
...
...
@@ -483,7 +498,7 @@ def plot(df):
print
(
df
.
round
(
2
))
ax
=
df
.
plot
(
kind
=
"bar"
)
plt
.
ylabel
(
"Time Elapsed [ms]"
)
plt
.
xticks
(
rotation
=
0
,
fontsize
=
10
)
plt
.
xticks
(
rotation
=
0
,
fontsize
=
10
)
for
cont
,
col
,
color
in
zip
(
ax
.
containers
,
df
.
columns
,
mcolors
.
TABLEAU_COLORS
):
label
=
[
"N/A"
if
v
!=
v
else
str
(
v
)
for
v
in
df
[
col
].
round
(
2
)]
ax
.
bar_label
(
cont
,
labels
=
label
,
color
=
color
,
fontweight
=
"bold"
,
fontsize
=
"x-small"
)
...
...
@@ -531,8 +546,8 @@ plot(df)
# - a larger ``lowpass_filter_width`` results in a larger resampling kernel,
# and therefore increases computation time for both the kernel computation
# and convolution
# - using ``
kaiser_window
`` results in longer computation times than the default
# ``sinc_interp
olatio
n`` because it is more complex to compute the intermediate
# - using ``
sinc_interp_kaiser
`` results in longer computation times than the default
# ``sinc_interp
_han
n`` because it is more complex to compute the intermediate
# window values
# - a large GCD between the sample and resample rate will result
# in a simplification that allows for a smaller kernel and faster kernel computation.
...
...
examples/tutorials/ctc_forced_alignment_api_tutorial.py
0 → 100644
View file @
ffeba11a
"""
CTC forced alignment API tutorial
=================================
**Author**: `Xiaohui Zhang <xiaohuizhang@meta.com>`__, `Moto Hira <moto@meta.com>`__
The forced alignment is a process to align transcript with speech.
This tutorial shows how to align transcripts to speech using
:py:func:`torchaudio.functional.forced_align` which was developed along the work of
`Scaling Speech Technology to 1,000+ Languages
<https://research.facebook.com/publications/scaling-speech-technology-to-1000-languages/>`__.
:py:func:`~torchaudio.functional.forced_align` has custom CPU and CUDA
implementations which are more performant than the vanilla Python
implementation above, and are more accurate.
It can also handle missing transcript with special ``<star>`` token.
There is also a high-level API, :py:class:`torchaudio.pipelines.Wav2Vec2FABundle`,
which wraps the pre/post-processing explained in this tutorial and makes it easy
to run forced-alignments.
`Forced alignment for multilingual data
<./forced_alignment_for_multilingual_data_tutorial.html>`__ uses this API to
illustrate how to align non-English transcripts.
"""
######################################################################
# Preparation
# -----------
import
torch
import
torchaudio
print
(
torch
.
__version__
)
print
(
torchaudio
.
__version__
)
######################################################################
#
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
print
(
device
)
######################################################################
#
import
IPython
import
matplotlib.pyplot
as
plt
import
torchaudio.functional
as
F
######################################################################
# First we prepare the speech data and the transcript we area going
# to use.
#
SPEECH_FILE
=
torchaudio
.
utils
.
download_asset
(
"tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
)
waveform
,
_
=
torchaudio
.
load
(
SPEECH_FILE
)
TRANSCRIPT
=
"i had that curiosity beside me at this moment"
.
split
()
######################################################################
# Generating emissions
# ~~~~~~~~~~~~~~~~~~~~
#
# :py:func:`~torchaudio.functional.forced_align` takes emission and
# token sequences and outputs timestaps of the tokens and their scores.
#
# Emission reperesents the frame-wise probability distribution over
# tokens, and it can be obtained by passing waveform to an acoustic
# model.
#
# Tokens are numerical expression of transcripts. There are many ways to
# tokenize transcripts, but here, we simply map alphabets into integer,
# which is how labels were constructed when the acoustice model we are
# going to use was trained.
#
# We will use a pre-trained Wav2Vec2 model,
# :py:data:`torchaudio.pipelines.MMS_FA`, to obtain emission and tokenize
# the transcript.
#
bundle
=
torchaudio
.
pipelines
.
MMS_FA
model
=
bundle
.
get_model
(
with_star
=
False
).
to
(
device
)
with
torch
.
inference_mode
():
emission
,
_
=
model
(
waveform
.
to
(
device
))
######################################################################
#
def
plot_emission
(
emission
):
fig
,
ax
=
plt
.
subplots
()
ax
.
imshow
(
emission
.
cpu
().
T
)
ax
.
set_title
(
"Frame-wise class probabilities"
)
ax
.
set_xlabel
(
"Time"
)
ax
.
set_ylabel
(
"Labels"
)
fig
.
tight_layout
()
plot_emission
(
emission
[
0
])
######################################################################
# Tokenize the transcript
# ~~~~~~~~~~~~~~~~~~~~~~~
#
# We create a dictionary, which maps each label into token.
LABELS
=
bundle
.
get_labels
(
star
=
None
)
DICTIONARY
=
bundle
.
get_dict
(
star
=
None
)
for
k
,
v
in
DICTIONARY
.
items
():
print
(
f
"
{
k
}
:
{
v
}
"
)
######################################################################
# converting transcript to tokens is as simple as
tokenized_transcript
=
[
DICTIONARY
[
c
]
for
word
in
TRANSCRIPT
for
c
in
word
]
for
t
in
tokenized_transcript
:
print
(
t
,
end
=
" "
)
print
()
######################################################################
# Computing alignments
# --------------------
#
# Frame-level alignments
# ~~~~~~~~~~~~~~~~~~~~~~
#
# Now we call TorchAudio’s forced alignment API to compute the
# frame-level alignment. For the detail of function signature, please
# refer to :py:func:`~torchaudio.functional.forced_align`.
#
def
align
(
emission
,
tokens
):
targets
=
torch
.
tensor
([
tokens
],
dtype
=
torch
.
int32
,
device
=
device
)
alignments
,
scores
=
F
.
forced_align
(
emission
,
targets
,
blank
=
0
)
alignments
,
scores
=
alignments
[
0
],
scores
[
0
]
# remove batch dimension for simplicity
scores
=
scores
.
exp
()
# convert back to probability
return
alignments
,
scores
aligned_tokens
,
alignment_scores
=
align
(
emission
,
tokenized_transcript
)
######################################################################
# Now let's look at the output.
for
i
,
(
ali
,
score
)
in
enumerate
(
zip
(
aligned_tokens
,
alignment_scores
)):
print
(
f
"
{
i
:
3
d
}
:
\t
{
ali
:
2
d
}
[
{
LABELS
[
ali
]
}
],
{
score
:.
2
f
}
"
)
######################################################################
#
# .. note::
#
# The alignment is expressed in the frame cordinate of the emission,
# which is different from the original waveform.
#
# It contains blank tokens and repeated tokens. The following is the
# interpretation of the non-blank tokens.
#
# .. code-block::
#
# 31: 0 [-], 1.00
# 32: 2 [i], 1.00 "i" starts and ends
# 33: 0 [-], 1.00
# 34: 0 [-], 1.00
# 35: 15 [h], 1.00 "h" starts
# 36: 15 [h], 0.93 "h" ends
# 37: 1 [a], 1.00 "a" starts and ends
# 38: 0 [-], 0.96
# 39: 0 [-], 1.00
# 40: 0 [-], 1.00
# 41: 13 [d], 1.00 "d" starts and ends
# 42: 0 [-], 1.00
#
# .. note::
#
# When same token occured after blank tokens, it is not treated as
# a repeat, but as a new occurrence.
#
# .. code-block::
#
# a a a b -> a b
# a - - b -> a b
# a a - b -> a b
# a - a b -> a a b
# ^^^ ^^^
#
######################################################################
# Token-level alignments
# ~~~~~~~~~~~~~~~~~~~~~~
#
# Next step is to resolve the repetation, so that each alignment does
# not depend on previous alignments.
# :py:func:`torchaudio.functional.merge_tokens` computes the
# :py:class:`~torchaudio.functional.TokenSpan` object, which represents
# which token from the transcript is present at what time span.
######################################################################
#
token_spans
=
F
.
merge_tokens
(
aligned_tokens
,
alignment_scores
)
print
(
"Token
\t
Time
\t
Score"
)
for
s
in
token_spans
:
print
(
f
"
{
LABELS
[
s
.
token
]
}
\t
[
{
s
.
start
:
3
d
}
,
{
s
.
end
:
3
d
}
)
\t
{
s
.
score
:.
2
f
}
"
)
######################################################################
# Word-level alignments
# ~~~~~~~~~~~~~~~~~~~~~
#
# Now let’s group the token-level alignments into word-level alignments.
def
unflatten
(
list_
,
lengths
):
assert
len
(
list_
)
==
sum
(
lengths
)
i
=
0
ret
=
[]
for
l
in
lengths
:
ret
.
append
(
list_
[
i
:
i
+
l
])
i
+=
l
return
ret
word_spans
=
unflatten
(
token_spans
,
[
len
(
word
)
for
word
in
TRANSCRIPT
])
######################################################################
# Audio previews
# ~~~~~~~~~~~~~~
#
# Compute average score weighted by the span length
def
_score
(
spans
):
return
sum
(
s
.
score
*
len
(
s
)
for
s
in
spans
)
/
sum
(
len
(
s
)
for
s
in
spans
)
def
preview_word
(
waveform
,
spans
,
num_frames
,
transcript
,
sample_rate
=
bundle
.
sample_rate
):
ratio
=
waveform
.
size
(
1
)
/
num_frames
x0
=
int
(
ratio
*
spans
[
0
].
start
)
x1
=
int
(
ratio
*
spans
[
-
1
].
end
)
print
(
f
"
{
transcript
}
(
{
_score
(
spans
):.
2
f
}
):
{
x0
/
sample_rate
:.
3
f
}
-
{
x1
/
sample_rate
:.
3
f
}
sec"
)
segment
=
waveform
[:,
x0
:
x1
]
return
IPython
.
display
.
Audio
(
segment
.
numpy
(),
rate
=
sample_rate
)
num_frames
=
emission
.
size
(
1
)
######################################################################
# Generate the audio for each segment
print
(
TRANSCRIPT
)
IPython
.
display
.
Audio
(
SPEECH_FILE
)
######################################################################
#
preview_word
(
waveform
,
word_spans
[
0
],
num_frames
,
TRANSCRIPT
[
0
])
######################################################################
#
preview_word
(
waveform
,
word_spans
[
1
],
num_frames
,
TRANSCRIPT
[
1
])
######################################################################
#
preview_word
(
waveform
,
word_spans
[
2
],
num_frames
,
TRANSCRIPT
[
2
])
######################################################################
#
preview_word
(
waveform
,
word_spans
[
3
],
num_frames
,
TRANSCRIPT
[
3
])
######################################################################
#
preview_word
(
waveform
,
word_spans
[
4
],
num_frames
,
TRANSCRIPT
[
4
])
######################################################################
#
preview_word
(
waveform
,
word_spans
[
5
],
num_frames
,
TRANSCRIPT
[
5
])
######################################################################
#
preview_word
(
waveform
,
word_spans
[
6
],
num_frames
,
TRANSCRIPT
[
6
])
######################################################################
#
preview_word
(
waveform
,
word_spans
[
7
],
num_frames
,
TRANSCRIPT
[
7
])
######################################################################
#
preview_word
(
waveform
,
word_spans
[
8
],
num_frames
,
TRANSCRIPT
[
8
])
######################################################################
# Visualization
# ~~~~~~~~~~~~~
#
# Now let's look at the alignment result and segment the original
# speech into words.
def
plot_alignments
(
waveform
,
token_spans
,
emission
,
transcript
,
sample_rate
=
bundle
.
sample_rate
):
ratio
=
waveform
.
size
(
1
)
/
emission
.
size
(
1
)
/
sample_rate
fig
,
axes
=
plt
.
subplots
(
2
,
1
)
axes
[
0
].
imshow
(
emission
[
0
].
detach
().
cpu
().
T
,
aspect
=
"auto"
)
axes
[
0
].
set_title
(
"Emission"
)
axes
[
0
].
set_xticks
([])
axes
[
1
].
specgram
(
waveform
[
0
],
Fs
=
sample_rate
)
for
t_spans
,
chars
in
zip
(
token_spans
,
transcript
):
t0
,
t1
=
t_spans
[
0
].
start
+
0.1
,
t_spans
[
-
1
].
end
-
0.1
axes
[
0
].
axvspan
(
t0
-
0.5
,
t1
-
0.5
,
facecolor
=
"None"
,
hatch
=
"/"
,
edgecolor
=
"white"
)
axes
[
1
].
axvspan
(
ratio
*
t0
,
ratio
*
t1
,
facecolor
=
"None"
,
hatch
=
"/"
,
edgecolor
=
"white"
)
axes
[
1
].
annotate
(
f
"
{
_score
(
t_spans
):.
2
f
}
"
,
(
ratio
*
t0
,
sample_rate
*
0.51
),
annotation_clip
=
False
)
for
span
,
char
in
zip
(
t_spans
,
chars
):
t0
=
span
.
start
*
ratio
axes
[
1
].
annotate
(
char
,
(
t0
,
sample_rate
*
0.55
),
annotation_clip
=
False
)
axes
[
1
].
set_xlabel
(
"time [second]"
)
axes
[
1
].
set_xlim
([
0
,
None
])
fig
.
tight_layout
()
######################################################################
#
plot_alignments
(
waveform
,
word_spans
,
emission
,
TRANSCRIPT
)
######################################################################
#
# Inconsistent treatment of ``blank`` token
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# When splitting the token-level alignments into words, you will
# notice that some blank tokens are treated differently, and this makes
# the interpretation of the result somehwat ambigious.
#
# This is easy to see when we plot the scores. The following figure
# shows word regions and non-word regions, with the frame-level scores
# of non-blank tokens.
def
plot_scores
(
word_spans
,
scores
):
fig
,
ax
=
plt
.
subplots
()
span_xs
,
span_hs
=
[],
[]
ax
.
axvspan
(
word_spans
[
0
][
0
].
start
-
0.05
,
word_spans
[
-
1
][
-
1
].
end
+
0.05
,
facecolor
=
"paleturquoise"
,
edgecolor
=
"none"
,
zorder
=-
1
)
for
t_span
in
word_spans
:
for
span
in
t_span
:
for
t
in
range
(
span
.
start
,
span
.
end
):
span_xs
.
append
(
t
+
0.5
)
span_hs
.
append
(
scores
[
t
].
item
())
ax
.
annotate
(
LABELS
[
span
.
token
],
(
span
.
start
,
-
0.07
))
ax
.
axvspan
(
t_span
[
0
].
start
-
0.05
,
t_span
[
-
1
].
end
+
0.05
,
facecolor
=
"mistyrose"
,
edgecolor
=
"none"
,
zorder
=-
1
)
ax
.
bar
(
span_xs
,
span_hs
,
color
=
"lightsalmon"
,
edgecolor
=
"coral"
)
ax
.
set_title
(
"Frame-level scores and word segments"
)
ax
.
set_ylim
(
-
0.1
,
None
)
ax
.
grid
(
True
,
axis
=
"y"
)
ax
.
axhline
(
0
,
color
=
"black"
)
fig
.
tight_layout
()
plot_scores
(
word_spans
,
alignment_scores
)
######################################################################
# In this plot, the blank tokens are those highlighted area without
# vertical bar.
# You can see that there are blank tokens which are interpreted as
# part of a word (highlighted red), while the others (highlighted blue)
# are not.
#
# One reason for this is because the model was trained without a
# label for the word boundary. The blank tokens are treated not just
# as repeatation but also as silence between words.
#
# But then, a question arises. Should frames immediately after or
# near the end of a word be silent or repeat?
#
# In the above example, if you go back to the previous plot of
# spectrogram and word regions, you see that after "y" in "curiosity",
# there is still some activities in multiple frequency buckets.
#
# Would it be more accurate if that frame was included in the word?
#
# Unfortunately, CTC does not provide a comprehensive solution to this.
# Models trained with CTC are known to exhibit "peaky" response,
# that is, they tend to spike for an aoccurance of a label, but the
# spike does not last for the duration of the label.
# (Note: Pre-trained Wav2Vec2 models tend to spike at the beginning of
# label occurances, but this not always the case.)
#
# :cite:`zeyer2021does` has in-depth alanysis on the peaky behavior of
# CTC.
# We encourage those who are interested understanding more to refer
# to the paper.
# The following is a quote from the paper, which is the exact issue we
# are facing here.
#
# *Peaky behavior can be problematic in certain cases,*
# *e.g. when an application requires to not use the blank label,*
# *e.g. to get meaningful time accurate alignments of phonemes*
# *to a transcription.*
######################################################################
# Advanced: Handling transcripts with ``<star>`` token
# ----------------------------------------------------
#
# Now let’s look at when the transcript is partially missing, how can we
# improve alignment quality using the ``<star>`` token, which is capable of modeling
# any token.
#
# Here we use the same English example as used above. But we remove the
# beginning text ``“i had that curiosity beside me at”`` from the transcript.
# Aligning audio with such transcript results in wrong alignments of the
# existing word “this”. However, this issue can be mitigated by using the
# ``<star>`` token to model the missing text.
#
######################################################################
# First, we extend the dictionary to include the ``<star>`` token.
DICTIONARY
[
"*"
]
=
len
(
DICTIONARY
)
######################################################################
# Next, we extend the emission tensor with the extra dimension
# corresponding to the ``<star>`` token.
#
star_dim
=
torch
.
zeros
((
1
,
emission
.
size
(
1
),
1
),
device
=
emission
.
device
,
dtype
=
emission
.
dtype
)
emission
=
torch
.
cat
((
emission
,
star_dim
),
2
)
assert
len
(
DICTIONARY
)
==
emission
.
shape
[
2
]
plot_emission
(
emission
[
0
])
######################################################################
# The following function combines all the processes, and compute
# word segments from emission in one-go.
def
compute_alignments
(
emission
,
transcript
,
dictionary
):
tokens
=
[
dictionary
[
char
]
for
word
in
transcript
for
char
in
word
]
alignment
,
scores
=
align
(
emission
,
tokens
)
token_spans
=
F
.
merge_tokens
(
alignment
,
scores
)
word_spans
=
unflatten
(
token_spans
,
[
len
(
word
)
for
word
in
transcript
])
return
word_spans
######################################################################
# Full Transcript
# ~~~~~~~~~~~~~~~
word_spans
=
compute_alignments
(
emission
,
TRANSCRIPT
,
DICTIONARY
)
plot_alignments
(
waveform
,
word_spans
,
emission
,
TRANSCRIPT
)
######################################################################
# Partial Transcript with ``<star>`` token
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# Now we replace the first part of the transcript with the ``<star>`` token.
transcript
=
"* this moment"
.
split
()
word_spans
=
compute_alignments
(
emission
,
transcript
,
DICTIONARY
)
plot_alignments
(
waveform
,
word_spans
,
emission
,
transcript
)
######################################################################
#
preview_word
(
waveform
,
word_spans
[
0
],
num_frames
,
transcript
[
0
])
######################################################################
#
preview_word
(
waveform
,
word_spans
[
1
],
num_frames
,
transcript
[
1
])
######################################################################
#
preview_word
(
waveform
,
word_spans
[
2
],
num_frames
,
transcript
[
2
])
######################################################################
# Partial Transcript without ``<star>`` token
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# As a comparison, the following aligns the partial transcript
# without using ``<star>`` token.
# It demonstrates the effect of ``<star>`` token for dealing with deletion errors.
transcript
=
"this moment"
.
split
()
word_spans
=
compute_alignments
(
emission
,
transcript
,
DICTIONARY
)
plot_alignments
(
waveform
,
word_spans
,
emission
,
transcript
)
######################################################################
# Conclusion
# ----------
#
# In this tutorial, we looked at how to use torchaudio’s forced alignment
# API to align and segment speech files, and demonstrated one advanced usage:
# How introducing a ``<star>`` token could improve alignment accuracy when
# transcription errors exist.
#
######################################################################
# Acknowledgement
# ---------------
#
# Thanks to `Vineel Pratap <vineelkpratap@meta.com>`__ and `Zhaoheng
# Ni <zni@meta.com>`__ for developing and open-sourcing the
# forced aligner API.
examples/tutorials/device_asr.py
View file @
ffeba11a
...
...
@@ -7,26 +7,23 @@ Device ASR with Emformer RNN-T
This tutorial shows how to use Emformer RNN-T and streaming API
to perform speech recognition on a streaming device input, i.e. microphone
on laptop.
.. note::
This tutorial requires FFmpeg libraries (>=4.1, <4.4) and SentencePiece.
There are multiple ways to install FFmpeg libraries.
If you are using Anaconda Python distribution,
``conda install 'ffmpeg<4.4'`` will install
the required FFmpeg libraries.
You can install SentencePiece by running ``pip install sentencepiece``.
.. note::
This tutorial was tested on MacBook Pro and Dynabook with Windows 10.
This tutorial does NOT work on Google Colab because the server running
this tutorial does not have a microphone that you can talk to.
"""
######################################################################
#
# .. note::
#
# This tutorial requires FFmpeg libraries.
# Please refer to :ref:`FFmpeg dependency <ffmpeg_dependency>` for
# the detail.
#
# .. note::
#
# This tutorial was tested on MacBook Pro and Dynabook with Windows 10.
#
# This tutorial does NOT work on Google Colab because the server running
# this tutorial does not have a microphone that you can talk to.
######################################################################
# 1. Overview
# -----------
...
...
examples/tutorials/effector_tutorial.py
0 → 100644
View file @
ffeba11a
"""
AudioEffector Usages
====================
**Author**: `Moto Hira <moto@meta.com>`__
This tutorial shows how to use :py:class:`torchaudio.io.AudioEffector` to
apply various effects and codecs to waveform tensor.
"""
######################################################################
#
# .. note::
#
# This tutorial requires FFmpeg libraries.
# Please refer to :ref:`FFmpeg dependency <ffmpeg_dependency>` for
# the detail.
#
######################################################################
# Overview
# --------
#
# :py:class:`~torchaudio.io.AudioEffector` combines in-memory encoding,
# decoding and filtering that are provided by
# :py:class:`~torchaudio.io.StreamWriter` and
# :py:class:`~torchaudio.io.StreamReader`.
#
# The following figure illustrates the process.
#
# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/AudioEffector.png
#
import
torch
import
torchaudio
print
(
torch
.
__version__
)
print
(
torchaudio
.
__version__
)
######################################################################
#
from
torchaudio.io
import
AudioEffector
,
CodecConfig
import
matplotlib.pyplot
as
plt
from
IPython.display
import
Audio
######################################################################
#
for
k
,
v
in
torchaudio
.
utils
.
ffmpeg_utils
.
get_versions
().
items
():
print
(
k
,
v
)
######################################################################
# Usage
# -----
#
# To use ``AudioEffector``, instantiate it with ``effect`` and
# ``format``, then either pass the waveform to
# :py:meth:`~torchaudio.io.AudioEffector.apply` or
# :py:meth:`~torchaudio.io.AudioEffector.stream` method.
#
# .. code:: python
#
# effector = AudioEffector(effect=..., format=...,)
#
# # Apply at once
# applied = effector.apply(waveform, sample_rate)
#
# ``apply`` method applies effect and codec to the entire waveform at
# once. So if the input waveform is long, and memory consumption is an
# issue, one can use ``stream`` method to process chunk by chunk.
#
# .. code:: python
#
# # Apply chunk by chunk
# for applied_chunk = effector.stream(waveform, sample_rate):
# ...
#
######################################################################
# Example
# -------
#
src
=
torchaudio
.
utils
.
download_asset
(
"tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
)
waveform
,
sr
=
torchaudio
.
load
(
src
,
channels_first
=
False
)
######################################################################
# Gallery
# -------
#
def
show
(
effect
,
*
,
stereo
=
False
):
wf
=
torch
.
cat
([
waveform
]
*
2
,
dim
=
1
)
if
stereo
else
waveform
figsize
=
(
6.4
,
2.1
if
stereo
else
1.2
)
effector
=
AudioEffector
(
effect
=
effect
,
pad_end
=
False
)
result
=
effector
.
apply
(
wf
,
int
(
sr
))
num_channels
=
result
.
size
(
1
)
f
,
ax
=
plt
.
subplots
(
num_channels
,
1
,
squeeze
=
False
,
figsize
=
figsize
,
sharex
=
True
)
for
i
in
range
(
num_channels
):
ax
[
i
][
0
].
specgram
(
result
[:,
i
],
Fs
=
sr
)
f
.
set_tight_layout
(
True
)
return
Audio
(
result
.
numpy
().
T
,
rate
=
sr
)
######################################################################
# Original
# --------
#
show
(
effect
=
None
)
######################################################################
# Effects
# -------
#
######################################################################
# tempo
# ~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#atempo
show
(
"atempo=0.7"
)
######################################################################
#
show
(
"atempo=1.8"
)
######################################################################
# highpass
# ~~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#highpass
show
(
"highpass=frequency=1500"
)
######################################################################
# lowpass
# ~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#lowpass
show
(
"lowpass=frequency=1000"
)
######################################################################
# allpass
# ~~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#allpass
show
(
"allpass"
)
######################################################################
# bandpass
# ~~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#bandpass
show
(
"bandpass=frequency=3000"
)
######################################################################
# bandreject
# ~~~~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#bandreject
show
(
"bandreject=frequency=3000"
)
######################################################################
# echo
# ~~~~
# https://ffmpeg.org/ffmpeg-filters.html#aecho
show
(
"aecho=in_gain=0.8:out_gain=0.88:delays=6:decays=0.4"
)
######################################################################
#
show
(
"aecho=in_gain=0.8:out_gain=0.88:delays=60:decays=0.4"
)
######################################################################
#
show
(
"aecho=in_gain=0.8:out_gain=0.9:delays=1000:decays=0.3"
)
######################################################################
# chorus
# ~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#chorus
show
(
"chorus=0.5:0.9:50|60|40:0.4|0.32|0.3:0.25|0.4|0.3:2|2.3|1.3"
)
######################################################################
# fft filter
# ~~~~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#afftfilt
# fmt: off
show
(
"afftfilt="
"real='re * (1-clip(b * (b/nb), 0, 1))':"
"imag='im * (1-clip(b * (b/nb), 0, 1))'"
)
######################################################################
#
show
(
"afftfilt="
"real='hypot(re,im) * sin(0)':"
"imag='hypot(re,im) * cos(0)':"
"win_size=512:"
"overlap=0.75"
)
######################################################################
#
show
(
"afftfilt="
"real='hypot(re,im) * cos(2 * 3.14 * (random(0) * 2-1))':"
"imag='hypot(re,im) * sin(2 * 3.14 * (random(1) * 2-1))':"
"win_size=128:"
"overlap=0.8"
)
# fmt: on
######################################################################
# vibrato
# ~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#vibrato
show
(
"vibrato=f=10:d=0.8"
)
######################################################################
# tremolo
# ~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#tremolo
show
(
"tremolo=f=8:d=0.8"
)
######################################################################
# crystalizer
# ~~~~~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#crystalizer
show
(
"crystalizer"
)
######################################################################
# flanger
# ~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#flanger
show
(
"flanger"
)
######################################################################
# phaser
# ~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#aphaser
show
(
"aphaser"
)
######################################################################
# pulsator
# ~~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#apulsator
show
(
"apulsator"
,
stereo
=
True
)
######################################################################
# haas
# ~~~~
# https://ffmpeg.org/ffmpeg-filters.html#haas
show
(
"haas"
)
######################################################################
# Codecs
# ------
#
def
show_multi
(
configs
):
results
=
[]
for
config
in
configs
:
effector
=
AudioEffector
(
**
config
)
results
.
append
(
effector
.
apply
(
waveform
,
int
(
sr
)))
num_configs
=
len
(
configs
)
figsize
=
(
6.4
,
0.3
+
num_configs
*
0.9
)
f
,
axes
=
plt
.
subplots
(
num_configs
,
1
,
figsize
=
figsize
,
sharex
=
True
)
for
result
,
ax
in
zip
(
results
,
axes
):
ax
.
specgram
(
result
[:,
0
],
Fs
=
sr
)
f
.
set_tight_layout
(
True
)
return
[
Audio
(
r
.
numpy
().
T
,
rate
=
sr
)
for
r
in
results
]
######################################################################
# ogg
# ~~~
#
results
=
show_multi
(
[
{
"format"
:
"ogg"
},
{
"format"
:
"ogg"
,
"encoder"
:
"vorbis"
},
{
"format"
:
"ogg"
,
"encoder"
:
"opus"
},
]
)
######################################################################
# ogg - default encoder (flac)
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
results
[
0
]
######################################################################
# ogg - vorbis
# ^^^^^^^^^^^^
#
results
[
1
]
######################################################################
# ogg - opus
# ^^^^^^^^^^
#
results
[
2
]
######################################################################
# mp3
# ~~~
# https://trac.ffmpeg.org/wiki/Encode/MP3
results
=
show_multi
(
[
{
"format"
:
"mp3"
},
{
"format"
:
"mp3"
,
"codec_config"
:
CodecConfig
(
compression_level
=
1
)},
{
"format"
:
"mp3"
,
"codec_config"
:
CodecConfig
(
compression_level
=
9
)},
{
"format"
:
"mp3"
,
"codec_config"
:
CodecConfig
(
bit_rate
=
192_000
)},
{
"format"
:
"mp3"
,
"codec_config"
:
CodecConfig
(
bit_rate
=
8_000
)},
{
"format"
:
"mp3"
,
"codec_config"
:
CodecConfig
(
qscale
=
9
)},
{
"format"
:
"mp3"
,
"codec_config"
:
CodecConfig
(
qscale
=
1
)},
]
)
######################################################################
# default
# ^^^^^^^
results
[
0
]
######################################################################
# compression_level=1
# ^^^^^^^^^^^^^^^^^^^
results
[
1
]
######################################################################
# compression_level=9
# ^^^^^^^^^^^^^^^^^^^
results
[
2
]
######################################################################
# bit_rate=192k
# ^^^^^^^^^^^^^
results
[
3
]
######################################################################
# bit_rate=8k
# ^^^^^^^^^^^^^
results
[
4
]
######################################################################
# qscale=9
# ^^^^^^^^
results
[
5
]
######################################################################
# qscale=1
# ^^^^^^^^
results
[
6
]
######################################################################
#
# Tag: :obj:`torchaudio.io`
examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py
0 → 100644
View file @
ffeba11a
"""
Forced alignment for multilingual data
======================================
**Authors**: `Xiaohui Zhang <xiaohuizhang@meta.com>`__, `Moto Hira <moto@meta.com>`__.
This tutorial shows how to align transcript to speech for non-English languages.
The process of aligning non-English (normalized) transcript is identical to aligning
English (normalized) transcript, and the process for English is covered in detail in
`CTC forced alignment tutorial <./ctc_forced_alignment_api_tutorial.html>`__.
In this tutorial, we use TorchAudio's high-level API,
:py:class:`torchaudio.pipelines.Wav2Vec2FABundle`, which packages the pre-trained
model, tokenizer and aligner, to perform the forced alignment with less code.
"""
import
torch
import
torchaudio
print
(
torch
.
__version__
)
print
(
torchaudio
.
__version__
)
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
print
(
device
)
######################################################################
#
from
typing
import
List
import
IPython
import
matplotlib.pyplot
as
plt
######################################################################
# Creating the pipeline
# ---------------------
#
# First, we instantiate the model and pre/post-processing pipelines.
#
# The following diagram illustrates the process of alignment.
#
# .. image:: https://download.pytorch.org/torchaudio/doc-assets/pipelines-wav2vec2fabundle.png
#
# The waveform is passed to an acoustic model, which produces the sequence of
# probability distribution of tokens.
# The transcript is passed to tokenizer, which converts the transcript to
# sequence of tokens.
# Aligner takes the results from the acoustic model and the tokenizer and generate
# timestamps for each token.
#
# .. note::
#
# This process expects that the input transcript is already normalized.
# The process of normalization, which involves romanization of non-English
# languages, is language-dependent, so it is not covered in this tutorial,
# but we will breifly look into it.
#
# The acoustic model and the tokenizer must use the same set of tokens.
# To facilitate the creation of matching processors,
# :py:class:`~torchaudio.pipelines.Wav2Vec2FABundle` associates a
# pre-trained accoustic model and a tokenizer.
# :py:data:`torchaudio.pipelines.MMS_FA` is one of such instance.
#
# The following code instantiates a pre-trained acoustic model, a tokenizer
# which uses the same set of tokens as the model, and an aligner.
#
from
torchaudio.pipelines
import
MMS_FA
as
bundle
model
=
bundle
.
get_model
()
model
.
to
(
device
)
tokenizer
=
bundle
.
get_tokenizer
()
aligner
=
bundle
.
get_aligner
()
######################################################################
# .. note::
#
# The model instantiated by :py:data:`~torchaudio.pipelines.MMS_FA`'s
# :py:meth:`~torchaudio.pipelines.Wav2Vec2FABundle.get_model`
# method by default includes the feature dimension for ``<star>`` token.
# You can disable this by passing ``with_star=False``.
#
######################################################################
# The acoustic model of :py:data:`~torchaudio.pipelines.MMS_FA` was
# created and open-sourced as part of the research project,
# `Scaling Speech Technology to 1,000+ Languages
# <https://research.facebook.com/publications/scaling-speech-technology-to-1000-languages/>`__.
# It was trained with 23,000 hours of audio from 1100+ languages.
#
# The tokenizer simply maps the normalized characters to integers.
# You can check the mapping as follow;
print
(
bundle
.
get_dict
())
######################################################################
#
# The aligner internally uses :py:func:`torchaudio.functional.forced_align`
# and :py:func:`torchaudio.functional.merge_tokens` to infer the time
# stamps of the input tokens.
#
# The detail of the underlying mechanism is covered in
# `CTC forced alignment API tutorial <./ctc_forced_alignment_api_tutorial.html>`__,
# so please refer to it.
######################################################################
# We define a utility function that performs the forced alignment with
# the above model, the tokenizer and the aligner.
#
def
compute_alignments
(
waveform
:
torch
.
Tensor
,
transcript
:
List
[
str
]):
with
torch
.
inference_mode
():
emission
,
_
=
model
(
waveform
.
to
(
device
))
token_spans
=
aligner
(
emission
[
0
],
tokenizer
(
transcript
))
return
emission
,
token_spans
######################################################################
# We also define utility functions for plotting the result and previewing
# the audio segments.
# Compute average score weighted by the span length
def
_score
(
spans
):
return
sum
(
s
.
score
*
len
(
s
)
for
s
in
spans
)
/
sum
(
len
(
s
)
for
s
in
spans
)
def
plot_alignments
(
waveform
,
token_spans
,
emission
,
transcript
,
sample_rate
=
bundle
.
sample_rate
):
ratio
=
waveform
.
size
(
1
)
/
emission
.
size
(
1
)
/
sample_rate
fig
,
axes
=
plt
.
subplots
(
2
,
1
)
axes
[
0
].
imshow
(
emission
[
0
].
detach
().
cpu
().
T
,
aspect
=
"auto"
)
axes
[
0
].
set_title
(
"Emission"
)
axes
[
0
].
set_xticks
([])
axes
[
1
].
specgram
(
waveform
[
0
],
Fs
=
sample_rate
)
for
t_spans
,
chars
in
zip
(
token_spans
,
transcript
):
t0
,
t1
=
t_spans
[
0
].
start
,
t_spans
[
-
1
].
end
axes
[
0
].
axvspan
(
t0
-
0.5
,
t1
-
0.5
,
facecolor
=
"None"
,
hatch
=
"/"
,
edgecolor
=
"white"
)
axes
[
1
].
axvspan
(
ratio
*
t0
,
ratio
*
t1
,
facecolor
=
"None"
,
hatch
=
"/"
,
edgecolor
=
"white"
)
axes
[
1
].
annotate
(
f
"
{
_score
(
t_spans
):.
2
f
}
"
,
(
ratio
*
t0
,
sample_rate
*
0.51
),
annotation_clip
=
False
)
for
span
,
char
in
zip
(
t_spans
,
chars
):
t0
=
span
.
start
*
ratio
axes
[
1
].
annotate
(
char
,
(
t0
,
sample_rate
*
0.55
),
annotation_clip
=
False
)
axes
[
1
].
set_xlabel
(
"time [second]"
)
fig
.
tight_layout
()
######################################################################
#
def
preview_word
(
waveform
,
spans
,
num_frames
,
transcript
,
sample_rate
=
bundle
.
sample_rate
):
ratio
=
waveform
.
size
(
1
)
/
num_frames
x0
=
int
(
ratio
*
spans
[
0
].
start
)
x1
=
int
(
ratio
*
spans
[
-
1
].
end
)
print
(
f
"
{
transcript
}
(
{
_score
(
spans
):.
2
f
}
):
{
x0
/
sample_rate
:.
3
f
}
-
{
x1
/
sample_rate
:.
3
f
}
sec"
)
segment
=
waveform
[:,
x0
:
x1
]
return
IPython
.
display
.
Audio
(
segment
.
numpy
(),
rate
=
sample_rate
)
######################################################################
# Normalizing the transcript
# --------------------------
#
# The transcripts passed to the pipeline must be normalized beforehand.
# The exact process of normalization depends on language.
#
# Languages that do not have explicit word boundaries
# (such as Chinese, Japanese and Korean) require segmentation first.
# There are dedicated tools for this, but let's say we have segmented
# transcript.
#
# The first step of normalization is romanization.
# `uroman <https://github.com/isi-nlp/uroman>`__ is a tool that
# supports many languages.
#
# Here is a BASH commands to romanize the input text file and write
# the output to another text file using ``uroman``.
#
# .. code-block:: bash
#
# $ echo "des événements d'actualité qui se sont produits durant l'année 1882" > text.txt
# $ uroman/bin/uroman.pl < text.txt > text_romanized.txt
# $ cat text_romanized.txt
#
# .. code-block:: text
#
# Cette page concerne des evenements d'actualite qui se sont produits durant l'annee 1882
#
# The next step is to remove non-alphabets and punctuations.
# The following snippet normalizes the romanized transcript.
#
# .. code-block:: python
#
# import re
#
#
# def normalize_uroman(text):
# text = text.lower()
# text = text.replace("’", "'")
# text = re.sub("([^a-z' ])", " ", text)
# text = re.sub(' +', ' ', text)
# return text.strip()
#
#
# with open("text_romanized.txt", "r") as f:
# for line in f:
# text_normalized = normalize_uroman(line)
# print(text_normalized)
#
# Running the script on the above exanple produces the following.
#
# .. code-block:: text
#
# cette page concerne des evenements d'actualite qui se sont produits durant l'annee
#
# Note that, in this example, since "1882" was not romanized by ``uroman``,
# it was removed in the normalization step.
# To avoid this, one needs to romanize numbers, but this is known to be a non-trivial task.
#
######################################################################
# Aligning transcripts to speech
# ------------------------------
#
# Now we perform the forced alignment for multiple languages.
#
#
# German
# ~~~~~~
text_raw
=
"aber seit ich bei ihnen das brot hole"
text_normalized
=
"aber seit ich bei ihnen das brot hole"
url
=
"https://download.pytorch.org/torchaudio/tutorial-assets/10349_8674_000087.flac"
waveform
,
sample_rate
=
torchaudio
.
load
(
url
,
frame_offset
=
int
(
0.5
*
bundle
.
sample_rate
),
num_frames
=
int
(
2.5
*
bundle
.
sample_rate
)
)
######################################################################
#
assert
sample_rate
==
bundle
.
sample_rate
######################################################################
#
transcript
=
text_normalized
.
split
()
tokens
=
tokenizer
(
transcript
)
emission
,
token_spans
=
compute_alignments
(
waveform
,
transcript
)
num_frames
=
emission
.
size
(
1
)
plot_alignments
(
waveform
,
token_spans
,
emission
,
transcript
)
print
(
"Raw Transcript: "
,
text_raw
)
print
(
"Normalized Transcript: "
,
text_normalized
)
IPython
.
display
.
Audio
(
waveform
,
rate
=
sample_rate
)
######################################################################
#
preview_word
(
waveform
,
token_spans
[
0
],
num_frames
,
transcript
[
0
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
1
],
num_frames
,
transcript
[
1
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
2
],
num_frames
,
transcript
[
2
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
3
],
num_frames
,
transcript
[
3
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
4
],
num_frames
,
transcript
[
4
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
5
],
num_frames
,
transcript
[
5
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
6
],
num_frames
,
transcript
[
6
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
7
],
num_frames
,
transcript
[
7
])
######################################################################
# Chinese
# ~~~~~~~
#
# Chinese is a character-based language, and there is not explicit word-level
# tokenization (separated by spaces) in its raw written form. In order to
# obtain word level alignments, you need to first tokenize the transcripts
# at the word level using a word tokenizer like `“Stanford
# Tokenizer” <https://michelleful.github.io/code-blog/2015/09/10/parsing-chinese-with-stanford/>`__.
# However this is not needed if you only want character-level alignments.
#
text_raw
=
"关 服务 高端 产品 仍 处于 供不应求 的 局面"
text_normalized
=
"guan fuwu gaoduan chanpin reng chuyu gongbuyingqiu de jumian"
######################################################################
#
url
=
"https://download.pytorch.org/torchaudio/tutorial-assets/mvdr/clean_speech.wav"
waveform
,
sample_rate
=
torchaudio
.
load
(
url
)
waveform
=
waveform
[
0
:
1
]
######################################################################
#
assert
sample_rate
==
bundle
.
sample_rate
######################################################################
#
transcript
=
text_normalized
.
split
()
emission
,
token_spans
=
compute_alignments
(
waveform
,
transcript
)
num_frames
=
emission
.
size
(
1
)
plot_alignments
(
waveform
,
token_spans
,
emission
,
transcript
)
print
(
"Raw Transcript: "
,
text_raw
)
print
(
"Normalized Transcript: "
,
text_normalized
)
IPython
.
display
.
Audio
(
waveform
,
rate
=
sample_rate
)
######################################################################
#
preview_word
(
waveform
,
token_spans
[
0
],
num_frames
,
transcript
[
0
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
1
],
num_frames
,
transcript
[
1
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
2
],
num_frames
,
transcript
[
2
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
3
],
num_frames
,
transcript
[
3
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
4
],
num_frames
,
transcript
[
4
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
5
],
num_frames
,
transcript
[
5
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
6
],
num_frames
,
transcript
[
6
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
7
],
num_frames
,
transcript
[
7
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
8
],
num_frames
,
transcript
[
8
])
######################################################################
# Polish
# ~~~~~~
text_raw
=
"wtedy ujrzałem na jego brzuchu okrągłą czarną ranę"
text_normalized
=
"wtedy ujrzalem na jego brzuchu okragla czarna rane"
url
=
"https://download.pytorch.org/torchaudio/tutorial-assets/5090_1447_000088.flac"
waveform
,
sample_rate
=
torchaudio
.
load
(
url
,
num_frames
=
int
(
4.5
*
bundle
.
sample_rate
))
######################################################################
#
assert
sample_rate
==
bundle
.
sample_rate
######################################################################
#
transcript
=
text_normalized
.
split
()
emission
,
token_spans
=
compute_alignments
(
waveform
,
transcript
)
num_frames
=
emission
.
size
(
1
)
plot_alignments
(
waveform
,
token_spans
,
emission
,
transcript
)
print
(
"Raw Transcript: "
,
text_raw
)
print
(
"Normalized Transcript: "
,
text_normalized
)
IPython
.
display
.
Audio
(
waveform
,
rate
=
sample_rate
)
######################################################################
#
preview_word
(
waveform
,
token_spans
[
0
],
num_frames
,
transcript
[
0
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
1
],
num_frames
,
transcript
[
1
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
2
],
num_frames
,
transcript
[
2
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
3
],
num_frames
,
transcript
[
3
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
4
],
num_frames
,
transcript
[
4
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
5
],
num_frames
,
transcript
[
5
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
6
],
num_frames
,
transcript
[
6
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
7
],
num_frames
,
transcript
[
7
])
######################################################################
# Portuguese
# ~~~~~~~~~~
text_raw
=
"na imensa extensão onde se esconde o inconsciente imortal"
text_normalized
=
"na imensa extensao onde se esconde o inconsciente imortal"
url
=
"https://download.pytorch.org/torchaudio/tutorial-assets/6566_5323_000027.flac"
waveform
,
sample_rate
=
torchaudio
.
load
(
url
,
frame_offset
=
int
(
bundle
.
sample_rate
),
num_frames
=
int
(
4.6
*
bundle
.
sample_rate
)
)
######################################################################
#
assert
sample_rate
==
bundle
.
sample_rate
######################################################################
#
transcript
=
text_normalized
.
split
()
emission
,
token_spans
=
compute_alignments
(
waveform
,
transcript
)
num_frames
=
emission
.
size
(
1
)
plot_alignments
(
waveform
,
token_spans
,
emission
,
transcript
)
print
(
"Raw Transcript: "
,
text_raw
)
print
(
"Normalized Transcript: "
,
text_normalized
)
IPython
.
display
.
Audio
(
waveform
,
rate
=
sample_rate
)
######################################################################
#
preview_word
(
waveform
,
token_spans
[
0
],
num_frames
,
transcript
[
0
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
1
],
num_frames
,
transcript
[
1
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
2
],
num_frames
,
transcript
[
2
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
3
],
num_frames
,
transcript
[
3
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
4
],
num_frames
,
transcript
[
4
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
5
],
num_frames
,
transcript
[
5
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
6
],
num_frames
,
transcript
[
6
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
7
],
num_frames
,
transcript
[
7
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
8
],
num_frames
,
transcript
[
8
])
######################################################################
# Italian
# ~~~~~~~
text_raw
=
"elle giacean per terra tutte quante"
text_normalized
=
"elle giacean per terra tutte quante"
url
=
"https://download.pytorch.org/torchaudio/tutorial-assets/642_529_000025.flac"
waveform
,
sample_rate
=
torchaudio
.
load
(
url
,
num_frames
=
int
(
4
*
bundle
.
sample_rate
))
######################################################################
#
assert
sample_rate
==
bundle
.
sample_rate
######################################################################
#
transcript
=
text_normalized
.
split
()
emission
,
token_spans
=
compute_alignments
(
waveform
,
transcript
)
num_frames
=
emission
.
size
(
1
)
plot_alignments
(
waveform
,
token_spans
,
emission
,
transcript
)
print
(
"Raw Transcript: "
,
text_raw
)
print
(
"Normalized Transcript: "
,
text_normalized
)
IPython
.
display
.
Audio
(
waveform
,
rate
=
sample_rate
)
######################################################################
#
preview_word
(
waveform
,
token_spans
[
0
],
num_frames
,
transcript
[
0
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
1
],
num_frames
,
transcript
[
1
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
2
],
num_frames
,
transcript
[
2
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
3
],
num_frames
,
transcript
[
3
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
4
],
num_frames
,
transcript
[
4
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
5
],
num_frames
,
transcript
[
5
])
######################################################################
# Conclusion
# ----------
#
# In this tutorial, we looked at how to use torchaudio’s forced alignment
# API and a Wav2Vec2 pre-trained mulilingual acoustic model to align
# speech data to transcripts in five languages.
#
######################################################################
# Acknowledgement
# ---------------
#
# Thanks to `Vineel Pratap <vineelkpratap@meta.com>`__ and `Zhaoheng
# Ni <zni@meta.com>`__ for developing and open-sourcing the
# forced aligner API.
#
Prev
1
…
4
5
6
7
8
9
10
11
12
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment