Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Torchaudio
Commits
ffeba11a
Commit
ffeba11a
authored
Sep 02, 2024
by
mayp777
Browse files
UPDATE
parent
29deb085
Changes
337
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2825 additions
and
471 deletions
+2825
-471
examples/self_supervised_learning/losses/__init__.py
examples/self_supervised_learning/losses/__init__.py
+6
-0
examples/self_supervised_learning/losses/_hubert_loss.py
examples/self_supervised_learning/losses/_hubert_loss.py
+47
-0
examples/self_supervised_learning/losses/_wav2vec2_loss.py
examples/self_supervised_learning/losses/_wav2vec2_loss.py
+80
-0
examples/self_supervised_learning/lr_schedulers/__init__.py
examples/self_supervised_learning/lr_schedulers/__init__.py
+5
-0
examples/self_supervised_learning/lr_schedulers/_linear_decay.py
...s/self_supervised_learning/lr_schedulers/_linear_decay.py
+27
-0
examples/self_supervised_learning/train_hubert.py
examples/self_supervised_learning/train_hubert.py
+316
-0
examples/source_separation/eval.py
examples/source_separation/eval.py
+2
-2
examples/source_separation/lightning_train.py
examples/source_separation/lightning_train.py
+3
-2
examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py
...ples/tutorials/asr_inference_with_ctc_decoder_tutorial.py
+75
-18
examples/tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.py
...tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.py
+311
-0
examples/tutorials/audio_data_augmentation_tutorial.py
examples/tutorials/audio_data_augmentation_tutorial.py
+125
-163
examples/tutorials/audio_datasets_tutorial.py
examples/tutorials/audio_datasets_tutorial.py
+26
-46
examples/tutorials/audio_feature_augmentation_tutorial.py
examples/tutorials/audio_feature_augmentation_tutorial.py
+66
-47
examples/tutorials/audio_feature_extractions_tutorial.py
examples/tutorials/audio_feature_extractions_tutorial.py
+151
-122
examples/tutorials/audio_io_tutorial.py
examples/tutorials/audio_io_tutorial.py
+33
-14
examples/tutorials/audio_resampling_tutorial.py
examples/tutorials/audio_resampling_tutorial.py
+54
-39
examples/tutorials/ctc_forced_alignment_api_tutorial.py
examples/tutorials/ctc_forced_alignment_api_tutorial.py
+517
-0
examples/tutorials/device_asr.py
examples/tutorials/device_asr.py
+15
-18
examples/tutorials/effector_tutorial.py
examples/tutorials/effector_tutorial.py
+366
-0
examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py
...orials/forced_alignment_for_multilingual_data_tutorial.py
+600
-0
No files found.
Too many changes to show.
To preserve performance only
337 of 337+
files are displayed.
Plain diff
Email patch
examples/self_supervised_learning/losses/__init__.py
0 → 100644
View file @
ffeba11a
from
._hubert_loss
import
hubert_loss
__all__
=
[
"hubert_loss"
,
"wav2vec2_loss"
,
]
examples/self_supervised_learning/losses/_hubert_loss.py
0 → 100644
View file @
ffeba11a
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn.functional
as
F
from
torch
import
Tensor
def
hubert_loss
(
logit_m
:
Optional
[
Tensor
],
logit_u
:
Optional
[
Tensor
],
feature_penalty
:
Tensor
,
label
:
Optional
[
Tensor
]
=
None
,
masked_weight
:
float
=
1.0
,
unmasked_weight
:
float
=
0.0
,
feature_weight
:
float
=
10.0
,
reduction
:
str
=
"sum"
,
)
->
Tuple
[
Tensor
,
float
]:
"""Compute the cross-entropy loss on HuBERT masked and non-masked logits.
Args:
logit_m (Tensor or None): The masked logit Tensor of dimension `(masked_frames, final_dim)`.
logit_u (Tensor or None): The non-masked logit Tensor of dimension `(unmasked_frames, final_dim)`.
feature_penalty (Tensor): The feature mean value for additional penalty loss.
masked_weight (float, optional): The weight for masked cross-entropy loss (Default: ``1.0``).
unmasked_weight (float, optional): The weight for non-masked cross-entropy loss (Default: ``0.0``).
feature_weight (float, optional): The weight for feature penalty loss (Default: ``10.0``).
reduction (str, optional): The reduction method for cross-entropy loss (Default: ``"sum"``).
Returns:
(Tensor, float)
Tensor: The desired loss Tensor.
float: Number of frames used in loss computation.
"""
num_frame
=
0.0
loss
=
0.0
if
logit_m
is
not
None
:
target_m
=
torch
.
zeros
(
logit_m
.
shape
[
0
],
dtype
=
torch
.
long
,
device
=
logit_m
.
device
)
loss_m
=
F
.
cross_entropy
(
logit_m
,
target_m
,
reduction
=
reduction
)
loss
+=
loss_m
*
masked_weight
num_frame
+=
logit_m
.
shape
[
0
]
if
logit_u
is
not
None
:
target_u
=
torch
.
zeros
(
logit_u
.
shape
[
0
],
dtype
=
torch
.
long
,
device
=
logit_m
.
device
)
loss_u
=
F
.
cross_entropy
(
logit_u
,
target_u
,
reduction
=
reduction
)
loss
+=
loss_u
*
unmasked_weight
num_frame
+=
logit_u
.
shape
[
0
]
loss
+=
feature_penalty
*
feature_weight
*
num_frame
return
loss
,
num_frame
examples/self_supervised_learning/losses/_wav2vec2_loss.py
0 → 100644
View file @
ffeba11a
from
typing
import
Tuple
import
torch
import
torch.nn.functional
as
F
from
torch
import
Tensor
def
compute_contrastive_loss
(
x
:
Tensor
,
mask_indices
:
Tensor
,
targets
:
Tensor
,
neg_is_pos
:
Tensor
,
reduction
:
str
=
"none"
,
logit_temp
:
float
=
0.1
,
):
"""
Computes the contrastive loss used in Wav2Vec2 loss function.
Args:
x (Tensor): Input embeddings of shape `(batch_size, sequence_length, hidden_size)`.
mask_indices (Tensor): Indices to mask negative samples of shape `(batch_size, sequence_length)`.
targets (Tensor): Labels indicating positive samples.
Tensor of shape `(num_negative + 1, batch, sequence_length, hidden_size)`.
neg_is_pos (Tensor): Boolean tensor indicating whether negative samples should be treated as positives.
Tensor of shape `(batch, sequence_length)`.
reduction (str): Reduction type ("sum" or "none").
logit_temp (float, optional): Temperature scaling factor for logits, defaults to 0.1.
Returns:
The computed contrastive loss and sample size
"""
x
=
x
[
mask_indices
].
view
(
x
.
size
(
0
),
-
1
,
x
.
size
(
-
1
)).
unsqueeze
(
0
).
expand
(
targets
.
shape
)
logits
=
torch
.
cosine_similarity
(
x
.
float
(),
targets
.
float
(),
dim
=-
1
).
float
()
logits
/=
logit_temp
if
neg_is_pos
.
any
():
logits
[
1
:][
neg_is_pos
]
=
float
(
"-inf"
)
target
=
logits
.
new_zeros
(
logits
.
size
(
1
)
*
logits
.
size
(
2
),
dtype
=
torch
.
long
,
device
=
logits
.
device
)
logits
=
logits
.
transpose
(
0
,
2
)
logits
=
logits
.
reshape
(
-
1
,
logits
.
size
(
-
1
))
loss
=
F
.
cross_entropy
(
logits
,
target
,
reduction
=
reduction
,
)
sample_size
=
target
.
numel
()
return
loss
,
sample_size
def
wav2vec2_loss
(
x
:
Tensor
,
mask_indices
:
Tensor
,
positives
:
Tensor
,
negatives
:
Tensor
,
reduction
:
str
=
"none"
)
->
Tuple
[
Tensor
,
float
]:
"""Compute Wav2Vec2 loss.
Args:
x (Tensor): The masked sequences of Wav2Vec 2.0 model.
Tensor of shape `(batch_size, sequence_length, hidden_size)`.
mask_indices (Tensor): The mask indices. Tensor of shape `(batch_size, sequence_length)`
positives (Tensor): The positives, prior to negative sampling.
Tensor of shape `(batch_size, masked_sequence_length, hidden_size)`
negatives (Tensor): The negative samples.
Tensor of shape `(num_negative, batch_size, masked_sequence_length, hidden_size)`
reduction (str): Use "sum" as reduction for cross-entropy loss (Default: ``none``)
Returns:
(Tensor, float)
Tensor: The desired loss Tensor.
float: Sample size according to mask_indices
"""
assert
positives
is
not
None
assert
mask_indices
is
not
None
assert
mask_indices
.
sum
()
==
positives
.
shape
[
0
]
*
positives
.
shape
[
1
]
neg_is_pos
=
(
positives
==
negatives
).
all
(
-
1
)
positives
=
positives
.
unsqueeze
(
0
)
targets
=
torch
.
cat
([
positives
,
negatives
],
dim
=
0
)
loss
,
sample_size
=
compute_contrastive_loss
(
x
,
mask_indices
,
targets
,
neg_is_pos
,
reduction
)
return
loss
,
sample_size
examples/self_supervised_learning/lr_schedulers/__init__.py
0 → 100644
View file @
ffeba11a
from
._linear_decay
import
LinearDecayLRScheduler
__all__
=
[
"LinearDecayLRScheduler"
,
]
examples/self_supervised_learning/lr_schedulers/_linear_decay.py
0 → 100644
View file @
ffeba11a
import
torch
from
torch.optim.optimizer
import
Optimizer
class
LinearDecayLRScheduler
(
torch
.
optim
.
lr_scheduler
.
_LRScheduler
):
"""Linear learning rate scheduler with warm up."""
def
__init__
(
self
,
optimizer
:
Optimizer
,
warmup_updates
:
int
,
max_updates
:
int
,
last_epoch
:
int
=
-
1
,
verbose
:
bool
=
False
,
):
self
.
warmup_updates
=
warmup_updates
self
.
max_updates
=
max_updates
super
().
__init__
(
optimizer
,
last_epoch
=
last_epoch
,
verbose
=
verbose
)
def
get_lr
(
self
):
if
self
.
_step_count
<=
self
.
warmup_updates
:
return
[
self
.
_step_count
/
self
.
warmup_updates
*
base_lr
for
base_lr
in
self
.
base_lrs
]
elif
self
.
_step_count
>=
self
.
max_updates
:
return
[
0.0
for
_
in
self
.
base_lrs
]
else
:
pct_remaining
=
(
self
.
max_updates
-
self
.
_step_count
)
/
(
self
.
max_updates
-
self
.
warmup_updates
)
return
[
base_lr
*
pct_remaining
for
base_lr
in
self
.
base_lrs
]
examples/self_supervised_learning/train_hubert.py
0 → 100644
View file @
ffeba11a
import
logging
import
pathlib
from
argparse
import
ArgumentDefaultsHelpFormatter
,
ArgumentParser
,
RawDescriptionHelpFormatter
from
functools
import
partial
from
typing
import
Dict
,
Tuple
import
torch
import
torchaudio.models
from
lightning.pytorch
import
seed_everything
,
Trainer
from
lightning.pytorch.callbacks
import
ModelCheckpoint
from
.data_modules
import
HuBERTDataModule
from
.lightning_modules
import
SSLPretrainModule
from
.losses
import
hubert_loss
from
.lr_schedulers
import
LinearDecayLRScheduler
class
_Formatter
(
ArgumentDefaultsHelpFormatter
,
RawDescriptionHelpFormatter
):
# To use ArgumentDefaultsHelpFormatter as the formatter_class and
# RawDescriptionHelpFormatter to add custom formatting to description or epilog.
# Check: https://stackoverflow.com/a/18462760
pass
def
_compute_accuracy
(
logits
:
torch
.
Tensor
):
with
torch
.
no_grad
():
max
=
logits
.
argmax
(
-
1
)
==
0
min
=
logits
.
argmin
(
-
1
)
==
0
both
=
max
&
min
corr
=
max
.
long
().
sum
().
item
()
-
both
.
long
().
sum
().
item
()
count
=
max
.
numel
()
return
corr
/
count
class
HuBERTModule
(
SSLPretrainModule
):
def
configure_optimizers
(
self
):
return
(
[
self
.
optimizer
],
[
{
"scheduler"
:
self
.
lr_scheduler
,
"interval"
:
"step"
,
},
],
)
def
log_metric
(
self
,
batch
:
Dict
,
output
:
Tuple
,
loss
:
torch
.
Tensor
,
step_type
:
str
):
logit_m
,
logit_u
,
_
=
output
self
.
log
(
f
"
{
step_type
}
_loss"
,
loss
.
item
(),
on_step
=
True
,
on_epoch
=
True
,
)
acc_m
=
_compute_accuracy
(
logit_m
)
acc_u
=
_compute_accuracy
(
logit_u
)
self
.
log
(
f
"
{
step_type
}
_acc_m"
,
acc_m
,
on_step
=
True
,
on_epoch
=
True
,
sync_dist
=
True
,
prog_bar
=
step_type
==
"train"
,
)
self
.
log
(
f
"
{
step_type
}
_acc_u"
,
acc_u
,
on_step
=
True
,
on_epoch
=
True
,
sync_dist
=
True
,
prog_bar
=
step_type
==
"train"
,
)
def
run_train
(
args
):
seed_everything
(
1337
)
checkpoint_dir
=
args
.
exp_dir
/
f
"checkpoints_
{
args
.
dataset
}
_
{
args
.
model_name
}
"
checkpoint
=
ModelCheckpoint
(
checkpoint_dir
,
monitor
=
"val_loss"
,
mode
=
"min"
,
save_top_k
=
5
,
save_weights_only
=
False
,
verbose
=
True
,
)
train_checkpoint
=
ModelCheckpoint
(
checkpoint_dir
,
monitor
=
"train_loss"
,
mode
=
"min"
,
save_top_k
=
5
,
save_weights_only
=
False
,
verbose
=
True
,
)
callbacks
=
[
checkpoint
,
train_checkpoint
,
]
trainer
=
Trainer
(
default_root_dir
=
args
.
exp_dir
,
max_steps
=
args
.
max_updates
,
num_nodes
=
args
.
num_nodes
,
devices
=
args
.
gpus
,
accelerator
=
"gpu"
,
strategy
=
"ddp_find_unused_parameters_true"
,
precision
=
args
.
precision
,
accumulate_grad_batches
=
args
.
accumulate_grad_batches
,
gradient_clip_val
=
args
.
clip_norm
,
use_distributed_sampler
=
False
,
callbacks
=
callbacks
,
reload_dataloaders_every_n_epochs
=
1
,
)
if
args
.
model_name
not
in
[
"hubert_pretrain_base"
,
"hubert_pretrain_large"
,
"hubert_pretrain_xlarge"
]:
raise
ValueError
(
"Expect model_name to be one of 'hubert_pretrain_base', 'hubert_pretrain_large', 'hubert_pretrain_xlarge'."
f
"Found
{
args
.
model_name
}
."
)
model
=
getattr
(
torchaudio
.
models
,
args
.
model_name
)()
loss_fn
=
partial
(
hubert_loss
,
masked_weight
=
args
.
masked_weight
,
unmasked_weight
=
args
.
unmasked_weight
,
feature_weight
=
args
.
feature_weight
,
)
optimizer
=
torch
.
optim
.
AdamW
(
model
.
parameters
(),
lr
=
args
.
learning_rate
,
betas
=
args
.
betas
,
eps
=
args
.
eps
,
weight_decay
=
args
.
weight_decay
,
)
lr_scheduler
=
LinearDecayLRScheduler
(
optimizer
,
args
.
warmup_updates
,
args
.
max_updates
)
lightning_module
=
HuBERTModule
(
model
,
loss_fn
,
optimizer
,
lr_scheduler
,
)
data_module
=
HuBERTDataModule
(
dataset_path
=
args
.
dataset_path
,
dataset
=
"librispeech"
,
feature_type
=
"mfcc"
,
seconds_per_batch
=
args
.
seconds_per_batch
,
train_shuffle
=
True
,
num_workers
=
10
,
)
trainer
.
fit
(
lightning_module
,
datamodule
=
data_module
)
def
_parse_args
():
parser
=
ArgumentParser
(
description
=
__doc__
,
formatter_class
=
_Formatter
,
)
parser
.
add_argument
(
"--dataset-path"
,
type
=
pathlib
.
Path
,
required
=
True
,
help
=
"Path to the feature and label directories."
,
)
parser
.
add_argument
(
"--resume-checkpoint"
,
type
=
pathlib
.
Path
,
default
=
None
,
help
=
"Path to the feature and label directories. (Default: None)"
,
)
parser
.
add_argument
(
"--feature-type"
,
choices
=
[
"mfcc"
,
"hubert"
],
type
=
str
,
required
=
True
,
)
parser
.
add_argument
(
"--feature-grad-mult"
,
default
=
0.1
,
type
=
float
,
help
=
"The scaling factor to multiply the feature extractor gradient. (Default: 0.1)"
,
)
parser
.
add_argument
(
"--num-classes"
,
choices
=
[
100
,
500
],
type
=
int
,
required
=
True
,
help
=
"The ``num_class`` when building the hubert_pretrain_base model."
,
)
parser
.
add_argument
(
"--model-name"
,
default
=
"hubert_pretrain_base"
,
choices
=
[
"hubert_pretrain_base"
,
"hubert_pretrain_large"
,
"hubert_pretrain_xlarge"
,
],
type
=
str
,
help
=
"The HuBERT model to train. (Default: 'hubert_pretrain_base')"
,
)
parser
.
add_argument
(
"--exp-dir"
,
default
=
pathlib
.
Path
(
"./exp"
),
type
=
pathlib
.
Path
,
help
=
"Directory to save checkpoints and logs to. (Default: './exp')"
,
)
parser
.
add_argument
(
"--dataset"
,
default
=
"librispeech"
,
choices
=
[
"librispeech"
,
"librilight"
],
type
=
str
,
help
=
"The dataset for training. (Default: 'librispeech')"
,
)
parser
.
add_argument
(
"--learning-rate"
,
default
=
0.0005
,
type
=
float
,
help
=
"The peak learning rate. (Default: 0.0005)"
,
)
parser
.
add_argument
(
"--betas"
,
default
=
(
0.9
,
0.98
),
type
=
Tuple
,
help
=
"The coefficients for computing running averages of gradient and its square (Default: (0.9, 0.98))"
,
)
parser
.
add_argument
(
"--eps"
,
default
=
1e-6
,
type
=
float
,
help
=
"Epsilon value in Adam optimizer. (Default: 1e-6)"
,
)
parser
.
add_argument
(
"--weight-decay"
,
default
=
0.01
,
type
=
float
,
help
=
"Weight decay (L2 penalty) (Default: 0.01)"
,
)
parser
.
add_argument
(
"--precision"
,
default
=
16
,
choices
=
[
16
,
32
,
64
,
"bf16"
],
help
=
"Precision of model training. (Default: 16)"
,
)
parser
.
add_argument
(
"--accumulate-grad-batches"
,
default
=
1
,
type
=
int
,
help
=
"Number of steps for accumulating gradients. (Default: 1)"
,
)
parser
.
add_argument
(
"--clip-norm"
,
default
=
10.0
,
type
=
float
,
help
=
"The gradient norm value to clip. (Default: 10.0)"
,
)
parser
.
add_argument
(
"--num-nodes"
,
default
=
4
,
type
=
int
,
help
=
"Number of nodes to use for training. (Default: 4)"
,
)
parser
.
add_argument
(
"--gpus"
,
default
=
8
,
type
=
int
,
help
=
"Number of GPUs per node to use for training. (Default: 8)"
,
)
parser
.
add_argument
(
"--warmup-updates"
,
default
=
32000
,
type
=
int
,
help
=
"Number of steps for warm up the learning rate. (Default: 32000)"
,
)
parser
.
add_argument
(
"--max-updates"
,
default
=
250000
,
type
=
int
,
help
=
"Total number of training steps. (Default: 250000)"
,
)
parser
.
add_argument
(
"--seconds-per-batch"
,
default
=
87.5
,
type
=
float
,
help
=
"Number of seconds of audio in a mini-batch. (Default: 87.5)"
,
)
parser
.
add_argument
(
"--masked-weight"
,
default
=
1.0
,
type
=
float
,
help
=
"The weight for cross-entropy loss of masksed frames. (Default: ``1.0``)"
,
)
parser
.
add_argument
(
"--unmasked-weight"
,
default
=
0.0
,
type
=
float
,
help
=
"The weight for cross-entropy loss of unmasksed frames. (Default: ``0.0``)"
,
)
parser
.
add_argument
(
"--feature-weight"
,
default
=
10.0
,
type
=
float
,
help
=
"The weight for feature penalty loss. (Default: ``10.0``)"
,
)
parser
.
add_argument
(
"--debug"
,
action
=
"store_true"
,
help
=
"whether to use debug level for logging"
)
return
parser
.
parse_args
()
def
_init_logger
(
debug
):
fmt
=
"%(asctime)s %(message)s"
if
debug
else
"%(message)s"
level
=
logging
.
DEBUG
if
debug
else
logging
.
INFO
logging
.
basicConfig
(
format
=
fmt
,
level
=
level
,
datefmt
=
"%Y-%m-%d %H:%M:%S"
)
def
cli_main
():
args
=
_parse_args
()
_init_logger
(
args
.
debug
)
run_train
(
args
)
if
__name__
==
"__main__"
:
cli_main
()
examples/source_separation/eval.py
View file @
ffeba11a
...
@@ -31,7 +31,7 @@ def _eval(model, data_loader, device):
...
@@ -31,7 +31,7 @@ def _eval(model, data_loader, device):
def
cli_main
():
def
cli_main
():
parser
=
ArgumentParser
()
parser
=
ArgumentParser
()
parser
.
add_argument
(
"--dataset"
,
default
=
"librimix"
,
type
=
str
,
choices
=
[
"wsj0
-
mix"
,
"librimix"
])
parser
.
add_argument
(
"--dataset"
,
default
=
"librimix"
,
type
=
str
,
choices
=
[
"wsj0mix"
,
"librimix"
])
parser
.
add_argument
(
parser
.
add_argument
(
"--root-dir"
,
"--root-dir"
,
type
=
Path
,
type
=
Path
,
...
@@ -79,7 +79,7 @@ def cli_main():
...
@@ -79,7 +79,7 @@ def cli_main():
_
,
_
,
eval_loader
=
_get_dataloader
(
_
,
_
,
eval_loader
=
_get_dataloader
(
args
.
dataset
,
args
.
dataset
,
args
.
data
_dir
,
args
.
root
_dir
,
args
.
num_speakers
,
args
.
num_speakers
,
args
.
sample_rate
,
args
.
sample_rate
,
1
,
# batch size is set to 1 to avoid masking
1
,
# batch size is set to 1 to avoid masking
...
...
examples/source_separation/lightning_train.py
View file @
ffeba11a
...
@@ -308,7 +308,7 @@ def _get_dataloader(
...
@@ -308,7 +308,7 @@ def _get_dataloader(
def
cli_main
():
def
cli_main
():
parser
=
ArgumentParser
()
parser
=
ArgumentParser
()
parser
.
add_argument
(
"--batch-size"
,
default
=
6
,
type
=
int
)
parser
.
add_argument
(
"--batch-size"
,
default
=
6
,
type
=
int
)
parser
.
add_argument
(
"--dataset"
,
default
=
"librimix"
,
type
=
str
,
choices
=
[
"wsj0
-
mix"
,
"librimix"
])
parser
.
add_argument
(
"--dataset"
,
default
=
"librimix"
,
type
=
str
,
choices
=
[
"wsj0mix"
,
"librimix"
])
parser
.
add_argument
(
parser
.
add_argument
(
"--root-dir"
,
"--root-dir"
,
type
=
Path
,
type
=
Path
,
...
@@ -412,9 +412,10 @@ def cli_main():
...
@@ -412,9 +412,10 @@ def cli_main():
trainer
=
Trainer
(
trainer
=
Trainer
(
default_root_dir
=
args
.
exp_dir
,
default_root_dir
=
args
.
exp_dir
,
max_epochs
=
args
.
epochs
,
max_epochs
=
args
.
epochs
,
gpus
=
args
.
num_gpu
,
num_nodes
=
args
.
num_node
,
num_nodes
=
args
.
num_node
,
accelerator
=
"gpu"
,
strategy
=
"ddp_find_unused_parameters_false"
,
strategy
=
"ddp_find_unused_parameters_false"
,
devices
=
args
.
num_gpu
,
limit_train_batches
=
1.0
,
# Useful for fast experiment
limit_train_batches
=
1.0
,
# Useful for fast experiment
gradient_clip_val
=
5.0
,
gradient_clip_val
=
5.0
,
callbacks
=
callbacks
,
callbacks
=
callbacks
,
...
...
examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py
View file @
ffeba11a
...
@@ -207,6 +207,7 @@ from torchaudio.models.decoder import CTCDecoderLM, CTCDecoderLMState
...
@@ -207,6 +207,7 @@ from torchaudio.models.decoder import CTCDecoderLM, CTCDecoderLMState
class
CustomLM
(
CTCDecoderLM
):
class
CustomLM
(
CTCDecoderLM
):
"""Create a Python wrapper around `language_model` to feed to the decoder."""
"""Create a Python wrapper around `language_model` to feed to the decoder."""
def
__init__
(
self
,
language_model
:
torch
.
nn
.
Module
):
def
__init__
(
self
,
language_model
:
torch
.
nn
.
Module
):
CTCDecoderLM
.
__init__
(
self
)
CTCDecoderLM
.
__init__
(
self
)
self
.
language_model
=
language_model
self
.
language_model
=
language_model
...
@@ -386,6 +387,47 @@ print(f"WER: {beam_search_wer}")
...
@@ -386,6 +387,47 @@ print(f"WER: {beam_search_wer}")
# and “shoktd”.
# and “shoktd”.
#
#
######################################################################
# Incremental decoding
# ~~~~~~~~~~~~~~~~~~~~
#
# If the input speech is long, one can decode the emission in
# incremental manner.
#
# You need to first initialize the internal state of the decoder with
# :py:meth:`~torchaudio.models.decoder.CTCDecoder.decode_begin`.
beam_search_decoder
.
decode_begin
()
######################################################################
# Then, you can pass emissions to
# :py:meth:`~torchaudio.models.decoder.CTCDecoder.decode_begin`.
# Here we use the same emission but pass it to the decoder one frame
# at a time.
for
t
in
range
(
emission
.
size
(
1
)):
beam_search_decoder
.
decode_step
(
emission
[
0
,
t
:
t
+
1
,
:])
######################################################################
# Finally, finalize the internal state of the decoder, and retrieve the
# result.
beam_search_decoder
.
decode_end
()
beam_search_result_inc
=
beam_search_decoder
.
get_final_hypothesis
()
######################################################################
# The result of incremental decoding is identical to batch decoding.
#
beam_search_transcript_inc
=
" "
.
join
(
beam_search_result_inc
[
0
].
words
).
strip
()
beam_search_wer_inc
=
torchaudio
.
functional
.
edit_distance
(
actual_transcript
,
beam_search_result_inc
[
0
].
words
)
/
len
(
actual_transcript
)
print
(
f
"Transcript:
{
beam_search_transcript_inc
}
"
)
print
(
f
"WER:
{
beam_search_wer_inc
}
"
)
assert
beam_search_result
[
0
][
0
].
words
==
beam_search_result_inc
[
0
].
words
assert
beam_search_result
[
0
][
0
].
score
==
beam_search_result_inc
[
0
].
score
torch
.
testing
.
assert_close
(
beam_search_result
[
0
][
0
].
timesteps
,
beam_search_result_inc
[
0
].
timesteps
)
######################################################################
######################################################################
# Timestep Alignments
# Timestep Alignments
...
@@ -406,30 +448,45 @@ print(timesteps, timesteps.shape[0])
...
@@ -406,30 +448,45 @@ print(timesteps, timesteps.shape[0])
#
#
def
plot_alignments
(
waveform
,
emission
,
tokens
,
timesteps
):
def
plot_alignments
(
waveform
,
emission
,
tokens
,
timesteps
,
sample_rate
):
fig
,
ax
=
plt
.
subplots
(
figsize
=
(
32
,
10
))
t
=
torch
.
arange
(
waveform
.
size
(
0
))
/
sample_rate
ratio
=
waveform
.
size
(
0
)
/
emission
.
size
(
1
)
/
sample_rate
ax
.
plot
(
waveform
)
chars
=
[]
words
=
[]
word_start
=
None
for
token
,
timestep
in
zip
(
tokens
,
timesteps
*
ratio
):
if
token
==
"|"
:
if
word_start
is
not
None
:
words
.
append
((
word_start
,
timestep
))
word_start
=
None
else
:
chars
.
append
((
token
,
timestep
))
if
word_start
is
None
:
word_start
=
timestep
ratio
=
waveform
.
shape
[
0
]
/
emission
.
shape
[
1
]
fig
,
axes
=
plt
.
subplots
(
3
,
1
)
word_start
=
0
for
i
in
range
(
len
(
tokens
)):
def
_plot
(
ax
,
xlim
):
if
i
!=
0
and
tokens
[
i
-
1
]
==
"|"
:
ax
.
plot
(
t
,
waveform
)
word_start
=
timesteps
[
i
]
for
token
,
timestep
in
chars
:
if
tokens
[
i
]
!=
"|"
:
ax
.
annotate
(
token
.
upper
(),
(
timestep
,
0.5
))
plt
.
annotate
(
tokens
[
i
].
upper
(),
(
timesteps
[
i
]
*
ratio
,
waveform
.
max
()
*
1.02
),
size
=
14
)
for
word_start
,
word_end
in
words
:
elif
i
!=
0
:
ax
.
axvspan
(
word_start
,
word_end
,
alpha
=
0.1
,
color
=
"red"
)
word_end
=
timesteps
[
i
]
ax
.
set_ylim
(
-
0.6
,
0.7
)
ax
.
axvspan
(
word_start
*
ratio
,
word_end
*
ratio
,
alpha
=
0.1
,
color
=
"red"
)
ax
.
set_yticks
([
0
])
ax
.
grid
(
True
,
axis
=
"y"
)
ax
.
set_xlim
(
xlim
)
xticks
=
ax
.
get_xticks
()
_plot
(
axes
[
0
],
(
0.3
,
2.5
))
plt
.
xticks
(
xticks
,
xticks
/
bundle
.
sample_rate
)
_plot
(
axes
[
1
],
(
2.5
,
4.7
))
ax
.
set_xlabel
(
"time (sec)"
)
_plot
(
axes
[
2
],
(
4.7
,
6.9
))
ax
.
set_xlim
(
0
,
waveform
.
shape
[
0
])
axes
[
2
].
set_xlabel
(
"time (sec)"
)
fig
.
tight_layout
()
plot_alignments
(
waveform
[
0
],
emission
,
predicted_tokens
,
timesteps
)
plot_alignments
(
waveform
[
0
],
emission
,
predicted_tokens
,
timesteps
,
bundle
.
sample_rate
)
######################################################################
######################################################################
...
...
examples/tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.py
0 → 100644
View file @
ffeba11a
"""
ASR Inference with CUDA CTC Decoder
====================================
**Author**: `Yuekai Zhang <yuekaiz@nvidia.com>`__
This tutorial shows how to perform speech recognition inference using a
CUDA-based CTC beam search decoder.
We demonstrate this on a pretrained
`Zipformer <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_ctc>`__
model from `Next-gen Kaldi <https://nadirapovey.com/next-gen-kaldi-what-is-it>`__ project.
"""
######################################################################
# Overview
# --------
#
# Beam search decoding works by iteratively expanding text hypotheses (beams)
# with next possible characters, and maintaining only the hypotheses with the
# highest scores at each time step.
#
# The underlying implementation uses cuda to acclerate the whole decoding process
# A mathematical formula for the decoder can be
# found in the `paper <https://arxiv.org/pdf/1408.2873.pdf>`__, and
# a more detailed algorithm can be found in this `blog
# <https://distill.pub/2017/ctc/>`__.
#
# Running ASR inference using a CUDA CTC Beam Search decoder
# requires the following components
#
# - Acoustic Model: model predicting modeling units (BPE in this tutorial) from acoustic features
# - BPE Model: the byte-pair encoding (BPE) tokenizer file
#
######################################################################
# Acoustic Model and Set Up
# -------------------------
#
# First we import the necessary utilities and fetch the data that we are
# working with
#
import
torch
import
torchaudio
print
(
torch
.
__version__
)
print
(
torchaudio
.
__version__
)
######################################################################
#
import
time
from
pathlib
import
Path
import
IPython
import
sentencepiece
as
spm
from
torchaudio.models.decoder
import
cuda_ctc_decoder
from
torchaudio.utils
import
download_asset
######################################################################
#
# We use the pretrained
# `Zipformer <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-ctc-2022-12-01>`__
# model that is trained on the `LibriSpeech
# dataset <http://www.openslr.org/12>`__. The model is jointly trained with CTC and Transducer loss functions.
# In this tutorial, we only use CTC head of the model.
def
download_asset_external
(
url
,
key
):
path
=
Path
(
torch
.
hub
.
get_dir
())
/
"torchaudio"
/
Path
(
key
)
if
not
path
.
exists
():
path
.
parent
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
torch
.
hub
.
download_url_to_file
(
url
,
path
)
return
str
(
path
)
url_prefix
=
"https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-ctc-2022-12-01"
model_link
=
f
"
{
url_prefix
}
/resolve/main/exp/cpu_jit.pt"
model_path
=
download_asset_external
(
model_link
,
"cuda_ctc_decoder/cpu_jit.pt"
)
######################################################################
# We will load a sample from the LibriSpeech test-other dataset.
#
speech_file
=
download_asset
(
"tutorial-assets/ctc-decoding/1688-142285-0007.wav"
)
waveform
,
sample_rate
=
torchaudio
.
load
(
speech_file
)
assert
sample_rate
==
16000
IPython
.
display
.
Audio
(
speech_file
)
######################################################################
# The transcript corresponding to this audio file is
#
# .. code-block::
#
# i really was very much afraid of showing him how much shocked i was at some parts of what he said
#
######################################################################
# Files and Data for Decoder
# --------------------------
#
# Next, we load in our token from BPE model, which is the tokenizer for decoding.
#
######################################################################
# Tokens
# ~~~~~~
#
# The tokens are the possible symbols that the acoustic model can predict,
# including the blank symbol in CTC. In this tutorial, it includes 500 BPE tokens.
# It can either be passed in as a
# file, where each line consists of the tokens corresponding to the same
# index, or as a list of tokens, each mapping to a unique index.
#
# .. code-block::
#
# # tokens
# <blk>
# <sos/eos>
# <unk>
# S
# _THE
# _A
# T
# _AND
# ...
#
bpe_link
=
f
"
{
url_prefix
}
/resolve/main/data/lang_bpe_500/bpe.model"
bpe_path
=
download_asset_external
(
bpe_link
,
"cuda_ctc_decoder/bpe.model"
)
bpe_model
=
spm
.
SentencePieceProcessor
()
bpe_model
.
load
(
bpe_path
)
tokens
=
[
bpe_model
.
id_to_piece
(
id
)
for
id
in
range
(
bpe_model
.
get_piece_size
())]
print
(
tokens
)
######################################################################
# Construct CUDA Decoder
# ----------------------
# In this tutorial, we will construct a CUDA beam search decoder.
# The decoder can be constructed using the factory function
# :py:func:`~torchaudio.models.decoder.cuda_ctc_decoder`.
#
cuda_decoder
=
cuda_ctc_decoder
(
tokens
,
nbest
=
10
,
beam_size
=
10
,
blank_skip_threshold
=
0.95
)
######################################################################
# Run Inference
# -------------
#
# Now that we have the data, acoustic model, and decoder, we can perform
# inference. The output of the beam search decoder is of type
# :py:class:`~torchaudio.models.decoder.CUCTCHypothesis`, consisting of the
# predicted token IDs, words (symbols corresponding to the token IDs), and hypothesis scores.
# Recall the transcript corresponding to the
# waveform is
#
# .. code-block::
#
# i really was very much afraid of showing him how much shocked i was at some parts of what he said
#
actual_transcript
=
"i really was very much afraid of showing him how much shocked i was at some parts of what he said"
actual_transcript
=
actual_transcript
.
split
()
device
=
torch
.
device
(
"cuda"
,
0
)
acoustic_model
=
torch
.
jit
.
load
(
model_path
)
acoustic_model
.
to
(
device
)
acoustic_model
.
eval
()
waveform
=
waveform
.
to
(
device
)
feat
=
torchaudio
.
compliance
.
kaldi
.
fbank
(
waveform
,
num_mel_bins
=
80
,
snip_edges
=
False
)
feat
=
feat
.
unsqueeze
(
0
)
feat_lens
=
torch
.
tensor
(
feat
.
size
(
1
),
device
=
device
).
unsqueeze
(
0
)
encoder_out
,
encoder_out_lens
=
acoustic_model
.
encoder
(
feat
,
feat_lens
)
nnet_output
=
acoustic_model
.
ctc_output
(
encoder_out
)
log_prob
=
torch
.
nn
.
functional
.
log_softmax
(
nnet_output
,
-
1
)
print
(
f
"The shape of log_prob:
{
log_prob
.
shape
}
, the shape of encoder_out_lens:
{
encoder_out_lens
.
shape
}
"
)
######################################################################
# The cuda ctc decoder gives the following result.
#
results
=
cuda_decoder
(
log_prob
,
encoder_out_lens
.
to
(
torch
.
int32
))
beam_search_transcript
=
bpe_model
.
decode
(
results
[
0
][
0
].
tokens
).
lower
()
beam_search_wer
=
torchaudio
.
functional
.
edit_distance
(
actual_transcript
,
beam_search_transcript
.
split
())
/
len
(
actual_transcript
)
print
(
f
"Transcript:
{
beam_search_transcript
}
"
)
print
(
f
"WER:
{
beam_search_wer
}
"
)
######################################################################
# Beam Search Decoder Parameters
# ------------------------------
#
# In this section, we go a little bit more in depth about some different
# parameters and tradeoffs. For the full list of customizable parameters,
# please refer to the
# :py:func:`documentation <torchaudio.models.decoder.cuda_ctc_decoder>`.
#
######################################################################
# Helper Function
# ~~~~~~~~~~~~~~~
#
def
print_decoded
(
cuda_decoder
,
bpe_model
,
log_prob
,
encoder_out_lens
,
param
,
param_value
):
start_time
=
time
.
monotonic
()
results
=
cuda_decoder
(
log_prob
,
encoder_out_lens
.
to
(
torch
.
int32
))
decode_time
=
time
.
monotonic
()
-
start_time
transcript
=
bpe_model
.
decode
(
results
[
0
][
0
].
tokens
).
lower
()
score
=
results
[
0
][
0
].
score
print
(
f
"
{
param
}
{
param_value
:
<
3
}
:
{
transcript
}
(score:
{
score
:.
2
f
}
;
{
decode_time
:.
4
f
}
secs)"
)
######################################################################
# nbest
# ~~~~~
#
# This parameter indicates the number of best hypotheses to return. For
# instance, by setting ``nbest=10`` when constructing the beam search
# decoder earlier, we can now access the hypotheses with the top 10 scores.
#
for
i
in
range
(
10
):
transcript
=
bpe_model
.
decode
(
results
[
0
][
i
].
tokens
).
lower
()
score
=
results
[
0
][
i
].
score
print
(
f
"
{
transcript
}
(score:
{
score
}
)"
)
######################################################################
# beam size
# ~~~~~~~~~
#
# The ``beam_size`` parameter determines the maximum number of best
# hypotheses to hold after each decoding step. Using larger beam sizes
# allows for exploring a larger range of possible hypotheses which can
# produce hypotheses with higher scores, but it does not provide additional gains beyond a certain point.
# We recommend to set beam_size=10 for cuda beam search decoder.
#
# In the example below, we see improvement in decoding quality as we
# increase beam size from 1 to 3, but notice how using a beam size
# of 3 provides the same output as beam size 10.
#
beam_sizes
=
[
1
,
2
,
3
,
10
]
for
beam_size
in
beam_sizes
:
beam_search_decoder
=
cuda_ctc_decoder
(
tokens
,
nbest
=
1
,
beam_size
=
beam_size
,
blank_skip_threshold
=
0.95
,
)
print_decoded
(
beam_search_decoder
,
bpe_model
,
log_prob
,
encoder_out_lens
,
"beam size"
,
beam_size
)
######################################################################
# blank skip threshold
# ~~~~~~~~~~~~~~~~~~~~
#
# The ``blank_skip_threshold`` parameter is used to prune the frames which have large blank probability.
# Pruning these frames with a good blank_skip_threshold could speed up decoding
# process a lot while no accuracy drop.
# Since the rule of CTC, we would keep at least one blank frame between two non-blank frames
# to avoid mistakenly merge two consecutive identical symbols.
# We recommend to set blank_skip_threshold=0.95 for cuda beam search decoder.
#
blank_skip_probs
=
[
0.25
,
0.95
,
1.0
]
for
blank_skip_prob
in
blank_skip_probs
:
beam_search_decoder
=
cuda_ctc_decoder
(
tokens
,
nbest
=
10
,
beam_size
=
10
,
blank_skip_threshold
=
blank_skip_prob
,
)
print_decoded
(
beam_search_decoder
,
bpe_model
,
log_prob
,
encoder_out_lens
,
"blank_skip_threshold"
,
blank_skip_prob
)
del
cuda_decoder
######################################################################
# Benchmark with flashlight CPU decoder
# -------------------------------------
# We benchmark the throughput and accuracy between CUDA decoder and CPU decoder using librispeech test_other set.
# To reproduce below benchmark results, you may refer `here <https://github.com/pytorch/audio/tree/main/examples/asr/librispeech_cuda_ctc_decoder>`__.
#
# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
# | Decoder | Setting | WER (%) | N-Best Oracle WER (%) | Decoder Cost Time (seconds) |
# +==============+==========================================+=========+=======================+=============================+
# | CUDA decoder | blank_skip_threshold 0.95 | 5.81 | 4.11 | 2.57 |
# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
# | CUDA decoder | blank_skip_threshold 1.0 (no frame-skip) | 5.81 | 4.09 | 6.24 |
# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
# | CPU decoder | beam_size_token 10 | 5.86 | 4.30 | 28.61 |
# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
# | CPU decoder | beam_size_token 500 | 5.86 | 4.30 | 791.80 |
# +--------------+------------------------------------------+---------+-----------------------+-----------------------------+
#
# From the above table, CUDA decoder could give a slight improvement in WER and a significant increase in throughput.
examples/tutorials/audio_data_augmentation_tutorial.py
View file @
ffeba11a
...
@@ -20,6 +20,8 @@ import torchaudio.functional as F
...
@@ -20,6 +20,8 @@ import torchaudio.functional as F
print
(
torch
.
__version__
)
print
(
torch
.
__version__
)
print
(
torchaudio
.
__version__
)
print
(
torchaudio
.
__version__
)
import
matplotlib.pyplot
as
plt
######################################################################
######################################################################
# Preparation
# Preparation
# -----------
# -----------
...
@@ -27,10 +29,7 @@ print(torchaudio.__version__)
...
@@ -27,10 +29,7 @@ print(torchaudio.__version__)
# First, we import the modules and download the audio assets we use in this tutorial.
# First, we import the modules and download the audio assets we use in this tutorial.
#
#
import
math
from
IPython.display
import
Audio
from
IPython.display
import
Audio
import
matplotlib.pyplot
as
plt
from
torchaudio.utils
import
download_asset
from
torchaudio.utils
import
download_asset
...
@@ -44,56 +43,38 @@ SAMPLE_NOISE = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-st
...
@@ -44,56 +43,38 @@ SAMPLE_NOISE = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-st
# Applying effects and filtering
# Applying effects and filtering
# ------------------------------
# ------------------------------
#
#
# :py:func:`torchaudio.sox_effects` allows for directly applying filters similar to
# :py:class:`torchaudio.io.AudioEffector` allows for directly applying
# those available in ``sox`` to Tensor objects and file object audio sources.
# filters and codecs to Tensor objects, in a similar way as ``ffmpeg``
#
# command
# There are two functions for this:
#
# - :py:func:`torchaudio.sox_effects.apply_effects_tensor` for applying effects
# to Tensor.
# - :py:func:`torchaudio.sox_effects.apply_effects_file` for applying effects to
# other audio sources.
#
#
# Both functions accept effect definitions in the form
# `AudioEffector Usages <./effector_tutorial.html>` explains how to use
# ``List[List[str]]``.
# this class, so for the detail, please refer to the tutorial.
# This is mostly consistent with how ``sox`` command works, but one caveat is
# that ``sox`` adds some effects automatically, whereas ``torchaudio``’s
# implementation does not.
#
# For the list of available effects, please refer to `the sox
# documentation <http://sox.sourceforge.net/sox.html>`__.
#
# **Tip** If you need to load and resample your audio data on the fly,
# then you can use :py:func:`torchaudio.sox_effects.apply_effects_file`
# with effect ``"rate"``.
#
# **Note** :py:func:`torchaudio.sox_effects.apply_effects_file` accepts a
# file-like object or path-like object.
# Similar to :py:func:`torchaudio.load`, when the audio format cannot be
# inferred from either the file extension or header, you can provide
# argument ``format`` to specify the format of the audio source.
#
# **Note** This process is not differentiable.
#
#
# Load the data
# Load the data
waveform1
,
sample_rate
1
=
torchaudio
.
load
(
SAMPLE_WAV
)
waveform1
,
sample_rate
=
torchaudio
.
load
(
SAMPLE_WAV
,
channels_first
=
False
)
# Define effects
# Define effects
effects
=
[
effect
=
","
.
join
(
[
"lowpass"
,
"-1"
,
"300"
],
# apply single-pole lowpass filter
[
[
"speed"
,
"0.8"
],
# reduce the speed
"lowpass=frequency=300:poles=1"
,
# apply single-pole lowpass filter
# This only changes sample rate, so it is necessary to
"atempo=0.8"
,
# reduce the speed
# add `rate` effect with original sample rate after this.
"aecho=in_gain=0.8:out_gain=0.9:delays=200:decays=0.3|delays=400:decays=0.3"
[
"rate"
,
f
"
{
sample_rate1
}
"
],
# Applying echo gives some dramatic feeling
[
"reverb"
,
"-w"
],
# Reverbration gives some dramatic feeling
],
]
)
# Apply effects
# Apply effects
waveform2
,
sample_rate2
=
torchaudio
.
sox_effects
.
apply_effects_tensor
(
waveform1
,
sample_rate1
,
effects
)
def
apply_effect
(
waveform
,
sample_rate
,
effect
):
effector
=
torchaudio
.
io
.
AudioEffector
(
effect
=
effect
)
return
effector
.
apply
(
waveform
,
sample_rate
)
print
(
waveform1
.
shape
,
sample_rate1
)
waveform2
=
apply_effect
(
waveform1
,
sample_rate
,
effect
)
print
(
waveform2
.
shape
,
sample_rate2
)
print
(
waveform1
.
shape
,
sample_rate
)
print
(
waveform2
.
shape
,
sample_rate
)
######################################################################
######################################################################
# Note that the number of frames and number of channels are different from
# Note that the number of frames and number of channels are different from
...
@@ -101,6 +82,7 @@ print(waveform2.shape, sample_rate2)
...
@@ -101,6 +82,7 @@ print(waveform2.shape, sample_rate2)
# audio.
# audio.
#
#
def
plot_waveform
(
waveform
,
sample_rate
,
title
=
"Waveform"
,
xlim
=
None
):
def
plot_waveform
(
waveform
,
sample_rate
,
title
=
"Waveform"
,
xlim
=
None
):
waveform
=
waveform
.
numpy
()
waveform
=
waveform
.
numpy
()
...
@@ -118,11 +100,12 @@ def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None):
...
@@ -118,11 +100,12 @@ def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None):
if
xlim
:
if
xlim
:
axes
[
c
].
set_xlim
(
xlim
)
axes
[
c
].
set_xlim
(
xlim
)
figure
.
suptitle
(
title
)
figure
.
suptitle
(
title
)
plt
.
show
(
block
=
False
)
######################################################################
######################################################################
#
#
def
plot_specgram
(
waveform
,
sample_rate
,
title
=
"Spectrogram"
,
xlim
=
None
):
def
plot_specgram
(
waveform
,
sample_rate
,
title
=
"Spectrogram"
,
xlim
=
None
):
waveform
=
waveform
.
numpy
()
waveform
=
waveform
.
numpy
()
...
@@ -138,29 +121,26 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
...
@@ -138,29 +121,26 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
if
xlim
:
if
xlim
:
axes
[
c
].
set_xlim
(
xlim
)
axes
[
c
].
set_xlim
(
xlim
)
figure
.
suptitle
(
title
)
figure
.
suptitle
(
title
)
plt
.
show
(
block
=
False
)
######################################################################
######################################################################
# Original
:
# Original
# ~~~~~~~~
~
# ~~~~~~~~
#
#
plot_waveform
(
waveform1
,
sample_rate
1
,
title
=
"Original"
,
xlim
=
(
-
0.1
,
3.2
))
plot_waveform
(
waveform1
.
T
,
sample_rate
,
title
=
"Original"
,
xlim
=
(
-
0.1
,
3.2
))
plot_specgram
(
waveform1
,
sample_rate
1
,
title
=
"Original"
,
xlim
=
(
0
,
3.04
))
plot_specgram
(
waveform1
.
T
,
sample_rate
,
title
=
"Original"
,
xlim
=
(
0
,
3.04
))
Audio
(
waveform1
,
rate
=
sample_rate
1
)
Audio
(
waveform1
.
T
,
rate
=
sample_rate
)
######################################################################
######################################################################
# Effects applied
:
# Effects applied
# ~~~~~~~~~~~~~~~
~
# ~~~~~~~~~~~~~~~
#
#
plot_waveform
(
waveform2
,
sample_rate
2
,
title
=
"Effects Applied"
,
xlim
=
(
-
0.1
,
3.2
))
plot_waveform
(
waveform2
.
T
,
sample_rate
,
title
=
"Effects Applied"
,
xlim
=
(
-
0.1
,
3.2
))
plot_specgram
(
waveform2
,
sample_rate
2
,
title
=
"Effects Applied"
,
xlim
=
(
0
,
3.04
))
plot_specgram
(
waveform2
.
T
,
sample_rate
,
title
=
"Effects Applied"
,
xlim
=
(
0
,
3.04
))
Audio
(
waveform2
,
rate
=
sample_rate
2
)
Audio
(
waveform2
.
T
,
rate
=
sample_rate
)
######################################################################
# Doesn’t it sound more dramatic?
#
######################################################################
######################################################################
# Simulating room reverberation
# Simulating room reverberation
...
@@ -185,28 +165,26 @@ plot_specgram(rir_raw, sample_rate, title="Room Impulse Response (raw)")
...
@@ -185,28 +165,26 @@ plot_specgram(rir_raw, sample_rate, title="Room Impulse Response (raw)")
Audio
(
rir_raw
,
rate
=
sample_rate
)
Audio
(
rir_raw
,
rate
=
sample_rate
)
######################################################################
######################################################################
# First, we need to clean up the RIR. We extract the main impulse
,
normalize
# First, we need to clean up the RIR. We extract the main impulse
and
normalize
#
the signal power, then flip along the time axis
.
#
it by its power
.
#
#
rir
=
rir_raw
[:,
int
(
sample_rate
*
1.01
)
:
int
(
sample_rate
*
1.3
)]
rir
=
rir_raw
[:,
int
(
sample_rate
*
1.01
)
:
int
(
sample_rate
*
1.3
)]
rir
=
rir
/
torch
.
norm
(
rir
,
p
=
2
)
rir
=
rir
/
torch
.
linalg
.
vector_norm
(
rir
,
ord
=
2
)
RIR
=
torch
.
flip
(
rir
,
[
1
])
plot_waveform
(
rir
,
sample_rate
,
title
=
"Room Impulse Response"
)
plot_waveform
(
rir
,
sample_rate
,
title
=
"Room Impulse Response"
)
######################################################################
######################################################################
# Then, we convolve the speech signal with the RIR filter.
# Then, using :py:func:`torchaudio.functional.fftconvolve`,
# we convolve the speech signal with the RIR.
#
#
speech
,
_
=
torchaudio
.
load
(
SAMPLE_SPEECH
)
speech
,
_
=
torchaudio
.
load
(
SAMPLE_SPEECH
)
augmented
=
F
.
fftconvolve
(
speech
,
rir
)
speech_
=
torch
.
nn
.
functional
.
pad
(
speech
,
(
RIR
.
shape
[
1
]
-
1
,
0
))
augmented
=
torch
.
nn
.
functional
.
conv1d
(
speech_
[
None
,
...],
RIR
[
None
,
...])[
0
]
######################################################################
######################################################################
# Original
:
# Original
# ~~~~~~~~
~
# ~~~~~~~~
#
#
plot_waveform
(
speech
,
sample_rate
,
title
=
"Original"
)
plot_waveform
(
speech
,
sample_rate
,
title
=
"Original"
)
...
@@ -214,8 +192,8 @@ plot_specgram(speech, sample_rate, title="Original")
...
@@ -214,8 +192,8 @@ plot_specgram(speech, sample_rate, title="Original")
Audio
(
speech
,
rate
=
sample_rate
)
Audio
(
speech
,
rate
=
sample_rate
)
######################################################################
######################################################################
# RIR applied
:
# RIR applied
# ~~~~~~~~~~~
~
# ~~~~~~~~~~~
#
#
plot_waveform
(
augmented
,
sample_rate
,
title
=
"RIR Applied"
)
plot_waveform
(
augmented
,
sample_rate
,
title
=
"RIR Applied"
)
...
@@ -227,33 +205,31 @@ Audio(augmented, rate=sample_rate)
...
@@ -227,33 +205,31 @@ Audio(augmented, rate=sample_rate)
# Adding background noise
# Adding background noise
# -----------------------
# -----------------------
#
#
# To add background noise to audio data, you can simply add a noise Tensor to
# To introduce background noise to audio data, we can add a noise Tensor to
# the Tensor representing the audio data. A common method to adjust the
# the Tensor representing the audio data according to some desired
# intensity of noise is changing the Signal-to-Noise Ratio (SNR).
# signal-to-noise ratio (SNR)
# [`wikipedia <https://en.wikipedia.org/wiki/Signal-to-noise_ratio>`__]
# [`wikipedia <https://en.wikipedia.org/wiki/Signal-to-noise_ratio>`__],
# which determines the intensity of the audio data relative to that of the noise
# in the output.
#
#
# $$ \\mathrm{SNR} = \\frac{P_{signal}}{P_{noise}} $$
# $$ \\mathrm{SNR} = \\frac{P_{signal}}{P_{noise}} $$
#
#
# $$ \\mathrm{SNR_{dB}} = 10 \\log _{{10}} \\mathrm {SNR} $$
# $$ \\mathrm{SNR_{dB}} = 10 \\log _{{10}} \\mathrm {SNR} $$
#
#
# To add noise to audio data per SNRs, we
# use :py:func:`torchaudio.functional.add_noise`.
speech
,
_
=
torchaudio
.
load
(
SAMPLE_SPEECH
)
speech
,
_
=
torchaudio
.
load
(
SAMPLE_SPEECH
)
noise
,
_
=
torchaudio
.
load
(
SAMPLE_NOISE
)
noise
,
_
=
torchaudio
.
load
(
SAMPLE_NOISE
)
noise
=
noise
[:,
:
speech
.
shape
[
1
]]
noise
=
noise
[:,
:
speech
.
shape
[
1
]]
s
peech_rms
=
speech
.
norm
(
p
=
2
)
s
nr_dbs
=
torch
.
tensor
([
20
,
10
,
3
]
)
nois
e_rms
=
noise
.
norm
(
p
=
2
)
nois
y_speeches
=
F
.
add_noise
(
speech
,
noise
,
snr_dbs
)
snr_dbs
=
[
20
,
10
,
3
]
noisy_speeches
=
[]
for
snr_db
in
snr_dbs
:
snr
=
10
**
(
snr_db
/
20
)
scale
=
snr
*
noise_rms
/
speech_rms
noisy_speeches
.
append
((
scale
*
speech
+
noise
)
/
2
)
######################################################################
######################################################################
# Background noise
:
# Background noise
# ~~~~~~~~~~~~~~~~
~
# ~~~~~~~~~~~~~~~~
#
#
plot_waveform
(
noise
,
sample_rate
,
title
=
"Background noise"
)
plot_waveform
(
noise
,
sample_rate
,
title
=
"Background noise"
)
...
@@ -261,31 +237,31 @@ plot_specgram(noise, sample_rate, title="Background noise")
...
@@ -261,31 +237,31 @@ plot_specgram(noise, sample_rate, title="Background noise")
Audio
(
noise
,
rate
=
sample_rate
)
Audio
(
noise
,
rate
=
sample_rate
)
######################################################################
######################################################################
# SNR 20 dB
:
# SNR 20 dB
# ~~~~~~~~~
~
# ~~~~~~~~~
#
#
snr_db
,
noisy_speech
=
snr_dbs
[
0
],
noisy_speeches
[
0
]
snr_db
,
noisy_speech
=
snr_dbs
[
0
],
noisy_speeches
[
0
:
1
]
plot_waveform
(
noisy_speech
,
sample_rate
,
title
=
f
"SNR:
{
snr_db
}
[dB]"
)
plot_waveform
(
noisy_speech
,
sample_rate
,
title
=
f
"SNR:
{
snr_db
}
[dB]"
)
plot_specgram
(
noisy_speech
,
sample_rate
,
title
=
f
"SNR:
{
snr_db
}
[dB]"
)
plot_specgram
(
noisy_speech
,
sample_rate
,
title
=
f
"SNR:
{
snr_db
}
[dB]"
)
Audio
(
noisy_speech
,
rate
=
sample_rate
)
Audio
(
noisy_speech
,
rate
=
sample_rate
)
######################################################################
######################################################################
# SNR 10 dB
:
# SNR 10 dB
# ~~~~~~~~~
~
# ~~~~~~~~~
#
#
snr_db
,
noisy_speech
=
snr_dbs
[
1
],
noisy_speeches
[
1
]
snr_db
,
noisy_speech
=
snr_dbs
[
1
],
noisy_speeches
[
1
:
2
]
plot_waveform
(
noisy_speech
,
sample_rate
,
title
=
f
"SNR:
{
snr_db
}
[dB]"
)
plot_waveform
(
noisy_speech
,
sample_rate
,
title
=
f
"SNR:
{
snr_db
}
[dB]"
)
plot_specgram
(
noisy_speech
,
sample_rate
,
title
=
f
"SNR:
{
snr_db
}
[dB]"
)
plot_specgram
(
noisy_speech
,
sample_rate
,
title
=
f
"SNR:
{
snr_db
}
[dB]"
)
Audio
(
noisy_speech
,
rate
=
sample_rate
)
Audio
(
noisy_speech
,
rate
=
sample_rate
)
######################################################################
######################################################################
# SNR 3 dB
:
# SNR 3 dB
# ~~~~~~~~
~
# ~~~~~~~~
#
#
snr_db
,
noisy_speech
=
snr_dbs
[
2
],
noisy_speeches
[
2
]
snr_db
,
noisy_speech
=
snr_dbs
[
2
],
noisy_speeches
[
2
:
3
]
plot_waveform
(
noisy_speech
,
sample_rate
,
title
=
f
"SNR:
{
snr_db
}
[dB]"
)
plot_waveform
(
noisy_speech
,
sample_rate
,
title
=
f
"SNR:
{
snr_db
}
[dB]"
)
plot_specgram
(
noisy_speech
,
sample_rate
,
title
=
f
"SNR:
{
snr_db
}
[dB]"
)
plot_specgram
(
noisy_speech
,
sample_rate
,
title
=
f
"SNR:
{
snr_db
}
[dB]"
)
Audio
(
noisy_speech
,
rate
=
sample_rate
)
Audio
(
noisy_speech
,
rate
=
sample_rate
)
...
@@ -295,60 +271,56 @@ Audio(noisy_speech, rate=sample_rate)
...
@@ -295,60 +271,56 @@ Audio(noisy_speech, rate=sample_rate)
# Applying codec to Tensor object
# Applying codec to Tensor object
# -------------------------------
# -------------------------------
#
#
# :py:
func
:`torchaudio.
functional.apply_codec` can
apply codecs to
# :py:
class
:`torchaudio.
io.AudioEffector` can also
apply codecs to
# a Tensor object.
# a Tensor object.
#
#
# **Note** This process is not differentiable.
#
waveform
,
sample_rate
=
torchaudio
.
load
(
SAMPLE_SPEECH
,
channels_first
=
False
)
waveform
,
sample_rate
=
torchaudio
.
load
(
SAMPLE_SPEECH
)
def
apply_codec
(
waveform
,
sample_rate
,
format
,
encoder
=
None
):
encoder
=
torchaudio
.
io
.
AudioEffector
(
format
=
format
,
encoder
=
encoder
)
return
encoder
.
apply
(
waveform
,
sample_rate
)
configs
=
[
{
"format"
:
"wav"
,
"encoding"
:
"ULAW"
,
"bits_per_sample"
:
8
},
{
"format"
:
"gsm"
},
{
"format"
:
"vorbis"
,
"compression"
:
-
1
},
]
waveforms
=
[]
for
param
in
configs
:
augmented
=
F
.
apply_codec
(
waveform
,
sample_rate
,
**
param
)
waveforms
.
append
(
augmented
)
######################################################################
######################################################################
# Original
:
# Original
# ~~~~~~~~
~
# ~~~~~~~~
#
#
plot_waveform
(
waveform
,
sample_rate
,
title
=
"Original"
)
plot_waveform
(
waveform
.
T
,
sample_rate
,
title
=
"Original"
)
plot_specgram
(
waveform
,
sample_rate
,
title
=
"Original"
)
plot_specgram
(
waveform
.
T
,
sample_rate
,
title
=
"Original"
)
Audio
(
waveform
,
rate
=
sample_rate
)
Audio
(
waveform
.
T
,
rate
=
sample_rate
)
######################################################################
######################################################################
# 8 bit mu-law
:
# 8 bit mu-law
# ~~~~~~~~~~~~
~
# ~~~~~~~~~~~~
#
#
plot_waveform
(
waveforms
[
0
],
sample_rate
,
title
=
"8 bit mu-law"
)
mulaw
=
apply_codec
(
waveform
,
sample_rate
,
"wav"
,
encoder
=
"pcm_mulaw"
)
plot_specgram
(
waveforms
[
0
],
sample_rate
,
title
=
"8 bit mu-law"
)
plot_waveform
(
mulaw
.
T
,
sample_rate
,
title
=
"8 bit mu-law"
)
Audio
(
waveforms
[
0
],
rate
=
sample_rate
)
plot_specgram
(
mulaw
.
T
,
sample_rate
,
title
=
"8 bit mu-law"
)
Audio
(
mulaw
.
T
,
rate
=
sample_rate
)
######################################################################
######################################################################
# G
SM-FR:
# G
.722
# ~~~~~
~~
# ~~~~~
#
#
plot_waveform
(
waveforms
[
1
],
sample_rate
,
title
=
"GSM-FR"
)
g722
=
apply_codec
(
waveform
,
sample_rate
,
"g722"
)
plot_specgram
(
waveforms
[
1
],
sample_rate
,
title
=
"GSM-FR"
)
plot_waveform
(
g722
.
T
,
sample_rate
,
title
=
"G.722"
)
Audio
(
waveforms
[
1
],
rate
=
sample_rate
)
plot_specgram
(
g722
.
T
,
sample_rate
,
title
=
"G.722"
)
Audio
(
g722
.
T
,
rate
=
sample_rate
)
######################################################################
######################################################################
# Vorbis
:
# Vorbis
# ~~~~~~
~
# ~~~~~~
#
#
plot_waveform
(
waveforms
[
2
],
sample_rate
,
title
=
"Vorbis"
)
vorbis
=
apply_codec
(
waveform
,
sample_rate
,
"ogg"
,
encoder
=
"vorbis"
)
plot_specgram
(
waveforms
[
2
],
sample_rate
,
title
=
"Vorbis"
)
plot_waveform
(
vorbis
.
T
,
sample_rate
,
title
=
"Vorbis"
)
Audio
(
waveforms
[
2
],
rate
=
sample_rate
)
plot_specgram
(
vorbis
.
T
,
sample_rate
,
title
=
"Vorbis"
)
Audio
(
vorbis
.
T
,
rate
=
sample_rate
)
######################################################################
######################################################################
# Simulating a phone recoding
# Simulating a phone recoding
...
@@ -365,8 +337,7 @@ original_speech, sample_rate = torchaudio.load(SAMPLE_SPEECH)
...
@@ -365,8 +337,7 @@ original_speech, sample_rate = torchaudio.load(SAMPLE_SPEECH)
plot_specgram
(
original_speech
,
sample_rate
,
title
=
"Original"
)
plot_specgram
(
original_speech
,
sample_rate
,
title
=
"Original"
)
# Apply RIR
# Apply RIR
speech_
=
torch
.
nn
.
functional
.
pad
(
original_speech
,
(
RIR
.
shape
[
1
]
-
1
,
0
))
rir_applied
=
F
.
fftconvolve
(
speech
,
rir
)
rir_applied
=
torch
.
nn
.
functional
.
conv1d
(
speech_
[
None
,
...],
RIR
[
None
,
...])[
0
]
plot_specgram
(
rir_applied
,
sample_rate
,
title
=
"RIR Applied"
)
plot_specgram
(
rir_applied
,
sample_rate
,
title
=
"RIR Applied"
)
...
@@ -377,69 +348,60 @@ plot_specgram(rir_applied, sample_rate, title="RIR Applied")
...
@@ -377,69 +348,60 @@ plot_specgram(rir_applied, sample_rate, title="RIR Applied")
noise
,
_
=
torchaudio
.
load
(
SAMPLE_NOISE
)
noise
,
_
=
torchaudio
.
load
(
SAMPLE_NOISE
)
noise
=
noise
[:,
:
rir_applied
.
shape
[
1
]]
noise
=
noise
[:,
:
rir_applied
.
shape
[
1
]]
snr_db
=
8
snr_db
=
torch
.
tensor
([
8
])
scale
=
(
10
**
(
snr_db
/
20
))
*
noise
.
norm
(
p
=
2
)
/
rir_applied
.
norm
(
p
=
2
)
bg_added
=
F
.
add_noise
(
rir_applied
,
noise
,
snr_db
)
bg_added
=
(
scale
*
rir_applied
+
noise
)
/
2
plot_specgram
(
bg_added
,
sample_rate
,
title
=
"BG noise added"
)
plot_specgram
(
bg_added
,
sample_rate
,
title
=
"BG noise added"
)
# Apply filtering and change sample rate
# Apply filtering and change sample rate
filtered
,
sample_rate2
=
torchaudio
.
sox_effects
.
apply_effects_tensor
(
effect
=
","
.
join
(
bg_added
,
[
sample_rate
,
"lowpass=frequency=4000:poles=1"
,
effects
=
[
"compand=attacks=0.02:decays=0.05:points=-60/-60|-30/-10|-20/-8|-5/-8|-2/-8:gain=-8:volume=-7:delay=0.05"
,
[
"lowpass"
,
"4000"
],
]
[
"compand"
,
"0.02,0.05"
,
"-60,-60,-30,-10,-20,-8,-5,-8,-2,-8"
,
"-8"
,
"-7"
,
"0.05"
,
],
[
"rate"
,
"8000"
],
],
)
)
plot_specgram
(
filtered
,
sample_rate2
,
title
=
"Filtered"
)
filtered
=
apply_effect
(
bg_added
.
T
,
sample_rate
,
effect
)
sample_rate2
=
8000
# Apply telephony codec
plot_specgram
(
filtered
.
T
,
sample_rate2
,
title
=
"Filtered"
)
codec_applied
=
F
.
apply_codec
(
filtered
,
sample_rate2
,
format
=
"gsm"
)
plot_specgram
(
codec_applied
,
sample_rate2
,
title
=
"GSM Codec Applied"
)
# Apply telephony codec
codec_applied
=
apply_codec
(
filtered
,
sample_rate2
,
"g722"
)
plot_specgram
(
codec_applied
.
T
,
sample_rate2
,
title
=
"G.722 Codec Applied"
)
######################################################################
######################################################################
# Original speech
:
# Original speech
# ~~~~~~~~~~~~~~~
~
# ~~~~~~~~~~~~~~~
#
#
Audio
(
original_speech
,
rate
=
sample_rate
)
Audio
(
original_speech
,
rate
=
sample_rate
)
######################################################################
######################################################################
# RIR applied
:
# RIR applied
# ~~~~~~~~~~~
~
# ~~~~~~~~~~~
#
#
Audio
(
rir_applied
,
rate
=
sample_rate
)
Audio
(
rir_applied
,
rate
=
sample_rate
)
######################################################################
######################################################################
# Background noise added
:
# Background noise added
# ~~~~~~~~~~~~~~~~~~~~~~
~
# ~~~~~~~~~~~~~~~~~~~~~~
#
#
Audio
(
bg_added
,
rate
=
sample_rate
)
Audio
(
bg_added
,
rate
=
sample_rate
)
######################################################################
######################################################################
# Filtered
:
# Filtered
# ~~~~~~~~
~
# ~~~~~~~~
#
#
Audio
(
filtered
,
rate
=
sample_rate2
)
Audio
(
filtered
.
T
,
rate
=
sample_rate2
)
######################################################################
######################################################################
# Codec applied
:
# Codec applied
# ~~~~~~~~~~~~~
~
# ~~~~~~~~~~~~~
#
#
Audio
(
codec_applied
,
rate
=
sample_rate2
)
Audio
(
codec_applied
.
T
,
rate
=
sample_rate2
)
examples/tutorials/audio_datasets_tutorial.py
View file @
ffeba11a
# -*- coding: utf-8 -*-
"""
"""
Audio Datasets
Audio Datasets
==============
==============
...
@@ -10,10 +9,6 @@ datasets. Please refer to the official documentation for the list of
...
@@ -10,10 +9,6 @@ datasets. Please refer to the official documentation for the list of
available datasets.
available datasets.
"""
"""
# When running this tutorial in Google Colab, install the required packages
# with the following.
# !pip install torchaudio
import
torch
import
torch
import
torchaudio
import
torchaudio
...
@@ -21,22 +16,13 @@ print(torch.__version__)
...
@@ -21,22 +16,13 @@ print(torch.__version__)
print
(
torchaudio
.
__version__
)
print
(
torchaudio
.
__version__
)
######################################################################
######################################################################
# Preparing data and utility functions (skip this section)
# --------------------------------------------------------
#
#
# @title Prepare data and utility functions. {display-mode: "form"}
# @markdown
# @markdown You do not need to look into this cell.
# @markdown Just execute once and you are good to go.
# -------------------------------------------------------------------------------
# Preparation of data and helper functions.
# -------------------------------------------------------------------------------
import
os
import
os
import
IPython
import
matplotlib.pyplot
as
plt
import
matplotlib.pyplot
as
plt
from
IPython.display
import
Audio
,
display
_SAMPLE_DIR
=
"_assets"
_SAMPLE_DIR
=
"_assets"
...
@@ -44,34 +30,13 @@ YESNO_DATASET_PATH = os.path.join(_SAMPLE_DIR, "yes_no")
...
@@ -44,34 +30,13 @@ YESNO_DATASET_PATH = os.path.join(_SAMPLE_DIR, "yes_no")
os
.
makedirs
(
YESNO_DATASET_PATH
,
exist_ok
=
True
)
os
.
makedirs
(
YESNO_DATASET_PATH
,
exist_ok
=
True
)
def
plot_specgram
(
waveform
,
sample_rate
,
title
=
"Spectrogram"
,
xlim
=
None
):
def
plot_specgram
(
waveform
,
sample_rate
,
title
=
"Spectrogram"
):
waveform
=
waveform
.
numpy
()
waveform
=
waveform
.
numpy
()
num_channels
,
_
=
waveform
.
shape
figure
,
ax
=
plt
.
subplots
()
ax
.
specgram
(
waveform
[
0
],
Fs
=
sample_rate
)
figure
,
axes
=
plt
.
subplots
(
num_channels
,
1
)
if
num_channels
==
1
:
axes
=
[
axes
]
for
c
in
range
(
num_channels
):
axes
[
c
].
specgram
(
waveform
[
c
],
Fs
=
sample_rate
)
if
num_channels
>
1
:
axes
[
c
].
set_ylabel
(
f
"Channel
{
c
+
1
}
"
)
if
xlim
:
axes
[
c
].
set_xlim
(
xlim
)
figure
.
suptitle
(
title
)
figure
.
suptitle
(
title
)
plt
.
show
(
block
=
False
)
figure
.
tight_layout
()
def
play_audio
(
waveform
,
sample_rate
):
waveform
=
waveform
.
numpy
()
num_channels
,
_
=
waveform
.
shape
if
num_channels
==
1
:
display
(
Audio
(
waveform
[
0
],
rate
=
sample_rate
))
elif
num_channels
==
2
:
display
(
Audio
((
waveform
[
0
],
waveform
[
1
]),
rate
=
sample_rate
))
else
:
raise
ValueError
(
"Waveform with more than 2 channels are not supported."
)
######################################################################
######################################################################
...
@@ -79,10 +44,25 @@ def play_audio(waveform, sample_rate):
...
@@ -79,10 +44,25 @@ def play_audio(waveform, sample_rate):
# :py:class:`torchaudio.datasets.YESNO` dataset.
# :py:class:`torchaudio.datasets.YESNO` dataset.
#
#
dataset
=
torchaudio
.
datasets
.
YESNO
(
YESNO_DATASET_PATH
,
download
=
True
)
dataset
=
torchaudio
.
datasets
.
YESNO
(
YESNO_DATASET_PATH
,
download
=
True
)
for
i
in
[
1
,
3
,
5
]:
######################################################################
waveform
,
sample_rate
,
label
=
dataset
[
i
]
#
plot_specgram
(
waveform
,
sample_rate
,
title
=
f
"Sample
{
i
}
:
{
label
}
"
)
i
=
1
play_audio
(
waveform
,
sample_rate
)
waveform
,
sample_rate
,
label
=
dataset
[
i
]
plot_specgram
(
waveform
,
sample_rate
,
title
=
f
"Sample
{
i
}
:
{
label
}
"
)
IPython
.
display
.
Audio
(
waveform
,
rate
=
sample_rate
)
######################################################################
#
i
=
3
waveform
,
sample_rate
,
label
=
dataset
[
i
]
plot_specgram
(
waveform
,
sample_rate
,
title
=
f
"Sample
{
i
}
:
{
label
}
"
)
IPython
.
display
.
Audio
(
waveform
,
rate
=
sample_rate
)
######################################################################
#
i
=
5
waveform
,
sample_rate
,
label
=
dataset
[
i
]
plot_specgram
(
waveform
,
sample_rate
,
title
=
f
"Sample
{
i
}
:
{
label
}
"
)
IPython
.
display
.
Audio
(
waveform
,
rate
=
sample_rate
)
examples/tutorials/audio_feature_augmentation_tutorial.py
View file @
ffeba11a
...
@@ -19,25 +19,20 @@ print(torch.__version__)
...
@@ -19,25 +19,20 @@ print(torch.__version__)
print
(
torchaudio
.
__version__
)
print
(
torchaudio
.
__version__
)
######################################################################
######################################################################
# Prepar
ing data and utility functions (skip this section)
# Prepar
ation
# -----------
---------------------------------------------
# -----------
#
#
# @title Prepare data and utility functions. {display-mode: "form"}
# @markdown
# @markdown You do not need to look into this cell.
# @markdown Just execute once and you are good to go.
# @markdown
# @markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/),
# @markdown which is licensed under Creative Commos BY 4.0.
# -------------------------------------------------------------------------------
# Preparation of data and helper functions.
# -------------------------------------------------------------------------------
import
librosa
import
librosa
import
matplotlib.pyplot
as
plt
import
matplotlib.pyplot
as
plt
from
IPython.display
import
Audio
from
torchaudio.utils
import
download_asset
from
torchaudio.utils
import
download_asset
######################################################################
# In this tutorial, we will use a speech data from
# `VOiCES dataset <https://iqtlabs.github.io/voices/>`__,
# which is licensed under Creative Commos BY 4.0.
SAMPLE_WAV_SPEECH_PATH
=
download_asset
(
"tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
)
SAMPLE_WAV_SPEECH_PATH
=
download_asset
(
"tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
)
...
@@ -75,18 +70,6 @@ def get_spectrogram(
...
@@ -75,18 +70,6 @@ def get_spectrogram(
return
spectrogram
(
waveform
)
return
spectrogram
(
waveform
)
def
plot_spectrogram
(
spec
,
title
=
None
,
ylabel
=
"freq_bin"
,
aspect
=
"auto"
,
xmax
=
None
):
fig
,
axs
=
plt
.
subplots
(
1
,
1
)
axs
.
set_title
(
title
or
"Spectrogram (db)"
)
axs
.
set_ylabel
(
ylabel
)
axs
.
set_xlabel
(
"frame"
)
im
=
axs
.
imshow
(
librosa
.
power_to_db
(
spec
),
origin
=
"lower"
,
aspect
=
aspect
)
if
xmax
:
axs
.
set_xlim
((
0
,
xmax
))
fig
.
colorbar
(
im
,
ax
=
axs
)
plt
.
show
(
block
=
False
)
######################################################################
######################################################################
# SpecAugment
# SpecAugment
# -----------
# -----------
...
@@ -108,43 +91,79 @@ def plot_spectrogram(spec, title=None, ylabel="freq_bin", aspect="auto", xmax=No
...
@@ -108,43 +91,79 @@ def plot_spectrogram(spec, title=None, ylabel="freq_bin", aspect="auto", xmax=No
spec
=
get_spectrogram
(
power
=
None
)
spec
=
get_spectrogram
(
power
=
None
)
stretch
=
T
.
TimeStretch
()
stretch
=
T
.
TimeStretch
()
rate
=
1.2
spec_12
=
stretch
(
spec
,
overriding_rate
=
1.2
)
spec_
=
stretch
(
spec
,
rate
)
spec_09
=
stretch
(
spec
,
overriding_rate
=
0.9
)
plot_spectrogram
(
torch
.
abs
(
spec_
[
0
]),
title
=
f
"Stretched x
{
rate
}
"
,
aspect
=
"equal"
,
xmax
=
304
)
######################################################################
# Visualization
# ~~~~~~~~~~~~~
def
plot
():
def
plot_spec
(
ax
,
spec
,
title
):
ax
.
set_title
(
title
)
ax
.
imshow
(
librosa
.
amplitude_to_db
(
spec
),
origin
=
"lower"
,
aspect
=
"auto"
)
fig
,
axes
=
plt
.
subplots
(
3
,
1
,
sharex
=
True
,
sharey
=
True
)
plot_spec
(
axes
[
0
],
torch
.
abs
(
spec_12
[
0
]),
title
=
"Stretched x1.2"
)
plot_spec
(
axes
[
1
],
torch
.
abs
(
spec
[
0
]),
title
=
"Original"
)
plot_spec
(
axes
[
2
],
torch
.
abs
(
spec_09
[
0
]),
title
=
"Stretched x0.9"
)
fig
.
tight_layout
()
plot_spectrogram
(
torch
.
abs
(
spec
[
0
]),
title
=
"Original"
,
aspect
=
"equal"
,
xmax
=
304
)
rate
=
0.9
plot
()
spec_
=
stretch
(
spec
,
rate
)
plot_spectrogram
(
torch
.
abs
(
spec_
[
0
]),
title
=
f
"Stretched x
{
rate
}
"
,
aspect
=
"equal"
,
xmax
=
304
)
######################################################################
######################################################################
# TimeMasking
# Audio Samples
# -----------
# ~~~~~~~~~~~~~
#
def
preview
(
spec
,
rate
=
16000
):
ispec
=
T
.
InverseSpectrogram
()
waveform
=
ispec
(
spec
)
torch
.
random
.
manual_seed
(
4
)
return
Audio
(
waveform
[
0
].
numpy
().
T
,
rate
=
rate
)
spec
=
get_spectrogram
()
plot_spectrogram
(
spec
[
0
],
title
=
"Original"
)
masking
=
T
.
TimeMasking
(
time_mask_param
=
80
)
preview
(
spec
)
spec
=
masking
(
spec
)
plot_spectrogram
(
spec
[
0
],
title
=
"Masked along time axis"
)
######################################################################
######################################################################
# FrequencyMasking
# ----------------
#
#
preview
(
spec_12
)
######################################################################
#
preview
(
spec_09
)
######################################################################
# Time and Frequency Masking
# --------------------------
#
torch
.
random
.
manual_seed
(
4
)
torch
.
random
.
manual_seed
(
4
)
time_masking
=
T
.
TimeMasking
(
time_mask_param
=
80
)
freq_masking
=
T
.
FrequencyMasking
(
freq_mask_param
=
80
)
spec
=
get_spectrogram
()
spec
=
get_spectrogram
()
plot_spectrogram
(
spec
[
0
],
title
=
"Original"
)
time_masked
=
time_masking
(
spec
)
freq_masked
=
freq_masking
(
spec
)
######################################################################
#
def
plot
():
def
plot_spec
(
ax
,
spec
,
title
):
ax
.
set_title
(
title
)
ax
.
imshow
(
librosa
.
power_to_db
(
spec
),
origin
=
"lower"
,
aspect
=
"auto"
)
fig
,
axes
=
plt
.
subplots
(
3
,
1
,
sharex
=
True
,
sharey
=
True
)
plot_spec
(
axes
[
0
],
spec
[
0
],
title
=
"Original"
)
plot_spec
(
axes
[
1
],
time_masked
[
0
],
title
=
"Masked along time axis"
)
plot_spec
(
axes
[
2
],
freq_masked
[
0
],
title
=
"Masked along frequency axis"
)
fig
.
tight_layout
()
masking
=
T
.
FrequencyMasking
(
freq_mask_param
=
80
)
spec
=
masking
(
spec
)
plot
_spectrogram
(
spec
[
0
],
title
=
"Masked along frequency axis"
)
plot
(
)
examples/tutorials/audio_feature_extractions_tutorial.py
View file @
ffeba11a
...
@@ -25,6 +25,23 @@ import torchaudio.transforms as T
...
@@ -25,6 +25,23 @@ import torchaudio.transforms as T
print
(
torch
.
__version__
)
print
(
torch
.
__version__
)
print
(
torchaudio
.
__version__
)
print
(
torchaudio
.
__version__
)
import
librosa
import
matplotlib.pyplot
as
plt
######################################################################
# Overview of audio features
# --------------------------
#
# The following diagram shows the relationship between common audio features
# and torchaudio APIs to generate them.
#
# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio_feature_extractions.png
#
# For the complete list of available features, please refer to the
# documentation.
#
######################################################################
######################################################################
# Preparation
# Preparation
# -----------
# -----------
...
@@ -38,8 +55,7 @@ print(torchaudio.__version__)
...
@@ -38,8 +55,7 @@ print(torchaudio.__version__)
# !pip install librosa
# !pip install librosa
#
#
from
IPython.display
import
Audio
from
IPython.display
import
Audio
import
librosa
from
matplotlib.patches
import
Rectangle
import
matplotlib.pyplot
as
plt
from
torchaudio.utils
import
download_asset
from
torchaudio.utils
import
download_asset
torch
.
random
.
manual_seed
(
0
)
torch
.
random
.
manual_seed
(
0
)
...
@@ -47,27 +63,27 @@ torch.random.manual_seed(0)
...
@@ -47,27 +63,27 @@ torch.random.manual_seed(0)
SAMPLE_SPEECH
=
download_asset
(
"tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
)
SAMPLE_SPEECH
=
download_asset
(
"tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
)
def
plot_waveform
(
waveform
,
sr
,
title
=
"Waveform"
):
def
plot_waveform
(
waveform
,
sr
,
title
=
"Waveform"
,
ax
=
None
):
waveform
=
waveform
.
numpy
()
waveform
=
waveform
.
numpy
()
num_channels
,
num_frames
=
waveform
.
shape
num_channels
,
num_frames
=
waveform
.
shape
time_axis
=
torch
.
arange
(
0
,
num_frames
)
/
sr
time_axis
=
torch
.
arange
(
0
,
num_frames
)
/
sr
figure
,
axes
=
plt
.
subplots
(
num_channels
,
1
)
if
ax
is
None
:
axes
.
plot
(
time_axis
,
waveform
[
0
],
linewidth
=
1
)
_
,
ax
=
plt
.
subplots
(
num_channels
,
1
)
axes
.
grid
(
True
)
ax
.
plot
(
time_axis
,
waveform
[
0
],
linewidth
=
1
)
figure
.
suptitle
(
title
)
ax
.
grid
(
True
)
plt
.
show
(
block
=
False
)
ax
.
set_xlim
([
0
,
time_axis
[
-
1
]])
ax
.
set_title
(
title
)
def
plot_spectrogram
(
specgram
,
title
=
None
,
ylabel
=
"freq_bin"
):
def
plot_spectrogram
(
specgram
,
title
=
None
,
ylabel
=
"freq_bin"
,
ax
=
None
):
fig
,
axs
=
plt
.
subplots
(
1
,
1
)
if
ax
is
None
:
axs
.
set_title
(
title
or
"Spectrogram (db)"
)
_
,
ax
=
plt
.
subplots
(
1
,
1
)
axs
.
set_ylabel
(
ylabel
)
if
title
is
not
None
:
axs
.
set_xlabel
(
"frame"
)
ax
.
set_title
(
title
)
im
=
axs
.
imshow
(
librosa
.
power_to_db
(
specgram
),
origin
=
"lower"
,
aspect
=
"auto"
)
ax
.
set_ylabel
(
ylabel
)
fig
.
colorbar
(
im
,
ax
=
axs
)
ax
.
imshow
(
librosa
.
power_to_db
(
specgram
),
origin
=
"lower"
,
aspect
=
"auto"
,
interpolation
=
"nearest"
)
plt
.
show
(
block
=
False
)
def
plot_fbank
(
fbank
,
title
=
None
):
def
plot_fbank
(
fbank
,
title
=
None
):
...
@@ -76,21 +92,6 @@ def plot_fbank(fbank, title=None):
...
@@ -76,21 +92,6 @@ def plot_fbank(fbank, title=None):
axs
.
imshow
(
fbank
,
aspect
=
"auto"
)
axs
.
imshow
(
fbank
,
aspect
=
"auto"
)
axs
.
set_ylabel
(
"frequency bin"
)
axs
.
set_ylabel
(
"frequency bin"
)
axs
.
set_xlabel
(
"mel bin"
)
axs
.
set_xlabel
(
"mel bin"
)
plt
.
show
(
block
=
False
)
######################################################################
# Overview of audio features
# --------------------------
#
# The following diagram shows the relationship between common audio features
# and torchaudio APIs to generate them.
#
# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio_feature_extractions.png
#
# For the complete list of available features, please refer to the
# documentation.
#
######################################################################
######################################################################
...
@@ -101,77 +102,157 @@ def plot_fbank(fbank, title=None):
...
@@ -101,77 +102,157 @@ def plot_fbank(fbank, title=None):
# you can use :py:func:`torchaudio.transforms.Spectrogram`.
# you can use :py:func:`torchaudio.transforms.Spectrogram`.
#
#
# Load audio
SPEECH_WAVEFORM
,
SAMPLE_RATE
=
torchaudio
.
load
(
SAMPLE_SPEECH
)
SPEECH_WAVEFORM
,
SAMPLE_RATE
=
torchaudio
.
load
(
SAMPLE_SPEECH
)
plot_waveform
(
SPEECH_WAVEFORM
,
SAMPLE_RATE
,
title
=
"Original wave
form
"
)
# Define trans
form
Audio
(
SPEECH_WAVEFORM
.
numpy
(),
rate
=
SAMPLE_RATE
)
spectrogram
=
T
.
Spectrogram
(
n_fft
=
512
)
# Perform transform
spec
=
spectrogram
(
SPEECH_WAVEFORM
)
######################################################################
######################################################################
#
#
n_fft
=
1024
fig
,
axs
=
plt
.
subplots
(
2
,
1
)
win_length
=
None
plot_waveform
(
SPEECH_WAVEFORM
,
SAMPLE_RATE
,
title
=
"Original waveform"
,
ax
=
axs
[
0
])
hop_length
=
512
plot_spectrogram
(
spec
[
0
],
title
=
"spectrogram"
,
ax
=
axs
[
1
])
fig
.
tight_layout
()
# Define transform
######################################################################
spectrogram
=
T
.
Spectrogram
(
#
n_fft
=
n_fft
,
win_length
=
win_length
,
Audio
(
SPEECH_WAVEFORM
.
numpy
(),
rate
=
SAMPLE_RATE
)
hop_length
=
hop_length
,
center
=
True
,
pad_mode
=
"reflect"
,
power
=
2.0
,
)
######################################################################
######################################################################
# The effect of ``n_fft`` parameter
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# The core of spectrogram computation is (short-term) Fourier transform,
# and the ``n_fft`` parameter corresponds to the :math:`N` in the following
# definition of descrete Fourier transform.
#
# $$ X_k = \\sum_{n=0}^{N-1} x_n e^{-\\frac{2\\pi i}{N} nk} $$
#
# (For the detail of Fourier transform, please refer to
# `Wikipedia <https://en.wikipedia.org/wiki/Fast_Fourier_transform>`__.
#
# The value of ``n_fft`` determines the resolution of frequency axis.
# However, with the higher ``n_fft`` value, the energy will be distributed
# among more bins, so when you visualize it, it might look more blurry,
# even thought they are higher resolution.
#
# The following illustrates this;
#
#
# Perform transform
######################################################################
spec
=
spectrogram
(
SPEECH_WAVEFORM
)
#
# .. note::
#
# ``hop_length`` determines the time axis resolution.
# By default, (i.e. ``hop_length=None`` and ``win_length=None``),
# the value of ``n_fft // 4`` is used.
# Here we use the same ``hop_length`` value across different ``n_fft``
# so that they have the same number of elemets in the time axis.
#
n_ffts
=
[
32
,
128
,
512
,
2048
]
hop_length
=
64
specs
=
[]
for
n_fft
in
n_ffts
:
spectrogram
=
T
.
Spectrogram
(
n_fft
=
n_fft
,
hop_length
=
hop_length
)
spec
=
spectrogram
(
SPEECH_WAVEFORM
)
specs
.
append
(
spec
)
######################################################################
######################################################################
#
#
plot_spectrogram
(
spec
[
0
],
title
=
"torchaudio"
)
fig
,
axs
=
plt
.
subplots
(
len
(
specs
),
1
,
sharex
=
True
)
for
i
,
(
spec
,
n_fft
)
in
enumerate
(
zip
(
specs
,
n_ffts
)):
plot_spectrogram
(
spec
[
0
],
ylabel
=
f
"n_fft=
{
n_fft
}
"
,
ax
=
axs
[
i
])
axs
[
i
].
set_xlabel
(
None
)
fig
.
tight_layout
()
######################################################################
######################################################################
# GriffinLim
# ----------
#
#
# To recover a waveform from a spectrogram, you can use ``GriffinLim``.
# When comparing signals, it is desirable to use the same sampling rate,
# however if you must use the different sampling rate, care must be
# taken for interpretating the meaning of ``n_fft``.
# Recall that ``n_fft`` determines the resolution of the frequency
# axis for a given sampling rate. In other words, what each bin on
# the frequency axis represents is subject to the sampling rate.
#
#
# As we have seen above, changing the value of ``n_fft`` does not change
# the coverage of frequency range for the same input signal.
torch
.
random
.
manual_seed
(
0
)
######################################################################
#
# Let's downsample the audio and apply spectrogram with the same ``n_fft``
# value.
n_fft
=
1024
# Downsample to half of the original sample rate
win_length
=
None
speech2
=
torchaudio
.
functional
.
resample
(
SPEECH_WAVEFORM
,
SAMPLE_RATE
,
SAMPLE_RATE
//
2
)
hop_length
=
512
# Upsample to the original sample rate
speech3
=
torchaudio
.
functional
.
resample
(
speech2
,
SAMPLE_RATE
//
2
,
SAMPLE_RATE
)
spec
=
T
.
Spectrogram
(
######################################################################
n_fft
=
n_fft
,
#
win_length
=
win_length
,
hop_length
=
hop_length
,
# Apply the same spectrogram
)(
SPEECH_WAVEFORM
)
spectrogram
=
T
.
Spectrogram
(
n_fft
=
512
)
spec0
=
spectrogram
(
SPEECH_WAVEFORM
)
spec2
=
spectrogram
(
speech2
)
spec3
=
spectrogram
(
speech3
)
######################################################################
######################################################################
#
#
griffin_lim
=
T
.
GriffinLim
(
# Visualize it
n_fft
=
n_fft
,
fig
,
axs
=
plt
.
subplots
(
3
,
1
)
win_length
=
win_length
,
plot_spectrogram
(
spec0
[
0
],
ylabel
=
"Original"
,
ax
=
axs
[
0
])
hop_length
=
hop_length
,
axs
[
0
].
add_patch
(
Rectangle
((
0
,
3
),
212
,
128
,
edgecolor
=
"r"
,
facecolor
=
"none"
))
)
plot_spectrogram
(
spec2
[
0
],
ylabel
=
"Downsampled"
,
ax
=
axs
[
1
])
plot_spectrogram
(
spec3
[
0
],
ylabel
=
"Upsampled"
,
ax
=
axs
[
2
])
fig
.
tight_layout
()
######################################################################
######################################################################
#
#
# In the above visualization, the second plot ("Downsampled") might
# give the impression that the spectrogram is streched.
# This is because the meaning of frequency bins is different from
# the original one.
# Even though, they have the same number of bins, in the second plot,
# the frequency is only covered to the half of the original sampling
# rate.
# This becomes more clear if we resample the downsampled signal again
# so that it has the same sample rate as the original.
######################################################################
# GriffinLim
# ----------
#
# To recover a waveform from a spectrogram, you can use
# :py:class:`torchaudio.transforms.GriffinLim`.
#
# The same set of parameters used for spectrogram must be used.
# Define transforms
n_fft
=
1024
spectrogram
=
T
.
Spectrogram
(
n_fft
=
n_fft
)
griffin_lim
=
T
.
GriffinLim
(
n_fft
=
n_fft
)
# Apply the transforms
spec
=
spectrogram
(
SPEECH_WAVEFORM
)
reconstructed_waveform
=
griffin_lim
(
spec
)
reconstructed_waveform
=
griffin_lim
(
spec
)
######################################################################
######################################################################
#
#
plot_waveform
(
reconstructed_waveform
,
SAMPLE_RATE
,
title
=
"Reconstructed"
)
_
,
axes
=
plt
.
subplots
(
2
,
1
,
sharex
=
True
,
sharey
=
True
)
plot_waveform
(
SPEECH_WAVEFORM
,
SAMPLE_RATE
,
title
=
"Original"
,
ax
=
axes
[
0
])
plot_waveform
(
reconstructed_waveform
,
SAMPLE_RATE
,
title
=
"Reconstructed"
,
ax
=
axes
[
1
])
Audio
(
reconstructed_waveform
,
rate
=
SAMPLE_RATE
)
Audio
(
reconstructed_waveform
,
rate
=
SAMPLE_RATE
)
######################################################################
######################################################################
...
@@ -253,7 +334,6 @@ mel_spectrogram = T.MelSpectrogram(
...
@@ -253,7 +334,6 @@ mel_spectrogram = T.MelSpectrogram(
pad_mode
=
"reflect"
,
pad_mode
=
"reflect"
,
power
=
2.0
,
power
=
2.0
,
norm
=
"slaney"
,
norm
=
"slaney"
,
onesided
=
True
,
n_mels
=
n_mels
,
n_mels
=
n_mels
,
mel_scale
=
"htk"
,
mel_scale
=
"htk"
,
)
)
...
@@ -322,7 +402,7 @@ mfcc = mfcc_transform(SPEECH_WAVEFORM)
...
@@ -322,7 +402,7 @@ mfcc = mfcc_transform(SPEECH_WAVEFORM)
######################################################################
######################################################################
#
#
plot_spectrogram
(
mfcc
[
0
])
plot_spectrogram
(
mfcc
[
0
]
,
title
=
"MFCC"
)
######################################################################
######################################################################
# Comparison against librosa
# Comparison against librosa
...
@@ -350,7 +430,7 @@ mfcc_librosa = librosa.feature.mfcc(
...
@@ -350,7 +430,7 @@ mfcc_librosa = librosa.feature.mfcc(
######################################################################
######################################################################
#
#
plot_spectrogram
(
mfcc_librosa
)
plot_spectrogram
(
mfcc_librosa
,
title
=
"MFCC (librosa)"
)
mse
=
torch
.
square
(
mfcc
-
mfcc_librosa
).
mean
().
item
()
mse
=
torch
.
square
(
mfcc
-
mfcc_librosa
).
mean
().
item
()
print
(
"Mean Square Difference: "
,
mse
)
print
(
"Mean Square Difference: "
,
mse
)
...
@@ -376,7 +456,7 @@ lfcc_transform = T.LFCC(
...
@@ -376,7 +456,7 @@ lfcc_transform = T.LFCC(
)
)
lfcc
=
lfcc_transform
(
SPEECH_WAVEFORM
)
lfcc
=
lfcc_transform
(
SPEECH_WAVEFORM
)
plot_spectrogram
(
lfcc
[
0
])
plot_spectrogram
(
lfcc
[
0
]
,
title
=
"LFCC"
)
######################################################################
######################################################################
# Pitch
# Pitch
...
@@ -388,6 +468,7 @@ pitch = F.detect_pitch_frequency(SPEECH_WAVEFORM, SAMPLE_RATE)
...
@@ -388,6 +468,7 @@ pitch = F.detect_pitch_frequency(SPEECH_WAVEFORM, SAMPLE_RATE)
######################################################################
######################################################################
#
#
def
plot_pitch
(
waveform
,
sr
,
pitch
):
def
plot_pitch
(
waveform
,
sr
,
pitch
):
figure
,
axis
=
plt
.
subplots
(
1
,
1
)
figure
,
axis
=
plt
.
subplots
(
1
,
1
)
axis
.
set_title
(
"Pitch Feature"
)
axis
.
set_title
(
"Pitch Feature"
)
...
@@ -402,58 +483,6 @@ def plot_pitch(waveform, sr, pitch):
...
@@ -402,58 +483,6 @@ def plot_pitch(waveform, sr, pitch):
axis2
.
plot
(
time_axis
,
pitch
[
0
],
linewidth
=
2
,
label
=
"Pitch"
,
color
=
"green"
)
axis2
.
plot
(
time_axis
,
pitch
[
0
],
linewidth
=
2
,
label
=
"Pitch"
,
color
=
"green"
)
axis2
.
legend
(
loc
=
0
)
axis2
.
legend
(
loc
=
0
)
plt
.
show
(
block
=
False
)
plot_pitch
(
SPEECH_WAVEFORM
,
SAMPLE_RATE
,
pitch
)
plot_pitch
(
SPEECH_WAVEFORM
,
SAMPLE_RATE
,
pitch
)
######################################################################
# Kaldi Pitch (beta)
# ------------------
#
# Kaldi Pitch feature [1] is a pitch detection mechanism tuned for automatic
# speech recognition (ASR) applications. This is a beta feature in ``torchaudio``,
# and it is available as :py:func:`torchaudio.functional.compute_kaldi_pitch`.
#
# 1. A pitch extraction algorithm tuned for automatic speech recognition
#
# Ghahremani, B. BabaAli, D. Povey, K. Riedhammer, J. Trmal and S.
# Khudanpur
#
# 2014 IEEE International Conference on Acoustics, Speech and Signal
# Processing (ICASSP), Florence, 2014, pp. 2494-2498, doi:
# 10.1109/ICASSP.2014.6854049.
# [`abstract <https://ieeexplore.ieee.org/document/6854049>`__],
# [`paper <https://danielpovey.com/files/2014_icassp_pitch.pdf>`__]
#
pitch_feature
=
F
.
compute_kaldi_pitch
(
SPEECH_WAVEFORM
,
SAMPLE_RATE
)
pitch
,
nfcc
=
pitch_feature
[...,
0
],
pitch_feature
[...,
1
]
######################################################################
#
def
plot_kaldi_pitch
(
waveform
,
sr
,
pitch
,
nfcc
):
_
,
axis
=
plt
.
subplots
(
1
,
1
)
axis
.
set_title
(
"Kaldi Pitch Feature"
)
axis
.
grid
(
True
)
end_time
=
waveform
.
shape
[
1
]
/
sr
time_axis
=
torch
.
linspace
(
0
,
end_time
,
waveform
.
shape
[
1
])
axis
.
plot
(
time_axis
,
waveform
[
0
],
linewidth
=
1
,
color
=
"gray"
,
alpha
=
0.3
)
time_axis
=
torch
.
linspace
(
0
,
end_time
,
pitch
.
shape
[
1
])
ln1
=
axis
.
plot
(
time_axis
,
pitch
[
0
],
linewidth
=
2
,
label
=
"Pitch"
,
color
=
"green"
)
axis
.
set_ylim
((
-
1.3
,
1.3
))
axis2
=
axis
.
twinx
()
time_axis
=
torch
.
linspace
(
0
,
end_time
,
nfcc
.
shape
[
1
])
ln2
=
axis2
.
plot
(
time_axis
,
nfcc
[
0
],
linewidth
=
2
,
label
=
"NFCC"
,
color
=
"blue"
,
linestyle
=
"--"
)
lns
=
ln1
+
ln2
labels
=
[
l
.
get_label
()
for
l
in
lns
]
axis
.
legend
(
lns
,
labels
,
loc
=
0
)
plt
.
show
(
block
=
False
)
plot_kaldi_pitch
(
SPEECH_WAVEFORM
,
SAMPLE_RATE
,
pitch
,
nfcc
)
examples/tutorials/audio_io_tutorial.py
View file @
ffeba11a
...
@@ -5,8 +5,15 @@ Audio I/O
...
@@ -5,8 +5,15 @@ Audio I/O
**Author**: `Moto Hira <moto@meta.com>`__
**Author**: `Moto Hira <moto@meta.com>`__
This tutorial shows how to use TorchAudio's basic I/O API to load audio files
This tutorial shows how to use TorchAudio's basic I/O API to inspect audio data,
into PyTorch's Tensor object, and save Tensor objects to audio files.
load them into PyTorch Tensors and save PyTorch Tensors.
.. warning::
There are multiple changes planned/made to audio I/O in recent releases.
For the detail of these changes please refer to
:ref:`Introduction of Dispatcher <dispatcher_migration>`.
"""
"""
import
torch
import
torch
...
@@ -47,6 +54,16 @@ SAMPLE_WAV = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch12753
...
@@ -47,6 +54,16 @@ SAMPLE_WAV = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch12753
SAMPLE_WAV_8000
=
download_asset
(
"tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042-8000hz.wav"
)
SAMPLE_WAV_8000
=
download_asset
(
"tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042-8000hz.wav"
)
def
_hide_seek
(
obj
):
class
_wrapper
:
def
__init__
(
self
,
obj
):
self
.
obj
=
obj
def
read
(
self
,
n
):
return
self
.
obj
.
read
(
n
)
return
_wrapper
(
obj
)
######################################################################
######################################################################
# Querying audio metadata
# Querying audio metadata
...
@@ -113,7 +130,7 @@ print(metadata)
...
@@ -113,7 +130,7 @@ print(metadata)
url
=
"https://download.pytorch.org/torchaudio/tutorial-assets/steam-train-whistle-daniel_simon.wav"
url
=
"https://download.pytorch.org/torchaudio/tutorial-assets/steam-train-whistle-daniel_simon.wav"
with
requests
.
get
(
url
,
stream
=
True
)
as
response
:
with
requests
.
get
(
url
,
stream
=
True
)
as
response
:
metadata
=
torchaudio
.
info
(
response
.
raw
)
metadata
=
torchaudio
.
info
(
_hide_seek
(
response
.
raw
)
)
print
(
metadata
)
print
(
metadata
)
######################################################################
######################################################################
...
@@ -164,7 +181,6 @@ def plot_waveform(waveform, sample_rate):
...
@@ -164,7 +181,6 @@ def plot_waveform(waveform, sample_rate):
if
num_channels
>
1
:
if
num_channels
>
1
:
axes
[
c
].
set_ylabel
(
f
"Channel
{
c
+
1
}
"
)
axes
[
c
].
set_ylabel
(
f
"Channel
{
c
+
1
}
"
)
figure
.
suptitle
(
"waveform"
)
figure
.
suptitle
(
"waveform"
)
plt
.
show
(
block
=
False
)
######################################################################
######################################################################
...
@@ -187,7 +203,6 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram"):
...
@@ -187,7 +203,6 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram"):
if
num_channels
>
1
:
if
num_channels
>
1
:
axes
[
c
].
set_ylabel
(
f
"Channel
{
c
+
1
}
"
)
axes
[
c
].
set_ylabel
(
f
"Channel
{
c
+
1
}
"
)
figure
.
suptitle
(
title
)
figure
.
suptitle
(
title
)
plt
.
show
(
block
=
False
)
######################################################################
######################################################################
...
@@ -215,7 +230,7 @@ Audio(waveform.numpy()[0], rate=sample_rate)
...
@@ -215,7 +230,7 @@ Audio(waveform.numpy()[0], rate=sample_rate)
# Load audio data as HTTP request
# Load audio data as HTTP request
url
=
"https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
url
=
"https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
with
requests
.
get
(
url
,
stream
=
True
)
as
response
:
with
requests
.
get
(
url
,
stream
=
True
)
as
response
:
waveform
,
sample_rate
=
torchaudio
.
load
(
response
.
raw
)
waveform
,
sample_rate
=
torchaudio
.
load
(
_hide_seek
(
response
.
raw
)
)
plot_specgram
(
waveform
,
sample_rate
,
title
=
"HTTP datasource"
)
plot_specgram
(
waveform
,
sample_rate
,
title
=
"HTTP datasource"
)
######################################################################
######################################################################
...
@@ -237,7 +252,7 @@ bucket = "pytorch-tutorial-assets"
...
@@ -237,7 +252,7 @@ bucket = "pytorch-tutorial-assets"
key
=
"VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
key
=
"VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
client
=
boto3
.
client
(
"s3"
,
config
=
Config
(
signature_version
=
UNSIGNED
))
client
=
boto3
.
client
(
"s3"
,
config
=
Config
(
signature_version
=
UNSIGNED
))
response
=
client
.
get_object
(
Bucket
=
bucket
,
Key
=
key
)
response
=
client
.
get_object
(
Bucket
=
bucket
,
Key
=
key
)
waveform
,
sample_rate
=
torchaudio
.
load
(
response
[
"Body"
])
waveform
,
sample_rate
=
torchaudio
.
load
(
_hide_seek
(
response
[
"Body"
])
)
plot_specgram
(
waveform
,
sample_rate
,
title
=
"From S3"
)
plot_specgram
(
waveform
,
sample_rate
,
title
=
"From S3"
)
...
@@ -271,13 +286,15 @@ frame_offset, num_frames = 16000, 16000 # Fetch and decode the 1 - 2 seconds
...
@@ -271,13 +286,15 @@ frame_offset, num_frames = 16000, 16000 # Fetch and decode the 1 - 2 seconds
url
=
"https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
url
=
"https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
print
(
"Fetching all the data..."
)
print
(
"Fetching all the data..."
)
with
requests
.
get
(
url
,
stream
=
True
)
as
response
:
with
requests
.
get
(
url
,
stream
=
True
)
as
response
:
waveform1
,
sample_rate1
=
torchaudio
.
load
(
response
.
raw
)
waveform1
,
sample_rate1
=
torchaudio
.
load
(
_hide_seek
(
response
.
raw
)
)
waveform1
=
waveform1
[:,
frame_offset
:
frame_offset
+
num_frames
]
waveform1
=
waveform1
[:,
frame_offset
:
frame_offset
+
num_frames
]
print
(
f
" - Fetched
{
response
.
raw
.
tell
()
}
bytes"
)
print
(
f
" - Fetched
{
response
.
raw
.
tell
()
}
bytes"
)
print
(
"Fetching until the requested frames are available..."
)
print
(
"Fetching until the requested frames are available..."
)
with
requests
.
get
(
url
,
stream
=
True
)
as
response
:
with
requests
.
get
(
url
,
stream
=
True
)
as
response
:
waveform2
,
sample_rate2
=
torchaudio
.
load
(
response
.
raw
,
frame_offset
=
frame_offset
,
num_frames
=
num_frames
)
waveform2
,
sample_rate2
=
torchaudio
.
load
(
_hide_seek
(
response
.
raw
),
frame_offset
=
frame_offset
,
num_frames
=
num_frames
)
print
(
f
" - Fetched
{
response
.
raw
.
tell
()
}
bytes"
)
print
(
f
" - Fetched
{
response
.
raw
.
tell
()
}
bytes"
)
print
(
"Checking the resulting waveform ... "
,
end
=
""
)
print
(
"Checking the resulting waveform ... "
,
end
=
""
)
...
@@ -316,6 +333,7 @@ waveform, sample_rate = torchaudio.load(SAMPLE_WAV)
...
@@ -316,6 +333,7 @@ waveform, sample_rate = torchaudio.load(SAMPLE_WAV)
######################################################################
######################################################################
#
#
def
inspect_file
(
path
):
def
inspect_file
(
path
):
print
(
"-"
*
10
)
print
(
"-"
*
10
)
print
(
"Source:"
,
path
)
print
(
"Source:"
,
path
)
...
@@ -324,6 +342,7 @@ def inspect_file(path):
...
@@ -324,6 +342,7 @@ def inspect_file(path):
print
(
f
" -
{
torchaudio
.
info
(
path
)
}
"
)
print
(
f
" -
{
torchaudio
.
info
(
path
)
}
"
)
print
()
print
()
######################################################################
######################################################################
#
#
# Save without any encoding option.
# Save without any encoding option.
...
@@ -351,11 +370,11 @@ with tempfile.TemporaryDirectory() as tempdir:
...
@@ -351,11 +370,11 @@ with tempfile.TemporaryDirectory() as tempdir:
formats
=
[
formats
=
[
"flac"
,
"flac"
,
"vorbis"
,
#
"vorbis",
"sph"
,
#
"sph",
"amb"
,
#
"amb",
"amr-nb"
,
#
"amr-nb",
"gsm"
,
#
"gsm",
]
]
######################################################################
######################################################################
...
...
examples/tutorials/audio_resampling_tutorial.py
View file @
ffeba11a
...
@@ -27,14 +27,14 @@ import math
...
@@ -27,14 +27,14 @@ import math
import
timeit
import
timeit
import
librosa
import
librosa
import
resampy
import
matplotlib.pyplot
as
plt
import
matplotlib.colors
as
mcolors
import
matplotlib.colors
as
mcolors
import
matplotlib.pyplot
as
plt
import
pandas
as
pd
import
pandas
as
pd
from
IPython.display
import
Audio
,
display
import
resampy
from
IPython.display
import
Audio
pd
.
set_option
(
'
display.max_rows
'
,
None
)
pd
.
set_option
(
"
display.max_rows
"
,
None
)
pd
.
set_option
(
'
display.max_columns
'
,
None
)
pd
.
set_option
(
"
display.max_columns
"
,
None
)
DEFAULT_OFFSET
=
201
DEFAULT_OFFSET
=
201
...
@@ -105,7 +105,6 @@ def plot_sweep(
...
@@ -105,7 +105,6 @@ def plot_sweep(
axis
.
yaxis
.
grid
(
True
,
alpha
=
0.67
)
axis
.
yaxis
.
grid
(
True
,
alpha
=
0.67
)
figure
.
suptitle
(
f
"
{
title
}
(sample rate:
{
sample_rate
}
Hz)"
)
figure
.
suptitle
(
f
"
{
title
}
(sample rate:
{
sample_rate
}
Hz)"
)
plt
.
colorbar
(
cax
)
plt
.
colorbar
(
cax
)
plt
.
show
(
block
=
True
)
######################################################################
######################################################################
...
@@ -240,13 +239,13 @@ plot_sweep(resampled_waveform, resample_rate, title="rolloff=0.8")
...
@@ -240,13 +239,13 @@ plot_sweep(resampled_waveform, resample_rate, title="rolloff=0.8")
sample_rate
=
48000
sample_rate
=
48000
resample_rate
=
32000
resample_rate
=
32000
resampled_waveform
=
F
.
resample
(
waveform
,
sample_rate
,
resample_rate
,
resampling_method
=
"sinc_interp
olatio
n"
)
resampled_waveform
=
F
.
resample
(
waveform
,
sample_rate
,
resample_rate
,
resampling_method
=
"sinc_interp
_han
n"
)
plot_sweep
(
resampled_waveform
,
resample_rate
,
title
=
"Hann Window Default"
)
plot_sweep
(
resampled_waveform
,
resample_rate
,
title
=
"Hann Window Default"
)
######################################################################
######################################################################
#
#
resampled_waveform
=
F
.
resample
(
waveform
,
sample_rate
,
resample_rate
,
resampling_method
=
"
kaiser_window
"
)
resampled_waveform
=
F
.
resample
(
waveform
,
sample_rate
,
resample_rate
,
resampling_method
=
"
sinc_interp_kaiser
"
)
plot_sweep
(
resampled_waveform
,
resample_rate
,
title
=
"Kaiser Window Default"
)
plot_sweep
(
resampled_waveform
,
resample_rate
,
title
=
"Kaiser Window Default"
)
...
@@ -271,7 +270,7 @@ resampled_waveform = F.resample(
...
@@ -271,7 +270,7 @@ resampled_waveform = F.resample(
resample_rate
,
resample_rate
,
lowpass_filter_width
=
64
,
lowpass_filter_width
=
64
,
rolloff
=
0.9475937167399596
,
rolloff
=
0.9475937167399596
,
resampling_method
=
"
kaiser_window
"
,
resampling_method
=
"
sinc_interp_kaiser
"
,
beta
=
14.769656459379492
,
beta
=
14.769656459379492
,
)
)
plot_sweep
(
resampled_waveform
,
resample_rate
,
title
=
"Kaiser Window Best (torchaudio)"
)
plot_sweep
(
resampled_waveform
,
resample_rate
,
title
=
"Kaiser Window Best (torchaudio)"
)
...
@@ -300,7 +299,7 @@ resampled_waveform = F.resample(
...
@@ -300,7 +299,7 @@ resampled_waveform = F.resample(
resample_rate
,
resample_rate
,
lowpass_filter_width
=
16
,
lowpass_filter_width
=
16
,
rolloff
=
0.85
,
rolloff
=
0.85
,
resampling_method
=
"
kaiser_window
"
,
resampling_method
=
"
sinc_interp_kaiser
"
,
beta
=
8.555504641634386
,
beta
=
8.555504641634386
,
)
)
plot_sweep
(
resampled_waveform
,
resample_rate
,
title
=
"Kaiser Window Fast (torchaudio)"
)
plot_sweep
(
resampled_waveform
,
resample_rate
,
title
=
"Kaiser Window Fast (torchaudio)"
)
...
@@ -325,7 +324,7 @@ print("torchaudio and librosa kaiser fast MSE:", mse)
...
@@ -325,7 +324,7 @@ print("torchaudio and librosa kaiser fast MSE:", mse)
#
#
# Below are benchmarks for downsampling and upsampling waveforms between
# Below are benchmarks for downsampling and upsampling waveforms between
# two pairs of sampling rates. We demonstrate the performance implications
# two pairs of sampling rates. We demonstrate the performance implications
# that the ``lowpass_filter_w
d
ith``, window type, and sample rates can
# that the ``lowpass_filter_wi
d
th``, window type, and sample rates can
# have. Additionally, we provide a comparison against ``librosa``\ ’s
# have. Additionally, we provide a comparison against ``librosa``\ ’s
# ``kaiser_best`` and ``kaiser_fast`` using their corresponding parameters
# ``kaiser_best`` and ``kaiser_fast`` using their corresponding parameters
# in ``torchaudio``.
# in ``torchaudio``.
...
@@ -338,18 +337,20 @@ print(f"resampy: {resampy.__version__}")
...
@@ -338,18 +337,20 @@ print(f"resampy: {resampy.__version__}")
######################################################################
######################################################################
#
#
def
benchmark_resample_functional
(
def
benchmark_resample_functional
(
waveform
,
waveform
,
sample_rate
,
sample_rate
,
resample_rate
,
resample_rate
,
lowpass_filter_width
=
6
,
lowpass_filter_width
=
6
,
rolloff
=
0.99
,
rolloff
=
0.99
,
resampling_method
=
"sinc_interp
olatio
n"
,
resampling_method
=
"sinc_interp
_han
n"
,
beta
=
None
,
beta
=
None
,
iters
=
5
,
iters
=
5
,
):
):
return
timeit
.
timeit
(
return
(
stmt
=
'''
timeit
.
timeit
(
stmt
=
"""
torchaudio.functional.resample(
torchaudio.functional.resample(
waveform,
waveform,
sample_rate,
sample_rate,
...
@@ -359,29 +360,34 @@ torchaudio.functional.resample(
...
@@ -359,29 +360,34 @@ torchaudio.functional.resample(
resampling_method=resampling_method,
resampling_method=resampling_method,
beta=beta,
beta=beta,
)
)
'''
,
"""
,
setup
=
'import torchaudio'
,
setup
=
"import torchaudio"
,
number
=
iters
,
number
=
iters
,
globals
=
locals
(),
globals
=
locals
(),
)
*
1000
/
iters
)
*
1000
/
iters
)
######################################################################
######################################################################
#
#
def
benchmark_resample_transforms
(
def
benchmark_resample_transforms
(
waveform
,
waveform
,
sample_rate
,
sample_rate
,
resample_rate
,
resample_rate
,
lowpass_filter_width
=
6
,
lowpass_filter_width
=
6
,
rolloff
=
0.99
,
rolloff
=
0.99
,
resampling_method
=
"sinc_interp
olatio
n"
,
resampling_method
=
"sinc_interp
_han
n"
,
beta
=
None
,
beta
=
None
,
iters
=
5
,
iters
=
5
,
):
):
return
timeit
.
timeit
(
return
(
stmt
=
'resampler(waveform)'
,
timeit
.
timeit
(
setup
=
'''
stmt
=
"resampler(waveform)"
,
setup
=
"""
import torchaudio
import torchaudio
resampler = torchaudio.transforms.Resample(
resampler = torchaudio.transforms.Resample(
...
@@ -394,15 +400,19 @@ resampler = torchaudio.transforms.Resample(
...
@@ -394,15 +400,19 @@ resampler = torchaudio.transforms.Resample(
beta=beta,
beta=beta,
)
)
resampler.to(waveform.device)
resampler.to(waveform.device)
'''
,
"""
,
number
=
iters
,
number
=
iters
,
globals
=
locals
(),
globals
=
locals
(),
)
*
1000
/
iters
)
*
1000
/
iters
)
######################################################################
######################################################################
#
#
def
benchmark_resample_librosa
(
def
benchmark_resample_librosa
(
waveform
,
waveform
,
sample_rate
,
sample_rate
,
...
@@ -411,24 +421,29 @@ def benchmark_resample_librosa(
...
@@ -411,24 +421,29 @@ def benchmark_resample_librosa(
iters
=
5
,
iters
=
5
,
):
):
waveform_np
=
waveform
.
squeeze
().
numpy
()
waveform_np
=
waveform
.
squeeze
().
numpy
()
return
timeit
.
timeit
(
return
(
stmt
=
'''
timeit
.
timeit
(
stmt
=
"""
librosa.resample(
librosa.resample(
waveform_np,
waveform_np,
orig_sr=sample_rate,
orig_sr=sample_rate,
target_sr=resample_rate,
target_sr=resample_rate,
res_type=res_type,
res_type=res_type,
)
)
'''
,
"""
,
setup
=
'import librosa'
,
setup
=
"import librosa"
,
number
=
iters
,
number
=
iters
,
globals
=
locals
(),
globals
=
locals
(),
)
*
1000
/
iters
)
*
1000
/
iters
)
######################################################################
######################################################################
#
#
def
benchmark
(
sample_rate
,
resample_rate
):
def
benchmark
(
sample_rate
,
resample_rate
):
times
,
rows
=
[],
[]
times
,
rows
=
[],
[]
waveform
=
get_sine_sweep
(
sample_rate
).
to
(
torch
.
float32
)
waveform
=
get_sine_sweep
(
sample_rate
).
to
(
torch
.
float32
)
...
@@ -451,7 +466,7 @@ def benchmark(sample_rate, resample_rate):
...
@@ -451,7 +466,7 @@ def benchmark(sample_rate, resample_rate):
kwargs
=
{
kwargs
=
{
"lowpass_filter_width"
:
64
,
"lowpass_filter_width"
:
64
,
"rolloff"
:
0.9475937167399596
,
"rolloff"
:
0.9475937167399596
,
"resampling_method"
:
"
kaiser_window
"
,
"resampling_method"
:
"
sinc_interp_kaiser
"
,
"beta"
:
14.769656459379492
,
"beta"
:
14.769656459379492
,
}
}
lib_time
=
benchmark_resample_librosa
(
*
args
,
res_type
=
"kaiser_best"
)
lib_time
=
benchmark_resample_librosa
(
*
args
,
res_type
=
"kaiser_best"
)
...
@@ -464,7 +479,7 @@ def benchmark(sample_rate, resample_rate):
...
@@ -464,7 +479,7 @@ def benchmark(sample_rate, resample_rate):
kwargs
=
{
kwargs
=
{
"lowpass_filter_width"
:
16
,
"lowpass_filter_width"
:
16
,
"rolloff"
:
0.85
,
"rolloff"
:
0.85
,
"resampling_method"
:
"
kaiser_window
"
,
"resampling_method"
:
"
sinc_interp_kaiser
"
,
"beta"
:
8.555504641634386
,
"beta"
:
8.555504641634386
,
}
}
lib_time
=
benchmark_resample_librosa
(
*
args
,
res_type
=
"kaiser_fast"
)
lib_time
=
benchmark_resample_librosa
(
*
args
,
res_type
=
"kaiser_fast"
)
...
@@ -483,7 +498,7 @@ def plot(df):
...
@@ -483,7 +498,7 @@ def plot(df):
print
(
df
.
round
(
2
))
print
(
df
.
round
(
2
))
ax
=
df
.
plot
(
kind
=
"bar"
)
ax
=
df
.
plot
(
kind
=
"bar"
)
plt
.
ylabel
(
"Time Elapsed [ms]"
)
plt
.
ylabel
(
"Time Elapsed [ms]"
)
plt
.
xticks
(
rotation
=
0
,
fontsize
=
10
)
plt
.
xticks
(
rotation
=
0
,
fontsize
=
10
)
for
cont
,
col
,
color
in
zip
(
ax
.
containers
,
df
.
columns
,
mcolors
.
TABLEAU_COLORS
):
for
cont
,
col
,
color
in
zip
(
ax
.
containers
,
df
.
columns
,
mcolors
.
TABLEAU_COLORS
):
label
=
[
"N/A"
if
v
!=
v
else
str
(
v
)
for
v
in
df
[
col
].
round
(
2
)]
label
=
[
"N/A"
if
v
!=
v
else
str
(
v
)
for
v
in
df
[
col
].
round
(
2
)]
ax
.
bar_label
(
cont
,
labels
=
label
,
color
=
color
,
fontweight
=
"bold"
,
fontsize
=
"x-small"
)
ax
.
bar_label
(
cont
,
labels
=
label
,
color
=
color
,
fontweight
=
"bold"
,
fontsize
=
"x-small"
)
...
@@ -531,8 +546,8 @@ plot(df)
...
@@ -531,8 +546,8 @@ plot(df)
# - a larger ``lowpass_filter_width`` results in a larger resampling kernel,
# - a larger ``lowpass_filter_width`` results in a larger resampling kernel,
# and therefore increases computation time for both the kernel computation
# and therefore increases computation time for both the kernel computation
# and convolution
# and convolution
# - using ``
kaiser_window
`` results in longer computation times than the default
# - using ``
sinc_interp_kaiser
`` results in longer computation times than the default
# ``sinc_interp
olatio
n`` because it is more complex to compute the intermediate
# ``sinc_interp
_han
n`` because it is more complex to compute the intermediate
# window values
# window values
# - a large GCD between the sample and resample rate will result
# - a large GCD between the sample and resample rate will result
# in a simplification that allows for a smaller kernel and faster kernel computation.
# in a simplification that allows for a smaller kernel and faster kernel computation.
...
...
examples/tutorials/ctc_forced_alignment_api_tutorial.py
0 → 100644
View file @
ffeba11a
"""
CTC forced alignment API tutorial
=================================
**Author**: `Xiaohui Zhang <xiaohuizhang@meta.com>`__, `Moto Hira <moto@meta.com>`__
The forced alignment is a process to align transcript with speech.
This tutorial shows how to align transcripts to speech using
:py:func:`torchaudio.functional.forced_align` which was developed along the work of
`Scaling Speech Technology to 1,000+ Languages
<https://research.facebook.com/publications/scaling-speech-technology-to-1000-languages/>`__.
:py:func:`~torchaudio.functional.forced_align` has custom CPU and CUDA
implementations which are more performant than the vanilla Python
implementation above, and are more accurate.
It can also handle missing transcript with special ``<star>`` token.
There is also a high-level API, :py:class:`torchaudio.pipelines.Wav2Vec2FABundle`,
which wraps the pre/post-processing explained in this tutorial and makes it easy
to run forced-alignments.
`Forced alignment for multilingual data
<./forced_alignment_for_multilingual_data_tutorial.html>`__ uses this API to
illustrate how to align non-English transcripts.
"""
######################################################################
# Preparation
# -----------
import
torch
import
torchaudio
print
(
torch
.
__version__
)
print
(
torchaudio
.
__version__
)
######################################################################
#
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
print
(
device
)
######################################################################
#
import
IPython
import
matplotlib.pyplot
as
plt
import
torchaudio.functional
as
F
######################################################################
# First we prepare the speech data and the transcript we area going
# to use.
#
SPEECH_FILE
=
torchaudio
.
utils
.
download_asset
(
"tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
)
waveform
,
_
=
torchaudio
.
load
(
SPEECH_FILE
)
TRANSCRIPT
=
"i had that curiosity beside me at this moment"
.
split
()
######################################################################
# Generating emissions
# ~~~~~~~~~~~~~~~~~~~~
#
# :py:func:`~torchaudio.functional.forced_align` takes emission and
# token sequences and outputs timestaps of the tokens and their scores.
#
# Emission reperesents the frame-wise probability distribution over
# tokens, and it can be obtained by passing waveform to an acoustic
# model.
#
# Tokens are numerical expression of transcripts. There are many ways to
# tokenize transcripts, but here, we simply map alphabets into integer,
# which is how labels were constructed when the acoustice model we are
# going to use was trained.
#
# We will use a pre-trained Wav2Vec2 model,
# :py:data:`torchaudio.pipelines.MMS_FA`, to obtain emission and tokenize
# the transcript.
#
bundle
=
torchaudio
.
pipelines
.
MMS_FA
model
=
bundle
.
get_model
(
with_star
=
False
).
to
(
device
)
with
torch
.
inference_mode
():
emission
,
_
=
model
(
waveform
.
to
(
device
))
######################################################################
#
def
plot_emission
(
emission
):
fig
,
ax
=
plt
.
subplots
()
ax
.
imshow
(
emission
.
cpu
().
T
)
ax
.
set_title
(
"Frame-wise class probabilities"
)
ax
.
set_xlabel
(
"Time"
)
ax
.
set_ylabel
(
"Labels"
)
fig
.
tight_layout
()
plot_emission
(
emission
[
0
])
######################################################################
# Tokenize the transcript
# ~~~~~~~~~~~~~~~~~~~~~~~
#
# We create a dictionary, which maps each label into token.
LABELS
=
bundle
.
get_labels
(
star
=
None
)
DICTIONARY
=
bundle
.
get_dict
(
star
=
None
)
for
k
,
v
in
DICTIONARY
.
items
():
print
(
f
"
{
k
}
:
{
v
}
"
)
######################################################################
# converting transcript to tokens is as simple as
tokenized_transcript
=
[
DICTIONARY
[
c
]
for
word
in
TRANSCRIPT
for
c
in
word
]
for
t
in
tokenized_transcript
:
print
(
t
,
end
=
" "
)
print
()
######################################################################
# Computing alignments
# --------------------
#
# Frame-level alignments
# ~~~~~~~~~~~~~~~~~~~~~~
#
# Now we call TorchAudio’s forced alignment API to compute the
# frame-level alignment. For the detail of function signature, please
# refer to :py:func:`~torchaudio.functional.forced_align`.
#
def
align
(
emission
,
tokens
):
targets
=
torch
.
tensor
([
tokens
],
dtype
=
torch
.
int32
,
device
=
device
)
alignments
,
scores
=
F
.
forced_align
(
emission
,
targets
,
blank
=
0
)
alignments
,
scores
=
alignments
[
0
],
scores
[
0
]
# remove batch dimension for simplicity
scores
=
scores
.
exp
()
# convert back to probability
return
alignments
,
scores
aligned_tokens
,
alignment_scores
=
align
(
emission
,
tokenized_transcript
)
######################################################################
# Now let's look at the output.
for
i
,
(
ali
,
score
)
in
enumerate
(
zip
(
aligned_tokens
,
alignment_scores
)):
print
(
f
"
{
i
:
3
d
}
:
\t
{
ali
:
2
d
}
[
{
LABELS
[
ali
]
}
],
{
score
:.
2
f
}
"
)
######################################################################
#
# .. note::
#
# The alignment is expressed in the frame cordinate of the emission,
# which is different from the original waveform.
#
# It contains blank tokens and repeated tokens. The following is the
# interpretation of the non-blank tokens.
#
# .. code-block::
#
# 31: 0 [-], 1.00
# 32: 2 [i], 1.00 "i" starts and ends
# 33: 0 [-], 1.00
# 34: 0 [-], 1.00
# 35: 15 [h], 1.00 "h" starts
# 36: 15 [h], 0.93 "h" ends
# 37: 1 [a], 1.00 "a" starts and ends
# 38: 0 [-], 0.96
# 39: 0 [-], 1.00
# 40: 0 [-], 1.00
# 41: 13 [d], 1.00 "d" starts and ends
# 42: 0 [-], 1.00
#
# .. note::
#
# When same token occured after blank tokens, it is not treated as
# a repeat, but as a new occurrence.
#
# .. code-block::
#
# a a a b -> a b
# a - - b -> a b
# a a - b -> a b
# a - a b -> a a b
# ^^^ ^^^
#
######################################################################
# Token-level alignments
# ~~~~~~~~~~~~~~~~~~~~~~
#
# Next step is to resolve the repetation, so that each alignment does
# not depend on previous alignments.
# :py:func:`torchaudio.functional.merge_tokens` computes the
# :py:class:`~torchaudio.functional.TokenSpan` object, which represents
# which token from the transcript is present at what time span.
######################################################################
#
token_spans
=
F
.
merge_tokens
(
aligned_tokens
,
alignment_scores
)
print
(
"Token
\t
Time
\t
Score"
)
for
s
in
token_spans
:
print
(
f
"
{
LABELS
[
s
.
token
]
}
\t
[
{
s
.
start
:
3
d
}
,
{
s
.
end
:
3
d
}
)
\t
{
s
.
score
:.
2
f
}
"
)
######################################################################
# Word-level alignments
# ~~~~~~~~~~~~~~~~~~~~~
#
# Now let’s group the token-level alignments into word-level alignments.
def
unflatten
(
list_
,
lengths
):
assert
len
(
list_
)
==
sum
(
lengths
)
i
=
0
ret
=
[]
for
l
in
lengths
:
ret
.
append
(
list_
[
i
:
i
+
l
])
i
+=
l
return
ret
word_spans
=
unflatten
(
token_spans
,
[
len
(
word
)
for
word
in
TRANSCRIPT
])
######################################################################
# Audio previews
# ~~~~~~~~~~~~~~
#
# Compute average score weighted by the span length
def
_score
(
spans
):
return
sum
(
s
.
score
*
len
(
s
)
for
s
in
spans
)
/
sum
(
len
(
s
)
for
s
in
spans
)
def
preview_word
(
waveform
,
spans
,
num_frames
,
transcript
,
sample_rate
=
bundle
.
sample_rate
):
ratio
=
waveform
.
size
(
1
)
/
num_frames
x0
=
int
(
ratio
*
spans
[
0
].
start
)
x1
=
int
(
ratio
*
spans
[
-
1
].
end
)
print
(
f
"
{
transcript
}
(
{
_score
(
spans
):.
2
f
}
):
{
x0
/
sample_rate
:.
3
f
}
-
{
x1
/
sample_rate
:.
3
f
}
sec"
)
segment
=
waveform
[:,
x0
:
x1
]
return
IPython
.
display
.
Audio
(
segment
.
numpy
(),
rate
=
sample_rate
)
num_frames
=
emission
.
size
(
1
)
######################################################################
# Generate the audio for each segment
print
(
TRANSCRIPT
)
IPython
.
display
.
Audio
(
SPEECH_FILE
)
######################################################################
#
preview_word
(
waveform
,
word_spans
[
0
],
num_frames
,
TRANSCRIPT
[
0
])
######################################################################
#
preview_word
(
waveform
,
word_spans
[
1
],
num_frames
,
TRANSCRIPT
[
1
])
######################################################################
#
preview_word
(
waveform
,
word_spans
[
2
],
num_frames
,
TRANSCRIPT
[
2
])
######################################################################
#
preview_word
(
waveform
,
word_spans
[
3
],
num_frames
,
TRANSCRIPT
[
3
])
######################################################################
#
preview_word
(
waveform
,
word_spans
[
4
],
num_frames
,
TRANSCRIPT
[
4
])
######################################################################
#
preview_word
(
waveform
,
word_spans
[
5
],
num_frames
,
TRANSCRIPT
[
5
])
######################################################################
#
preview_word
(
waveform
,
word_spans
[
6
],
num_frames
,
TRANSCRIPT
[
6
])
######################################################################
#
preview_word
(
waveform
,
word_spans
[
7
],
num_frames
,
TRANSCRIPT
[
7
])
######################################################################
#
preview_word
(
waveform
,
word_spans
[
8
],
num_frames
,
TRANSCRIPT
[
8
])
######################################################################
# Visualization
# ~~~~~~~~~~~~~
#
# Now let's look at the alignment result and segment the original
# speech into words.
def
plot_alignments
(
waveform
,
token_spans
,
emission
,
transcript
,
sample_rate
=
bundle
.
sample_rate
):
ratio
=
waveform
.
size
(
1
)
/
emission
.
size
(
1
)
/
sample_rate
fig
,
axes
=
plt
.
subplots
(
2
,
1
)
axes
[
0
].
imshow
(
emission
[
0
].
detach
().
cpu
().
T
,
aspect
=
"auto"
)
axes
[
0
].
set_title
(
"Emission"
)
axes
[
0
].
set_xticks
([])
axes
[
1
].
specgram
(
waveform
[
0
],
Fs
=
sample_rate
)
for
t_spans
,
chars
in
zip
(
token_spans
,
transcript
):
t0
,
t1
=
t_spans
[
0
].
start
+
0.1
,
t_spans
[
-
1
].
end
-
0.1
axes
[
0
].
axvspan
(
t0
-
0.5
,
t1
-
0.5
,
facecolor
=
"None"
,
hatch
=
"/"
,
edgecolor
=
"white"
)
axes
[
1
].
axvspan
(
ratio
*
t0
,
ratio
*
t1
,
facecolor
=
"None"
,
hatch
=
"/"
,
edgecolor
=
"white"
)
axes
[
1
].
annotate
(
f
"
{
_score
(
t_spans
):.
2
f
}
"
,
(
ratio
*
t0
,
sample_rate
*
0.51
),
annotation_clip
=
False
)
for
span
,
char
in
zip
(
t_spans
,
chars
):
t0
=
span
.
start
*
ratio
axes
[
1
].
annotate
(
char
,
(
t0
,
sample_rate
*
0.55
),
annotation_clip
=
False
)
axes
[
1
].
set_xlabel
(
"time [second]"
)
axes
[
1
].
set_xlim
([
0
,
None
])
fig
.
tight_layout
()
######################################################################
#
plot_alignments
(
waveform
,
word_spans
,
emission
,
TRANSCRIPT
)
######################################################################
#
# Inconsistent treatment of ``blank`` token
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# When splitting the token-level alignments into words, you will
# notice that some blank tokens are treated differently, and this makes
# the interpretation of the result somehwat ambigious.
#
# This is easy to see when we plot the scores. The following figure
# shows word regions and non-word regions, with the frame-level scores
# of non-blank tokens.
def
plot_scores
(
word_spans
,
scores
):
fig
,
ax
=
plt
.
subplots
()
span_xs
,
span_hs
=
[],
[]
ax
.
axvspan
(
word_spans
[
0
][
0
].
start
-
0.05
,
word_spans
[
-
1
][
-
1
].
end
+
0.05
,
facecolor
=
"paleturquoise"
,
edgecolor
=
"none"
,
zorder
=-
1
)
for
t_span
in
word_spans
:
for
span
in
t_span
:
for
t
in
range
(
span
.
start
,
span
.
end
):
span_xs
.
append
(
t
+
0.5
)
span_hs
.
append
(
scores
[
t
].
item
())
ax
.
annotate
(
LABELS
[
span
.
token
],
(
span
.
start
,
-
0.07
))
ax
.
axvspan
(
t_span
[
0
].
start
-
0.05
,
t_span
[
-
1
].
end
+
0.05
,
facecolor
=
"mistyrose"
,
edgecolor
=
"none"
,
zorder
=-
1
)
ax
.
bar
(
span_xs
,
span_hs
,
color
=
"lightsalmon"
,
edgecolor
=
"coral"
)
ax
.
set_title
(
"Frame-level scores and word segments"
)
ax
.
set_ylim
(
-
0.1
,
None
)
ax
.
grid
(
True
,
axis
=
"y"
)
ax
.
axhline
(
0
,
color
=
"black"
)
fig
.
tight_layout
()
plot_scores
(
word_spans
,
alignment_scores
)
######################################################################
# In this plot, the blank tokens are those highlighted area without
# vertical bar.
# You can see that there are blank tokens which are interpreted as
# part of a word (highlighted red), while the others (highlighted blue)
# are not.
#
# One reason for this is because the model was trained without a
# label for the word boundary. The blank tokens are treated not just
# as repeatation but also as silence between words.
#
# But then, a question arises. Should frames immediately after or
# near the end of a word be silent or repeat?
#
# In the above example, if you go back to the previous plot of
# spectrogram and word regions, you see that after "y" in "curiosity",
# there is still some activities in multiple frequency buckets.
#
# Would it be more accurate if that frame was included in the word?
#
# Unfortunately, CTC does not provide a comprehensive solution to this.
# Models trained with CTC are known to exhibit "peaky" response,
# that is, they tend to spike for an aoccurance of a label, but the
# spike does not last for the duration of the label.
# (Note: Pre-trained Wav2Vec2 models tend to spike at the beginning of
# label occurances, but this not always the case.)
#
# :cite:`zeyer2021does` has in-depth alanysis on the peaky behavior of
# CTC.
# We encourage those who are interested understanding more to refer
# to the paper.
# The following is a quote from the paper, which is the exact issue we
# are facing here.
#
# *Peaky behavior can be problematic in certain cases,*
# *e.g. when an application requires to not use the blank label,*
# *e.g. to get meaningful time accurate alignments of phonemes*
# *to a transcription.*
######################################################################
# Advanced: Handling transcripts with ``<star>`` token
# ----------------------------------------------------
#
# Now let’s look at when the transcript is partially missing, how can we
# improve alignment quality using the ``<star>`` token, which is capable of modeling
# any token.
#
# Here we use the same English example as used above. But we remove the
# beginning text ``“i had that curiosity beside me at”`` from the transcript.
# Aligning audio with such transcript results in wrong alignments of the
# existing word “this”. However, this issue can be mitigated by using the
# ``<star>`` token to model the missing text.
#
######################################################################
# First, we extend the dictionary to include the ``<star>`` token.
DICTIONARY
[
"*"
]
=
len
(
DICTIONARY
)
######################################################################
# Next, we extend the emission tensor with the extra dimension
# corresponding to the ``<star>`` token.
#
star_dim
=
torch
.
zeros
((
1
,
emission
.
size
(
1
),
1
),
device
=
emission
.
device
,
dtype
=
emission
.
dtype
)
emission
=
torch
.
cat
((
emission
,
star_dim
),
2
)
assert
len
(
DICTIONARY
)
==
emission
.
shape
[
2
]
plot_emission
(
emission
[
0
])
######################################################################
# The following function combines all the processes, and compute
# word segments from emission in one-go.
def
compute_alignments
(
emission
,
transcript
,
dictionary
):
tokens
=
[
dictionary
[
char
]
for
word
in
transcript
for
char
in
word
]
alignment
,
scores
=
align
(
emission
,
tokens
)
token_spans
=
F
.
merge_tokens
(
alignment
,
scores
)
word_spans
=
unflatten
(
token_spans
,
[
len
(
word
)
for
word
in
transcript
])
return
word_spans
######################################################################
# Full Transcript
# ~~~~~~~~~~~~~~~
word_spans
=
compute_alignments
(
emission
,
TRANSCRIPT
,
DICTIONARY
)
plot_alignments
(
waveform
,
word_spans
,
emission
,
TRANSCRIPT
)
######################################################################
# Partial Transcript with ``<star>`` token
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# Now we replace the first part of the transcript with the ``<star>`` token.
transcript
=
"* this moment"
.
split
()
word_spans
=
compute_alignments
(
emission
,
transcript
,
DICTIONARY
)
plot_alignments
(
waveform
,
word_spans
,
emission
,
transcript
)
######################################################################
#
preview_word
(
waveform
,
word_spans
[
0
],
num_frames
,
transcript
[
0
])
######################################################################
#
preview_word
(
waveform
,
word_spans
[
1
],
num_frames
,
transcript
[
1
])
######################################################################
#
preview_word
(
waveform
,
word_spans
[
2
],
num_frames
,
transcript
[
2
])
######################################################################
# Partial Transcript without ``<star>`` token
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# As a comparison, the following aligns the partial transcript
# without using ``<star>`` token.
# It demonstrates the effect of ``<star>`` token for dealing with deletion errors.
transcript
=
"this moment"
.
split
()
word_spans
=
compute_alignments
(
emission
,
transcript
,
DICTIONARY
)
plot_alignments
(
waveform
,
word_spans
,
emission
,
transcript
)
######################################################################
# Conclusion
# ----------
#
# In this tutorial, we looked at how to use torchaudio’s forced alignment
# API to align and segment speech files, and demonstrated one advanced usage:
# How introducing a ``<star>`` token could improve alignment accuracy when
# transcription errors exist.
#
######################################################################
# Acknowledgement
# ---------------
#
# Thanks to `Vineel Pratap <vineelkpratap@meta.com>`__ and `Zhaoheng
# Ni <zni@meta.com>`__ for developing and open-sourcing the
# forced aligner API.
examples/tutorials/device_asr.py
View file @
ffeba11a
...
@@ -7,26 +7,23 @@ Device ASR with Emformer RNN-T
...
@@ -7,26 +7,23 @@ Device ASR with Emformer RNN-T
This tutorial shows how to use Emformer RNN-T and streaming API
This tutorial shows how to use Emformer RNN-T and streaming API
to perform speech recognition on a streaming device input, i.e. microphone
to perform speech recognition on a streaming device input, i.e. microphone
on laptop.
on laptop.
.. note::
This tutorial requires FFmpeg libraries (>=4.1, <4.4) and SentencePiece.
There are multiple ways to install FFmpeg libraries.
If you are using Anaconda Python distribution,
``conda install 'ffmpeg<4.4'`` will install
the required FFmpeg libraries.
You can install SentencePiece by running ``pip install sentencepiece``.
.. note::
This tutorial was tested on MacBook Pro and Dynabook with Windows 10.
This tutorial does NOT work on Google Colab because the server running
this tutorial does not have a microphone that you can talk to.
"""
"""
######################################################################
#
# .. note::
#
# This tutorial requires FFmpeg libraries.
# Please refer to :ref:`FFmpeg dependency <ffmpeg_dependency>` for
# the detail.
#
# .. note::
#
# This tutorial was tested on MacBook Pro and Dynabook with Windows 10.
#
# This tutorial does NOT work on Google Colab because the server running
# this tutorial does not have a microphone that you can talk to.
######################################################################
######################################################################
# 1. Overview
# 1. Overview
# -----------
# -----------
...
...
examples/tutorials/effector_tutorial.py
0 → 100644
View file @
ffeba11a
"""
AudioEffector Usages
====================
**Author**: `Moto Hira <moto@meta.com>`__
This tutorial shows how to use :py:class:`torchaudio.io.AudioEffector` to
apply various effects and codecs to waveform tensor.
"""
######################################################################
#
# .. note::
#
# This tutorial requires FFmpeg libraries.
# Please refer to :ref:`FFmpeg dependency <ffmpeg_dependency>` for
# the detail.
#
######################################################################
# Overview
# --------
#
# :py:class:`~torchaudio.io.AudioEffector` combines in-memory encoding,
# decoding and filtering that are provided by
# :py:class:`~torchaudio.io.StreamWriter` and
# :py:class:`~torchaudio.io.StreamReader`.
#
# The following figure illustrates the process.
#
# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/AudioEffector.png
#
import
torch
import
torchaudio
print
(
torch
.
__version__
)
print
(
torchaudio
.
__version__
)
######################################################################
#
from
torchaudio.io
import
AudioEffector
,
CodecConfig
import
matplotlib.pyplot
as
plt
from
IPython.display
import
Audio
######################################################################
#
for
k
,
v
in
torchaudio
.
utils
.
ffmpeg_utils
.
get_versions
().
items
():
print
(
k
,
v
)
######################################################################
# Usage
# -----
#
# To use ``AudioEffector``, instantiate it with ``effect`` and
# ``format``, then either pass the waveform to
# :py:meth:`~torchaudio.io.AudioEffector.apply` or
# :py:meth:`~torchaudio.io.AudioEffector.stream` method.
#
# .. code:: python
#
# effector = AudioEffector(effect=..., format=...,)
#
# # Apply at once
# applied = effector.apply(waveform, sample_rate)
#
# ``apply`` method applies effect and codec to the entire waveform at
# once. So if the input waveform is long, and memory consumption is an
# issue, one can use ``stream`` method to process chunk by chunk.
#
# .. code:: python
#
# # Apply chunk by chunk
# for applied_chunk = effector.stream(waveform, sample_rate):
# ...
#
######################################################################
# Example
# -------
#
src
=
torchaudio
.
utils
.
download_asset
(
"tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
)
waveform
,
sr
=
torchaudio
.
load
(
src
,
channels_first
=
False
)
######################################################################
# Gallery
# -------
#
def
show
(
effect
,
*
,
stereo
=
False
):
wf
=
torch
.
cat
([
waveform
]
*
2
,
dim
=
1
)
if
stereo
else
waveform
figsize
=
(
6.4
,
2.1
if
stereo
else
1.2
)
effector
=
AudioEffector
(
effect
=
effect
,
pad_end
=
False
)
result
=
effector
.
apply
(
wf
,
int
(
sr
))
num_channels
=
result
.
size
(
1
)
f
,
ax
=
plt
.
subplots
(
num_channels
,
1
,
squeeze
=
False
,
figsize
=
figsize
,
sharex
=
True
)
for
i
in
range
(
num_channels
):
ax
[
i
][
0
].
specgram
(
result
[:,
i
],
Fs
=
sr
)
f
.
set_tight_layout
(
True
)
return
Audio
(
result
.
numpy
().
T
,
rate
=
sr
)
######################################################################
# Original
# --------
#
show
(
effect
=
None
)
######################################################################
# Effects
# -------
#
######################################################################
# tempo
# ~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#atempo
show
(
"atempo=0.7"
)
######################################################################
#
show
(
"atempo=1.8"
)
######################################################################
# highpass
# ~~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#highpass
show
(
"highpass=frequency=1500"
)
######################################################################
# lowpass
# ~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#lowpass
show
(
"lowpass=frequency=1000"
)
######################################################################
# allpass
# ~~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#allpass
show
(
"allpass"
)
######################################################################
# bandpass
# ~~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#bandpass
show
(
"bandpass=frequency=3000"
)
######################################################################
# bandreject
# ~~~~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#bandreject
show
(
"bandreject=frequency=3000"
)
######################################################################
# echo
# ~~~~
# https://ffmpeg.org/ffmpeg-filters.html#aecho
show
(
"aecho=in_gain=0.8:out_gain=0.88:delays=6:decays=0.4"
)
######################################################################
#
show
(
"aecho=in_gain=0.8:out_gain=0.88:delays=60:decays=0.4"
)
######################################################################
#
show
(
"aecho=in_gain=0.8:out_gain=0.9:delays=1000:decays=0.3"
)
######################################################################
# chorus
# ~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#chorus
show
(
"chorus=0.5:0.9:50|60|40:0.4|0.32|0.3:0.25|0.4|0.3:2|2.3|1.3"
)
######################################################################
# fft filter
# ~~~~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#afftfilt
# fmt: off
show
(
"afftfilt="
"real='re * (1-clip(b * (b/nb), 0, 1))':"
"imag='im * (1-clip(b * (b/nb), 0, 1))'"
)
######################################################################
#
show
(
"afftfilt="
"real='hypot(re,im) * sin(0)':"
"imag='hypot(re,im) * cos(0)':"
"win_size=512:"
"overlap=0.75"
)
######################################################################
#
show
(
"afftfilt="
"real='hypot(re,im) * cos(2 * 3.14 * (random(0) * 2-1))':"
"imag='hypot(re,im) * sin(2 * 3.14 * (random(1) * 2-1))':"
"win_size=128:"
"overlap=0.8"
)
# fmt: on
######################################################################
# vibrato
# ~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#vibrato
show
(
"vibrato=f=10:d=0.8"
)
######################################################################
# tremolo
# ~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#tremolo
show
(
"tremolo=f=8:d=0.8"
)
######################################################################
# crystalizer
# ~~~~~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#crystalizer
show
(
"crystalizer"
)
######################################################################
# flanger
# ~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#flanger
show
(
"flanger"
)
######################################################################
# phaser
# ~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#aphaser
show
(
"aphaser"
)
######################################################################
# pulsator
# ~~~~~~~~
# https://ffmpeg.org/ffmpeg-filters.html#apulsator
show
(
"apulsator"
,
stereo
=
True
)
######################################################################
# haas
# ~~~~
# https://ffmpeg.org/ffmpeg-filters.html#haas
show
(
"haas"
)
######################################################################
# Codecs
# ------
#
def
show_multi
(
configs
):
results
=
[]
for
config
in
configs
:
effector
=
AudioEffector
(
**
config
)
results
.
append
(
effector
.
apply
(
waveform
,
int
(
sr
)))
num_configs
=
len
(
configs
)
figsize
=
(
6.4
,
0.3
+
num_configs
*
0.9
)
f
,
axes
=
plt
.
subplots
(
num_configs
,
1
,
figsize
=
figsize
,
sharex
=
True
)
for
result
,
ax
in
zip
(
results
,
axes
):
ax
.
specgram
(
result
[:,
0
],
Fs
=
sr
)
f
.
set_tight_layout
(
True
)
return
[
Audio
(
r
.
numpy
().
T
,
rate
=
sr
)
for
r
in
results
]
######################################################################
# ogg
# ~~~
#
results
=
show_multi
(
[
{
"format"
:
"ogg"
},
{
"format"
:
"ogg"
,
"encoder"
:
"vorbis"
},
{
"format"
:
"ogg"
,
"encoder"
:
"opus"
},
]
)
######################################################################
# ogg - default encoder (flac)
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
results
[
0
]
######################################################################
# ogg - vorbis
# ^^^^^^^^^^^^
#
results
[
1
]
######################################################################
# ogg - opus
# ^^^^^^^^^^
#
results
[
2
]
######################################################################
# mp3
# ~~~
# https://trac.ffmpeg.org/wiki/Encode/MP3
results
=
show_multi
(
[
{
"format"
:
"mp3"
},
{
"format"
:
"mp3"
,
"codec_config"
:
CodecConfig
(
compression_level
=
1
)},
{
"format"
:
"mp3"
,
"codec_config"
:
CodecConfig
(
compression_level
=
9
)},
{
"format"
:
"mp3"
,
"codec_config"
:
CodecConfig
(
bit_rate
=
192_000
)},
{
"format"
:
"mp3"
,
"codec_config"
:
CodecConfig
(
bit_rate
=
8_000
)},
{
"format"
:
"mp3"
,
"codec_config"
:
CodecConfig
(
qscale
=
9
)},
{
"format"
:
"mp3"
,
"codec_config"
:
CodecConfig
(
qscale
=
1
)},
]
)
######################################################################
# default
# ^^^^^^^
results
[
0
]
######################################################################
# compression_level=1
# ^^^^^^^^^^^^^^^^^^^
results
[
1
]
######################################################################
# compression_level=9
# ^^^^^^^^^^^^^^^^^^^
results
[
2
]
######################################################################
# bit_rate=192k
# ^^^^^^^^^^^^^
results
[
3
]
######################################################################
# bit_rate=8k
# ^^^^^^^^^^^^^
results
[
4
]
######################################################################
# qscale=9
# ^^^^^^^^
results
[
5
]
######################################################################
# qscale=1
# ^^^^^^^^
results
[
6
]
######################################################################
#
# Tag: :obj:`torchaudio.io`
examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py
0 → 100644
View file @
ffeba11a
"""
Forced alignment for multilingual data
======================================
**Authors**: `Xiaohui Zhang <xiaohuizhang@meta.com>`__, `Moto Hira <moto@meta.com>`__.
This tutorial shows how to align transcript to speech for non-English languages.
The process of aligning non-English (normalized) transcript is identical to aligning
English (normalized) transcript, and the process for English is covered in detail in
`CTC forced alignment tutorial <./ctc_forced_alignment_api_tutorial.html>`__.
In this tutorial, we use TorchAudio's high-level API,
:py:class:`torchaudio.pipelines.Wav2Vec2FABundle`, which packages the pre-trained
model, tokenizer and aligner, to perform the forced alignment with less code.
"""
import
torch
import
torchaudio
print
(
torch
.
__version__
)
print
(
torchaudio
.
__version__
)
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
print
(
device
)
######################################################################
#
from
typing
import
List
import
IPython
import
matplotlib.pyplot
as
plt
######################################################################
# Creating the pipeline
# ---------------------
#
# First, we instantiate the model and pre/post-processing pipelines.
#
# The following diagram illustrates the process of alignment.
#
# .. image:: https://download.pytorch.org/torchaudio/doc-assets/pipelines-wav2vec2fabundle.png
#
# The waveform is passed to an acoustic model, which produces the sequence of
# probability distribution of tokens.
# The transcript is passed to tokenizer, which converts the transcript to
# sequence of tokens.
# Aligner takes the results from the acoustic model and the tokenizer and generate
# timestamps for each token.
#
# .. note::
#
# This process expects that the input transcript is already normalized.
# The process of normalization, which involves romanization of non-English
# languages, is language-dependent, so it is not covered in this tutorial,
# but we will breifly look into it.
#
# The acoustic model and the tokenizer must use the same set of tokens.
# To facilitate the creation of matching processors,
# :py:class:`~torchaudio.pipelines.Wav2Vec2FABundle` associates a
# pre-trained accoustic model and a tokenizer.
# :py:data:`torchaudio.pipelines.MMS_FA` is one of such instance.
#
# The following code instantiates a pre-trained acoustic model, a tokenizer
# which uses the same set of tokens as the model, and an aligner.
#
from
torchaudio.pipelines
import
MMS_FA
as
bundle
model
=
bundle
.
get_model
()
model
.
to
(
device
)
tokenizer
=
bundle
.
get_tokenizer
()
aligner
=
bundle
.
get_aligner
()
######################################################################
# .. note::
#
# The model instantiated by :py:data:`~torchaudio.pipelines.MMS_FA`'s
# :py:meth:`~torchaudio.pipelines.Wav2Vec2FABundle.get_model`
# method by default includes the feature dimension for ``<star>`` token.
# You can disable this by passing ``with_star=False``.
#
######################################################################
# The acoustic model of :py:data:`~torchaudio.pipelines.MMS_FA` was
# created and open-sourced as part of the research project,
# `Scaling Speech Technology to 1,000+ Languages
# <https://research.facebook.com/publications/scaling-speech-technology-to-1000-languages/>`__.
# It was trained with 23,000 hours of audio from 1100+ languages.
#
# The tokenizer simply maps the normalized characters to integers.
# You can check the mapping as follow;
print
(
bundle
.
get_dict
())
######################################################################
#
# The aligner internally uses :py:func:`torchaudio.functional.forced_align`
# and :py:func:`torchaudio.functional.merge_tokens` to infer the time
# stamps of the input tokens.
#
# The detail of the underlying mechanism is covered in
# `CTC forced alignment API tutorial <./ctc_forced_alignment_api_tutorial.html>`__,
# so please refer to it.
######################################################################
# We define a utility function that performs the forced alignment with
# the above model, the tokenizer and the aligner.
#
def
compute_alignments
(
waveform
:
torch
.
Tensor
,
transcript
:
List
[
str
]):
with
torch
.
inference_mode
():
emission
,
_
=
model
(
waveform
.
to
(
device
))
token_spans
=
aligner
(
emission
[
0
],
tokenizer
(
transcript
))
return
emission
,
token_spans
######################################################################
# We also define utility functions for plotting the result and previewing
# the audio segments.
# Compute average score weighted by the span length
def
_score
(
spans
):
return
sum
(
s
.
score
*
len
(
s
)
for
s
in
spans
)
/
sum
(
len
(
s
)
for
s
in
spans
)
def
plot_alignments
(
waveform
,
token_spans
,
emission
,
transcript
,
sample_rate
=
bundle
.
sample_rate
):
ratio
=
waveform
.
size
(
1
)
/
emission
.
size
(
1
)
/
sample_rate
fig
,
axes
=
plt
.
subplots
(
2
,
1
)
axes
[
0
].
imshow
(
emission
[
0
].
detach
().
cpu
().
T
,
aspect
=
"auto"
)
axes
[
0
].
set_title
(
"Emission"
)
axes
[
0
].
set_xticks
([])
axes
[
1
].
specgram
(
waveform
[
0
],
Fs
=
sample_rate
)
for
t_spans
,
chars
in
zip
(
token_spans
,
transcript
):
t0
,
t1
=
t_spans
[
0
].
start
,
t_spans
[
-
1
].
end
axes
[
0
].
axvspan
(
t0
-
0.5
,
t1
-
0.5
,
facecolor
=
"None"
,
hatch
=
"/"
,
edgecolor
=
"white"
)
axes
[
1
].
axvspan
(
ratio
*
t0
,
ratio
*
t1
,
facecolor
=
"None"
,
hatch
=
"/"
,
edgecolor
=
"white"
)
axes
[
1
].
annotate
(
f
"
{
_score
(
t_spans
):.
2
f
}
"
,
(
ratio
*
t0
,
sample_rate
*
0.51
),
annotation_clip
=
False
)
for
span
,
char
in
zip
(
t_spans
,
chars
):
t0
=
span
.
start
*
ratio
axes
[
1
].
annotate
(
char
,
(
t0
,
sample_rate
*
0.55
),
annotation_clip
=
False
)
axes
[
1
].
set_xlabel
(
"time [second]"
)
fig
.
tight_layout
()
######################################################################
#
def
preview_word
(
waveform
,
spans
,
num_frames
,
transcript
,
sample_rate
=
bundle
.
sample_rate
):
ratio
=
waveform
.
size
(
1
)
/
num_frames
x0
=
int
(
ratio
*
spans
[
0
].
start
)
x1
=
int
(
ratio
*
spans
[
-
1
].
end
)
print
(
f
"
{
transcript
}
(
{
_score
(
spans
):.
2
f
}
):
{
x0
/
sample_rate
:.
3
f
}
-
{
x1
/
sample_rate
:.
3
f
}
sec"
)
segment
=
waveform
[:,
x0
:
x1
]
return
IPython
.
display
.
Audio
(
segment
.
numpy
(),
rate
=
sample_rate
)
######################################################################
# Normalizing the transcript
# --------------------------
#
# The transcripts passed to the pipeline must be normalized beforehand.
# The exact process of normalization depends on language.
#
# Languages that do not have explicit word boundaries
# (such as Chinese, Japanese and Korean) require segmentation first.
# There are dedicated tools for this, but let's say we have segmented
# transcript.
#
# The first step of normalization is romanization.
# `uroman <https://github.com/isi-nlp/uroman>`__ is a tool that
# supports many languages.
#
# Here is a BASH commands to romanize the input text file and write
# the output to another text file using ``uroman``.
#
# .. code-block:: bash
#
# $ echo "des événements d'actualité qui se sont produits durant l'année 1882" > text.txt
# $ uroman/bin/uroman.pl < text.txt > text_romanized.txt
# $ cat text_romanized.txt
#
# .. code-block:: text
#
# Cette page concerne des evenements d'actualite qui se sont produits durant l'annee 1882
#
# The next step is to remove non-alphabets and punctuations.
# The following snippet normalizes the romanized transcript.
#
# .. code-block:: python
#
# import re
#
#
# def normalize_uroman(text):
# text = text.lower()
# text = text.replace("’", "'")
# text = re.sub("([^a-z' ])", " ", text)
# text = re.sub(' +', ' ', text)
# return text.strip()
#
#
# with open("text_romanized.txt", "r") as f:
# for line in f:
# text_normalized = normalize_uroman(line)
# print(text_normalized)
#
# Running the script on the above exanple produces the following.
#
# .. code-block:: text
#
# cette page concerne des evenements d'actualite qui se sont produits durant l'annee
#
# Note that, in this example, since "1882" was not romanized by ``uroman``,
# it was removed in the normalization step.
# To avoid this, one needs to romanize numbers, but this is known to be a non-trivial task.
#
######################################################################
# Aligning transcripts to speech
# ------------------------------
#
# Now we perform the forced alignment for multiple languages.
#
#
# German
# ~~~~~~
text_raw
=
"aber seit ich bei ihnen das brot hole"
text_normalized
=
"aber seit ich bei ihnen das brot hole"
url
=
"https://download.pytorch.org/torchaudio/tutorial-assets/10349_8674_000087.flac"
waveform
,
sample_rate
=
torchaudio
.
load
(
url
,
frame_offset
=
int
(
0.5
*
bundle
.
sample_rate
),
num_frames
=
int
(
2.5
*
bundle
.
sample_rate
)
)
######################################################################
#
assert
sample_rate
==
bundle
.
sample_rate
######################################################################
#
transcript
=
text_normalized
.
split
()
tokens
=
tokenizer
(
transcript
)
emission
,
token_spans
=
compute_alignments
(
waveform
,
transcript
)
num_frames
=
emission
.
size
(
1
)
plot_alignments
(
waveform
,
token_spans
,
emission
,
transcript
)
print
(
"Raw Transcript: "
,
text_raw
)
print
(
"Normalized Transcript: "
,
text_normalized
)
IPython
.
display
.
Audio
(
waveform
,
rate
=
sample_rate
)
######################################################################
#
preview_word
(
waveform
,
token_spans
[
0
],
num_frames
,
transcript
[
0
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
1
],
num_frames
,
transcript
[
1
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
2
],
num_frames
,
transcript
[
2
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
3
],
num_frames
,
transcript
[
3
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
4
],
num_frames
,
transcript
[
4
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
5
],
num_frames
,
transcript
[
5
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
6
],
num_frames
,
transcript
[
6
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
7
],
num_frames
,
transcript
[
7
])
######################################################################
# Chinese
# ~~~~~~~
#
# Chinese is a character-based language, and there is not explicit word-level
# tokenization (separated by spaces) in its raw written form. In order to
# obtain word level alignments, you need to first tokenize the transcripts
# at the word level using a word tokenizer like `“Stanford
# Tokenizer” <https://michelleful.github.io/code-blog/2015/09/10/parsing-chinese-with-stanford/>`__.
# However this is not needed if you only want character-level alignments.
#
text_raw
=
"关 服务 高端 产品 仍 处于 供不应求 的 局面"
text_normalized
=
"guan fuwu gaoduan chanpin reng chuyu gongbuyingqiu de jumian"
######################################################################
#
url
=
"https://download.pytorch.org/torchaudio/tutorial-assets/mvdr/clean_speech.wav"
waveform
,
sample_rate
=
torchaudio
.
load
(
url
)
waveform
=
waveform
[
0
:
1
]
######################################################################
#
assert
sample_rate
==
bundle
.
sample_rate
######################################################################
#
transcript
=
text_normalized
.
split
()
emission
,
token_spans
=
compute_alignments
(
waveform
,
transcript
)
num_frames
=
emission
.
size
(
1
)
plot_alignments
(
waveform
,
token_spans
,
emission
,
transcript
)
print
(
"Raw Transcript: "
,
text_raw
)
print
(
"Normalized Transcript: "
,
text_normalized
)
IPython
.
display
.
Audio
(
waveform
,
rate
=
sample_rate
)
######################################################################
#
preview_word
(
waveform
,
token_spans
[
0
],
num_frames
,
transcript
[
0
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
1
],
num_frames
,
transcript
[
1
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
2
],
num_frames
,
transcript
[
2
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
3
],
num_frames
,
transcript
[
3
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
4
],
num_frames
,
transcript
[
4
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
5
],
num_frames
,
transcript
[
5
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
6
],
num_frames
,
transcript
[
6
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
7
],
num_frames
,
transcript
[
7
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
8
],
num_frames
,
transcript
[
8
])
######################################################################
# Polish
# ~~~~~~
text_raw
=
"wtedy ujrzałem na jego brzuchu okrągłą czarną ranę"
text_normalized
=
"wtedy ujrzalem na jego brzuchu okragla czarna rane"
url
=
"https://download.pytorch.org/torchaudio/tutorial-assets/5090_1447_000088.flac"
waveform
,
sample_rate
=
torchaudio
.
load
(
url
,
num_frames
=
int
(
4.5
*
bundle
.
sample_rate
))
######################################################################
#
assert
sample_rate
==
bundle
.
sample_rate
######################################################################
#
transcript
=
text_normalized
.
split
()
emission
,
token_spans
=
compute_alignments
(
waveform
,
transcript
)
num_frames
=
emission
.
size
(
1
)
plot_alignments
(
waveform
,
token_spans
,
emission
,
transcript
)
print
(
"Raw Transcript: "
,
text_raw
)
print
(
"Normalized Transcript: "
,
text_normalized
)
IPython
.
display
.
Audio
(
waveform
,
rate
=
sample_rate
)
######################################################################
#
preview_word
(
waveform
,
token_spans
[
0
],
num_frames
,
transcript
[
0
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
1
],
num_frames
,
transcript
[
1
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
2
],
num_frames
,
transcript
[
2
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
3
],
num_frames
,
transcript
[
3
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
4
],
num_frames
,
transcript
[
4
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
5
],
num_frames
,
transcript
[
5
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
6
],
num_frames
,
transcript
[
6
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
7
],
num_frames
,
transcript
[
7
])
######################################################################
# Portuguese
# ~~~~~~~~~~
text_raw
=
"na imensa extensão onde se esconde o inconsciente imortal"
text_normalized
=
"na imensa extensao onde se esconde o inconsciente imortal"
url
=
"https://download.pytorch.org/torchaudio/tutorial-assets/6566_5323_000027.flac"
waveform
,
sample_rate
=
torchaudio
.
load
(
url
,
frame_offset
=
int
(
bundle
.
sample_rate
),
num_frames
=
int
(
4.6
*
bundle
.
sample_rate
)
)
######################################################################
#
assert
sample_rate
==
bundle
.
sample_rate
######################################################################
#
transcript
=
text_normalized
.
split
()
emission
,
token_spans
=
compute_alignments
(
waveform
,
transcript
)
num_frames
=
emission
.
size
(
1
)
plot_alignments
(
waveform
,
token_spans
,
emission
,
transcript
)
print
(
"Raw Transcript: "
,
text_raw
)
print
(
"Normalized Transcript: "
,
text_normalized
)
IPython
.
display
.
Audio
(
waveform
,
rate
=
sample_rate
)
######################################################################
#
preview_word
(
waveform
,
token_spans
[
0
],
num_frames
,
transcript
[
0
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
1
],
num_frames
,
transcript
[
1
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
2
],
num_frames
,
transcript
[
2
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
3
],
num_frames
,
transcript
[
3
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
4
],
num_frames
,
transcript
[
4
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
5
],
num_frames
,
transcript
[
5
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
6
],
num_frames
,
transcript
[
6
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
7
],
num_frames
,
transcript
[
7
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
8
],
num_frames
,
transcript
[
8
])
######################################################################
# Italian
# ~~~~~~~
text_raw
=
"elle giacean per terra tutte quante"
text_normalized
=
"elle giacean per terra tutte quante"
url
=
"https://download.pytorch.org/torchaudio/tutorial-assets/642_529_000025.flac"
waveform
,
sample_rate
=
torchaudio
.
load
(
url
,
num_frames
=
int
(
4
*
bundle
.
sample_rate
))
######################################################################
#
assert
sample_rate
==
bundle
.
sample_rate
######################################################################
#
transcript
=
text_normalized
.
split
()
emission
,
token_spans
=
compute_alignments
(
waveform
,
transcript
)
num_frames
=
emission
.
size
(
1
)
plot_alignments
(
waveform
,
token_spans
,
emission
,
transcript
)
print
(
"Raw Transcript: "
,
text_raw
)
print
(
"Normalized Transcript: "
,
text_normalized
)
IPython
.
display
.
Audio
(
waveform
,
rate
=
sample_rate
)
######################################################################
#
preview_word
(
waveform
,
token_spans
[
0
],
num_frames
,
transcript
[
0
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
1
],
num_frames
,
transcript
[
1
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
2
],
num_frames
,
transcript
[
2
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
3
],
num_frames
,
transcript
[
3
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
4
],
num_frames
,
transcript
[
4
])
######################################################################
#
preview_word
(
waveform
,
token_spans
[
5
],
num_frames
,
transcript
[
5
])
######################################################################
# Conclusion
# ----------
#
# In this tutorial, we looked at how to use torchaudio’s forced alignment
# API and a Wav2Vec2 pre-trained mulilingual acoustic model to align
# speech data to transcripts in five languages.
#
######################################################################
# Acknowledgement
# ---------------
#
# Thanks to `Vineel Pratap <vineelkpratap@meta.com>`__ and `Zhaoheng
# Ni <zni@meta.com>`__ for developing and open-sourcing the
# forced aligner API.
#
Prev
1
…
4
5
6
7
8
9
10
11
12
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment