Commit 4130a52d authored by changhl's avatar changhl
Browse files

init model

parent eb6a18fd
Pipeline #1617 failed with stages
in 0 seconds
# Generated 2024-08-27 from:
# /public/home/changhl/tacotron2_pytorch/speechbrain/recipes/LJSpeech/TTS/tacotron2/hparams/train.yaml
# yamllint disable
############################################################################
# Model: Tacotron2
# Tokens: Raw characters (English text)
# losses: Transducer
# Training: LJSpeech
# Authors: Georges Abous-Rjeili, Artem Ploujnikov, Yingzhi Wang
# ############################################################################
###################################
# Experiment Parameters and setup #
###################################
seed: 1234
__set_seed: !apply:torch.manual_seed [1234]
output_folder: ./results/tacotron2/1234
save_folder: /public/home/changhl/taco/logdir
train_log: ./results/tacotron2/1234/train_log.txt
epochs: 750
keep_checkpoint_interval: 50
###################################
# Progress Samples #
###################################
# Progress samples are used to monitor the progress
# of an ongoing training session by outputting samples
# of spectrograms, alignments, etc at regular intervals
# Whether to enable progress samples
progress_samples: true
# The path where the samples will be stored
progress_sample_path: ./results/tacotron2/1234/samples
# The interval, in epochs. For instance, if it is set to 5,
# progress samples will be output every 5 epochs
progress_samples_interval: 1
# The sample size for raw batch samples saved in batch.pth
# (useful mostly for model debugging)
progress_batch_sample_size: 3
#################################
# Data files and pre-processing #
#################################
data_folder: /public/home/changhl/LJSpeech-1.1
# e.g, /localscratch/ljspeech
train_json: /public/home/changhl/taco/logdir/train.json
valid_json: /public/home/changhl/taco/logdir/valid.json
test_json: /public/home/changhl/taco/logdir/test.json
splits: [train, valid]
split_ratio: [90, 10]
skip_prep: false
# Use the original preprocessing from nvidia
# The cleaners to be used (applicable to nvidia only)
text_cleaners: [english_cleaners]
################################
# Audio Parameters #
################################
sample_rate: 22050
hop_length: 256
win_length: 1024
n_mel_channels: 80
n_fft: 1024
mel_fmin: 0.0
mel_fmax: 8000.0
mel_normalized: false
power: 1
norm: slaney
mel_scale: slaney
dynamic_range_compression: true
################################
# Optimization Hyperparameters #
################################
learning_rate: 0.001
weight_decay: 0.000006
batch_size: 64 #minimum 2
num_workers: 8
mask_padding: true
guided_attention_sigma: 0.2
guided_attention_weight: 50.0
guided_attention_weight_half_life: 10.
guided_attention_hard_stop: 50
gate_loss_weight: 1.0
train_dataloader_opts:
batch_size: 64
drop_last: false #True #False
num_workers: 8
collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
valid_dataloader_opts:
batch_size: 64
num_workers: 8
collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
test_dataloader_opts:
batch_size: 64
num_workers: 8
collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
################################
# Model Parameters and model #
################################
n_symbols: 148 #fixed depending on symbols in textToSequence
symbols_embedding_dim: 512
# Encoder parameters
encoder_kernel_size: 5
encoder_n_convolutions: 3
encoder_embedding_dim: 512
# Decoder parameters
# The number of frames in the target per encoder step
n_frames_per_step: 1
decoder_rnn_dim: 1024
prenet_dim: 256
max_decoder_steps: 1000
gate_threshold: 0.5
p_attention_dropout: 0.1
p_decoder_dropout: 0.1
decoder_no_early_stopping: false
# Attention parameters
attention_rnn_dim: 1024
attention_dim: 128
# Location Layer parameters
attention_location_n_filters: 32
attention_location_kernel_size: 31
# Mel-post processing network parameters
postnet_embedding_dim: 512
postnet_kernel_size: 5
postnet_n_convolutions: 5
mel_spectogram: !name:speechbrain.lobes.models.Tacotron2.mel_spectogram
sample_rate: 22050
hop_length: 256
win_length: 1024
n_fft: 1024
n_mels: 80
f_min: 0.0
f_max: 8000.0
power: 1
normalized: false
norm: slaney
mel_scale: slaney
compression: true
#model
model: &id002 !new:speechbrain.lobes.models.Tacotron2.Tacotron2
#optimizer
mask_padding: true
n_mel_channels: 80
# symbols
n_symbols: 148
symbols_embedding_dim: 512
# encoder
encoder_kernel_size: 5
encoder_n_convolutions: 3
encoder_embedding_dim: 512
# attention
attention_rnn_dim: 1024
attention_dim: 128
# attention location
attention_location_n_filters: 32
attention_location_kernel_size: 31
# decoder
n_frames_per_step: 1
decoder_rnn_dim: 1024
prenet_dim: 256
max_decoder_steps: 1000
gate_threshold: 0.5
p_attention_dropout: 0.1
p_decoder_dropout: 0.1
# postnet
postnet_embedding_dim: 512
postnet_kernel_size: 5
postnet_n_convolutions: 5
decoder_no_early_stopping: false
guided_attention_scheduler: &id001 !new:speechbrain.nnet.schedulers.StepScheduler
initial_value: 50.0
half_life: 10.
criterion: !new:speechbrain.lobes.models.Tacotron2.Loss
gate_loss_weight: 1.0
guided_attention_weight: 50.0
guided_attention_sigma: 0.2
guided_attention_scheduler: *id001
guided_attention_hard_stop: 50
modules:
model: *id002
opt_class: !name:torch.optim.Adam
lr: 0.001
weight_decay: 0.000006
#epoch object
epoch_counter: &id003 !new:speechbrain.utils.epoch_loop.EpochCounter
limit: 750
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: ./results/tacotron2/1234/train_log.txt
#annealing_function
lr_annealing: &id004 !new:speechbrain.nnet.schedulers.IntervalScheduler
#infer: !name:speechbrain.lobes.models.Tacotron2.infer
intervals:
- steps: 6000
lr: 0.0005
- steps: 8000
lr: 0.0003
- steps: 10000
lr: 0.0001
#checkpointer
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: /public/home/changhl/taco/logdir
recoverables:
model: *id002
counter: *id003
scheduler: *id004
progress_sample_logger: !new:speechbrain.utils.train_logger.ProgressSampleLogger
output_path: ./results/tacotron2/1234/samples
batch_sample_size: 3
formats:
raw_batch: raw
# -*- coding: utf-8 -*-
"""
Recipe for training the Tacotron Text-To-Speech model, an end-to-end
neural text-to-speech (TTS) system
To run this recipe, do the following:
# python train.py --device=cuda:0 --max_grad_norm=1.0 --data_folder=/your_folder/LJSpeech-1.1 hparams/train.yaml
to infer simply load saved model and do
savemodel.infer(text_Sequence,len(textsequence))
were text_Sequence is the output of the text_to_sequence function from
textToSequence.py (from textToSequence import text_to_sequence)
Authors
* Georges Abous-Rjeili 2021
* Artem Ploujnikov 2021
* Yingzhi Wang 2022
"""
import logging
import sys
import torch
from hyperpyyaml import load_hyperpyyaml
import speechbrain as sb
from speechbrain.utils.data_utils import scalarize
from speechbrain.utils.text_to_sequence import text_to_sequence
logger = logging.getLogger(__name__)
class Tacotron2Brain(sb.Brain):
"""The Brain implementation for Tacotron2"""
def on_fit_start(self):
"""Gets called at the beginning of ``fit()``, on multiple processes
if ``distributed_count > 0`` and backend is ddp and initializes statistics
"""
self.hparams.progress_sample_logger.reset()
self.last_epoch = 0
self.last_batch = None
self.last_loss_stats = {}
return super().on_fit_start()
def compute_forward(self, batch, stage):
"""Computes the forward pass
Arguments
---------
batch: str
a single batch
stage: speechbrain.Stage
the training stage
Returns
-------
the model output
"""
effective_batch = self.batch_to_device(batch)
inputs, y, num_items, _, _ = effective_batch
_, input_lengths, _, _, _ = inputs
max_input_length = input_lengths.max().item()
return self.modules.model(inputs, alignments_dim=max_input_length)
def on_fit_batch_end(self, batch, outputs, loss, should_step):
"""At the end of the optimizer step, apply noam annealing."""
if should_step:
self.hparams.lr_annealing(self.optimizer)
def compute_objectives(self, predictions, batch, stage):
"""Computes the loss given the predicted and targeted outputs.
Arguments
---------
predictions : torch.Tensor
The model generated spectrograms and other metrics from `compute_forward`.
batch : PaddedBatch
This batch object contains all the relevant tensors for computation.
stage : sb.Stage
One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
Returns
-------
loss : torch.Tensor
A one-element tensor used for backpropagating the gradient.
"""
effective_batch = self.batch_to_device(batch)
# Hold on to the batch for the inference sample. This is needed because
# the inference sample is run from on_stage_end only, where
# batch information is not available
self.last_batch = effective_batch
# Hold on to a sample (for logging)
self._remember_sample(effective_batch, predictions)
# Compute the loss
loss = self._compute_loss(predictions, effective_batch, stage)
return loss
def _compute_loss(self, predictions, batch, stage):
"""Computes the value of the loss function and updates stats
Arguments
---------
predictions: tuple
model predictions
batch: PaddedBatch
Inputs for this training iteration.
stage: sb.Stage
One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
Returns
-------
loss: torch.Tensor
the loss value
"""
inputs, targets, num_items, labels, wavs = batch
text_padded, input_lengths, _, max_len, output_lengths = inputs
loss_stats = self.hparams.criterion(
predictions, targets, input_lengths, output_lengths, self.last_epoch
)
self.last_loss_stats[stage] = scalarize(loss_stats)
return loss_stats.loss
def _remember_sample(self, batch, predictions):
"""Remembers samples of spectrograms and the batch for logging purposes
Arguments
---------
batch: tuple
a training batch
predictions: tuple
predictions (raw output of the Tacotron model)
"""
inputs, targets, num_items, labels, wavs = batch
text_padded, input_lengths, _, max_len, output_lengths = inputs
mel_target, _ = targets
mel_out, mel_out_postnet, gate_out, alignments = predictions
alignments_max = (
alignments[0]
.max(dim=-1)
.values.max(dim=-1)
.values.unsqueeze(-1)
.unsqueeze(-1)
)
alignments_output = alignments[0].T.flip(dims=(1,)) / alignments_max
self.hparams.progress_sample_logger.remember(
target=self._get_spectrogram_sample(mel_target),
output=self._get_spectrogram_sample(mel_out),
output_postnet=self._get_spectrogram_sample(mel_out_postnet),
alignments=alignments_output,
raw_batch=self.hparams.progress_sample_logger.get_batch_sample(
{
"text_padded": text_padded,
"input_lengths": input_lengths,
"mel_target": mel_target,
"mel_out": mel_out,
"mel_out_postnet": mel_out_postnet,
"max_len": max_len,
"output_lengths": output_lengths,
"gate_out": gate_out,
"alignments": alignments,
"labels": labels,
"wavs": wavs,
}
),
)
def batch_to_device(self, batch):
"""Transfers the batch to the target device
Arguments
---------
batch: tuple
the batch to use
Returns
-------
batch: tuple
the batch on the correct device
"""
(
text_padded,
input_lengths,
mel_padded,
gate_padded,
output_lengths,
len_x,
labels,
wavs,
) = batch
text_padded = text_padded.to(self.device, non_blocking=True).long()
input_lengths = input_lengths.to(self.device, non_blocking=True).long()
max_len = torch.max(input_lengths.data).item()
mel_padded = mel_padded.to(self.device, non_blocking=True).float()
gate_padded = gate_padded.to(self.device, non_blocking=True).float()
output_lengths = output_lengths.to(
self.device, non_blocking=True
).long()
x = (text_padded, input_lengths, mel_padded, max_len, output_lengths)
y = (mel_padded, gate_padded)
len_x = torch.sum(output_lengths)
return (x, y, len_x, labels, wavs)
def _get_spectrogram_sample(self, raw):
"""Converts a raw spectrogram to one that can be saved as an image
sample = sqrt(exp(raw))
Arguments
---------
raw: torch.Tensor
the raw spectrogram (as used in the model)
Returns
-------
sample: torch.Tensor
the spectrogram, for image saving purposes
"""
sample = raw[0]
return torch.sqrt(torch.exp(sample))
def on_stage_end(self, stage, stage_loss, epoch):
"""Gets called at the end of an epoch.
Arguments
---------
stage : sb.Stage
One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST
stage_loss : float
The average loss for all of the data processed in this stage.
epoch : int
The currently-starting epoch. This is passed
`None` during the test stage.
"""
# Store the train loss until the validation stage.
# At the end of validation, we can write
if stage == sb.Stage.VALID:
# Update learning rate
lr = self.optimizer.param_groups[-1]["lr"]
self.last_epoch = epoch
# The train_logger writes a summary to stdout and to the logfile.
self.hparams.train_logger.log_stats( # 1#2#
stats_meta={"Epoch": epoch, "lr": lr},
train_stats=self.last_loss_stats[sb.Stage.TRAIN],
valid_stats=self.last_loss_stats[sb.Stage.VALID],
)
# Save the current checkpoint and delete previous checkpoints.
epoch_metadata = {
**{"epoch": epoch},
**self.last_loss_stats[sb.Stage.VALID],
}
self.checkpointer.save_and_keep_only(
meta=epoch_metadata,
min_keys=["loss"],
ckpt_predicate=(
(
lambda ckpt: (
ckpt.meta["epoch"]
% self.hparams.keep_checkpoint_interval
!= 0
)
)
if self.hparams.keep_checkpoint_interval is not None
else None
),
)
output_progress_sample = (
self.hparams.progress_samples
and epoch % self.hparams.progress_samples_interval == 0
)
if output_progress_sample:
self.run_inference_sample()
self.hparams.progress_sample_logger.save(epoch)
# We also write statistics about test data to stdout and to the logfile.
if stage == sb.Stage.TEST:
self.hparams.train_logger.log_stats(
{"Epoch loaded": self.hparams.epoch_counter.current},
test_stats=self.last_loss_stats[sb.Stage.TEST],
)
if self.hparams.progress_samples:
self.run_inference_sample()
self.hparams.progress_sample_logger.save("test")
def run_inference_sample(self):
"""Produces a sample in inference mode. This is called when producing
samples and can be useful because"""
if self.last_batch is None:
return
inputs, _, _, _, _ = self.last_batch
text_padded, input_lengths, _, _, _ = inputs
mel_out, _, _ = self.hparams.model.infer(
text_padded[:1], input_lengths[:1]
)
self.hparams.progress_sample_logger.remember(
inference_mel_out=self._get_spectrogram_sample(mel_out)
)
def dataio_prepare(hparams):
# Define audio pipeline:
@sb.utils.data_pipeline.takes("wav", "label")
@sb.utils.data_pipeline.provides("mel_text_pair")
def audio_pipeline(wav, label):
text_seq = torch.IntTensor(
text_to_sequence(label, hparams["text_cleaners"])
)
audio = sb.dataio.dataio.read_audio(wav)
mel = hparams["mel_spectogram"](audio=audio)
len_text = len(text_seq)
return text_seq, mel, len_text
datasets = {}
data_info = {
"train": hparams["train_json"],
"valid": hparams["valid_json"],
"test": hparams["test_json"],
}
for dataset in hparams["splits"]:
datasets[dataset] = sb.dataio.dataset.DynamicItemDataset.from_json(
json_path=data_info[dataset],
replacements={"data_root": hparams["data_folder"]},
dynamic_items=[audio_pipeline],
output_keys=["mel_text_pair", "wav", "label"],
)
return datasets
if __name__ == "__main__":
# Load hyperparameters file with command-line overrides
hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
with open(hparams_file) as fin:
hparams = load_hyperpyyaml(fin, overrides)
# create ddp_group with the right communication protocol
sb.utils.distributed.ddp_init_group(run_opts)
# Create experiment directory
sb.create_experiment_directory(
experiment_directory=hparams["output_folder"],
hyperparams_to_save=hparams_file,
overrides=overrides,
)
from ljspeech_prepare import prepare_ljspeech
sb.utils.distributed.run_on_main(
prepare_ljspeech,
kwargs={
"data_folder": hparams["data_folder"],
"save_folder": hparams["save_folder"],
"splits": hparams["splits"],
"split_ratio": hparams["split_ratio"],
"seed": hparams["seed"],
"skip_prep": hparams["skip_prep"],
},
)
datasets = dataio_prepare(hparams)
# Brain class initialization
tacotron2_brain = Tacotron2Brain(
modules=hparams["modules"],
opt_class=hparams["opt_class"],
hparams=hparams,
run_opts=run_opts,
checkpointer=hparams["checkpointer"],
)
# Training
tacotron2_brain.fit(
tacotron2_brain.hparams.epoch_counter,
train_set=datasets["train"],
valid_set=datasets["valid"],
train_loader_kwargs=hparams["train_dataloader_opts"],
valid_loader_kwargs=hparams["valid_dataloader_opts"],
)
# Test
if "test" in datasets:
tacotron2_brain.evaluate(
datasets["test"],
test_loader_kwargs=hparams["test_dataloader_opts"],
)
# -*- coding: utf-8 -*-
"""
Recipe for training the Tacotron Text-To-Speech model, an end-to-end
neural text-to-speech (TTS) system
To run this recipe, do the following:
# python train.py --device=cuda:0 --max_grad_norm=1.0 --data_folder=/your_folder/LJSpeech-1.1 hparams/train.yaml
to infer simply load saved model and do
savemodel.infer(text_Sequence,len(textsequence))
were text_Sequence is the output of the text_to_sequence function from
textToSequence.py (from textToSequence import text_to_sequence)
Authors
* Georges Abous-Rjeili 2021
* Artem Ploujnikov 2021
* Yingzhi Wang 2022
"""
import logging
import sys
import torch
from hyperpyyaml import load_hyperpyyaml
import speechbrain as sb
from speechbrain.utils.data_utils import scalarize
from speechbrain.utils.text_to_sequence import text_to_sequence
logger = logging.getLogger(__name__)
class Tacotron2Brain(sb.Brain):
"""The Brain implementation for Tacotron2"""
def on_fit_start(self):
"""Gets called at the beginning of ``fit()``, on multiple processes
if ``distributed_count > 0`` and backend is ddp and initializes statistics
"""
self.hparams.progress_sample_logger.reset()
self.last_epoch = 0
self.last_batch = None
self.last_loss_stats = {}
return super().on_fit_start()
def compute_forward(self, batch, stage):
"""Computes the forward pass
Arguments
---------
batch: str
a single batch
stage: speechbrain.Stage
the training stage
Returns
-------
the model output
"""
effective_batch = self.batch_to_device(batch)
inputs, y, num_items, _, _ = effective_batch
_, input_lengths, _, _, _ = inputs
max_input_length = input_lengths.max().item()
return self.modules.model(inputs, alignments_dim=max_input_length)
def on_fit_batch_end(self, batch, outputs, loss, should_step):
"""At the end of the optimizer step, apply noam annealing."""
if should_step:
self.hparams.lr_annealing(self.optimizer)
def compute_objectives(self, predictions, batch, stage):
"""Computes the loss given the predicted and targeted outputs.
Arguments
---------
predictions : torch.Tensor
The model generated spectrograms and other metrics from `compute_forward`.
batch : PaddedBatch
This batch object contains all the relevant tensors for computation.
stage : sb.Stage
One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
Returns
-------
loss : torch.Tensor
A one-element tensor used for backpropagating the gradient.
"""
effective_batch = self.batch_to_device(batch)
# Hold on to the batch for the inference sample. This is needed because
# the inference sample is run from on_stage_end only, where
# batch information is not available
self.last_batch = effective_batch
# Hold on to a sample (for logging)
self._remember_sample(effective_batch, predictions)
# Compute the loss
loss = self._compute_loss(predictions, effective_batch, stage)
return loss
def _compute_loss(self, predictions, batch, stage):
"""Computes the value of the loss function and updates stats
Arguments
---------
predictions: tuple
model predictions
batch: PaddedBatch
Inputs for this training iteration.
stage: sb.Stage
One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
Returns
-------
loss: torch.Tensor
the loss value
"""
inputs, targets, num_items, labels, wavs = batch
text_padded, input_lengths, _, max_len, output_lengths = inputs
loss_stats = self.hparams.criterion(
predictions, targets, input_lengths, output_lengths, self.last_epoch
)
self.last_loss_stats[stage] = scalarize(loss_stats)
return loss_stats.loss
def _remember_sample(self, batch, predictions):
"""Remembers samples of spectrograms and the batch for logging purposes
Arguments
---------
batch: tuple
a training batch
predictions: tuple
predictions (raw output of the Tacotron model)
"""
inputs, targets, num_items, labels, wavs = batch
text_padded, input_lengths, _, max_len, output_lengths = inputs
mel_target, _ = targets
mel_out, mel_out_postnet, gate_out, alignments = predictions
alignments_max = (
alignments[0]
.max(dim=-1)
.values.max(dim=-1)
.values.unsqueeze(-1)
.unsqueeze(-1)
)
alignments_output = alignments[0].T.flip(dims=(1,)) / alignments_max
self.hparams.progress_sample_logger.remember(
target=self._get_spectrogram_sample(mel_target),
output=self._get_spectrogram_sample(mel_out),
output_postnet=self._get_spectrogram_sample(mel_out_postnet),
alignments=alignments_output,
raw_batch=self.hparams.progress_sample_logger.get_batch_sample(
{
"text_padded": text_padded,
"input_lengths": input_lengths,
"mel_target": mel_target,
"mel_out": mel_out,
"mel_out_postnet": mel_out_postnet,
"max_len": max_len,
"output_lengths": output_lengths,
"gate_out": gate_out,
"alignments": alignments,
"labels": labels,
"wavs": wavs,
}
),
)
def batch_to_device(self, batch):
"""Transfers the batch to the target device
Arguments
---------
batch: tuple
the batch to use
Returns
-------
batch: tuple
the batch on the correct device
"""
(
text_padded,
input_lengths,
mel_padded,
gate_padded,
output_lengths,
len_x,
labels,
wavs,
) = batch
text_padded = text_padded.to(self.device, non_blocking=True).long()
input_lengths = input_lengths.to(self.device, non_blocking=True).long()
max_len = torch.max(input_lengths.data).item()
mel_padded = mel_padded.to(self.device, non_blocking=True).float()
gate_padded = gate_padded.to(self.device, non_blocking=True).float()
output_lengths = output_lengths.to(
self.device, non_blocking=True
).long()
x = (text_padded, input_lengths, mel_padded, max_len, output_lengths)
y = (mel_padded, gate_padded)
len_x = torch.sum(output_lengths)
return (x, y, len_x, labels, wavs)
def _get_spectrogram_sample(self, raw):
"""Converts a raw spectrogram to one that can be saved as an image
sample = sqrt(exp(raw))
Arguments
---------
raw: torch.Tensor
the raw spectrogram (as used in the model)
Returns
-------
sample: torch.Tensor
the spectrogram, for image saving purposes
"""
sample = raw[0]
return torch.sqrt(torch.exp(sample))
def on_stage_end(self, stage, stage_loss, epoch):
"""Gets called at the end of an epoch.
Arguments
---------
stage : sb.Stage
One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST
stage_loss : float
The average loss for all of the data processed in this stage.
epoch : int
The currently-starting epoch. This is passed
`None` during the test stage.
"""
# Store the train loss until the validation stage.
# At the end of validation, we can write
if stage == sb.Stage.VALID:
# Update learning rate
lr = self.optimizer.param_groups[-1]["lr"]
self.last_epoch = epoch
# The train_logger writes a summary to stdout and to the logfile.
self.hparams.train_logger.log_stats( # 1#2#
stats_meta={"Epoch": epoch, "lr": lr},
train_stats=self.last_loss_stats[sb.Stage.TRAIN],
valid_stats=self.last_loss_stats[sb.Stage.VALID],
)
# Save the current checkpoint and delete previous checkpoints.
epoch_metadata = {
**{"epoch": epoch},
**self.last_loss_stats[sb.Stage.VALID],
}
self.checkpointer.save_and_keep_only(
meta=epoch_metadata,
min_keys=["loss"],
ckpt_predicate=(
(
lambda ckpt: (
ckpt.meta["epoch"]
% self.hparams.keep_checkpoint_interval
!= 0
)
)
if self.hparams.keep_checkpoint_interval is not None
else None
),
)
output_progress_sample = (
self.hparams.progress_samples
and epoch % self.hparams.progress_samples_interval == 0
)
if output_progress_sample:
self.run_inference_sample()
self.hparams.progress_sample_logger.save(epoch)
# We also write statistics about test data to stdout and to the logfile.
if stage == sb.Stage.TEST:
self.hparams.train_logger.log_stats(
{"Epoch loaded": self.hparams.epoch_counter.current},
test_stats=self.last_loss_stats[sb.Stage.TEST],
)
if self.hparams.progress_samples:
self.run_inference_sample()
self.hparams.progress_sample_logger.save("test")
def run_inference_sample(self):
"""Produces a sample in inference mode. This is called when producing
samples and can be useful because"""
if self.last_batch is None:
return
inputs, _, _, _, _ = self.last_batch
text_padded, input_lengths, _, _, _ = inputs
mel_out, _, _ = self.hparams.model.infer(
text_padded[:1], input_lengths[:1]
)
self.hparams.progress_sample_logger.remember(
inference_mel_out=self._get_spectrogram_sample(mel_out)
)
def dataio_prepare(hparams):
# Define audio pipeline:
@sb.utils.data_pipeline.takes("wav", "label")
@sb.utils.data_pipeline.provides("mel_text_pair")
def audio_pipeline(wav, label):
text_seq = torch.IntTensor(
text_to_sequence(label, hparams["text_cleaners"])
)
audio = sb.dataio.dataio.read_audio(wav)
mel = hparams["mel_spectogram"](audio=audio)
len_text = len(text_seq)
return text_seq, mel, len_text
datasets = {}
data_info = {
"train": hparams["train_json"],
"valid": hparams["valid_json"],
"test": hparams["test_json"],
}
for dataset in hparams["splits"]:
datasets[dataset] = sb.dataio.dataset.DynamicItemDataset.from_json(
json_path=data_info[dataset],
replacements={"data_root": hparams["data_folder"]},
dynamic_items=[audio_pipeline],
output_keys=["mel_text_pair", "wav", "label"],
)
return datasets
if __name__ == "__main__":
# Load hyperparameters file with command-line overrides
hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
with open(hparams_file) as fin:
hparams = load_hyperpyyaml(fin, overrides)
# create ddp_group with the right communication protocol
sb.utils.distributed.ddp_init_group(run_opts)
# Create experiment directory
sb.create_experiment_directory(
experiment_directory=hparams["output_folder"],
hyperparams_to_save=hparams_file,
overrides=overrides,
)
from ljspeech_prepare import prepare_ljspeech
sb.utils.distributed.run_on_main(
prepare_ljspeech,
kwargs={
"data_folder": hparams["data_folder"],
"save_folder": hparams["save_folder"],
"splits": hparams["splits"],
"split_ratio": hparams["split_ratio"],
"seed": hparams["seed"],
"skip_prep": hparams["skip_prep"],
},
)
datasets = dataio_prepare(hparams)
# Brain class initialization
tacotron2_brain = Tacotron2Brain(
modules=hparams["modules"],
opt_class=hparams["opt_class"],
hparams=hparams,
run_opts=run_opts,
checkpointer=hparams["checkpointer"],
)
# Training
tacotron2_brain.fit(
tacotron2_brain.hparams.epoch_counter,
train_set=datasets["train"],
valid_set=datasets["valid"],
train_loader_kwargs=hparams["train_dataloader_opts"],
valid_loader_kwargs=hparams["valid_dataloader_opts"],
)
# Test
if "test" in datasets:
tacotron2_brain.evaluate(
datasets["test"],
test_loader_kwargs=hparams["test_dataloader_opts"],
)
# ################################################
# Basic training parameters for a diffwave vocoder
#
# Author:
# * Yingzhi Wang 2022
# ################################################
# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1234
__set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]
data_folder: !PLACEHOLDER
output_folder: !ref ./results/diffwave/<seed>
save_folder: !ref <output_folder>/save
progress_sample_path: !ref <output_folder>/samples
train_log: !ref <output_folder>/train_log.txt
progress_samples_interval: 10
train_json: !ref <save_folder>/train.json
valid_json: !ref <save_folder>/valid.json
test_json: !ref <save_folder>/test.json
splits: ["train", "valid"]
split_ratio: [90, 10]
skip_prep: False
# The train logger writes training statistics to a file, as well as stdout.
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: !ref <train_log>
keep_checkpoint_interval: 100
# conditional training length
segment_size: 15872
# Training Parameters
sample_rate: 22050
number_of_epochs: 500
batch_size: 16
num_workers: 8
lr: 0.0002
# diffusion parameters
train_timesteps: 50
beta_start: 0.0001
beta_end: 0.05
fast_sampling: True
fast_sampling_noise_schedule: [0.0001, 0.001, 0.01, 0.05, 0.2, 0.5]
loss_l2_steps: 0
adam_beta1: 0.95
adam_beta2: 0.999
adam_weight_decay: 0.000001
adam_epsilon: 0.00000001
train_dataloader_opts:
batch_size: !ref <batch_size>
drop_last: False
num_workers: !ref <num_workers>
valid_dataloader_opts:
batch_size: 1
num_workers: !ref <num_workers>
test_dataloader_opts:
batch_size: 1
num_workers: !ref <num_workers>
use_tensorboard: False
tensorboard_logs: !ref <output_folder>/logs/
residual_layers: 30
residual_channels: 64
dilation_cycle_length: 10
unconditional: False
# Spectrogram Parameters
spec_n_fft: 1024
spec_f_min: 0
spec_f_max: 8000
mel_normalized: False
spec_n_mels: 80
spec_power: 1
spec_hop_length: 256
spec_win_length: 1024
spec_norm: "slaney"
spec_mel_scale: "slaney"
dynamic_range_compression: True
# Feature extraction
mel_spectogram: !name:speechbrain.lobes.models.HifiGAN.mel_spectogram
sample_rate: !ref <sample_rate>
hop_length: !ref <spec_hop_length>
win_length: !ref <spec_win_length>
n_fft: !ref <spec_n_fft>
n_mels: !ref <spec_n_mels>
f_min: !ref <spec_f_min>
f_max: !ref <spec_f_max>
power: !ref <spec_power>
normalized: !ref <mel_normalized>
norm: !ref <spec_norm>
mel_scale: !ref <spec_mel_scale>
compression: !ref <dynamic_range_compression>
compute_cost: !new:speechbrain.nnet.schedulers.ScheduledLoss
schedule:
- loss_fn: !name:speechbrain.nnet.losses.mse_loss
steps: !ref <loss_l2_steps>
- loss_fn: !name:speechbrain.nnet.losses.l1_loss
# To design a custom model, either just edit the simple CustomModel
# class that's listed here, or replace this `!new` call with a line
# pointing to a different file you've defined.
diffwave: !new:speechbrain.lobes.models.DiffWave.DiffWave
input_channels: !ref <spec_n_mels>
residual_layers: !ref <residual_layers>
residual_channels: !ref <residual_channels>
dilation_cycle_length: !ref <dilation_cycle_length>
total_steps: !ref <train_timesteps>
unconditional: !ref <unconditional>
noise: !new:speechbrain.nnet.diffusion.GaussianNoise
diffusion: !new:speechbrain.lobes.models.DiffWave.DiffWaveDiffusion
model: !ref <diffwave.diffusion_forward>
beta_start: !ref <beta_start>
beta_end: !ref <beta_end>
timesteps: !ref <train_timesteps>
noise: !ref <noise>
# The first object passed to the Brain class is this "Epoch Counter"
# which is saved by the Checkpointer so that training can be resumed
# if it gets interrupted at any point.
epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
limit: !ref <number_of_epochs>
# Objects in "modules" dict will have their parameters moved to the correct
# device, as well as having train()/eval() called on them by the Brain class.
modules:
diffwave: !ref <diffwave>
diffusion: !ref <diffusion>
# This optimizer will be constructed by the Brain class after all parameters
# are moved to the correct device. Then it will be added to the checkpointer.
opt_class: !name:torch.optim.AdamW
lr: !ref <lr>
betas: !ref (<adam_beta1>, <adam_beta2>)
weight_decay: !ref <adam_weight_decay>
eps: !ref <adam_epsilon>
# This function manages learning rate annealing over the epochs.
# We here use the simple lr annealing method that linearly decreases
# the lr from the initial value to the final one.
# lr_annealing: !new:speechbrain.nnet.schedulers.WarmCoolDecayLRSchedule
# lr: !ref <lr>
# warmup: !ref <lr_warmup_steps>
# cooldown: !ref <lr_cooldown_steps>
# total_steps: !ref <lr_total_steps>
# This object is used for saving the state of training both so that it
# can be resumed if it gets interrupted, and also so that the best checkpoint
# can be later loaded for evaluation or inference.
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: !ref <save_folder>
recoverables:
diffwave: !ref <diffwave>
counter: !ref <epoch_counter>
../../../ljspeech_prepare.py
\ No newline at end of file
This diff is collapsed.
../../../ljspeech_prepare.py
\ No newline at end of file
This diff is collapsed.
../../../ljspeech_prepare.py
\ No newline at end of file
This diff is collapsed.
# K-means (Quantization)
This folder contains recipes for training K-means clustering model for the LJSpeech Dataset.
The model serves to quantize self-supervised representations into discrete representation. Thus representations can be used as a discrete audio input for various tasks including classification, ASR and speech generation.
It supports kmeans model using the features from HuBERT, WAVLM or Wav2Vec.
You can download LibriSpeech at http://www.openslr.org/12
## Installing Extra Dependencies
Before proceeding, ensure you have installed the necessary additional dependencies. To do this, simply run the following command in your terminal:
```
pip install -r extra_requirements.txt
```
# How to run:
To configure the SSL model type and corresponding Hub in your YAML configuration file, follow these steps:
1. Locate the `model_config` section in your YAML file.
2. Modify the `ssl_model_type` field to specify one of the SSL models: "Hubert", "WavLM", or "Wav2Vec2".
3. Update the `ssl_hub` field with the specific name of the SSL Hub associated with your chosen model type.
Here are the supported SSL models along with their corresponding SSL Hubs:
```
ssl_model_type: hubert, wavlm, wav2vec2
ssl_hub:
- facebook/hubert-large-ll60k
- microsoft/wavlm-large
- facebook/wav2vec2-large
```
4. Set the output folder according to the experiments you are running (e.g., `output_folder: !ref results/LJSpeech/clustering/wavlm/<seed>`)
To initiate training using a specific SSL model, execute the following command:
```shell
python train.py hparams/train_discrete_ssl.yaml
```
This command will start the training process using the configurations specified in 'train_discrete_ssl.yaml'.
# Results
The checkpoints can be found at [this](https://huggingface.co/speechbrain/SSL_Quantization) HuggingFace repository.
# **About SpeechBrain**
- Website: https://speechbrain.github.io/
- Code: https://github.com/speechbrain/speechbrain/
- HuggingFace: https://huggingface.co/speechbrain/
# **Citing SpeechBrain**
Please, cite SpeechBrain if you use it for your research or business.
```bibtex
@misc{ravanelli2024opensourceconversationalaispeechbrain,
title={Open-Source Conversational AI with SpeechBrain 1.0},
author={Mirco Ravanelli and Titouan Parcollet and Adel Moumen and Sylvain de Langen and Cem Subakan and Peter Plantinga and Yingzhi Wang and Pooneh Mousavi and Luca Della Libera and Artem Ploujnikov and Francesco Paissan and Davide Borra and Salah Zaiem and Zeyu Zhao and Shucong Zhang and Georgios Karakasidis and Sung-Lin Yeh and Pierre Champion and Aku Rouhe and Rudolf Braun and Florian Mai and Juan Zuluaga-Gomez and Seyed Mahed Mousavi and Andreas Nautsch and Xuechen Liu and Sangeet Sagar and Jarod Duret and Salima Mdhaffar and Gaelle Laperriere and Mickael Rouvier and Renato De Mori and Yannick Esteve},
year={2024},
eprint={2407.00463},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2407.00463},
}
@misc{speechbrain,
title={{SpeechBrain}: A General-Purpose Speech Toolkit},
author={Mirco Ravanelli and Titouan Parcollet and Peter Plantinga and Aku Rouhe and Samuele Cornell and Loren Lugosch and Cem Subakan and Nauman Dawalatabad and Abdelwahab Heba and Jianyuan Zhong and Ju-Chieh Chou and Sung-Lin Yeh and Szu-Wei Fu and Chien-Feng Liao and Elena Rastorgueva and François Grondin and William Aris and Hwidong Na and Yan Gao and Renato De Mori and Yoshua Bengio},
year={2021},
eprint={2106.04624},
archivePrefix={arXiv},
primaryClass={eess.AS},
note={arXiv:2106.04624}
}
```
../ljspeech_prepare.py
\ No newline at end of file
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment