init model

4130a52d · changhl · eb6a18fd · 4130a52d · 4130a52d · 4130a52d
Commit 4130a52d authored Aug 27, 2024 by changhl
20 changed files
--- a/speechbrain/recipes/LJSpeech/TTS/tacotron2/results/tacotron2/1234/hyperparams.yaml
+++ b/speechbrain/recipes/LJSpeech/TTS/tacotron2/results/tacotron2/1234/hyperparams.yaml
+# Generated 2024-08-27 from:
+# /public/home/changhl/tacotron2_pytorch/speechbrain/recipes/LJSpeech/TTS/tacotron2/hparams/train.yaml
+# yamllint disable
+############################################################################
+# Model: Tacotron2
+# Tokens: Raw characters (English text)
+# losses: Transducer
+# Training: LJSpeech
+# Authors: Georges Abous-Rjeili, Artem Ploujnikov, Yingzhi Wang
+# ############################################################################
+###################################
+# Experiment Parameters and setup #
+###################################
+seed: 1234
+__set_seed: !apply:torch.manual_seed [1234]
+output_folder: ./results/tacotron2/1234
+save_folder: /public/home/changhl/taco/logdir
+train_log: ./results/tacotron2/1234/train_log.txt
+epochs: 750
+keep_checkpoint_interval: 50
+###################################
+# Progress Samples                #
+###################################
+# Progress samples are used to monitor the progress
+# of an ongoing training session by outputting samples
+# of spectrograms, alignments, etc at regular intervals
+# Whether to enable progress samples
+progress_samples: true
+# The path where the samples will be stored
+progress_sample_path: ./results/tacotron2/1234/samples
+# The interval, in epochs. For instance, if it is set to 5,
+# progress samples will be output every 5 epochs
+progress_samples_interval: 1
+# The sample size for raw batch samples saved in batch.pth
+# (useful mostly for model debugging)
+progress_batch_sample_size: 3
+#################################
+# Data files and pre-processing #
+#################################
+data_folder: /public/home/changhl/LJSpeech-1.1
+                          # e.g, /localscratch/ljspeech
+train_json: /public/home/changhl/taco/logdir/train.json
+valid_json: /public/home/changhl/taco/logdir/valid.json
+test_json: /public/home/changhl/taco/logdir/test.json
+splits: [train, valid]
+split_ratio: [90, 10]
+skip_prep: false
+# Use the original preprocessing from nvidia
+# The cleaners to be used (applicable to nvidia only)
+text_cleaners: [english_cleaners]
+################################
+# Audio Parameters             #
+################################
+sample_rate: 22050
+hop_length: 256
+win_length: 1024
+n_mel_channels: 80
+n_fft: 1024
+mel_fmin: 0.0
+mel_fmax: 8000.0
+mel_normalized: false
+power: 1
+norm: slaney
+mel_scale: slaney
+dynamic_range_compression: true
+################################
+# Optimization Hyperparameters #
+################################
+learning_rate: 0.001
+weight_decay: 0.000006
+batch_size: 64 #minimum 2
+num_workers: 8
+mask_padding: true
+guided_attention_sigma: 0.2
+guided_attention_weight: 50.0
+guided_attention_weight_half_life: 10.
+guided_attention_hard_stop: 50
+gate_loss_weight: 1.0
+train_dataloader_opts:
+  batch_size: 64
+  drop_last: false  #True #False
+  num_workers: 8
+  collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
+valid_dataloader_opts:
+  batch_size: 64
+  num_workers: 8
+  collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
+test_dataloader_opts:
+  batch_size: 64
+  num_workers: 8
+  collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
+################################
+# Model Parameters and model   #
+################################
+n_symbols: 148 #fixed depending on symbols in textToSequence
+symbols_embedding_dim: 512
+# Encoder parameters
+encoder_kernel_size: 5
+encoder_n_convolutions: 3
+encoder_embedding_dim: 512
+# Decoder parameters
+# The number of frames in the target per encoder step
+n_frames_per_step: 1
+decoder_rnn_dim: 1024
+prenet_dim: 256
+max_decoder_steps: 1000
+gate_threshold: 0.5
+p_attention_dropout: 0.1
+p_decoder_dropout: 0.1
+decoder_no_early_stopping: false
+# Attention parameters
+attention_rnn_dim: 1024
+attention_dim: 128
+# Location Layer parameters
+attention_location_n_filters: 32
+attention_location_kernel_size: 31
+# Mel-post processing network parameters
+postnet_embedding_dim: 512
+postnet_kernel_size: 5
+postnet_n_convolutions: 5
+mel_spectogram: !name:speechbrain.lobes.models.Tacotron2.mel_spectogram
+  sample_rate: 22050
+  hop_length: 256
+  win_length: 1024
+  n_fft: 1024
+  n_mels: 80
+  f_min: 0.0
+  f_max: 8000.0
+  power: 1
+  normalized: false
+  norm: slaney
+  mel_scale: slaney
+  compression: true
+#model
+model: &id002 !new:speechbrain.lobes.models.Tacotron2.Tacotron2
+#optimizer
+  mask_padding: true
+  n_mel_channels: 80
+  # symbols
+  n_symbols: 148
+  symbols_embedding_dim: 512
+  # encoder
+  encoder_kernel_size: 5
+  encoder_n_convolutions: 3
+  encoder_embedding_dim: 512
+  # attention
+  attention_rnn_dim: 1024
+  attention_dim: 128
+  # attention location
+  attention_location_n_filters: 32
+  attention_location_kernel_size: 31
+  # decoder
+  n_frames_per_step: 1
+  decoder_rnn_dim: 1024
+  prenet_dim: 256
+  max_decoder_steps: 1000
+  gate_threshold: 0.5
+  p_attention_dropout: 0.1
+  p_decoder_dropout: 0.1
+  # postnet
+  postnet_embedding_dim: 512
+  postnet_kernel_size: 5
+  postnet_n_convolutions: 5
+  decoder_no_early_stopping: false
+guided_attention_scheduler: &id001 !new:speechbrain.nnet.schedulers.StepScheduler
+  initial_value: 50.0
+  half_life: 10.
+criterion: !new:speechbrain.lobes.models.Tacotron2.Loss
+  gate_loss_weight: 1.0
+  guided_attention_weight: 50.0
+  guided_attention_sigma: 0.2
+  guided_attention_scheduler: *id001
+  guided_attention_hard_stop: 50
+modules:
+  model: *id002
+opt_class: !name:torch.optim.Adam
+  lr: 0.001
+  weight_decay: 0.000006
+#epoch object
+epoch_counter: &id003 !new:speechbrain.utils.epoch_loop.EpochCounter
+  limit: 750
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+  save_file: ./results/tacotron2/1234/train_log.txt
+#annealing_function
+lr_annealing: &id004 !new:speechbrain.nnet.schedulers.IntervalScheduler
+#infer: !name:speechbrain.lobes.models.Tacotron2.infer
+  intervals:
+  - steps: 6000
+    lr: 0.0005
+  - steps: 8000
+    lr: 0.0003
+  - steps: 10000
+    lr: 0.0001
+#checkpointer
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+  checkpoints_dir: /public/home/changhl/taco/logdir
+  recoverables:
+    model: *id002
+    counter: *id003
+    scheduler: *id004
+progress_sample_logger: !new:speechbrain.utils.train_logger.ProgressSampleLogger
+  output_path: ./results/tacotron2/1234/samples
+  batch_sample_size: 3
+  formats:
+    raw_batch: raw
--- a/speechbrain/recipes/LJSpeech/TTS/tacotron2/results/tacotron2/1234/log.txt
+++ b/speechbrain/recipes/LJSpeech/TTS/tacotron2/results/tacotron2/1234/log.txt
--- a/speechbrain/recipes/LJSpeech/TTS/tacotron2/results/tacotron2/1234/train.py
+++ b/speechbrain/recipes/LJSpeech/TTS/tacotron2/results/tacotron2/1234/train.py
+# -*- coding: utf-8 -*-
+"""
+ Recipe for training the Tacotron Text-To-Speech model, an end-to-end
+ neural text-to-speech (TTS) system
+ To run this recipe, do the following:
+ # python train.py --device=cuda:0 --max_grad_norm=1.0 --data_folder=/your_folder/LJSpeech-1.1 hparams/train.yaml
+ to infer simply load saved model and do
+ savemodel.infer(text_Sequence,len(textsequence))
+ were text_Sequence is the output of the text_to_sequence function from
+ textToSequence.py (from textToSequence import text_to_sequence)
+ Authors
+ * Georges Abous-Rjeili 2021
+ * Artem Ploujnikov 2021
+ * Yingzhi Wang 2022
+"""
+import logging
+import sys
+import torch
+from hyperpyyaml import load_hyperpyyaml
+import speechbrain as sb
+from speechbrain.utils.data_utils import scalarize
+from speechbrain.utils.text_to_sequence import text_to_sequence
+logger = logging.getLogger(__name__)
+class Tacotron2Brain(sb.Brain):
+    """The Brain implementation for Tacotron2"""
+    def on_fit_start(self):
+        """Gets called at the beginning of ``fit()``, on multiple processes
+        if ``distributed_count > 0`` and backend is ddp and initializes statistics
+        """
+        self.hparams.progress_sample_logger.reset()
+        self.last_epoch = 0
+        self.last_batch = None
+        self.last_loss_stats = {}
+        return super().on_fit_start()
+    def compute_forward(self, batch, stage):
+        """Computes the forward pass
+        Arguments
+        ---------
+        batch: str
+            a single batch
+        stage: speechbrain.Stage
+            the training stage
+        Returns
+        -------
+        the model output
+        """
+        effective_batch = self.batch_to_device(batch)
+        inputs, y, num_items, _, _ = effective_batch
+        _, input_lengths, _, _, _ = inputs
+        max_input_length = input_lengths.max().item()
+        return self.modules.model(inputs, alignments_dim=max_input_length)
+    def on_fit_batch_end(self, batch, outputs, loss, should_step):
+        """At the end of the optimizer step, apply noam annealing."""
+        if should_step:
+            self.hparams.lr_annealing(self.optimizer)
+    def compute_objectives(self, predictions, batch, stage):
+        """Computes the loss given the predicted and targeted outputs.
+        Arguments
+        ---------
+        predictions : torch.Tensor
+            The model generated spectrograms and other metrics from `compute_forward`.
+        batch : PaddedBatch
+            This batch object contains all the relevant tensors for computation.
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+        Returns
+        -------
+        loss : torch.Tensor
+            A one-element tensor used for backpropagating the gradient.
+        """
+        effective_batch = self.batch_to_device(batch)
+        # Hold on to the batch for the inference sample. This is needed because
+        # the inference sample is run from on_stage_end only, where
+        # batch information is not available
+        self.last_batch = effective_batch
+        # Hold on to a sample (for logging)
+        self._remember_sample(effective_batch, predictions)
+        # Compute the loss
+        loss = self._compute_loss(predictions, effective_batch, stage)
+        return loss
+    def _compute_loss(self, predictions, batch, stage):
+        """Computes the value of the loss function and updates stats
+        Arguments
+        ---------
+        predictions: tuple
+            model predictions
+        batch: PaddedBatch
+            Inputs for this training iteration.
+        stage: sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+        Returns
+        -------
+        loss: torch.Tensor
+            the loss value
+        """
+        inputs, targets, num_items, labels, wavs = batch
+        text_padded, input_lengths, _, max_len, output_lengths = inputs
+        loss_stats = self.hparams.criterion(
+            predictions, targets, input_lengths, output_lengths, self.last_epoch
+        )
+        self.last_loss_stats[stage] = scalarize(loss_stats)
+        return loss_stats.loss
+    def _remember_sample(self, batch, predictions):
+        """Remembers samples of spectrograms and the batch for logging purposes
+        Arguments
+        ---------
+        batch: tuple
+            a training batch
+        predictions: tuple
+            predictions (raw output of the Tacotron model)
+        """
+        inputs, targets, num_items, labels, wavs = batch
+        text_padded, input_lengths, _, max_len, output_lengths = inputs
+        mel_target, _ = targets
+        mel_out, mel_out_postnet, gate_out, alignments = predictions
+        alignments_max = (
+            alignments[0]
+            .max(dim=-1)
+            .values.max(dim=-1)
+            .values.unsqueeze(-1)
+            .unsqueeze(-1)
+        )
+        alignments_output = alignments[0].T.flip(dims=(1,)) / alignments_max
+        self.hparams.progress_sample_logger.remember(
+            target=self._get_spectrogram_sample(mel_target),
+            output=self._get_spectrogram_sample(mel_out),
+            output_postnet=self._get_spectrogram_sample(mel_out_postnet),
+            alignments=alignments_output,
+            raw_batch=self.hparams.progress_sample_logger.get_batch_sample(
+                {
+                    "text_padded": text_padded,
+                    "input_lengths": input_lengths,
+                    "mel_target": mel_target,
+                    "mel_out": mel_out,
+                    "mel_out_postnet": mel_out_postnet,
+                    "max_len": max_len,
+                    "output_lengths": output_lengths,
+                    "gate_out": gate_out,
+                    "alignments": alignments,
+                    "labels": labels,
+                    "wavs": wavs,
+                }
+            ),
+        )
+    def batch_to_device(self, batch):
+        """Transfers the batch to the target device
+        Arguments
+        ---------
+        batch: tuple
+            the batch to use
+        Returns
+        -------
+        batch: tuple
+            the batch on the correct device
+        """
+        (
+            text_padded,
+            input_lengths,
+            mel_padded,
+            gate_padded,
+            output_lengths,
+            len_x,
+            labels,
+            wavs,
+        ) = batch
+        text_padded = text_padded.to(self.device, non_blocking=True).long()
+        input_lengths = input_lengths.to(self.device, non_blocking=True).long()
+        max_len = torch.max(input_lengths.data).item()
+        mel_padded = mel_padded.to(self.device, non_blocking=True).float()
+        gate_padded = gate_padded.to(self.device, non_blocking=True).float()
+        output_lengths = output_lengths.to(
+            self.device, non_blocking=True
+        ).long()
+        x = (text_padded, input_lengths, mel_padded, max_len, output_lengths)
+        y = (mel_padded, gate_padded)
+        len_x = torch.sum(output_lengths)
+        return (x, y, len_x, labels, wavs)
+    def _get_spectrogram_sample(self, raw):
+        """Converts a raw spectrogram to one that can be saved as an image
+        sample  = sqrt(exp(raw))
+        Arguments
+        ---------
+        raw: torch.Tensor
+            the raw spectrogram (as used in the model)
+        Returns
+        -------
+        sample: torch.Tensor
+            the spectrogram, for image saving purposes
+        """
+        sample = raw[0]
+        return torch.sqrt(torch.exp(sample))
+    def on_stage_end(self, stage, stage_loss, epoch):
+        """Gets called at the end of an epoch.
+        Arguments
+        ---------
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST
+        stage_loss : float
+            The average loss for all of the data processed in this stage.
+        epoch : int
+            The currently-starting epoch. This is passed
+            `None` during the test stage.
+        """
+        # Store the train loss until the validation stage.
+        # At the end of validation, we can write
+        if stage == sb.Stage.VALID:
+            # Update learning rate
+            lr = self.optimizer.param_groups[-1]["lr"]
+            self.last_epoch = epoch
+            # The train_logger writes a summary to stdout and to the logfile.
+            self.hparams.train_logger.log_stats(  # 1#2#
+                stats_meta={"Epoch": epoch, "lr": lr},
+                train_stats=self.last_loss_stats[sb.Stage.TRAIN],
+                valid_stats=self.last_loss_stats[sb.Stage.VALID],
+            )
+            # Save the current checkpoint and delete previous checkpoints.
+            epoch_metadata = {
+                **{"epoch": epoch},
+                **self.last_loss_stats[sb.Stage.VALID],
+            }
+            self.checkpointer.save_and_keep_only(
+                meta=epoch_metadata,
+                min_keys=["loss"],
+                ckpt_predicate=(
+                    (
+                        lambda ckpt: (
+                            ckpt.meta["epoch"]
+                            % self.hparams.keep_checkpoint_interval
+                            != 0
+                        )
+                    )
+                    if self.hparams.keep_checkpoint_interval is not None
+                    else None
+                ),
+            )
+            output_progress_sample = (
+                self.hparams.progress_samples
+                and epoch % self.hparams.progress_samples_interval == 0
+            )
+            if output_progress_sample:
+                self.run_inference_sample()
+                self.hparams.progress_sample_logger.save(epoch)
+        # We also write statistics about test data to stdout and to the logfile.
+        if stage == sb.Stage.TEST:
+            self.hparams.train_logger.log_stats(
+                {"Epoch loaded": self.hparams.epoch_counter.current},
+                test_stats=self.last_loss_stats[sb.Stage.TEST],
+            )
+            if self.hparams.progress_samples:
+                self.run_inference_sample()
+                self.hparams.progress_sample_logger.save("test")
+    def run_inference_sample(self):
+        """Produces a sample in inference mode. This is called when producing
+        samples and can be useful because"""
+        if self.last_batch is None:
+            return
+        inputs, _, _, _, _ = self.last_batch
+        text_padded, input_lengths, _, _, _ = inputs
+        mel_out, _, _ = self.hparams.model.infer(
+            text_padded[:1], input_lengths[:1]
+        )
+        self.hparams.progress_sample_logger.remember(
+            inference_mel_out=self._get_spectrogram_sample(mel_out)
+        )
+def dataio_prepare(hparams):
+    # Define audio pipeline:
+    @sb.utils.data_pipeline.takes("wav", "label")
+    @sb.utils.data_pipeline.provides("mel_text_pair")
+    def audio_pipeline(wav, label):
+        text_seq = torch.IntTensor(
+            text_to_sequence(label, hparams["text_cleaners"])
+        )
+        audio = sb.dataio.dataio.read_audio(wav)
+        mel = hparams["mel_spectogram"](audio=audio)
+        len_text = len(text_seq)
+        return text_seq, mel, len_text
+    datasets = {}
+    data_info = {
+        "train": hparams["train_json"],
+        "valid": hparams["valid_json"],
+        "test": hparams["test_json"],
+    }
+    for dataset in hparams["splits"]:
+        datasets[dataset] = sb.dataio.dataset.DynamicItemDataset.from_json(
+            json_path=data_info[dataset],
+            replacements={"data_root": hparams["data_folder"]},
+            dynamic_items=[audio_pipeline],
+            output_keys=["mel_text_pair", "wav", "label"],
+        )
+    return datasets
+if __name__ == "__main__":
+    # Load hyperparameters file with command-line overrides
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+    with open(hparams_file) as fin:
+        hparams = load_hyperpyyaml(fin, overrides)
+    # create ddp_group with the right communication protocol
+    sb.utils.distributed.ddp_init_group(run_opts)
+    # Create experiment directory
+    sb.create_experiment_directory(
+        experiment_directory=hparams["output_folder"],
+        hyperparams_to_save=hparams_file,
+        overrides=overrides,
+    )
+    from ljspeech_prepare import prepare_ljspeech
+    sb.utils.distributed.run_on_main(
+        prepare_ljspeech,
+        kwargs={
+            "data_folder": hparams["data_folder"],
+            "save_folder": hparams["save_folder"],
+            "splits": hparams["splits"],
+            "split_ratio": hparams["split_ratio"],
+            "seed": hparams["seed"],
+            "skip_prep": hparams["skip_prep"],
+        },
+    )
+    datasets = dataio_prepare(hparams)
+    # Brain class initialization
+    tacotron2_brain = Tacotron2Brain(
+        modules=hparams["modules"],
+        opt_class=hparams["opt_class"],
+        hparams=hparams,
+        run_opts=run_opts,
+        checkpointer=hparams["checkpointer"],
+    )
+    # Training
+    tacotron2_brain.fit(
+        tacotron2_brain.hparams.epoch_counter,
+        train_set=datasets["train"],
+        valid_set=datasets["valid"],
+        train_loader_kwargs=hparams["train_dataloader_opts"],
+        valid_loader_kwargs=hparams["valid_dataloader_opts"],
+    )
+    # Test
+    if "test" in datasets:
+        tacotron2_brain.evaluate(
+            datasets["test"],
+            test_loader_kwargs=hparams["test_dataloader_opts"],
+        )
--- a/speechbrain/recipes/LJSpeech/TTS/tacotron2/train.py
+++ b/speechbrain/recipes/LJSpeech/TTS/tacotron2/train.py
+# -*- coding: utf-8 -*-
+"""
+ Recipe for training the Tacotron Text-To-Speech model, an end-to-end
+ neural text-to-speech (TTS) system
+ To run this recipe, do the following:
+ # python train.py --device=cuda:0 --max_grad_norm=1.0 --data_folder=/your_folder/LJSpeech-1.1 hparams/train.yaml
+ to infer simply load saved model and do
+ savemodel.infer(text_Sequence,len(textsequence))
+ were text_Sequence is the output of the text_to_sequence function from
+ textToSequence.py (from textToSequence import text_to_sequence)
+ Authors
+ * Georges Abous-Rjeili 2021
+ * Artem Ploujnikov 2021
+ * Yingzhi Wang 2022
+"""
+import logging
+import sys
+import torch
+from hyperpyyaml import load_hyperpyyaml
+import speechbrain as sb
+from speechbrain.utils.data_utils import scalarize
+from speechbrain.utils.text_to_sequence import text_to_sequence
+logger = logging.getLogger(__name__)
+class Tacotron2Brain(sb.Brain):
+    """The Brain implementation for Tacotron2"""
+    def on_fit_start(self):
+        """Gets called at the beginning of ``fit()``, on multiple processes
+        if ``distributed_count > 0`` and backend is ddp and initializes statistics
+        """
+        self.hparams.progress_sample_logger.reset()
+        self.last_epoch = 0
+        self.last_batch = None
+        self.last_loss_stats = {}
+        return super().on_fit_start()
+    def compute_forward(self, batch, stage):
+        """Computes the forward pass
+        Arguments
+        ---------
+        batch: str
+            a single batch
+        stage: speechbrain.Stage
+            the training stage
+        Returns
+        -------
+        the model output
+        """
+        effective_batch = self.batch_to_device(batch)
+        inputs, y, num_items, _, _ = effective_batch
+        _, input_lengths, _, _, _ = inputs
+        max_input_length = input_lengths.max().item()
+        return self.modules.model(inputs, alignments_dim=max_input_length)
+    def on_fit_batch_end(self, batch, outputs, loss, should_step):
+        """At the end of the optimizer step, apply noam annealing."""
+        if should_step:
+            self.hparams.lr_annealing(self.optimizer)
+    def compute_objectives(self, predictions, batch, stage):
+        """Computes the loss given the predicted and targeted outputs.
+        Arguments
+        ---------
+        predictions : torch.Tensor
+            The model generated spectrograms and other metrics from `compute_forward`.
+        batch : PaddedBatch
+            This batch object contains all the relevant tensors for computation.
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+        Returns
+        -------
+        loss : torch.Tensor
+            A one-element tensor used for backpropagating the gradient.
+        """
+        effective_batch = self.batch_to_device(batch)
+        # Hold on to the batch for the inference sample. This is needed because
+        # the inference sample is run from on_stage_end only, where
+        # batch information is not available
+        self.last_batch = effective_batch
+        # Hold on to a sample (for logging)
+        self._remember_sample(effective_batch, predictions)
+        # Compute the loss
+        loss = self._compute_loss(predictions, effective_batch, stage)
+        return loss
+    def _compute_loss(self, predictions, batch, stage):
+        """Computes the value of the loss function and updates stats
+        Arguments
+        ---------
+        predictions: tuple
+            model predictions
+        batch: PaddedBatch
+            Inputs for this training iteration.
+        stage: sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+        Returns
+        -------
+        loss: torch.Tensor
+            the loss value
+        """
+        inputs, targets, num_items, labels, wavs = batch
+        text_padded, input_lengths, _, max_len, output_lengths = inputs
+        loss_stats = self.hparams.criterion(
+            predictions, targets, input_lengths, output_lengths, self.last_epoch
+        )
+        self.last_loss_stats[stage] = scalarize(loss_stats)
+        return loss_stats.loss
+    def _remember_sample(self, batch, predictions):
+        """Remembers samples of spectrograms and the batch for logging purposes
+        Arguments
+        ---------
+        batch: tuple
+            a training batch
+        predictions: tuple
+            predictions (raw output of the Tacotron model)
+        """
+        inputs, targets, num_items, labels, wavs = batch
+        text_padded, input_lengths, _, max_len, output_lengths = inputs
+        mel_target, _ = targets
+        mel_out, mel_out_postnet, gate_out, alignments = predictions
+        alignments_max = (
+            alignments[0]
+            .max(dim=-1)
+            .values.max(dim=-1)
+            .values.unsqueeze(-1)
+            .unsqueeze(-1)
+        )
+        alignments_output = alignments[0].T.flip(dims=(1,)) / alignments_max
+        self.hparams.progress_sample_logger.remember(
+            target=self._get_spectrogram_sample(mel_target),
+            output=self._get_spectrogram_sample(mel_out),
+            output_postnet=self._get_spectrogram_sample(mel_out_postnet),
+            alignments=alignments_output,
+            raw_batch=self.hparams.progress_sample_logger.get_batch_sample(
+                {
+                    "text_padded": text_padded,
+                    "input_lengths": input_lengths,
+                    "mel_target": mel_target,
+                    "mel_out": mel_out,
+                    "mel_out_postnet": mel_out_postnet,
+                    "max_len": max_len,
+                    "output_lengths": output_lengths,
+                    "gate_out": gate_out,
+                    "alignments": alignments,
+                    "labels": labels,
+                    "wavs": wavs,
+                }
+            ),
+        )
+    def batch_to_device(self, batch):
+        """Transfers the batch to the target device
+        Arguments
+        ---------
+        batch: tuple
+            the batch to use
+        Returns
+        -------
+        batch: tuple
+            the batch on the correct device
+        """
+        (
+            text_padded,
+            input_lengths,
+            mel_padded,
+            gate_padded,
+            output_lengths,
+            len_x,
+            labels,
+            wavs,
+        ) = batch
+        text_padded = text_padded.to(self.device, non_blocking=True).long()
+        input_lengths = input_lengths.to(self.device, non_blocking=True).long()
+        max_len = torch.max(input_lengths.data).item()
+        mel_padded = mel_padded.to(self.device, non_blocking=True).float()
+        gate_padded = gate_padded.to(self.device, non_blocking=True).float()
+        output_lengths = output_lengths.to(
+            self.device, non_blocking=True
+        ).long()
+        x = (text_padded, input_lengths, mel_padded, max_len, output_lengths)
+        y = (mel_padded, gate_padded)
+        len_x = torch.sum(output_lengths)
+        return (x, y, len_x, labels, wavs)
+    def _get_spectrogram_sample(self, raw):
+        """Converts a raw spectrogram to one that can be saved as an image
+        sample  = sqrt(exp(raw))
+        Arguments
+        ---------
+        raw: torch.Tensor
+            the raw spectrogram (as used in the model)
+        Returns
+        -------
+        sample: torch.Tensor
+            the spectrogram, for image saving purposes
+        """
+        sample = raw[0]
+        return torch.sqrt(torch.exp(sample))
+    def on_stage_end(self, stage, stage_loss, epoch):
+        """Gets called at the end of an epoch.
+        Arguments
+        ---------
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST
+        stage_loss : float
+            The average loss for all of the data processed in this stage.
+        epoch : int
+            The currently-starting epoch. This is passed
+            `None` during the test stage.
+        """
+        # Store the train loss until the validation stage.
+        # At the end of validation, we can write
+        if stage == sb.Stage.VALID:
+            # Update learning rate
+            lr = self.optimizer.param_groups[-1]["lr"]
+            self.last_epoch = epoch
+            # The train_logger writes a summary to stdout and to the logfile.
+            self.hparams.train_logger.log_stats(  # 1#2#
+                stats_meta={"Epoch": epoch, "lr": lr},
+                train_stats=self.last_loss_stats[sb.Stage.TRAIN],
+                valid_stats=self.last_loss_stats[sb.Stage.VALID],
+            )
+            # Save the current checkpoint and delete previous checkpoints.
+            epoch_metadata = {
+                **{"epoch": epoch},
+                **self.last_loss_stats[sb.Stage.VALID],
+            }
+            self.checkpointer.save_and_keep_only(
+                meta=epoch_metadata,
+                min_keys=["loss"],
+                ckpt_predicate=(
+                    (
+                        lambda ckpt: (
+                            ckpt.meta["epoch"]
+                            % self.hparams.keep_checkpoint_interval
+                            != 0
+                        )
+                    )
+                    if self.hparams.keep_checkpoint_interval is not None
+                    else None
+                ),
+            )
+            output_progress_sample = (
+                self.hparams.progress_samples
+                and epoch % self.hparams.progress_samples_interval == 0
+            )
+            if output_progress_sample:
+                self.run_inference_sample()
+                self.hparams.progress_sample_logger.save(epoch)
+        # We also write statistics about test data to stdout and to the logfile.
+        if stage == sb.Stage.TEST:
+            self.hparams.train_logger.log_stats(
+                {"Epoch loaded": self.hparams.epoch_counter.current},
+                test_stats=self.last_loss_stats[sb.Stage.TEST],
+            )
+            if self.hparams.progress_samples:
+                self.run_inference_sample()
+                self.hparams.progress_sample_logger.save("test")
+    def run_inference_sample(self):
+        """Produces a sample in inference mode. This is called when producing
+        samples and can be useful because"""
+        if self.last_batch is None:
+            return
+        inputs, _, _, _, _ = self.last_batch
+        text_padded, input_lengths, _, _, _ = inputs
+        mel_out, _, _ = self.hparams.model.infer(
+            text_padded[:1], input_lengths[:1]
+        )
+        self.hparams.progress_sample_logger.remember(
+            inference_mel_out=self._get_spectrogram_sample(mel_out)
+        )
+def dataio_prepare(hparams):
+    # Define audio pipeline:
+    @sb.utils.data_pipeline.takes("wav", "label")
+    @sb.utils.data_pipeline.provides("mel_text_pair")
+    def audio_pipeline(wav, label):
+        text_seq = torch.IntTensor(
+            text_to_sequence(label, hparams["text_cleaners"])
+        )
+        audio = sb.dataio.dataio.read_audio(wav)
+        mel = hparams["mel_spectogram"](audio=audio)
+        len_text = len(text_seq)
+        return text_seq, mel, len_text
+    datasets = {}
+    data_info = {
+        "train": hparams["train_json"],
+        "valid": hparams["valid_json"],
+        "test": hparams["test_json"],
+    }
+    for dataset in hparams["splits"]:
+        datasets[dataset] = sb.dataio.dataset.DynamicItemDataset.from_json(
+            json_path=data_info[dataset],
+            replacements={"data_root": hparams["data_folder"]},
+            dynamic_items=[audio_pipeline],
+            output_keys=["mel_text_pair", "wav", "label"],
+        )
+    return datasets
+if __name__ == "__main__":
+    # Load hyperparameters file with command-line overrides
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+    with open(hparams_file) as fin:
+        hparams = load_hyperpyyaml(fin, overrides)
+    # create ddp_group with the right communication protocol
+    sb.utils.distributed.ddp_init_group(run_opts)
+    # Create experiment directory
+    sb.create_experiment_directory(
+        experiment_directory=hparams["output_folder"],
+        hyperparams_to_save=hparams_file,
+        overrides=overrides,
+    )
+    from ljspeech_prepare import prepare_ljspeech
+    sb.utils.distributed.run_on_main(
+        prepare_ljspeech,
+        kwargs={
+            "data_folder": hparams["data_folder"],
+            "save_folder": hparams["save_folder"],
+            "splits": hparams["splits"],
+            "split_ratio": hparams["split_ratio"],
+            "seed": hparams["seed"],
+            "skip_prep": hparams["skip_prep"],
+        },
+    )
+    datasets = dataio_prepare(hparams)
+    # Brain class initialization
+    tacotron2_brain = Tacotron2Brain(
+        modules=hparams["modules"],
+        opt_class=hparams["opt_class"],
+        hparams=hparams,
+        run_opts=run_opts,
+        checkpointer=hparams["checkpointer"],
+    )
+    # Training
+    tacotron2_brain.fit(
+        tacotron2_brain.hparams.epoch_counter,
+        train_set=datasets["train"],
+        valid_set=datasets["valid"],
+        train_loader_kwargs=hparams["train_dataloader_opts"],
+        valid_loader_kwargs=hparams["valid_dataloader_opts"],
+    )
+    # Test
+    if "test" in datasets:
+        tacotron2_brain.evaluate(
+            datasets["test"],
+            test_loader_kwargs=hparams["test_dataloader_opts"],
+        )
--- a/speechbrain/recipes/LJSpeech/TTS/vocoder/diffwave/hparams/train.yaml
+++ b/speechbrain/recipes/LJSpeech/TTS/vocoder/diffwave/hparams/train.yaml
+# ################################################
+# Basic training parameters for a diffwave vocoder
+#
+# Author:
+#  * Yingzhi Wang 2022
+# ################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 1234
+__set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]
+data_folder: !PLACEHOLDER
+output_folder: !ref ./results/diffwave/<seed>
+save_folder: !ref <output_folder>/save
+progress_sample_path: !ref <output_folder>/samples
+train_log: !ref <output_folder>/train_log.txt
+progress_samples_interval: 10
+train_json: !ref <save_folder>/train.json
+valid_json: !ref <save_folder>/valid.json
+test_json: !ref <save_folder>/test.json
+splits: ["train", "valid"]
+split_ratio: [90, 10]
+skip_prep: False
+# The train logger writes training statistics to a file, as well as stdout.
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+keep_checkpoint_interval: 100
+# conditional training length
+segment_size: 15872
+# Training Parameters
+sample_rate: 22050
+number_of_epochs: 500
+batch_size: 16
+num_workers: 8
+lr: 0.0002
+# diffusion parameters
+train_timesteps: 50
+beta_start: 0.0001
+beta_end: 0.05
+fast_sampling: True
+fast_sampling_noise_schedule: [0.0001, 0.001, 0.01, 0.05, 0.2, 0.5]
+loss_l2_steps: 0
+adam_beta1: 0.95
+adam_beta2: 0.999
+adam_weight_decay: 0.000001
+adam_epsilon: 0.00000001
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    drop_last: False
+    num_workers: !ref <num_workers>
+valid_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+use_tensorboard: False
+tensorboard_logs: !ref <output_folder>/logs/
+residual_layers: 30
+residual_channels: 64
+dilation_cycle_length: 10
+unconditional: False
+# Spectrogram Parameters
+spec_n_fft: 1024
+spec_f_min: 0
+spec_f_max: 8000
+mel_normalized: False
+spec_n_mels: 80
+spec_power: 1
+spec_hop_length: 256
+spec_win_length: 1024
+spec_norm: "slaney"
+spec_mel_scale: "slaney"
+dynamic_range_compression: True
+# Feature extraction
+mel_spectogram: !name:speechbrain.lobes.models.HifiGAN.mel_spectogram
+    sample_rate: !ref <sample_rate>
+    hop_length: !ref <spec_hop_length>
+    win_length: !ref <spec_win_length>
+    n_fft: !ref <spec_n_fft>
+    n_mels: !ref <spec_n_mels>
+    f_min: !ref <spec_f_min>
+    f_max: !ref <spec_f_max>
+    power: !ref <spec_power>
+    normalized: !ref <mel_normalized>
+    norm: !ref <spec_norm>
+    mel_scale: !ref <spec_mel_scale>
+    compression: !ref <dynamic_range_compression>
+compute_cost: !new:speechbrain.nnet.schedulers.ScheduledLoss
+    schedule:
+        - loss_fn: !name:speechbrain.nnet.losses.mse_loss
+          steps: !ref <loss_l2_steps>
+        - loss_fn: !name:speechbrain.nnet.losses.l1_loss
+# To design a custom model, either just edit the simple CustomModel
+# class that's listed here, or replace this `!new` call with a line
+# pointing to a different file you've defined.
+diffwave: !new:speechbrain.lobes.models.DiffWave.DiffWave
+    input_channels: !ref <spec_n_mels>
+    residual_layers: !ref <residual_layers>
+    residual_channels: !ref <residual_channels>
+    dilation_cycle_length: !ref <dilation_cycle_length>
+    total_steps: !ref <train_timesteps>
+    unconditional: !ref <unconditional>
+noise: !new:speechbrain.nnet.diffusion.GaussianNoise
+diffusion: !new:speechbrain.lobes.models.DiffWave.DiffWaveDiffusion
+    model: !ref <diffwave.diffusion_forward>
+    beta_start: !ref <beta_start>
+    beta_end: !ref <beta_end>
+    timesteps: !ref <train_timesteps>
+    noise: !ref <noise>
+# The first object passed to the Brain class is this "Epoch Counter"
+# which is saved by the Checkpointer so that training can be resumed
+# if it gets interrupted at any point.
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+# Objects in "modules" dict will have their parameters moved to the correct
+# device, as well as having train()/eval() called on them by the Brain class.
+modules:
+    diffwave: !ref <diffwave>
+    diffusion: !ref <diffusion>
+# This optimizer will be constructed by the Brain class after all parameters
+# are moved to the correct device. Then it will be added to the checkpointer.
+opt_class: !name:torch.optim.AdamW
+    lr: !ref <lr>
+    betas: !ref (<adam_beta1>, <adam_beta2>)
+    weight_decay: !ref <adam_weight_decay>
+    eps: !ref <adam_epsilon>
+# This function manages learning rate annealing over the epochs.
+# We here use the simple lr annealing method that linearly decreases
+# the lr from the initial value to the final one.
+# lr_annealing: !new:speechbrain.nnet.schedulers.WarmCoolDecayLRSchedule
+#     lr: !ref <lr>
+#     warmup: !ref <lr_warmup_steps>
+#     cooldown: !ref <lr_cooldown_steps>
+#     total_steps: !ref <lr_total_steps>
+# This object is used for saving the state of training both so that it
+# can be resumed if it gets interrupted, and also so that the best checkpoint
+# can be later loaded for evaluation or inference.
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        diffwave: !ref <diffwave>
+        counter: !ref <epoch_counter>
--- a/speechbrain/recipes/LJSpeech/TTS/vocoder/diffwave/ljspeech_prepare.py
+++ b/speechbrain/recipes/LJSpeech/TTS/vocoder/diffwave/ljspeech_prepare.py
+../../../ljspeech_prepare.py
\ No newline at end of file
--- a/speechbrain/recipes/LJSpeech/TTS/vocoder/diffwave/train.py
+++ b/speechbrain/recipes/LJSpeech/TTS/vocoder/diffwave/train.py
--- a/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan/hparams/train.yaml
+++ b/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan/hparams/train.yaml
--- a/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan/ljspeech_prepare.py
+++ b/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan/ljspeech_prepare.py
+../../../ljspeech_prepare.py
\ No newline at end of file
--- a/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan/train.py
+++ b/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan/train.py
--- a/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan_discrete/extract_code.py
+++ b/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan_discrete/extract_code.py
--- a/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan_discrete/hparams/train.yaml
+++ b/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan_discrete/hparams/train.yaml
--- a/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan_discrete/ljspeech_prepare.py
+++ b/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan_discrete/ljspeech_prepare.py
+../../../ljspeech_prepare.py
\ No newline at end of file
--- a/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan_discrete/train.py
+++ b/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan_discrete/train.py
--- a/speechbrain/recipes/LJSpeech/ljspeech_prepare.py
+++ b/speechbrain/recipes/LJSpeech/ljspeech_prepare.py
--- a/speechbrain/recipes/LJSpeech/quantization/README.md
+++ b/speechbrain/recipes/LJSpeech/quantization/README.md
+# K-means (Quantization)
+This folder contains recipes for training K-means clustering model for the LJSpeech Dataset.
+The model serves to quantize self-supervised representations into discrete representation. Thus representations can be used as a discrete audio input for various tasks including classification, ASR and speech generation.
+It supports kmeans model using the features from  HuBERT, WAVLM or Wav2Vec.
+You can download LibriSpeech at http://www.openslr.org/12
+## Installing Extra Dependencies
+Before proceeding, ensure you have installed the necessary additional dependencies. To do this, simply run the following command in your terminal:
+```
+pip install -r extra_requirements.txt
+```
+# How to run:
+To configure the SSL model type and corresponding Hub in your YAML configuration file, follow these steps:
+1. Locate the `model_config` section in your YAML file.
+2. Modify the `ssl_model_type` field to specify one of the SSL models: "Hubert", "WavLM", or "Wav2Vec2".
+3. Update the `ssl_hub` field with the specific name of the SSL Hub associated with your chosen model type.
+Here are the supported SSL models along with their corresponding SSL Hubs:
+```
+ssl_model_type: hubert, wavlm, wav2vec2
+ssl_hub:
+  - facebook/hubert-large-ll60k
+  - microsoft/wavlm-large
+  - facebook/wav2vec2-large
+```
+4. Set the output folder according to the experiments you are running (e.g., `output_folder: !ref results/LJSpeech/clustering/wavlm/<seed>`)
+To initiate training using a specific SSL model, execute the following command:
+```shell
+python train.py hparams/train_discrete_ssl.yaml
+```
+This command will start the training process using the configurations specified in 'train_discrete_ssl.yaml'.
+# Results
+The checkpoints can be found at [this](https://huggingface.co/speechbrain/SSL_Quantization) HuggingFace repository.
+# **About SpeechBrain**
+- Website: https://speechbrain.github.io/
+- Code: https://github.com/speechbrain/speechbrain/
+- HuggingFace: https://huggingface.co/speechbrain/
+# **Citing SpeechBrain**
+Please, cite SpeechBrain if you use it for your research or business.
+```bibtex
+@misc{ravanelli2024opensourceconversationalaispeechbrain,
+      title={Open-Source Conversational AI with SpeechBrain 1.0},
+      author={Mirco Ravanelli and Titouan Parcollet and Adel Moumen and Sylvain de Langen and Cem Subakan and Peter Plantinga and Yingzhi Wang and Pooneh Mousavi and Luca Della Libera and Artem Ploujnikov and Francesco Paissan and Davide Borra and Salah Zaiem and Zeyu Zhao and Shucong Zhang and Georgios Karakasidis and Sung-Lin Yeh and Pierre Champion and Aku Rouhe and Rudolf Braun and Florian Mai and Juan Zuluaga-Gomez and Seyed Mahed Mousavi and Andreas Nautsch and Xuechen Liu and Sangeet Sagar and Jarod Duret and Salima Mdhaffar and Gaelle Laperriere and Mickael Rouvier and Renato De Mori and Yannick Esteve},
+      year={2024},
+      eprint={2407.00463},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/2407.00463},
+}
+@misc{speechbrain,
+  title={{SpeechBrain}: A General-Purpose Speech Toolkit},
+  author={Mirco Ravanelli and Titouan Parcollet and Peter Plantinga and Aku Rouhe and Samuele Cornell and Loren Lugosch and Cem Subakan and Nauman Dawalatabad and Abdelwahab Heba and Jianyuan Zhong and Ju-Chieh Chou and Sung-Lin Yeh and Szu-Wei Fu and Chien-Feng Liao and Elena Rastorgueva and François Grondin and William Aris and Hwidong Na and Yan Gao and Renato De Mori and Yoshua Bengio},
+  year={2021},
+  eprint={2106.04624},
+  archivePrefix={arXiv},
+  primaryClass={eess.AS},
+  note={arXiv:2106.04624}
+}
+```
--- a/speechbrain/recipes/LJSpeech/quantization/extra-requirements.txt
+++ b/speechbrain/recipes/LJSpeech/quantization/extra-requirements.txt
+scikit-learn
+tgt
+unidecode
--- a/speechbrain/recipes/LJSpeech/quantization/hparams/train_discrete_ssl.yaml
+++ b/speechbrain/recipes/LJSpeech/quantization/hparams/train_discrete_ssl.yaml
--- a/speechbrain/recipes/LJSpeech/quantization/ljspeech_prepare.py
+++ b/speechbrain/recipes/LJSpeech/quantization/ljspeech_prepare.py
+../ljspeech_prepare.py
\ No newline at end of file
--- a/speechbrain/recipes/LJSpeech/quantization/train.py
+++ b/speechbrain/recipes/LJSpeech/quantization/train.py