init model

4130a52d · changhl · eb6a18fd · 4130a52d · 4130a52d · 4130a52d
Commit 4130a52d authored Aug 27, 2024 by changhl
20 changed files
--- a/speechbrain/recipes/LJSpeech/TTS/tacotron2/results/tacotron2/1234/hyperparams.yaml
+++ b/speechbrain/recipes/LJSpeech/TTS/tacotron2/results/tacotron2/1234/hyperparams.yaml
+# Generated 2024-08-27 from:
+# /public/home/changhl/tacotron2_pytorch/speechbrain/recipes/LJSpeech/TTS/tacotron2/hparams/train.yaml
+# yamllint disable
+############################################################################
+# Model: Tacotron2
+# Tokens: Raw characters (English text)
+# losses: Transducer
+# Training: LJSpeech
+# Authors: Georges Abous-Rjeili, Artem Ploujnikov, Yingzhi Wang
+# ############################################################################
+
+
+###################################
+# Experiment Parameters and setup #
+###################################
+seed: 1234
+__set_seed: !apply:torch.manual_seed [1234]
+output_folder: ./results/tacotron2/1234
+save_folder: /public/home/changhl/taco/logdir
+train_log: ./results/tacotron2/1234/train_log.txt
+epochs: 750
+keep_checkpoint_interval: 50
+
+###################################
+# Progress Samples                #
+###################################
+# Progress samples are used to monitor the progress
+# of an ongoing training session by outputting samples
+# of spectrograms, alignments, etc at regular intervals
+
+# Whether to enable progress samples
+progress_samples: true
+
+# The path where the samples will be stored
+progress_sample_path: ./results/tacotron2/1234/samples
+# The interval, in epochs. For instance, if it is set to 5,
+# progress samples will be output every 5 epochs
+progress_samples_interval: 1
+# The sample size for raw batch samples saved in batch.pth
+# (useful mostly for model debugging)
+progress_batch_sample_size: 3
+
+#################################
+# Data files and pre-processing #
+#################################
+data_folder: /public/home/changhl/LJSpeech-1.1
+                          # e.g, /localscratch/ljspeech
+
+train_json: /public/home/changhl/taco/logdir/train.json
+valid_json: /public/home/changhl/taco/logdir/valid.json
+test_json: /public/home/changhl/taco/logdir/test.json
+
+splits: [train, valid]
+split_ratio: [90, 10]
+
+skip_prep: false
+
+# Use the original preprocessing from nvidia
+# The cleaners to be used (applicable to nvidia only)
+text_cleaners: [english_cleaners]
+
+################################
+# Audio Parameters             #
+################################
+sample_rate: 22050
+hop_length: 256
+win_length: 1024
+n_mel_channels: 80
+n_fft: 1024
+mel_fmin: 0.0
+mel_fmax: 8000.0
+mel_normalized: false
+power: 1
+norm: slaney
+mel_scale: slaney
+dynamic_range_compression: true
+
+################################
+# Optimization Hyperparameters #
+################################
+learning_rate: 0.001
+weight_decay: 0.000006
+batch_size: 64 #minimum 2
+num_workers: 8
+mask_padding: true
+guided_attention_sigma: 0.2
+guided_attention_weight: 50.0
+guided_attention_weight_half_life: 10.
+guided_attention_hard_stop: 50
+gate_loss_weight: 1.0
+
+train_dataloader_opts:
+  batch_size: 64
+  drop_last: false  #True #False
+  num_workers: 8
+  collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
+
+valid_dataloader_opts:
+  batch_size: 64
+  num_workers: 8
+  collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
+
+test_dataloader_opts:
+  batch_size: 64
+  num_workers: 8
+  collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
+
+################################
+# Model Parameters and model   #
+################################
+n_symbols: 148 #fixed depending on symbols in textToSequence
+symbols_embedding_dim: 512
+
+# Encoder parameters
+encoder_kernel_size: 5
+encoder_n_convolutions: 3
+encoder_embedding_dim: 512
+
+# Decoder parameters
+# The number of frames in the target per encoder step
+n_frames_per_step: 1
+decoder_rnn_dim: 1024
+prenet_dim: 256
+max_decoder_steps: 1000
+gate_threshold: 0.5
+p_attention_dropout: 0.1
+p_decoder_dropout: 0.1
+decoder_no_early_stopping: false
+
+# Attention parameters
+attention_rnn_dim: 1024
+attention_dim: 128
+
+# Location Layer parameters
+attention_location_n_filters: 32
+attention_location_kernel_size: 31
+
+# Mel-post processing network parameters
+postnet_embedding_dim: 512
+postnet_kernel_size: 5
+postnet_n_convolutions: 5
+
+mel_spectogram: !name:speechbrain.lobes.models.Tacotron2.mel_spectogram
+  sample_rate: 22050
+  hop_length: 256
+  win_length: 1024
+  n_fft: 1024
+  n_mels: 80
+  f_min: 0.0
+  f_max: 8000.0
+  power: 1
+  normalized: false
+  norm: slaney
+  mel_scale: slaney
+  compression: true
+
+#model
+model: &id002 !new:speechbrain.lobes.models.Tacotron2.Tacotron2
+
+#optimizer
+  mask_padding: true
+  n_mel_channels: 80
+  # symbols
+  n_symbols: 148
+  symbols_embedding_dim: 512
+  # encoder
+  encoder_kernel_size: 5
+  encoder_n_convolutions: 3
+  encoder_embedding_dim: 512
+  # attention
+  attention_rnn_dim: 1024
+  attention_dim: 128
+  # attention location
+  attention_location_n_filters: 32
+  attention_location_kernel_size: 31
+  # decoder
+  n_frames_per_step: 1
+  decoder_rnn_dim: 1024
+  prenet_dim: 256
+  max_decoder_steps: 1000
+  gate_threshold: 0.5
+  p_attention_dropout: 0.1
+  p_decoder_dropout: 0.1
+  # postnet
+  postnet_embedding_dim: 512
+  postnet_kernel_size: 5
+  postnet_n_convolutions: 5
+  decoder_no_early_stopping: false
+
+guided_attention_scheduler: &id001 !new:speechbrain.nnet.schedulers.StepScheduler
+  initial_value: 50.0
+  half_life: 10.
+
+criterion: !new:speechbrain.lobes.models.Tacotron2.Loss
+  gate_loss_weight: 1.0
+  guided_attention_weight: 50.0
+  guided_attention_sigma: 0.2
+  guided_attention_scheduler: *id001
+  guided_attention_hard_stop: 50
+
+modules:
+  model: *id002
+opt_class: !name:torch.optim.Adam
+  lr: 0.001
+  weight_decay: 0.000006
+
+#epoch object
+epoch_counter: &id003 !new:speechbrain.utils.epoch_loop.EpochCounter
+  limit: 750
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+  save_file: ./results/tacotron2/1234/train_log.txt
+
+#annealing_function
+lr_annealing: &id004 !new:speechbrain.nnet.schedulers.IntervalScheduler
+
+#infer: !name:speechbrain.lobes.models.Tacotron2.infer
+
+  intervals:
+  - steps: 6000
+    lr: 0.0005
+  - steps: 8000
+    lr: 0.0003
+  - steps: 10000
+    lr: 0.0001
+
+#checkpointer
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+  checkpoints_dir: /public/home/changhl/taco/logdir
+  recoverables:
+    model: *id002
+    counter: *id003
+    scheduler: *id004
+progress_sample_logger: !new:speechbrain.utils.train_logger.ProgressSampleLogger
+  output_path: ./results/tacotron2/1234/samples
+  batch_sample_size: 3
+  formats:
+    raw_batch: raw
--- a/speechbrain/recipes/LJSpeech/TTS/tacotron2/results/tacotron2/1234/log.txt
+++ b/speechbrain/recipes/LJSpeech/TTS/tacotron2/results/tacotron2/1234/log.txt
+2024-08-27 14:39:21,619 - speechbrain.core - INFO - Beginning experiment!
+2024-08-27 14:39:21,620 - speechbrain.core - INFO - Experiment folder: ./results/tacotron2/1234
+2024-08-27 14:39:22,259 - speechbrain.utils.superpowers - DEBUG - accelerate==0.31.0
+addict==2.4.0
+aiosignal==1.3.1
+aitemplate @ http://10.6.10.68:8000/release/aitemplate/dtk24.04.1/aitemplate-0.0.1%2Bdas1.1.git5d8aa20.dtk2404.torch2.1.0-py3-none-any.whl#sha256=ad763a7cfd3935857cf10a07a2a97899fd64dda481add2f48de8b8930bd341dd
+annotated-types==0.7.0
+anyio==4.4.0
+apex @ http://10.6.10.68:8000/release/apex/dtk24.04.1/apex-1.1.0%2Bdas1.1.gitf477a3a.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=85eb662d13d6e6c3b61c2d878378c2338c4479bc03a1912c3eabddc2d9d08aa1
+attrs==23.2.0
+audioread==3.0.1
+bitsandbytes @ http://10.6.10.68:8000/release/bitsandbyte/dtk24.04.1/bitsandbytes-0.42.0%2Bdas1.1.gitce85679.abi1.dtk2404.torch2.1.0-py3-none-any.whl#sha256=6324e330c8d12b858d39f4986c0ed0836fcb05f539cee92a7cf558e17954ae0d
+certifi==2024.6.2
+cffi==1.17.0
+cfgv==3.4.0
+charset-normalizer==3.3.2
+click==8.1.7
+coloredlogs==15.0.1
+contourpy==1.2.1
+cycler==0.12.1
+decorator==5.1.1
+deepspeed @ http://10.6.10.68:8000/release/deepspeed/dtk24.04.1/deepspeed-0.12.3%2Bgita724046.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=2c158ed2dab21f4f09e7fc29776cb43a1593b13cec33168ce3483f318b852fc9
+distlib==0.3.8
+dnspython==2.6.1
+dropout-layer-norm @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/dropout_layer_norm-0.1%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=ae10c7cc231a8e38492292e91e76ba710d7679762604c0a7f10964b2385cdbd7
+einops==0.8.0
+email_validator==2.1.1
+exceptiongroup==1.2.1
+fastapi==0.111.0
+fastapi-cli==0.0.4
+fastpt @ http://10.6.10.68:8000/release/fastpt/dtk24.04.1/fastpt-1.0.0%2Bdas1.1.abi1.dtk2404-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=ecf30dadcd2482adb1107991edde19b6559b8237379dbb0a3e6eb7306aad3f9a
+filelock==3.15.1
+fire==0.6.0
+flash-attn @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/flash_attn-2.0.4%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=7ca8e78ee0624b1ff0e91e9fc265e61b9510f02123a010ac71a2f8e5d08a62f7
+flatbuffers==24.3.25
+fonttools==4.53.0
+frozenlist==1.4.1
+fsspec==2024.6.0
+fused-dense-lib @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/fused_dense_lib-0.1%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=7202dd258a86bb7a1572e3b44b90dae667b0c948bf0f420b05924a107aaaba03
+h11==0.14.0
+hjson==3.1.0
+httpcore==1.0.5
+httptools==0.6.1
+httpx==0.27.0
+huggingface-hub==0.23.4
+humanfriendly==10.0
+HyperPyYAML==1.2.2
+hypothesis==5.35.1
+identify==2.6.0
+idna==3.7
+importlib_metadata==7.1.0
+Jinja2==3.1.4
+joblib==1.4.2
+jsonschema==4.22.0
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.5
+layer-check-pt @ http://10.6.10.68:8000/release/layercheck/dtk24.04.1/layer_check_pt-1.2.3.git59a087a.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=807adae2d4d4b74898777f81e1b94f1af4d881afe6a7826c7c910b211accbea7
+lazy_loader==0.4
+librosa==0.10.2.post1
+lightop @ http://10.6.10.68:8000/release/lightop/dtk24.04.1/lightop-0.4%2Bdas1.1git8e60f07.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=2f2c88fd3fe4be179f44c4849e9224cb5b2b259843fc5a2d088e468b7a14c1b1
+llvmlite==0.43.0
+lmdeploy @ http://10.6.10.68:8000/release/lmdeploy/dtk24.04.1/lmdeploy-0.2.6%2Bdas1.1.git6ba90df.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=92ecee2c8b982f86e5c3219ded24d2ede219f415bf2cd4297f989a03387a203c
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.0
+mdurl==0.1.2
+mmcv @ http://10.6.10.68:8000/release/mmcv/dtk24.04.1/mmcv-2.0.1%2Bdas1.1.gite58da25.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=7a937ae22f81b44d9100907e11303c31bf9a670cb4c92e361675674a41a8a07f
+mmengine==0.10.4
+mmengine-lite==0.10.4
+mpmath==1.3.0
+msgpack==1.0.8
+networkx==3.3
+ninja==1.11.1.1
+nodeenv==1.9.1
+numba==0.60.0
+numpy==1.24.3
+onnxruntime @ http://10.6.10.68:8000/release/onnxruntime/dtk24.04.1/onnxruntime-1.15.0%2Bdas1.1.git739f24d.abi1.dtk2404-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=d0d24167188d2c85f1ed4110fc43e62ea40c74280716d9b5fe9540256f17869a
+opencv-python==4.10.0.82
+orjson==3.10.5
+packaging==24.1
+pandas==2.2.2
+peft==0.9.0
+pillow==10.3.0
+platformdirs==4.2.2
+pooch==1.8.2
+pre-commit==3.8.0
+prometheus_client==0.20.0
+protobuf==5.27.1
+psutil==5.9.8
+py-cpuinfo==9.0.0
+pycparser==2.22
+pydantic==2.7.4
+pydantic_core==2.18.4
+Pygments==2.18.0
+pygtrie==2.5.0
+pynvml==11.5.0
+pyparsing==3.1.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.9
+pytz==2024.1
+PyYAML==6.0.1
+ray==2.9.1
+referencing==0.35.1
+regex==2024.5.15
+requests==2.32.3
+rich==13.7.1
+rotary-emb @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/rotary_emb-0.1%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=cc15ec6ae73875515243d7f5c96ab214455a33a4a99eb7f1327f773cae1e6721
+rpds-py==0.18.1
+ruamel.yaml==0.18.6
+ruamel.yaml.clib==0.2.8
+safetensors==0.4.3
+scikit-learn==1.5.1
+scipy==1.13.1
+sentencepiece==0.2.0
+shellingham==1.5.4
+shortuuid==1.0.13
+six==1.16.0
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soxr==0.5.0
+speechbrain==1.0.0
+starlette==0.37.2
+sympy==1.12.1
+termcolor==2.4.0
+tgt==1.5
+threadpoolctl==3.5.0
+tiktoken==0.7.0
+tokenizers==0.15.0
+tomli==2.0.1
+torch @ http://10.6.10.68:8000/release/pytorch/dtk24.04.1/torch-2.1.0%2Bdas1.1.git3ac1bdd.abi1.dtk2404-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=5fd3bcef3aa197c0922727913aca53db9ce3f2fd4a9b22bba1973c3d526377f9
+torchaudio @ http://10.6.10.68:8000/release/torchaudio/dtk24.04.1/torchaudio-2.1.2%2Bdas1.1.git63d9a68.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=4fcc556a7a2fffe64ddd57f22e5972b1b2b723f6fdfdaa305bd01551036df38b
+torchvision @ http://10.6.10.68:8000/release/vision/dtk24.04.1/torchvision-0.16.0%2Bdas1.1.git7d45932.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=e3032e1bcc0857b54391d66744f97e5cff0dc7e7bb508196356ee927fb81ec01
+tqdm==4.66.4
+transformers==4.38.0
+triton @ http://10.6.10.68:8000/release/triton/dtk24.04.1/triton-2.1.0%2Bdas1.1.git4bf1007a.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=4c30d45dab071e65d1704a5cd189b14c4ac20bd59a7061032dfd631b1fc37645
+typer==0.12.3
+typing_extensions==4.12.2
+tzdata==2024.1
+ujson==5.10.0
+urllib3==2.2.1
+uvicorn==0.30.1
+uvloop==0.19.0
+virtualenv==20.26.3
+vllm @ http://10.6.10.68:8000/release/vllm/dtk24.04.1/vllm-0.3.3%2Bdas1.1.gitdf6349c.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=48d265b07efa36f028eca45a3667fa10d3cf30eb1b8f019b62e3b255fb9e49c4
+watchfiles==0.22.0
+websockets==12.0
+xentropy-cuda-lib @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/xentropy_cuda_lib-0.1%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=91b058d6a5fd2734a5085d68e08d3a1f948fe9c0119c46885d19f55293e2cce4
+xformers @ http://10.6.10.68:8000/release/xformers/dtk24.04.1/xformers-0.0.25%2Bdas1.1.git8ef8bc1.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=ca87fd065753c1be3b9fad552eba02d30cd3f4c673f01e81a763834eb5cbb9cc
+yapf==0.40.2
+zipp==3.19.2
+
+
+2024-08-27 14:39:22,428 - speechbrain.core - ERROR - Exception:
+Traceback (most recent call last):
+  File "/public/home/changhl/tacotron2_pytorch/speechbrain/recipes/LJSpeech/TTS/tacotron2/train.py", line 353, in <module>
+    from ljspeech_prepare import prepare_ljspeech
+  File "/public/home/changhl/tacotron2_pytorch/speechbrain/recipes/LJSpeech/TTS/tacotron2/ljspeech_prepare.py", line 23, in <module>
+    from unidecode import unidecode
+ModuleNotFoundError: No module named 'unidecode'
+2024-08-27 14:41:02,748 - speechbrain.core - INFO - Beginning experiment!
+2024-08-27 14:41:02,749 - speechbrain.core - INFO - Experiment folder: ./results/tacotron2/1234
+2024-08-27 14:41:03,353 - speechbrain.utils.superpowers - DEBUG - accelerate==0.31.0
+addict==2.4.0
+aiosignal==1.3.1
+aitemplate @ http://10.6.10.68:8000/release/aitemplate/dtk24.04.1/aitemplate-0.0.1%2Bdas1.1.git5d8aa20.dtk2404.torch2.1.0-py3-none-any.whl#sha256=ad763a7cfd3935857cf10a07a2a97899fd64dda481add2f48de8b8930bd341dd
+annotated-types==0.7.0
+anyio==4.4.0
+apex @ http://10.6.10.68:8000/release/apex/dtk24.04.1/apex-1.1.0%2Bdas1.1.gitf477a3a.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=85eb662d13d6e6c3b61c2d878378c2338c4479bc03a1912c3eabddc2d9d08aa1
+attrs==23.2.0
+audioread==3.0.1
+bitsandbytes @ http://10.6.10.68:8000/release/bitsandbyte/dtk24.04.1/bitsandbytes-0.42.0%2Bdas1.1.gitce85679.abi1.dtk2404.torch2.1.0-py3-none-any.whl#sha256=6324e330c8d12b858d39f4986c0ed0836fcb05f539cee92a7cf558e17954ae0d
+certifi==2024.6.2
+cffi==1.17.0
+cfgv==3.4.0
+charset-normalizer==3.3.2
+click==8.1.7
+coloredlogs==15.0.1
+contourpy==1.2.1
+cycler==0.12.1
+decorator==5.1.1
+deepspeed @ http://10.6.10.68:8000/release/deepspeed/dtk24.04.1/deepspeed-0.12.3%2Bgita724046.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=2c158ed2dab21f4f09e7fc29776cb43a1593b13cec33168ce3483f318b852fc9
+distlib==0.3.8
+dnspython==2.6.1
+dropout-layer-norm @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/dropout_layer_norm-0.1%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=ae10c7cc231a8e38492292e91e76ba710d7679762604c0a7f10964b2385cdbd7
+einops==0.8.0
+email_validator==2.1.1
+exceptiongroup==1.2.1
+fastapi==0.111.0
+fastapi-cli==0.0.4
+fastpt @ http://10.6.10.68:8000/release/fastpt/dtk24.04.1/fastpt-1.0.0%2Bdas1.1.abi1.dtk2404-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=ecf30dadcd2482adb1107991edde19b6559b8237379dbb0a3e6eb7306aad3f9a
+filelock==3.15.1
+fire==0.6.0
+flash-attn @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/flash_attn-2.0.4%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=7ca8e78ee0624b1ff0e91e9fc265e61b9510f02123a010ac71a2f8e5d08a62f7
+flatbuffers==24.3.25
+fonttools==4.53.0
+frozenlist==1.4.1
+fsspec==2024.6.0
+fused-dense-lib @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/fused_dense_lib-0.1%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=7202dd258a86bb7a1572e3b44b90dae667b0c948bf0f420b05924a107aaaba03
+h11==0.14.0
+hjson==3.1.0
+httpcore==1.0.5
+httptools==0.6.1
+httpx==0.27.0
+huggingface-hub==0.23.4
+humanfriendly==10.0
+HyperPyYAML==1.2.2
+hypothesis==5.35.1
+identify==2.6.0
+idna==3.7
+importlib_metadata==7.1.0
+Jinja2==3.1.4
+joblib==1.4.2
+jsonschema==4.22.0
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.5
+layer-check-pt @ http://10.6.10.68:8000/release/layercheck/dtk24.04.1/layer_check_pt-1.2.3.git59a087a.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=807adae2d4d4b74898777f81e1b94f1af4d881afe6a7826c7c910b211accbea7
+lazy_loader==0.4
+librosa==0.10.2.post1
+lightop @ http://10.6.10.68:8000/release/lightop/dtk24.04.1/lightop-0.4%2Bdas1.1git8e60f07.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=2f2c88fd3fe4be179f44c4849e9224cb5b2b259843fc5a2d088e468b7a14c1b1
+llvmlite==0.43.0
+lmdeploy @ http://10.6.10.68:8000/release/lmdeploy/dtk24.04.1/lmdeploy-0.2.6%2Bdas1.1.git6ba90df.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=92ecee2c8b982f86e5c3219ded24d2ede219f415bf2cd4297f989a03387a203c
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.0
+mdurl==0.1.2
+mmcv @ http://10.6.10.68:8000/release/mmcv/dtk24.04.1/mmcv-2.0.1%2Bdas1.1.gite58da25.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=7a937ae22f81b44d9100907e11303c31bf9a670cb4c92e361675674a41a8a07f
+mmengine==0.10.4
+mmengine-lite==0.10.4
+mpmath==1.3.0
+msgpack==1.0.8
+networkx==3.3
+ninja==1.11.1.1
+nodeenv==1.9.1
+numba==0.60.0
+numpy==1.24.3
+onnxruntime @ http://10.6.10.68:8000/release/onnxruntime/dtk24.04.1/onnxruntime-1.15.0%2Bdas1.1.git739f24d.abi1.dtk2404-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=d0d24167188d2c85f1ed4110fc43e62ea40c74280716d9b5fe9540256f17869a
+opencv-python==4.10.0.82
+orjson==3.10.5
+packaging==24.1
+pandas==2.2.2
+peft==0.9.0
+pillow==10.3.0
+platformdirs==4.2.2
+pooch==1.8.2
+pre-commit==3.8.0
+prometheus_client==0.20.0
+protobuf==5.27.1
+psutil==5.9.8
+py-cpuinfo==9.0.0
+pycparser==2.22
+pydantic==2.7.4
+pydantic_core==2.18.4
+Pygments==2.18.0
+pygtrie==2.5.0
+pynvml==11.5.0
+pyparsing==3.1.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.9
+pytz==2024.1
+PyYAML==6.0.1
+ray==2.9.1
+referencing==0.35.1
+regex==2024.5.15
+requests==2.32.3
+rich==13.7.1
+rotary-emb @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/rotary_emb-0.1%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=cc15ec6ae73875515243d7f5c96ab214455a33a4a99eb7f1327f773cae1e6721
+rpds-py==0.18.1
+ruamel.yaml==0.18.6
+ruamel.yaml.clib==0.2.8
+safetensors==0.4.3
+scikit-learn==1.5.1
+scipy==1.13.1
+sentencepiece==0.2.0
+shellingham==1.5.4
+shortuuid==1.0.13
+six==1.16.0
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soxr==0.5.0
+speechbrain==1.0.0
+starlette==0.37.2
+sympy==1.12.1
+termcolor==2.4.0
+tgt==1.5
+threadpoolctl==3.5.0
+tiktoken==0.7.0
+tokenizers==0.15.0
+tomli==2.0.1
+torch @ http://10.6.10.68:8000/release/pytorch/dtk24.04.1/torch-2.1.0%2Bdas1.1.git3ac1bdd.abi1.dtk2404-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=5fd3bcef3aa197c0922727913aca53db9ce3f2fd4a9b22bba1973c3d526377f9
+torchaudio @ http://10.6.10.68:8000/release/torchaudio/dtk24.04.1/torchaudio-2.1.2%2Bdas1.1.git63d9a68.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=4fcc556a7a2fffe64ddd57f22e5972b1b2b723f6fdfdaa305bd01551036df38b
+torchvision @ http://10.6.10.68:8000/release/vision/dtk24.04.1/torchvision-0.16.0%2Bdas1.1.git7d45932.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=e3032e1bcc0857b54391d66744f97e5cff0dc7e7bb508196356ee927fb81ec01
+tqdm==4.66.4
+transformers==4.38.0
+triton @ http://10.6.10.68:8000/release/triton/dtk24.04.1/triton-2.1.0%2Bdas1.1.git4bf1007a.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=4c30d45dab071e65d1704a5cd189b14c4ac20bd59a7061032dfd631b1fc37645
+typer==0.12.3
+typing_extensions==4.12.2
+tzdata==2024.1
+ujson==5.10.0
+Unidecode==1.3.8
+urllib3==2.2.1
+uvicorn==0.30.1
+uvloop==0.19.0
+virtualenv==20.26.3
+vllm @ http://10.6.10.68:8000/release/vllm/dtk24.04.1/vllm-0.3.3%2Bdas1.1.gitdf6349c.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=48d265b07efa36f028eca45a3667fa10d3cf30eb1b8f019b62e3b255fb9e49c4
+watchfiles==0.22.0
+websockets==12.0
+xentropy-cuda-lib @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/xentropy_cuda_lib-0.1%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=91b058d6a5fd2734a5085d68e08d3a1f948fe9c0119c46885d19f55293e2cce4
+xformers @ http://10.6.10.68:8000/release/xformers/dtk24.04.1/xformers-0.0.25%2Bdas1.1.git8ef8bc1.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=ca87fd065753c1be3b9fad552eba02d30cd3f4c673f01e81a763834eb5cbb9cc
+yapf==0.40.2
+zipp==3.19.2
+
+
+2024-08-27 14:41:03,824 - ljspeech_prepare - INFO - Creating json file for ljspeech Dataset..
+2024-08-27 14:41:03,858 - ljspeech_prepare - INFO - preparing /public/home/changhl/taco/logdir/train.json.
+2024-08-27 14:41:05,014 - ljspeech_prepare - INFO - /public/home/changhl/taco/logdir/train.json successfully created!
+2024-08-27 14:41:05,017 - ljspeech_prepare - INFO - preparing /public/home/changhl/taco/logdir/valid.json.
+2024-08-27 14:41:05,144 - ljspeech_prepare - INFO - /public/home/changhl/taco/logdir/valid.json successfully created!
+2024-08-27 14:41:06,035 - speechbrain.core - INFO - Gradscaler enabled: False. Using precision: fp32.
+2024-08-27 14:41:06,036 - speechbrain.core - INFO - 28.2M trainable parameters in Tacotron2Brain
+2024-08-27 14:41:06,039 - speechbrain.utils.checkpoints - INFO - Would load a checkpoint here, but none found yet.
+2024-08-27 14:41:06,039 - speechbrain.utils.epoch_loop - INFO - Going into epoch 1
+2024-08-27 14:41:36,638 - speechbrain.core - ERROR - Exception:
+Traceback (most recent call last):
+  File "/public/home/changhl/tacotron2_pytorch/speechbrain/recipes/LJSpeech/TTS/tacotron2/train.py", line 379, in <module>
+    tacotron2_brain.fit(
+  File "/usr/local/lib/python3.10/site-packages/speechbrain/core.py", line 1555, in fit
+    self._fit_train(train_set=train_set, epoch=epoch, enable=enable)
+  File "/usr/local/lib/python3.10/site-packages/speechbrain/core.py", line 1384, in _fit_train
+    loss = self.fit_batch(batch)
+  File "/usr/local/lib/python3.10/site-packages/speechbrain/core.py", line 1185, in fit_batch
+    scaled_loss.backward()
+  File "/usr/local/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
+    torch.autograd.backward(
+  File "/usr/local/lib/python3.10/site-packages/torch/autograd/__init__.py", line 251, in backward
+    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+KeyboardInterrupt
+2024-08-27 14:43:04,704 - speechbrain.core - INFO - Beginning experiment!
+2024-08-27 14:43:04,704 - speechbrain.core - INFO - Experiment folder: ./results/tacotron2/1234
+2024-08-27 14:43:05,313 - speechbrain.utils.superpowers - DEBUG - accelerate==0.31.0
+addict==2.4.0
+aiosignal==1.3.1
+aitemplate @ http://10.6.10.68:8000/release/aitemplate/dtk24.04.1/aitemplate-0.0.1%2Bdas1.1.git5d8aa20.dtk2404.torch2.1.0-py3-none-any.whl#sha256=ad763a7cfd3935857cf10a07a2a97899fd64dda481add2f48de8b8930bd341dd
+annotated-types==0.7.0
+anyio==4.4.0
+apex @ http://10.6.10.68:8000/release/apex/dtk24.04.1/apex-1.1.0%2Bdas1.1.gitf477a3a.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=85eb662d13d6e6c3b61c2d878378c2338c4479bc03a1912c3eabddc2d9d08aa1
+attrs==23.2.0
+audioread==3.0.1
+bitsandbytes @ http://10.6.10.68:8000/release/bitsandbyte/dtk24.04.1/bitsandbytes-0.42.0%2Bdas1.1.gitce85679.abi1.dtk2404.torch2.1.0-py3-none-any.whl#sha256=6324e330c8d12b858d39f4986c0ed0836fcb05f539cee92a7cf558e17954ae0d
+certifi==2024.6.2
+cffi==1.17.0
+cfgv==3.4.0
+charset-normalizer==3.3.2
+click==8.1.7
+coloredlogs==15.0.1
+contourpy==1.2.1
+cycler==0.12.1
+decorator==5.1.1
+deepspeed @ http://10.6.10.68:8000/release/deepspeed/dtk24.04.1/deepspeed-0.12.3%2Bgita724046.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=2c158ed2dab21f4f09e7fc29776cb43a1593b13cec33168ce3483f318b852fc9
+distlib==0.3.8
+dnspython==2.6.1
+dropout-layer-norm @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/dropout_layer_norm-0.1%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=ae10c7cc231a8e38492292e91e76ba710d7679762604c0a7f10964b2385cdbd7
+einops==0.8.0
+email_validator==2.1.1
+exceptiongroup==1.2.1
+fastapi==0.111.0
+fastapi-cli==0.0.4
+fastpt @ http://10.6.10.68:8000/release/fastpt/dtk24.04.1/fastpt-1.0.0%2Bdas1.1.abi1.dtk2404-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=ecf30dadcd2482adb1107991edde19b6559b8237379dbb0a3e6eb7306aad3f9a
+filelock==3.15.1
+fire==0.6.0
+flash-attn @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/flash_attn-2.0.4%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=7ca8e78ee0624b1ff0e91e9fc265e61b9510f02123a010ac71a2f8e5d08a62f7
+flatbuffers==24.3.25
+fonttools==4.53.0
+frozenlist==1.4.1
+fsspec==2024.6.0
+fused-dense-lib @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/fused_dense_lib-0.1%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=7202dd258a86bb7a1572e3b44b90dae667b0c948bf0f420b05924a107aaaba03
+h11==0.14.0
+hjson==3.1.0
+httpcore==1.0.5
+httptools==0.6.1
+httpx==0.27.0
+huggingface-hub==0.23.4
+humanfriendly==10.0
+HyperPyYAML==1.2.2
+hypothesis==5.35.1
+identify==2.6.0
+idna==3.7
+importlib_metadata==7.1.0
+Jinja2==3.1.4
+joblib==1.4.2
+jsonschema==4.22.0
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.5
+layer-check-pt @ http://10.6.10.68:8000/release/layercheck/dtk24.04.1/layer_check_pt-1.2.3.git59a087a.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=807adae2d4d4b74898777f81e1b94f1af4d881afe6a7826c7c910b211accbea7
+lazy_loader==0.4
+librosa==0.10.2.post1
+lightop @ http://10.6.10.68:8000/release/lightop/dtk24.04.1/lightop-0.4%2Bdas1.1git8e60f07.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=2f2c88fd3fe4be179f44c4849e9224cb5b2b259843fc5a2d088e468b7a14c1b1
+llvmlite==0.43.0
+lmdeploy @ http://10.6.10.68:8000/release/lmdeploy/dtk24.04.1/lmdeploy-0.2.6%2Bdas1.1.git6ba90df.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=92ecee2c8b982f86e5c3219ded24d2ede219f415bf2cd4297f989a03387a203c
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.0
+mdurl==0.1.2
+mmcv @ http://10.6.10.68:8000/release/mmcv/dtk24.04.1/mmcv-2.0.1%2Bdas1.1.gite58da25.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=7a937ae22f81b44d9100907e11303c31bf9a670cb4c92e361675674a41a8a07f
+mmengine==0.10.4
+mmengine-lite==0.10.4
+mpmath==1.3.0
+msgpack==1.0.8
+networkx==3.3
+ninja==1.11.1.1
+nodeenv==1.9.1
+numba==0.60.0
+numpy==1.24.3
+onnxruntime @ http://10.6.10.68:8000/release/onnxruntime/dtk24.04.1/onnxruntime-1.15.0%2Bdas1.1.git739f24d.abi1.dtk2404-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=d0d24167188d2c85f1ed4110fc43e62ea40c74280716d9b5fe9540256f17869a
+opencv-python==4.10.0.82
+orjson==3.10.5
+packaging==24.1
+pandas==2.2.2
+peft==0.9.0
+pillow==10.3.0
+platformdirs==4.2.2
+pooch==1.8.2
+pre-commit==3.8.0
+prometheus_client==0.20.0
+protobuf==5.27.1
+psutil==5.9.8
+py-cpuinfo==9.0.0
+pycparser==2.22
+pydantic==2.7.4
+pydantic_core==2.18.4
+Pygments==2.18.0
+pygtrie==2.5.0
+pynvml==11.5.0
+pyparsing==3.1.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.9
+pytz==2024.1
+PyYAML==6.0.1
+ray==2.9.1
+referencing==0.35.1
+regex==2024.5.15
+requests==2.32.3
+rich==13.7.1
+rotary-emb @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/rotary_emb-0.1%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=cc15ec6ae73875515243d7f5c96ab214455a33a4a99eb7f1327f773cae1e6721
+rpds-py==0.18.1
+ruamel.yaml==0.18.6
+ruamel.yaml.clib==0.2.8
+safetensors==0.4.3
+scikit-learn==1.5.1
+scipy==1.13.1
+sentencepiece==0.2.0
+shellingham==1.5.4
+shortuuid==1.0.13
+six==1.16.0
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soxr==0.5.0
+speechbrain==1.0.0
+starlette==0.37.2
+sympy==1.12.1
+termcolor==2.4.0
+tgt==1.5
+threadpoolctl==3.5.0
+tiktoken==0.7.0
+tokenizers==0.15.0
+tomli==2.0.1
+torch @ http://10.6.10.68:8000/release/pytorch/dtk24.04.1/torch-2.1.0%2Bdas1.1.git3ac1bdd.abi1.dtk2404-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=5fd3bcef3aa197c0922727913aca53db9ce3f2fd4a9b22bba1973c3d526377f9
+torchaudio @ http://10.6.10.68:8000/release/torchaudio/dtk24.04.1/torchaudio-2.1.2%2Bdas1.1.git63d9a68.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=4fcc556a7a2fffe64ddd57f22e5972b1b2b723f6fdfdaa305bd01551036df38b
+torchvision @ http://10.6.10.68:8000/release/vision/dtk24.04.1/torchvision-0.16.0%2Bdas1.1.git7d45932.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=e3032e1bcc0857b54391d66744f97e5cff0dc7e7bb508196356ee927fb81ec01
+tqdm==4.66.4
+transformers==4.38.0
+triton @ http://10.6.10.68:8000/release/triton/dtk24.04.1/triton-2.1.0%2Bdas1.1.git4bf1007a.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=4c30d45dab071e65d1704a5cd189b14c4ac20bd59a7061032dfd631b1fc37645
+typer==0.12.3
+typing_extensions==4.12.2
+tzdata==2024.1
+ujson==5.10.0
+Unidecode==1.3.8
+urllib3==2.2.1
+uvicorn==0.30.1
+uvloop==0.19.0
+virtualenv==20.26.3
+vllm @ http://10.6.10.68:8000/release/vllm/dtk24.04.1/vllm-0.3.3%2Bdas1.1.gitdf6349c.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=48d265b07efa36f028eca45a3667fa10d3cf30eb1b8f019b62e3b255fb9e49c4
+watchfiles==0.22.0
+websockets==12.0
+xentropy-cuda-lib @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/xentropy_cuda_lib-0.1%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=91b058d6a5fd2734a5085d68e08d3a1f948fe9c0119c46885d19f55293e2cce4
+xformers @ http://10.6.10.68:8000/release/xformers/dtk24.04.1/xformers-0.0.25%2Bdas1.1.git8ef8bc1.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=ca87fd065753c1be3b9fad552eba02d30cd3f4c673f01e81a763834eb5cbb9cc
+yapf==0.40.2
+zipp==3.19.2
+
+
+2024-08-27 14:43:05,329 - ljspeech_prepare - INFO - Skipping preparation, completed in previous run.
+2024-08-27 14:43:06,197 - speechbrain.core - INFO - Gradscaler enabled: False. Using precision: fp32.
+2024-08-27 14:43:06,198 - speechbrain.core - INFO - 28.2M trainable parameters in Tacotron2Brain
+2024-08-27 14:43:06,200 - speechbrain.utils.checkpoints - INFO - Would load a checkpoint here, but none found yet.
+2024-08-27 14:43:06,200 - speechbrain.utils.epoch_loop - INFO - Going into epoch 1
+2024-08-27 14:44:00,358 - speechbrain.core - ERROR - Exception:
+Traceback (most recent call last):
+  File "/public/home/changhl/tacotron2_pytorch/speechbrain/recipes/LJSpeech/TTS/tacotron2/train.py", line 379, in <module>
+    tacotron2_brain.fit(
+  File "/usr/local/lib/python3.10/site-packages/speechbrain/core.py", line 1555, in fit
+    self._fit_train(train_set=train_set, epoch=epoch, enable=enable)
+  File "/usr/local/lib/python3.10/site-packages/speechbrain/core.py", line 1384, in _fit_train
+    loss = self.fit_batch(batch)
+  File "/usr/local/lib/python3.10/site-packages/speechbrain/core.py", line 1185, in fit_batch
+    scaled_loss.backward()
+  File "/usr/local/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
+    torch.autograd.backward(
+  File "/usr/local/lib/python3.10/site-packages/torch/autograd/__init__.py", line 251, in backward
+    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+KeyboardInterrupt
--- a/speechbrain/recipes/LJSpeech/TTS/tacotron2/results/tacotron2/1234/train.py
+++ b/speechbrain/recipes/LJSpeech/TTS/tacotron2/results/tacotron2/1234/train.py
+# -*- coding: utf-8 -*-
+"""
+ Recipe for training the Tacotron Text-To-Speech model, an end-to-end
+ neural text-to-speech (TTS) system
+
+ To run this recipe, do the following:
+ # python train.py --device=cuda:0 --max_grad_norm=1.0 --data_folder=/your_folder/LJSpeech-1.1 hparams/train.yaml
+
+ to infer simply load saved model and do
+ savemodel.infer(text_Sequence,len(textsequence))
+
+ were text_Sequence is the output of the text_to_sequence function from
+ textToSequence.py (from textToSequence import text_to_sequence)
+
+ Authors
+ * Georges Abous-Rjeili 2021
+ * Artem Ploujnikov 2021
+ * Yingzhi Wang 2022
+"""
+import logging
+import sys
+
+import torch
+from hyperpyyaml import load_hyperpyyaml
+
+import speechbrain as sb
+from speechbrain.utils.data_utils import scalarize
+from speechbrain.utils.text_to_sequence import text_to_sequence
+
+logger = logging.getLogger(__name__)
+
+
+class Tacotron2Brain(sb.Brain):
+    """The Brain implementation for Tacotron2"""
+
+    def on_fit_start(self):
+        """Gets called at the beginning of ``fit()``, on multiple processes
+        if ``distributed_count > 0`` and backend is ddp and initializes statistics
+        """
+        self.hparams.progress_sample_logger.reset()
+        self.last_epoch = 0
+        self.last_batch = None
+        self.last_loss_stats = {}
+        return super().on_fit_start()
+
+    def compute_forward(self, batch, stage):
+        """Computes the forward pass
+
+        Arguments
+        ---------
+        batch: str
+            a single batch
+        stage: speechbrain.Stage
+            the training stage
+
+        Returns
+        -------
+        the model output
+        """
+        effective_batch = self.batch_to_device(batch)
+        inputs, y, num_items, _, _ = effective_batch
+
+        _, input_lengths, _, _, _ = inputs
+
+        max_input_length = input_lengths.max().item()
+        return self.modules.model(inputs, alignments_dim=max_input_length)
+
+    def on_fit_batch_end(self, batch, outputs, loss, should_step):
+        """At the end of the optimizer step, apply noam annealing."""
+        if should_step:
+            self.hparams.lr_annealing(self.optimizer)
+
+    def compute_objectives(self, predictions, batch, stage):
+        """Computes the loss given the predicted and targeted outputs.
+        Arguments
+        ---------
+        predictions : torch.Tensor
+            The model generated spectrograms and other metrics from `compute_forward`.
+        batch : PaddedBatch
+            This batch object contains all the relevant tensors for computation.
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+        Returns
+        -------
+        loss : torch.Tensor
+            A one-element tensor used for backpropagating the gradient.
+        """
+        effective_batch = self.batch_to_device(batch)
+        # Hold on to the batch for the inference sample. This is needed because
+        # the inference sample is run from on_stage_end only, where
+        # batch information is not available
+        self.last_batch = effective_batch
+        # Hold on to a sample (for logging)
+        self._remember_sample(effective_batch, predictions)
+        # Compute the loss
+        loss = self._compute_loss(predictions, effective_batch, stage)
+        return loss
+
+    def _compute_loss(self, predictions, batch, stage):
+        """Computes the value of the loss function and updates stats
+
+        Arguments
+        ---------
+        predictions: tuple
+            model predictions
+        batch: PaddedBatch
+            Inputs for this training iteration.
+        stage: sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+
+        Returns
+        -------
+        loss: torch.Tensor
+            the loss value
+        """
+        inputs, targets, num_items, labels, wavs = batch
+        text_padded, input_lengths, _, max_len, output_lengths = inputs
+        loss_stats = self.hparams.criterion(
+            predictions, targets, input_lengths, output_lengths, self.last_epoch
+        )
+        self.last_loss_stats[stage] = scalarize(loss_stats)
+        return loss_stats.loss
+
+    def _remember_sample(self, batch, predictions):
+        """Remembers samples of spectrograms and the batch for logging purposes
+
+        Arguments
+        ---------
+        batch: tuple
+            a training batch
+        predictions: tuple
+            predictions (raw output of the Tacotron model)
+        """
+        inputs, targets, num_items, labels, wavs = batch
+        text_padded, input_lengths, _, max_len, output_lengths = inputs
+        mel_target, _ = targets
+        mel_out, mel_out_postnet, gate_out, alignments = predictions
+        alignments_max = (
+            alignments[0]
+            .max(dim=-1)
+            .values.max(dim=-1)
+            .values.unsqueeze(-1)
+            .unsqueeze(-1)
+        )
+        alignments_output = alignments[0].T.flip(dims=(1,)) / alignments_max
+        self.hparams.progress_sample_logger.remember(
+            target=self._get_spectrogram_sample(mel_target),
+            output=self._get_spectrogram_sample(mel_out),
+            output_postnet=self._get_spectrogram_sample(mel_out_postnet),
+            alignments=alignments_output,
+            raw_batch=self.hparams.progress_sample_logger.get_batch_sample(
+                {
+                    "text_padded": text_padded,
+                    "input_lengths": input_lengths,
+                    "mel_target": mel_target,
+                    "mel_out": mel_out,
+                    "mel_out_postnet": mel_out_postnet,
+                    "max_len": max_len,
+                    "output_lengths": output_lengths,
+                    "gate_out": gate_out,
+                    "alignments": alignments,
+                    "labels": labels,
+                    "wavs": wavs,
+                }
+            ),
+        )
+
+    def batch_to_device(self, batch):
+        """Transfers the batch to the target device
+
+        Arguments
+        ---------
+        batch: tuple
+            the batch to use
+
+        Returns
+        -------
+        batch: tuple
+            the batch on the correct device
+        """
+        (
+            text_padded,
+            input_lengths,
+            mel_padded,
+            gate_padded,
+            output_lengths,
+            len_x,
+            labels,
+            wavs,
+        ) = batch
+        text_padded = text_padded.to(self.device, non_blocking=True).long()
+        input_lengths = input_lengths.to(self.device, non_blocking=True).long()
+        max_len = torch.max(input_lengths.data).item()
+        mel_padded = mel_padded.to(self.device, non_blocking=True).float()
+        gate_padded = gate_padded.to(self.device, non_blocking=True).float()
+
+        output_lengths = output_lengths.to(
+            self.device, non_blocking=True
+        ).long()
+        x = (text_padded, input_lengths, mel_padded, max_len, output_lengths)
+        y = (mel_padded, gate_padded)
+        len_x = torch.sum(output_lengths)
+        return (x, y, len_x, labels, wavs)
+
+    def _get_spectrogram_sample(self, raw):
+        """Converts a raw spectrogram to one that can be saved as an image
+        sample  = sqrt(exp(raw))
+
+        Arguments
+        ---------
+        raw: torch.Tensor
+            the raw spectrogram (as used in the model)
+
+        Returns
+        -------
+        sample: torch.Tensor
+            the spectrogram, for image saving purposes
+        """
+        sample = raw[0]
+        return torch.sqrt(torch.exp(sample))
+
+    def on_stage_end(self, stage, stage_loss, epoch):
+        """Gets called at the end of an epoch.
+        Arguments
+        ---------
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST
+        stage_loss : float
+            The average loss for all of the data processed in this stage.
+        epoch : int
+            The currently-starting epoch. This is passed
+            `None` during the test stage.
+        """
+
+        # Store the train loss until the validation stage.
+
+        # At the end of validation, we can write
+        if stage == sb.Stage.VALID:
+            # Update learning rate
+            lr = self.optimizer.param_groups[-1]["lr"]
+            self.last_epoch = epoch
+
+            # The train_logger writes a summary to stdout and to the logfile.
+            self.hparams.train_logger.log_stats(  # 1#2#
+                stats_meta={"Epoch": epoch, "lr": lr},
+                train_stats=self.last_loss_stats[sb.Stage.TRAIN],
+                valid_stats=self.last_loss_stats[sb.Stage.VALID],
+            )
+
+            # Save the current checkpoint and delete previous checkpoints.
+            epoch_metadata = {
+                **{"epoch": epoch},
+                **self.last_loss_stats[sb.Stage.VALID],
+            }
+            self.checkpointer.save_and_keep_only(
+                meta=epoch_metadata,
+                min_keys=["loss"],
+                ckpt_predicate=(
+                    (
+                        lambda ckpt: (
+                            ckpt.meta["epoch"]
+                            % self.hparams.keep_checkpoint_interval
+                            != 0
+                        )
+                    )
+                    if self.hparams.keep_checkpoint_interval is not None
+                    else None
+                ),
+            )
+            output_progress_sample = (
+                self.hparams.progress_samples
+                and epoch % self.hparams.progress_samples_interval == 0
+            )
+            if output_progress_sample:
+                self.run_inference_sample()
+                self.hparams.progress_sample_logger.save(epoch)
+
+        # We also write statistics about test data to stdout and to the logfile.
+        if stage == sb.Stage.TEST:
+            self.hparams.train_logger.log_stats(
+                {"Epoch loaded": self.hparams.epoch_counter.current},
+                test_stats=self.last_loss_stats[sb.Stage.TEST],
+            )
+            if self.hparams.progress_samples:
+                self.run_inference_sample()
+                self.hparams.progress_sample_logger.save("test")
+
+    def run_inference_sample(self):
+        """Produces a sample in inference mode. This is called when producing
+        samples and can be useful because"""
+        if self.last_batch is None:
+            return
+        inputs, _, _, _, _ = self.last_batch
+        text_padded, input_lengths, _, _, _ = inputs
+        mel_out, _, _ = self.hparams.model.infer(
+            text_padded[:1], input_lengths[:1]
+        )
+        self.hparams.progress_sample_logger.remember(
+            inference_mel_out=self._get_spectrogram_sample(mel_out)
+        )
+
+
+def dataio_prepare(hparams):
+    # Define audio pipeline:
+    @sb.utils.data_pipeline.takes("wav", "label")
+    @sb.utils.data_pipeline.provides("mel_text_pair")
+    def audio_pipeline(wav, label):
+        text_seq = torch.IntTensor(
+            text_to_sequence(label, hparams["text_cleaners"])
+        )
+
+        audio = sb.dataio.dataio.read_audio(wav)
+        mel = hparams["mel_spectogram"](audio=audio)
+
+        len_text = len(text_seq)
+
+        return text_seq, mel, len_text
+
+    datasets = {}
+    data_info = {
+        "train": hparams["train_json"],
+        "valid": hparams["valid_json"],
+        "test": hparams["test_json"],
+    }
+    for dataset in hparams["splits"]:
+        datasets[dataset] = sb.dataio.dataset.DynamicItemDataset.from_json(
+            json_path=data_info[dataset],
+            replacements={"data_root": hparams["data_folder"]},
+            dynamic_items=[audio_pipeline],
+            output_keys=["mel_text_pair", "wav", "label"],
+        )
+
+    return datasets
+
+
+if __name__ == "__main__":
+    # Load hyperparameters file with command-line overrides
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+
+    with open(hparams_file) as fin:
+        hparams = load_hyperpyyaml(fin, overrides)
+
+    # create ddp_group with the right communication protocol
+    sb.utils.distributed.ddp_init_group(run_opts)
+
+    # Create experiment directory
+    sb.create_experiment_directory(
+        experiment_directory=hparams["output_folder"],
+        hyperparams_to_save=hparams_file,
+        overrides=overrides,
+    )
+
+    from ljspeech_prepare import prepare_ljspeech
+
+    sb.utils.distributed.run_on_main(
+        prepare_ljspeech,
+        kwargs={
+            "data_folder": hparams["data_folder"],
+            "save_folder": hparams["save_folder"],
+            "splits": hparams["splits"],
+            "split_ratio": hparams["split_ratio"],
+            "seed": hparams["seed"],
+            "skip_prep": hparams["skip_prep"],
+        },
+    )
+
+    datasets = dataio_prepare(hparams)
+
+    # Brain class initialization
+    tacotron2_brain = Tacotron2Brain(
+        modules=hparams["modules"],
+        opt_class=hparams["opt_class"],
+        hparams=hparams,
+        run_opts=run_opts,
+        checkpointer=hparams["checkpointer"],
+    )
+
+    # Training
+    tacotron2_brain.fit(
+        tacotron2_brain.hparams.epoch_counter,
+        train_set=datasets["train"],
+        valid_set=datasets["valid"],
+        train_loader_kwargs=hparams["train_dataloader_opts"],
+        valid_loader_kwargs=hparams["valid_dataloader_opts"],
+    )
+
+    # Test
+    if "test" in datasets:
+        tacotron2_brain.evaluate(
+            datasets["test"],
+            test_loader_kwargs=hparams["test_dataloader_opts"],
+        )
--- a/speechbrain/recipes/LJSpeech/TTS/tacotron2/train.py
+++ b/speechbrain/recipes/LJSpeech/TTS/tacotron2/train.py
+# -*- coding: utf-8 -*-
+"""
+ Recipe for training the Tacotron Text-To-Speech model, an end-to-end
+ neural text-to-speech (TTS) system
+
+ To run this recipe, do the following:
+ # python train.py --device=cuda:0 --max_grad_norm=1.0 --data_folder=/your_folder/LJSpeech-1.1 hparams/train.yaml
+
+ to infer simply load saved model and do
+ savemodel.infer(text_Sequence,len(textsequence))
+
+ were text_Sequence is the output of the text_to_sequence function from
+ textToSequence.py (from textToSequence import text_to_sequence)
+
+ Authors
+ * Georges Abous-Rjeili 2021
+ * Artem Ploujnikov 2021
+ * Yingzhi Wang 2022
+"""
+import logging
+import sys
+
+import torch
+from hyperpyyaml import load_hyperpyyaml
+
+import speechbrain as sb
+from speechbrain.utils.data_utils import scalarize
+from speechbrain.utils.text_to_sequence import text_to_sequence
+
+logger = logging.getLogger(__name__)
+
+
+class Tacotron2Brain(sb.Brain):
+    """The Brain implementation for Tacotron2"""
+
+    def on_fit_start(self):
+        """Gets called at the beginning of ``fit()``, on multiple processes
+        if ``distributed_count > 0`` and backend is ddp and initializes statistics
+        """
+        self.hparams.progress_sample_logger.reset()
+        self.last_epoch = 0
+        self.last_batch = None
+        self.last_loss_stats = {}
+        return super().on_fit_start()
+
+    def compute_forward(self, batch, stage):
+        """Computes the forward pass
+
+        Arguments
+        ---------
+        batch: str
+            a single batch
+        stage: speechbrain.Stage
+            the training stage
+
+        Returns
+        -------
+        the model output
+        """
+        effective_batch = self.batch_to_device(batch)
+        inputs, y, num_items, _, _ = effective_batch
+
+        _, input_lengths, _, _, _ = inputs
+
+        max_input_length = input_lengths.max().item()
+        return self.modules.model(inputs, alignments_dim=max_input_length)
+
+    def on_fit_batch_end(self, batch, outputs, loss, should_step):
+        """At the end of the optimizer step, apply noam annealing."""
+        if should_step:
+            self.hparams.lr_annealing(self.optimizer)
+
+    def compute_objectives(self, predictions, batch, stage):
+        """Computes the loss given the predicted and targeted outputs.
+        Arguments
+        ---------
+        predictions : torch.Tensor
+            The model generated spectrograms and other metrics from `compute_forward`.
+        batch : PaddedBatch
+            This batch object contains all the relevant tensors for computation.
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+        Returns
+        -------
+        loss : torch.Tensor
+            A one-element tensor used for backpropagating the gradient.
+        """
+        effective_batch = self.batch_to_device(batch)
+        # Hold on to the batch for the inference sample. This is needed because
+        # the inference sample is run from on_stage_end only, where
+        # batch information is not available
+        self.last_batch = effective_batch
+        # Hold on to a sample (for logging)
+        self._remember_sample(effective_batch, predictions)
+        # Compute the loss
+        loss = self._compute_loss(predictions, effective_batch, stage)
+        return loss
+
+    def _compute_loss(self, predictions, batch, stage):
+        """Computes the value of the loss function and updates stats
+
+        Arguments
+        ---------
+        predictions: tuple
+            model predictions
+        batch: PaddedBatch
+            Inputs for this training iteration.
+        stage: sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+
+        Returns
+        -------
+        loss: torch.Tensor
+            the loss value
+        """
+        inputs, targets, num_items, labels, wavs = batch
+        text_padded, input_lengths, _, max_len, output_lengths = inputs
+        loss_stats = self.hparams.criterion(
+            predictions, targets, input_lengths, output_lengths, self.last_epoch
+        )
+        self.last_loss_stats[stage] = scalarize(loss_stats)
+        return loss_stats.loss
+
+    def _remember_sample(self, batch, predictions):
+        """Remembers samples of spectrograms and the batch for logging purposes
+
+        Arguments
+        ---------
+        batch: tuple
+            a training batch
+        predictions: tuple
+            predictions (raw output of the Tacotron model)
+        """
+        inputs, targets, num_items, labels, wavs = batch
+        text_padded, input_lengths, _, max_len, output_lengths = inputs
+        mel_target, _ = targets
+        mel_out, mel_out_postnet, gate_out, alignments = predictions
+        alignments_max = (
+            alignments[0]
+            .max(dim=-1)
+            .values.max(dim=-1)
+            .values.unsqueeze(-1)
+            .unsqueeze(-1)
+        )
+        alignments_output = alignments[0].T.flip(dims=(1,)) / alignments_max
+        self.hparams.progress_sample_logger.remember(
+            target=self._get_spectrogram_sample(mel_target),
+            output=self._get_spectrogram_sample(mel_out),
+            output_postnet=self._get_spectrogram_sample(mel_out_postnet),
+            alignments=alignments_output,
+            raw_batch=self.hparams.progress_sample_logger.get_batch_sample(
+                {
+                    "text_padded": text_padded,
+                    "input_lengths": input_lengths,
+                    "mel_target": mel_target,
+                    "mel_out": mel_out,
+                    "mel_out_postnet": mel_out_postnet,
+                    "max_len": max_len,
+                    "output_lengths": output_lengths,
+                    "gate_out": gate_out,
+                    "alignments": alignments,
+                    "labels": labels,
+                    "wavs": wavs,
+                }
+            ),
+        )
+
+    def batch_to_device(self, batch):
+        """Transfers the batch to the target device
+
+        Arguments
+        ---------
+        batch: tuple
+            the batch to use
+
+        Returns
+        -------
+        batch: tuple
+            the batch on the correct device
+        """
+        (
+            text_padded,
+            input_lengths,
+            mel_padded,
+            gate_padded,
+            output_lengths,
+            len_x,
+            labels,
+            wavs,
+        ) = batch
+        text_padded = text_padded.to(self.device, non_blocking=True).long()
+        input_lengths = input_lengths.to(self.device, non_blocking=True).long()
+        max_len = torch.max(input_lengths.data).item()
+        mel_padded = mel_padded.to(self.device, non_blocking=True).float()
+        gate_padded = gate_padded.to(self.device, non_blocking=True).float()
+
+        output_lengths = output_lengths.to(
+            self.device, non_blocking=True
+        ).long()
+        x = (text_padded, input_lengths, mel_padded, max_len, output_lengths)
+        y = (mel_padded, gate_padded)
+        len_x = torch.sum(output_lengths)
+        return (x, y, len_x, labels, wavs)
+
+    def _get_spectrogram_sample(self, raw):
+        """Converts a raw spectrogram to one that can be saved as an image
+        sample  = sqrt(exp(raw))
+
+        Arguments
+        ---------
+        raw: torch.Tensor
+            the raw spectrogram (as used in the model)
+
+        Returns
+        -------
+        sample: torch.Tensor
+            the spectrogram, for image saving purposes
+        """
+        sample = raw[0]
+        return torch.sqrt(torch.exp(sample))
+
+    def on_stage_end(self, stage, stage_loss, epoch):
+        """Gets called at the end of an epoch.
+        Arguments
+        ---------
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST
+        stage_loss : float
+            The average loss for all of the data processed in this stage.
+        epoch : int
+            The currently-starting epoch. This is passed
+            `None` during the test stage.
+        """
+
+        # Store the train loss until the validation stage.
+
+        # At the end of validation, we can write
+        if stage == sb.Stage.VALID:
+            # Update learning rate
+            lr = self.optimizer.param_groups[-1]["lr"]
+            self.last_epoch = epoch
+
+            # The train_logger writes a summary to stdout and to the logfile.
+            self.hparams.train_logger.log_stats(  # 1#2#
+                stats_meta={"Epoch": epoch, "lr": lr},
+                train_stats=self.last_loss_stats[sb.Stage.TRAIN],
+                valid_stats=self.last_loss_stats[sb.Stage.VALID],
+            )
+
+            # Save the current checkpoint and delete previous checkpoints.
+            epoch_metadata = {
+                **{"epoch": epoch},
+                **self.last_loss_stats[sb.Stage.VALID],
+            }
+            self.checkpointer.save_and_keep_only(
+                meta=epoch_metadata,
+                min_keys=["loss"],
+                ckpt_predicate=(
+                    (
+                        lambda ckpt: (
+                            ckpt.meta["epoch"]
+                            % self.hparams.keep_checkpoint_interval
+                            != 0
+                        )
+                    )
+                    if self.hparams.keep_checkpoint_interval is not None
+                    else None
+                ),
+            )
+            output_progress_sample = (
+                self.hparams.progress_samples
+                and epoch % self.hparams.progress_samples_interval == 0
+            )
+            if output_progress_sample:
+                self.run_inference_sample()
+                self.hparams.progress_sample_logger.save(epoch)
+
+        # We also write statistics about test data to stdout and to the logfile.
+        if stage == sb.Stage.TEST:
+            self.hparams.train_logger.log_stats(
+                {"Epoch loaded": self.hparams.epoch_counter.current},
+                test_stats=self.last_loss_stats[sb.Stage.TEST],
+            )
+            if self.hparams.progress_samples:
+                self.run_inference_sample()
+                self.hparams.progress_sample_logger.save("test")
+
+    def run_inference_sample(self):
+        """Produces a sample in inference mode. This is called when producing
+        samples and can be useful because"""
+        if self.last_batch is None:
+            return
+        inputs, _, _, _, _ = self.last_batch
+        text_padded, input_lengths, _, _, _ = inputs
+        mel_out, _, _ = self.hparams.model.infer(
+            text_padded[:1], input_lengths[:1]
+        )
+        self.hparams.progress_sample_logger.remember(
+            inference_mel_out=self._get_spectrogram_sample(mel_out)
+        )
+
+
+def dataio_prepare(hparams):
+    # Define audio pipeline:
+    @sb.utils.data_pipeline.takes("wav", "label")
+    @sb.utils.data_pipeline.provides("mel_text_pair")
+    def audio_pipeline(wav, label):
+        text_seq = torch.IntTensor(
+            text_to_sequence(label, hparams["text_cleaners"])
+        )
+
+        audio = sb.dataio.dataio.read_audio(wav)
+        mel = hparams["mel_spectogram"](audio=audio)
+
+        len_text = len(text_seq)
+
+        return text_seq, mel, len_text
+
+    datasets = {}
+    data_info = {
+        "train": hparams["train_json"],
+        "valid": hparams["valid_json"],
+        "test": hparams["test_json"],
+    }
+    for dataset in hparams["splits"]:
+        datasets[dataset] = sb.dataio.dataset.DynamicItemDataset.from_json(
+            json_path=data_info[dataset],
+            replacements={"data_root": hparams["data_folder"]},
+            dynamic_items=[audio_pipeline],
+            output_keys=["mel_text_pair", "wav", "label"],
+        )
+
+    return datasets
+
+
+if __name__ == "__main__":
+    # Load hyperparameters file with command-line overrides
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+
+    with open(hparams_file) as fin:
+        hparams = load_hyperpyyaml(fin, overrides)
+
+    # create ddp_group with the right communication protocol
+    sb.utils.distributed.ddp_init_group(run_opts)
+
+    # Create experiment directory
+    sb.create_experiment_directory(
+        experiment_directory=hparams["output_folder"],
+        hyperparams_to_save=hparams_file,
+        overrides=overrides,
+    )
+
+    from ljspeech_prepare import prepare_ljspeech
+
+    sb.utils.distributed.run_on_main(
+        prepare_ljspeech,
+        kwargs={
+            "data_folder": hparams["data_folder"],
+            "save_folder": hparams["save_folder"],
+            "splits": hparams["splits"],
+            "split_ratio": hparams["split_ratio"],
+            "seed": hparams["seed"],
+            "skip_prep": hparams["skip_prep"],
+        },
+    )
+
+    datasets = dataio_prepare(hparams)
+
+    # Brain class initialization
+    tacotron2_brain = Tacotron2Brain(
+        modules=hparams["modules"],
+        opt_class=hparams["opt_class"],
+        hparams=hparams,
+        run_opts=run_opts,
+        checkpointer=hparams["checkpointer"],
+    )
+
+    # Training
+    tacotron2_brain.fit(
+        tacotron2_brain.hparams.epoch_counter,
+        train_set=datasets["train"],
+        valid_set=datasets["valid"],
+        train_loader_kwargs=hparams["train_dataloader_opts"],
+        valid_loader_kwargs=hparams["valid_dataloader_opts"],
+    )
+
+    # Test
+    if "test" in datasets:
+        tacotron2_brain.evaluate(
+            datasets["test"],
+            test_loader_kwargs=hparams["test_dataloader_opts"],
+        )
--- a/speechbrain/recipes/LJSpeech/TTS/vocoder/diffwave/hparams/train.yaml
+++ b/speechbrain/recipes/LJSpeech/TTS/vocoder/diffwave/hparams/train.yaml
+# ################################################
+# Basic training parameters for a diffwave vocoder
+#
+# Author:
+#  * Yingzhi Wang 2022
+# ################################################
+
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 1234
+__set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]
+
+data_folder: !PLACEHOLDER
+output_folder: !ref ./results/diffwave/<seed>
+save_folder: !ref <output_folder>/save
+progress_sample_path: !ref <output_folder>/samples
+train_log: !ref <output_folder>/train_log.txt
+progress_samples_interval: 10
+
+train_json: !ref <save_folder>/train.json
+valid_json: !ref <save_folder>/valid.json
+test_json: !ref <save_folder>/test.json
+splits: ["train", "valid"]
+split_ratio: [90, 10]
+skip_prep: False
+# The train logger writes training statistics to a file, as well as stdout.
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+keep_checkpoint_interval: 100
+
+# conditional training length
+segment_size: 15872
+
+# Training Parameters
+sample_rate: 22050
+number_of_epochs: 500
+batch_size: 16
+num_workers: 8
+
+lr: 0.0002
+
+# diffusion parameters
+train_timesteps: 50
+beta_start: 0.0001
+beta_end: 0.05
+fast_sampling: True
+fast_sampling_noise_schedule: [0.0001, 0.001, 0.01, 0.05, 0.2, 0.5]
+
+loss_l2_steps: 0
+
+adam_beta1: 0.95
+adam_beta2: 0.999
+adam_weight_decay: 0.000001
+adam_epsilon: 0.00000001
+
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    drop_last: False
+    num_workers: !ref <num_workers>
+
+valid_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+
+use_tensorboard: False
+tensorboard_logs: !ref <output_folder>/logs/
+
+residual_layers: 30
+residual_channels: 64
+dilation_cycle_length: 10
+
+unconditional: False
+
+# Spectrogram Parameters
+spec_n_fft: 1024
+spec_f_min: 0
+spec_f_max: 8000
+mel_normalized: False
+spec_n_mels: 80
+spec_power: 1
+spec_hop_length: 256
+spec_win_length: 1024
+spec_norm: "slaney"
+spec_mel_scale: "slaney"
+dynamic_range_compression: True
+
+# Feature extraction
+mel_spectogram: !name:speechbrain.lobes.models.HifiGAN.mel_spectogram
+    sample_rate: !ref <sample_rate>
+    hop_length: !ref <spec_hop_length>
+    win_length: !ref <spec_win_length>
+    n_fft: !ref <spec_n_fft>
+    n_mels: !ref <spec_n_mels>
+    f_min: !ref <spec_f_min>
+    f_max: !ref <spec_f_max>
+    power: !ref <spec_power>
+    normalized: !ref <mel_normalized>
+    norm: !ref <spec_norm>
+    mel_scale: !ref <spec_mel_scale>
+    compression: !ref <dynamic_range_compression>
+
+compute_cost: !new:speechbrain.nnet.schedulers.ScheduledLoss
+    schedule:
+        - loss_fn: !name:speechbrain.nnet.losses.mse_loss
+          steps: !ref <loss_l2_steps>
+        - loss_fn: !name:speechbrain.nnet.losses.l1_loss
+
+
+# To design a custom model, either just edit the simple CustomModel
+# class that's listed here, or replace this `!new` call with a line
+# pointing to a different file you've defined.
+diffwave: !new:speechbrain.lobes.models.DiffWave.DiffWave
+    input_channels: !ref <spec_n_mels>
+    residual_layers: !ref <residual_layers>
+    residual_channels: !ref <residual_channels>
+    dilation_cycle_length: !ref <dilation_cycle_length>
+    total_steps: !ref <train_timesteps>
+    unconditional: !ref <unconditional>
+
+noise: !new:speechbrain.nnet.diffusion.GaussianNoise
+
+diffusion: !new:speechbrain.lobes.models.DiffWave.DiffWaveDiffusion
+    model: !ref <diffwave.diffusion_forward>
+    beta_start: !ref <beta_start>
+    beta_end: !ref <beta_end>
+    timesteps: !ref <train_timesteps>
+    noise: !ref <noise>
+
+# The first object passed to the Brain class is this "Epoch Counter"
+# which is saved by the Checkpointer so that training can be resumed
+# if it gets interrupted at any point.
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+# Objects in "modules" dict will have their parameters moved to the correct
+# device, as well as having train()/eval() called on them by the Brain class.
+modules:
+    diffwave: !ref <diffwave>
+    diffusion: !ref <diffusion>
+
+# This optimizer will be constructed by the Brain class after all parameters
+# are moved to the correct device. Then it will be added to the checkpointer.
+opt_class: !name:torch.optim.AdamW
+    lr: !ref <lr>
+    betas: !ref (<adam_beta1>, <adam_beta2>)
+    weight_decay: !ref <adam_weight_decay>
+    eps: !ref <adam_epsilon>
+
+# This function manages learning rate annealing over the epochs.
+# We here use the simple lr annealing method that linearly decreases
+# the lr from the initial value to the final one.
+# lr_annealing: !new:speechbrain.nnet.schedulers.WarmCoolDecayLRSchedule
+#     lr: !ref <lr>
+#     warmup: !ref <lr_warmup_steps>
+#     cooldown: !ref <lr_cooldown_steps>
+#     total_steps: !ref <lr_total_steps>
+
+# This object is used for saving the state of training both so that it
+# can be resumed if it gets interrupted, and also so that the best checkpoint
+# can be later loaded for evaluation or inference.
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        diffwave: !ref <diffwave>
+        counter: !ref <epoch_counter>
--- a/speechbrain/recipes/LJSpeech/TTS/vocoder/diffwave/ljspeech_prepare.py
+++ b/speechbrain/recipes/LJSpeech/TTS/vocoder/diffwave/ljspeech_prepare.py
+../../../ljspeech_prepare.py
\ No newline at end of file
--- a/speechbrain/recipes/LJSpeech/TTS/vocoder/diffwave/train.py
+++ b/speechbrain/recipes/LJSpeech/TTS/vocoder/diffwave/train.py
+#!/usr/bin/env python3
+"""script to train a diffwave vocoder
+See https://arxiv.org/pdf/2009.09761.pdf for more details
+
+Authors
+ * Yingzhi Wang 2022
+"""
+
+import logging
+import os
+import sys
+
+import torch
+import torchaudio
+from hyperpyyaml import load_hyperpyyaml
+
+import speechbrain as sb
+
+logger = logging.getLogger(__name__)
+
+
+class DiffWaveBrain(sb.Brain):
+    """Class that manages the training loop. See speechbrain.core.Brain."""
+
+    def compute_forward(self, batch, stage):
+        """Runs all the computation of that transforms the input into the
+        output probabilities over the N classes.
+        Arguments
+        ---------
+        batch : PaddedBatch
+            This batch object contains all the relevant tensors for computation.
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+        Returns
+        -------
+        predictions : torch.Tensor
+            torch.Tensor that contains the posterior probabilities over the N classes.
+        """
+
+        # We first move the batch to the appropriate device.
+        batch = batch.to(self.device)
+
+        x, _ = batch.mel
+        y, _ = batch.sig
+
+        pred, noise, noisy_sample = self.modules.diffusion.train_sample(
+            y,
+            timesteps=None,
+            condition=x,
+        )
+
+        return pred, noise, noisy_sample, None
+
+    def compute_objectives(self, predictions, batch, stage):
+        """Computes the loss given the predicted and targeted outputs.
+        Arguments
+        ---------
+        predictions : tensor
+            The output tensor from `compute_forward`.
+        batch : PaddedBatch
+            This batch object contains all the relevant tensors for computation.
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+        Returns
+        -------
+        loss : torch.Tensor
+            A one-element tensor used for backpropagating the gradient.
+        """
+        batch = batch.to(self.device)
+        x, _ = batch.mel
+        y, _ = batch.sig
+        self.last_batch = (x, y)
+        self._remember_sample(self.last_batch, predictions)
+
+        preds, noise, noisy_sample, lens = predictions
+
+        loss = self.hparams.compute_cost(
+            preds.squeeze(1), noise.squeeze(1), length=lens
+        )
+
+        self.last_loss_stats[stage] = {"loss": loss}
+        return loss
+
+    def on_fit_start(self):
+        """Gets called at the beginning of ``fit()``, on multiple processes
+        if ``distributed_count > 0`` and backend is ddp and initializes statistics
+        """
+        self.last_batch = None
+        self.last_loss_stats = {}
+        return super().on_fit_start()
+
+    def _remember_sample(self, batch, predictions):
+        """Remembers samples of spectrograms and the batch for logging purposes
+        Arguments
+        ---------
+        batch: tuple
+            a training batch
+        predictions: tuple
+            predictions (raw output of the Tacotron model)
+        """
+        mel, sig = batch
+        pred, noise, noisy_sample, steps = predictions
+
+    def on_stage_end(self, stage, stage_loss, epoch):
+        """Gets called at the end of a stage (TRAIN, VALID, Or TEST)"""
+        if stage == sb.Stage.VALID:
+            lr = self.optimizer.param_groups[0]["lr"]
+            self.hparams.train_logger.log_stats(
+                stats_meta={"Epoch": epoch, "lr": lr},
+                train_stats=self.last_loss_stats[sb.Stage.TRAIN],
+                valid_stats=self.last_loss_stats[sb.Stage.VALID],
+            )
+            # The tensorboard_logger writes a summary to stdout and to the logfile.
+            if self.hparams.use_tensorboard:
+                self.tensorboard_logger.log_stats(
+                    stats_meta={"Epoch": epoch, "lr": lr},
+                    train_stats=self.last_loss_stats[sb.Stage.TRAIN],
+                    valid_stats=self.last_loss_stats[sb.Stage.VALID],
+                )
+
+            # Save the current checkpoint and delete previous checkpoints.
+            epoch_metadata = {
+                **{"epoch": epoch},
+                **self.last_loss_stats[sb.Stage.VALID],
+            }
+            self.checkpointer.save_and_keep_only(
+                meta=epoch_metadata,
+                end_of_epoch=True,
+                min_keys=["loss"],
+                ckpt_predicate=(
+                    (
+                        lambda ckpt: (
+                            ckpt.meta["epoch"]
+                            % self.hparams.keep_checkpoint_interval
+                            != 0
+                        )
+                    )
+                    if self.hparams.keep_checkpoint_interval is not None
+                    else None
+                ),
+            )
+
+            if epoch % self.hparams.progress_samples_interval == 0:
+                self.run_inference_sample("Valid")
+
+        # We also write statistics about test data to stdout and to the TensorboardLogger.
+        if stage == sb.Stage.TEST:
+            self.hparams.train_logger.log_stats(  # 1#2#
+                {"Epoch loaded": self.hparams.epoch_counter.current},
+                test_stats=self.last_loss_stats[sb.Stage.TEST],
+            )
+            if self.hparams.use_tensorboard:
+                self.tensorboard_logger.log_stats(
+                    {"Epoch loaded": self.hparams.epoch_counter.current},
+                    test_stats=self.last_loss_stats[sb.Stage.TEST],
+                )
+            self.run_inference_sample("Test")
+
+    def run_inference_sample(self, name):
+        """Produces a sample in inference mode. This is called when producing
+        samples.
+        """
+        with torch.no_grad():
+            if self.last_batch is None:
+                return
+            x, y = self.last_batch
+
+            sig_out = self.modules.diffusion.inference(
+                unconditional=self.hparams.unconditional,
+                scale=self.hparams.spec_hop_length,
+                condition=x,
+                fast_sampling=self.hparams.fast_sampling,
+                fast_sampling_noise_schedule=self.hparams.fast_sampling_noise_schedule,
+            )
+
+            spec_out = self.hparams.mel_spectogram(
+                audio=sig_out.squeeze(1).cpu()
+            )
+
+        if self.hparams.use_tensorboard:
+            self.tensorboard_logger.log_audio(
+                f"{name}/audio_target", y.squeeze(0), self.hparams.sample_rate
+            )
+            self.tensorboard_logger.log_audio(
+                f"{name}/audio_pred",
+                sig_out.squeeze(0),
+                self.hparams.sample_rate,
+            )
+            self.tensorboard_logger.log_figure(f"{name}/mel_target", x)
+            self.tensorboard_logger.log_figure(f"{name}/mel_pred", spec_out)
+        else:
+            # folder name is the current epoch for validation and "test" for test
+            folder = (
+                self.hparams.epoch_counter.current
+                if name == "Valid"
+                else "test"
+            )
+            self.save_audio("target", y.squeeze(1), folder)
+            self.save_audio("synthesized", sig_out, folder)
+
+    def save_audio(self, name, data, epoch):
+        """Saves a single wav
+        Arguments
+        ---------
+        name: str
+            the name of the saved audio
+        data: torch.Tensor
+            the  wave data to save
+        epoch: int or str
+            the epoch number (used in file path calculations)
+            or "test" for test stage
+        """
+        target_path = os.path.join(
+            self.hparams.progress_sample_path, str(epoch)
+        )
+        if not os.path.exists(target_path):
+            os.makedirs(target_path)
+        file_name = f"{name}.wav"
+        effective_file_name = os.path.join(target_path, file_name)
+        torchaudio.save(effective_file_name, data.cpu(), 22050)
+
+
+def dataio_prepare(hparams):
+    """This function prepares the datasets to be used in the brain class.
+    It also defines the data processing pipeline through user-defined functions.
+    """
+    segment_size = hparams["segment_size"]
+
+    # Define audio pipeline:
+    @sb.utils.data_pipeline.takes("wav", "segment")
+    @sb.utils.data_pipeline.provides("mel", "sig")
+    def audio_pipeline(wav, segment):
+        audio = sb.dataio.dataio.read_audio(wav)
+        audio = torch.FloatTensor(audio)
+        audio = audio.unsqueeze(0)
+        if segment:
+            if audio.size(1) >= segment_size:
+                max_audio_start = audio.size(1) - segment_size
+                audio_start = torch.randint(0, max_audio_start, (1,))
+                audio = audio[:, audio_start : audio_start + segment_size]
+            else:
+                audio = torch.nn.functional.pad(
+                    audio, (0, segment_size - audio.size(1)), "constant"
+                )
+
+        mel = hparams["mel_spectogram"](audio=audio.squeeze(0))
+
+        # for diffwave the audio length needs to be hop_length * mel_length
+        audio_length = mel.shape[-1] * hparams["spec_hop_length"]
+        audio = torch.nn.functional.pad(
+            audio, (0, audio_length - audio.size(1)), "constant"
+        )
+        return mel, audio
+
+    datasets = {}
+    data_info = {
+        "train": hparams["train_json"],
+        "valid": hparams["valid_json"],
+        "test": hparams["test_json"],
+    }
+    for dataset in hparams["splits"]:
+        datasets[dataset] = sb.dataio.dataset.DynamicItemDataset.from_json(
+            json_path=data_info[dataset],
+            replacements={"data_root": hparams["data_folder"]},
+            dynamic_items=[audio_pipeline],
+            output_keys=["id", "mel", "sig"],
+        )
+
+    return datasets
+
+
+def check_tensorboard(hparams):
+    """Checks whether Tensorboard is enabled and initializes the logger if it is
+
+    Arguments
+    ---------
+    hparams: dict
+        the hyperparameter dictionary
+    """
+    if hparams["use_tensorboard"]:
+        try:
+            from speechbrain.utils.train_logger import TensorboardLogger
+
+            hparams["tensorboard_train_logger"] = TensorboardLogger(
+                hparams["tensorboard_logs"]
+            )
+        except ImportError:
+            logger.warning(
+                "Could not enable torch.TensorBoard logging - torch.TensorBoard is not available"
+            )
+            hparams["use_tensorboard"] = False
+
+
+# Recipe begins!
+if __name__ == "__main__":
+    # Reading command line arguments.
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+
+    # Initialize ddp (useful only for multi-GPU DDP training).
+    sb.utils.distributed.ddp_init_group(run_opts)
+
+    # Load hyperparameters file with command-line overrides.
+    with open(hparams_file) as fin:
+        hparams = load_hyperpyyaml(fin, overrides)
+
+    # Check whether Tensorboard is available and enabled
+    check_tensorboard(hparams)
+
+    # Create experiment directory
+    sb.create_experiment_directory(
+        experiment_directory=hparams["output_folder"],
+        hyperparams_to_save=hparams_file,
+        overrides=overrides,
+    )
+
+    # Create dataset objects "train", "valid", and "test".
+    sys.path.append("../../")
+    from ljspeech_prepare import prepare_ljspeech
+
+    sb.utils.distributed.run_on_main(
+        prepare_ljspeech,
+        kwargs={
+            "data_folder": hparams["data_folder"],
+            "save_folder": hparams["save_folder"],
+            "splits": hparams["splits"],
+            "split_ratio": hparams["split_ratio"],
+            "seed": hparams["seed"],
+            "skip_prep": hparams["skip_prep"],
+        },
+    )
+
+    datasets = dataio_prepare(hparams)
+
+    # Initialize the Brain object to prepare for mask training.
+    diffusion_brain = DiffWaveBrain(
+        modules=hparams["modules"],
+        opt_class=hparams["opt_class"],
+        hparams=hparams,
+        run_opts=run_opts,
+        checkpointer=hparams["checkpointer"],
+    )
+
+    # The `fit()` method iterates the training loop, calling the methods
+    # necessary to update the parameters of the model. Since all objects
+    # with changing state are managed by the Checkpointer, training can be
+    # stopped at any point, and will be resumed on next call.
+    diffusion_brain.fit(
+        epoch_counter=diffusion_brain.hparams.epoch_counter,
+        train_set=datasets["train"],
+        valid_set=datasets["valid"],
+        train_loader_kwargs=hparams["train_dataloader_opts"],
+        valid_loader_kwargs=hparams["valid_dataloader_opts"],
+    )
+
+    # Load the best checkpoint for evaluation
+    if "test" in datasets:
+        test_stats = diffusion_brain.evaluate(
+            test_set=datasets["test"],
+            min_key="error",
+            test_loader_kwargs=hparams["test_dataloader_opts"],
+        )
--- a/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan/hparams/train.yaml
+++ b/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan/hparams/train.yaml
+###################################
+# Experiment Parameters and setup #
+###################################
+seed: 1234
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref ./results/hifi_gan/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+progress_sample_path: !ref <output_folder>/samples
+epochs: 500
+keep_checkpoint_interval: 50
+use_tensorboard: False
+
+#################################
+# Data files and pre-processing #
+#################################
+data_folder: !PLACEHOLDER # e.g, /datasets/ljspeech
+train_json: !ref <save_folder>/train.json
+valid_json: !ref <save_folder>/valid.json
+test_json: !ref <save_folder>/test.json
+
+splits: ["train", "valid"]
+split_ratio: [90, 10]
+################################
+# Audio Parameters             #
+################################
+skip_prep: False
+
+segment_size: 8192
+sample_rate: 22050
+hop_length: 256
+win_length: 1024
+n_mel_channels: 80
+n_fft: 1024
+mel_fmin: 0.0
+mel_fmax: 8000
+mel_normalized: False
+power: 1
+norm: "slaney"
+mel_scale: "slaney"
+dynamic_range_compression: True
+
+################################
+# Optimization Hyperparameters #
+################################
+learning_rate: 0.0002
+weight_decay: 0.9999
+adam_b1: 0.8
+adam_b2: 0.99
+batch_size: 32 #minimum 2
+num_workers: 8
+
+train_dataloader_opts:
+  batch_size: !ref <batch_size>
+  drop_last: False
+  num_workers: !ref <num_workers>
+
+valid_dataloader_opts:
+  batch_size: 1
+  num_workers: !ref <num_workers>
+
+test_dataloader_opts:
+  batch_size: 1
+  num_workers: !ref <num_workers>
+################################
+# Model Parameters and model   #
+################################
+
+# generator params
+in_channels: 80
+out_channels: 1
+
+###########################################################################################################################################################
+# version | resblock_type | upsample_kernel_sizes | upsample_factors | resblock_kernel_sizes | upsample_initial_channel | resblock_dilation_sizes
+#    1    |      "1"      |      [16,16,4,4]      |    [8, 8, 2, 2]  |     [3, 7, 11]        |           512            | [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+#    2    |      "1"      |      [16,16,4,4]      |    [8, 8, 2, 2]  |     [3, 7, 11]        |           128            | [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+#    3    |      "2"      |       [16,16,8]       |      [8,8,4]     |       [3,5,7]         |           256            |     [[1,2], [2,6], [3,12]]
+###########################################################################################################################################################
+resblock_type: "1"
+resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+resblock_kernel_sizes: [3, 7, 11]
+upsample_kernel_sizes: [16, 16, 4, 4]
+upsample_initial_channel: 512
+upsample_factors: [8, 8, 2, 2]
+
+inference_padding: 5
+cond_channels: 0
+conv_post_bias: True
+
+mel_spectogram: !name:speechbrain.lobes.models.HifiGAN.mel_spectogram
+  sample_rate: !ref <sample_rate>
+  hop_length: !ref <hop_length>
+  win_length: !ref <win_length>
+  n_fft: !ref <n_fft>
+  n_mels: !ref <n_mel_channels>
+  f_min: !ref <mel_fmin>
+  f_max: !ref <mel_fmax>
+  power: !ref <power>
+  normalized: !ref <mel_normalized>
+  norm: !ref <norm>
+  mel_scale: !ref <mel_scale>
+  compression: !ref <dynamic_range_compression>
+
+generator: !new:speechbrain.lobes.models.HifiGAN.HifiganGenerator
+  in_channels: !ref <in_channels>
+  out_channels: !ref <out_channels>
+  resblock_type: !ref <resblock_type>
+  resblock_dilation_sizes: !ref <resblock_dilation_sizes>
+  resblock_kernel_sizes: !ref <resblock_kernel_sizes>
+  upsample_kernel_sizes: !ref <upsample_kernel_sizes>
+  upsample_initial_channel: !ref <upsample_initial_channel>
+  upsample_factors: !ref <upsample_factors>
+  inference_padding: !ref <inference_padding>
+  cond_channels: !ref <cond_channels>
+  conv_post_bias: !ref <conv_post_bias>
+
+discriminator: !new:speechbrain.lobes.models.HifiGAN.HifiganDiscriminator
+
+modules:
+  generator: !ref <generator>
+  discriminator: !ref <discriminator>
+
+#generator loss
+stft_loss: null
+mseg_loss: !new:speechbrain.lobes.models.HifiGAN.MSEGLoss
+feat_match_loss: !new:speechbrain.lobes.models.HifiGAN.MelganFeatureLoss
+l1_spec_loss: !new:speechbrain.lobes.models.HifiGAN.L1SpecLoss
+  sample_rate: !ref <sample_rate>
+  hop_length: !ref <hop_length>
+  win_length: !ref <win_length>
+  n_mel_channels: !ref <n_mel_channels>
+  n_fft: !ref <n_fft>
+  n_stft: !ref <n_fft> // 2 + 1
+  mel_fmin: !ref <mel_fmin>
+  mel_fmax: null
+  mel_normalized: !ref <mel_normalized>
+  power: !ref <power>
+  dynamic_range_compression: !ref <dynamic_range_compression>
+
+generator_loss: !new:speechbrain.lobes.models.HifiGAN.GeneratorLoss
+  stft_loss: !ref <stft_loss>
+  stft_loss_weight: 0
+  mseg_loss: !ref <mseg_loss>
+  mseg_loss_weight: 1
+  feat_match_loss: !ref <feat_match_loss>
+  feat_match_loss_weight: 10
+  l1_spec_loss: !ref  <l1_spec_loss>
+  l1_spec_loss_weight: 45
+
+#discriminator loss
+msed_loss: !new:speechbrain.lobes.models.HifiGAN.MSEDLoss
+
+discriminator_loss: !new:speechbrain.lobes.models.HifiGAN.DiscriminatorLoss
+  msed_loss: !ref <msed_loss>
+
+#optimizer
+opt_class_generator: !name:torch.optim.AdamW
+  lr: !ref <learning_rate>
+  betas: [!ref <adam_b1>, !ref <adam_b2>]
+
+opt_class_discriminator: !name:torch.optim.AdamW
+  lr: !ref <learning_rate>
+  betas: [!ref <adam_b1>, !ref <adam_b2>]
+
+sch_class_generator: !name:torch.optim.lr_scheduler.ExponentialLR
+  gamma: !ref <weight_decay>
+  last_epoch: -1
+
+sch_class_discriminator: !name:torch.optim.lr_scheduler.ExponentialLR
+  gamma: !ref <weight_decay>
+  last_epoch: -1
+
+#epoch object
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+  limit: !ref <epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+  save_file: !ref <train_log>
+
+#checkpointer
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+  checkpoints_dir: !ref <save_folder>
+  recoverables:
+    generator: !ref <generator>
+    discriminator: !ref <discriminator>
+    counter: !ref <epoch_counter>
--- a/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan/ljspeech_prepare.py
+++ b/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan/ljspeech_prepare.py
+../../../ljspeech_prepare.py
\ No newline at end of file
--- a/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan/train.py
+++ b/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan/train.py
+#!/usr/bin/env python3
+"""Recipe for training a hifi-gan vocoder.
+For more details about hifi-gan: https://arxiv.org/pdf/2010.05646.pdf
+
+To run this recipe, do the following:
+> python train.py hparams/train.yaml --data_folder /path/to/LJspeech
+
+Authors
+ * Duret Jarod 2021
+ * Yingzhi WANG 2022
+"""
+
+import copy
+import os
+import sys
+
+import torch
+import torchaudio
+from hyperpyyaml import load_hyperpyyaml
+
+import speechbrain as sb
+from speechbrain.utils.data_utils import scalarize
+
+
+class HifiGanBrain(sb.Brain):
+    def compute_forward(self, batch, stage):
+        """The forward function, generates synthesized waveforms,
+        calculates the scores and the features of the discriminator
+        for synthesized waveforms and real waveforms.
+
+        Arguments
+        ---------
+        batch: str
+            a single batch
+        stage: speechbrain.Stage
+            the training stage
+
+        Returns
+        -------
+        y_g_hat : torch.Tensor
+        scores_fake : torch.Tensor
+        feats_fake : torch.Tensor
+        scores_real : torch.Tensor
+        feats_real : torch.Tensor
+        """
+        batch = batch.to(self.device)
+        x, _ = batch.mel
+        y, _ = batch.sig
+
+        # generate synthesized waveforms
+        y_g_hat = self.modules.generator(x)[:, :, : y.size(2)]
+
+        # get scores and features from discriminator for real and synthesized waveforms
+        scores_fake, feats_fake = self.modules.discriminator(y_g_hat.detach())
+        scores_real, feats_real = self.modules.discriminator(y)
+
+        return (y_g_hat, scores_fake, feats_fake, scores_real, feats_real)
+
+    def compute_objectives(self, predictions, batch, stage):
+        """Computes and combines generator and discriminator losses"""
+        batch = batch.to(self.device)
+        x, _ = batch.mel
+        y, _ = batch.sig
+
+        # Hold on to the batch for the inference sample. This is needed because
+        # the inference sample is run from on_stage_end only, where
+        # batch information is not available
+        self.last_batch = (x, y)
+
+        # Hold on to a sample (for logging)
+        self._remember_sample(self.last_batch, predictions)
+
+        y_hat, scores_fake, feats_fake, scores_real, feats_real = predictions
+        loss_g = self.hparams.generator_loss(
+            stage, y_hat, y, scores_fake, feats_fake, feats_real
+        )
+        loss_d = self.hparams.discriminator_loss(scores_fake, scores_real)
+        loss = {**loss_g, **loss_d}
+        self.last_loss_stats[stage] = scalarize(loss)
+        return loss
+
+    def fit_batch(self, batch):
+        """Train discriminator and generator adversarially"""
+
+        batch = batch.to(self.device)
+        y, _ = batch.sig
+
+        outputs = self.compute_forward(batch, sb.core.Stage.TRAIN)
+        (y_g_hat, scores_fake, feats_fake, scores_real, feats_real) = outputs
+        # calculate discriminator loss with the latest updated generator
+        loss_d = self.compute_objectives(outputs, batch, sb.core.Stage.TRAIN)[
+            "D_loss"
+        ]
+        # First train the discriminator
+        self.optimizer_d.zero_grad()
+        loss_d.backward()
+        self.optimizer_d.step()
+
+        # calculate generator loss with the latest updated discriminator
+        scores_fake, feats_fake = self.modules.discriminator(y_g_hat)
+        scores_real, feats_real = self.modules.discriminator(y)
+        outputs = (y_g_hat, scores_fake, feats_fake, scores_real, feats_real)
+        loss_g = self.compute_objectives(outputs, batch, sb.core.Stage.TRAIN)[
+            "G_loss"
+        ]
+        # Then train the generator
+        self.optimizer_g.zero_grad()
+        loss_g.backward()
+        self.optimizer_g.step()
+
+        return loss_g.detach().cpu()
+
+    def evaluate_batch(self, batch, stage):
+        """Evaluate one batch"""
+        out = self.compute_forward(batch, stage=stage)
+        loss = self.compute_objectives(out, batch, stage=stage)
+        loss_g = loss["G_loss"]
+        return loss_g.detach().cpu()
+
+    def on_fit_start(self):
+        """Gets called at the beginning of ``fit()``, on multiple processes
+        if ``distributed_count > 0`` and backend is ddp and initializes statistics
+        """
+        self.last_epoch = 0
+        self.last_batch = None
+        self.last_loss_stats = {}
+        return super().on_fit_start()
+
+    def init_optimizers(self):
+        """Called during ``on_fit_start()``, initialize optimizers
+        after parameters are fully configured (e.g. DDP, jit).
+        """
+        if self.opt_class is not None:
+            (
+                opt_g_class,
+                opt_d_class,
+                sch_g_class,
+                sch_d_class,
+            ) = self.opt_class
+
+            self.optimizer_g = opt_g_class(self.modules.generator.parameters())
+            self.optimizer_d = opt_d_class(
+                self.modules.discriminator.parameters()
+            )
+            self.scheduler_g = sch_g_class(self.optimizer_g)
+            self.scheduler_d = sch_d_class(self.optimizer_d)
+
+            if self.checkpointer is not None:
+                self.checkpointer.add_recoverable(
+                    "optimizer_g", self.optimizer_g
+                )
+                self.checkpointer.add_recoverable(
+                    "optimizer_d", self.optimizer_d
+                )
+                self.checkpointer.add_recoverable(
+                    "scheduler_g", self.scheduler_d
+                )
+                self.checkpointer.add_recoverable(
+                    "scheduler_d", self.scheduler_d
+                )
+
+    def zero_grad(self, set_to_none=False):
+        if self.opt_class is not None:
+            self.optimizer_g.zero_grad(set_to_none)
+            self.optimizer_d.zero_grad(set_to_none)
+
+    def _remember_sample(self, batch, predictions):
+        """Remembers samples of spectrograms and the batch for logging purposes
+
+        Arguments
+        ---------
+        batch: tuple
+            a training batch
+        predictions: tuple
+            predictions (raw output of the Tacotron model)
+        """
+        mel, sig = batch
+        y_hat, scores_fake, feats_fake, scores_real, feats_real = predictions
+
+    def on_stage_end(self, stage, stage_loss, epoch):
+        """Gets called at the end of a stage (TRAIN, VALID, Or TEST)"""
+        if stage == sb.Stage.VALID:
+            # Update learning rate
+            self.scheduler_g.step()
+            self.scheduler_d.step()
+            lr_g = self.optimizer_g.param_groups[-1]["lr"]
+            lr_d = self.optimizer_d.param_groups[-1]["lr"]
+
+            self.hparams.train_logger.log_stats(  # 1#2#
+                stats_meta={"Epoch": epoch, "lr_g": lr_g, "lr_d": lr_d},
+                train_stats=self.last_loss_stats[sb.Stage.TRAIN],
+                valid_stats=self.last_loss_stats[sb.Stage.VALID],
+            )
+            # The tensorboard_logger writes a summary to stdout and to the logfile.
+            if self.hparams.use_tensorboard:
+                self.tensorboard_logger.log_stats(
+                    stats_meta={"Epoch": epoch, "lr_g": lr_g, "lr_d": lr_d},
+                    train_stats=self.last_loss_stats[sb.Stage.TRAIN],
+                    valid_stats=self.last_loss_stats[sb.Stage.VALID],
+                )
+
+            # Save the current checkpoint and delete previous checkpoints.
+            epoch_metadata = {
+                **{"epoch": epoch},
+                **self.last_loss_stats[sb.Stage.VALID],
+            }
+            self.checkpointer.save_and_keep_only(
+                meta=epoch_metadata,
+                end_of_epoch=True,
+                min_keys=["loss"],
+                ckpt_predicate=(
+                    (
+                        lambda ckpt: (
+                            ckpt.meta["epoch"]
+                            % self.hparams.keep_checkpoint_interval
+                            != 0
+                        )
+                    )
+                    if self.hparams.keep_checkpoint_interval is not None
+                    else None
+                ),
+            )
+
+            self.run_inference_sample("Valid")
+
+        # We also write statistics about test data to stdout and to the torch.TensorboardLogger.
+        if stage == sb.Stage.TEST:
+            self.hparams.train_logger.log_stats(  # 1#2#
+                {"Epoch loaded": self.hparams.epoch_counter.current},
+                test_stats=self.last_loss_stats[sb.Stage.TEST],
+            )
+            if self.hparams.use_tensorboard:
+                self.tensorboard_logger.log_stats(
+                    {"Epoch loaded": self.hparams.epoch_counter.current},
+                    test_stats=self.last_loss_stats[sb.Stage.TEST],
+                )
+            self.run_inference_sample("Test")
+
+    def run_inference_sample(self, name):
+        """Produces a sample in inference mode. This is called when producing
+        samples.
+        """
+        with torch.no_grad():
+            if self.last_batch is None:
+                return
+            x, y = self.last_batch
+
+            # Preparing model for inference by removing weight norm
+            inference_generator = copy.deepcopy(self.hparams.generator)
+            inference_generator.remove_weight_norm()
+            sig_out = inference_generator.inference(x)
+            spec_out = self.hparams.mel_spectogram(
+                audio=sig_out.squeeze(0).cpu()
+            )
+        if self.hparams.use_tensorboard:
+            self.tensorboard_logger.log_audio(
+                f"{name}/audio_target", y.squeeze(0), self.hparams.sample_rate
+            )
+            self.tensorboard_logger.log_audio(
+                f"{name}/audio_pred",
+                sig_out.squeeze(0),
+                self.hparams.sample_rate,
+            )
+            self.tensorboard_logger.log_figure(f"{name}/mel_target", x)
+            self.tensorboard_logger.log_figure(f"{name}/mel_pred", spec_out)
+        else:
+            # folder name is the current epoch for validation and "test" for test
+            folder = (
+                self.hparams.epoch_counter.current
+                if name == "Valid"
+                else "test"
+            )
+            self.save_audio("target", y.squeeze(0), folder)
+            self.save_audio("synthesized", sig_out.squeeze(0), folder)
+
+    def save_audio(self, name, data, epoch):
+        """Saves a single wav
+
+        Arguments
+        ---------
+        name: str
+            the name of the saved audio
+        data: torch.Tensor
+            the  wave data to save
+        epoch: int or str
+            the epoch number (used in file path calculations)
+            or "test" for test stage
+        """
+        target_path = os.path.join(
+            self.hparams.progress_sample_path, str(epoch)
+        )
+        if not os.path.exists(target_path):
+            os.makedirs(target_path)
+        file_name = f"{name}.wav"
+        effective_file_name = os.path.join(target_path, file_name)
+        torchaudio.save(effective_file_name, data.cpu(), 22050)
+
+
+def dataio_prepare(hparams):
+    """This function prepares the datasets to be used in the brain class.
+    It also defines the data processing pipeline through user-defined functions.
+    """
+    segment_size = hparams["segment_size"]
+
+    # Define audio pipeline:
+    @sb.utils.data_pipeline.takes("wav", "segment")
+    @sb.utils.data_pipeline.provides("mel", "sig")
+    def audio_pipeline(wav, segment):
+        audio = sb.dataio.dataio.read_audio(wav)
+        audio = torch.FloatTensor(audio)
+        audio = audio.unsqueeze(0)
+        if segment:
+            if audio.size(1) >= segment_size:
+                max_audio_start = audio.size(1) - segment_size
+                audio_start = torch.randint(0, max_audio_start, (1,))
+                audio = audio[:, audio_start : audio_start + segment_size]
+            else:
+                audio = torch.nn.functional.pad(
+                    audio, (0, segment_size - audio.size(1)), "constant"
+                )
+
+        mel = hparams["mel_spectogram"](audio=audio.squeeze(0))
+
+        return mel, audio
+
+    datasets = {}
+    data_info = {
+        "train": hparams["train_json"],
+        "valid": hparams["valid_json"],
+        "test": hparams["test_json"],
+    }
+    for dataset in hparams["splits"]:
+        datasets[dataset] = sb.dataio.dataset.DynamicItemDataset.from_json(
+            json_path=data_info[dataset],
+            replacements={"data_root": hparams["data_folder"]},
+            dynamic_items=[audio_pipeline],
+            output_keys=["id", "mel", "sig"],
+        )
+
+    return datasets
+
+
+if __name__ == "__main__":
+    # Load hyperparameters file with command-line overrides
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+
+    with open(hparams_file) as fin:
+        hparams = load_hyperpyyaml(fin, overrides)
+
+    # create ddp_group with the right communication protocol
+    sb.utils.distributed.ddp_init_group(run_opts)
+
+    # Create experiment directory
+    sb.create_experiment_directory(
+        experiment_directory=hparams["output_folder"],
+        hyperparams_to_save=hparams_file,
+        overrides=overrides,
+    )
+
+    from ljspeech_prepare import prepare_ljspeech
+
+    sb.utils.distributed.run_on_main(
+        prepare_ljspeech,
+        kwargs={
+            "data_folder": hparams["data_folder"],
+            "save_folder": hparams["save_folder"],
+            "splits": hparams["splits"],
+            "split_ratio": hparams["split_ratio"],
+            "seed": hparams["seed"],
+            "skip_prep": hparams["skip_prep"],
+        },
+    )
+
+    datasets = dataio_prepare(hparams)
+
+    # Brain class initialization
+    hifi_gan_brain = HifiGanBrain(
+        modules=hparams["modules"],
+        opt_class=[
+            hparams["opt_class_generator"],
+            hparams["opt_class_discriminator"],
+            hparams["sch_class_generator"],
+            hparams["sch_class_discriminator"],
+        ],
+        hparams=hparams,
+        run_opts=run_opts,
+        checkpointer=hparams["checkpointer"],
+    )
+
+    if hparams["use_tensorboard"]:
+        hifi_gan_brain.tensorboard_logger = (
+            sb.utils.train_logger.TensorboardLogger(
+                save_dir=hparams["output_folder"] + "/tensorboard"
+            )
+        )
+
+    # Training
+    hifi_gan_brain.fit(
+        hifi_gan_brain.hparams.epoch_counter,
+        train_set=datasets["train"],
+        valid_set=datasets["valid"],
+        train_loader_kwargs=hparams["train_dataloader_opts"],
+        valid_loader_kwargs=hparams["valid_dataloader_opts"],
+    )
+
+    # Test
+    if "test" in datasets:
+        hifi_gan_brain.evaluate(
+            datasets["test"],
+            test_loader_kwargs=hparams["test_dataloader_opts"],
+        )
--- a/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan_discrete/extract_code.py
+++ b/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan_discrete/extract_code.py
+"""
+Apply K-means clustering over acoustic features to extract speech units for HiFi-GAN training.
+
+Authors
+ * Jarod Duret 2023
+"""
+
+import json
+import logging
+import pathlib as pl
+
+import numpy as np
+import torch
+import torchaudio
+from tqdm import tqdm
+
+import speechbrain as sb
+from speechbrain.dataio.dataio import load_pkl, save_pkl
+from speechbrain.lobes.models.huggingface_transformers import (
+    hubert,
+    wav2vec2,
+    wavlm,
+)
+from speechbrain.lobes.models.huggingface_transformers.discrete_ssl import (
+    DiscreteSSL,
+)
+
+OPT_FILE = "opt_ljspeech_extract_code.pkl"
+TRAIN_JSON = "train.json"
+VALID_JSON = "valid.json"
+TEST_JSON = "test.json"
+
+ENCODER_CLASSES = {
+    "HuBERT": hubert.HuBERT,
+    "Wav2Vec2": wav2vec2.Wav2Vec2,
+    "WavLM": wavlm.WavLM,
+}
+
+
+def setup_logger():
+    """Set up a logger with a log format and logging level."""
+    log_format = "[%(asctime)s] [%(levelname)s]: %(message)s"
+    logging.basicConfig(format=log_format, level=logging.INFO)
+    logger = logging.getLogger(__name__)
+    return logger
+
+
+def get_device(use_cuda):
+    """Determine and return the appropriate device for computation."""
+    use_cuda = use_cuda and torch.cuda.is_available()
+    print("\n" + "=" * 30)
+    print("USE_CUDA SET TO: {}".format(use_cuda))
+    print("CUDA AVAILABLE?: {}".format(torch.cuda.is_available()))
+    print("=" * 30 + "\n")
+    return torch.device("cuda" if use_cuda else "cpu")
+
+
+def np_array(tensor):
+    """Convert a Pytorch tensor to a Numpy array."""
+    tensor = tensor.squeeze(0)
+    tensor = tensor.detach().cpu()
+    return tensor.numpy()
+
+
+def skip(splits, save_folder, conf):
+    """
+    Detects if the ljspeech data_extraction has been already done.
+    If the extraction has been done, we can skip it.
+
+    Returns
+    -------
+    bool
+        if True, the preparation phase can be skipped.
+        if False, it must be done.
+    """
+    # Checking json files
+    skip = True
+
+    split_files = {
+        "train": TRAIN_JSON,
+        "valid": VALID_JSON,
+        "test": TEST_JSON,
+    }
+
+    for split in splits:
+        if not (save_folder / split_files[split]).exists():
+            skip = False
+
+    #  Checking saved options
+    save_opt = save_folder / OPT_FILE
+    if skip is True:
+        if save_opt.is_file():
+            opts_old = load_pkl(save_opt.as_posix())
+            if opts_old == conf:
+                skip = True
+            else:
+                skip = False
+        else:
+            skip = False
+    return skip
+
+
+def extract_ljspeech(
+    data_folder,
+    splits,
+    kmeans_folder,
+    kmeans_dataset,
+    num_clusters,
+    encoder_type,
+    encoder_source,
+    layer,
+    encoder_save_folder,
+    codes_save_folder,
+    sample_rate=16000,
+    skip_extract=False,
+):
+    """
+    Extract speech units for HiFi-GAN training on the LJspeech datasets.
+
+    Arguments
+    ---------
+    data_folder : str
+        Path to the folder where the original LJspeech dataset is stored.
+    splits : list
+        List of splits to prepare.
+    kmeans_folder: str
+        Huggingface repository if that contains the pretrained kmean model.
+    kmeans_dataset : str
+        Name of the dataset that Kmeans model on HF repo is trained with.
+    num_clusters:  (int)
+        determine the number of clusters of the targeted kmeans models to be downloaded.
+    encoder_type: str
+        Name of the model used as feature extractor.
+    encoder_source: str
+        Url to the model used as feature extractor.
+    layer: List[int] (default: [7]):
+        Determine which layers of SSL should be used to extract information.
+    encoder_save_folder: str
+        Path to the folder where the ssl encoder stored.
+    codes_save_folder: str
+        Path to the folder where the tokens are stored.
+    sample_rate: int
+        LjSpeech dataset sample rate
+    skip_extract: Bool
+        If True, skip extraction.
+
+    Example
+    -------
+    >>> from recipes.LJSpeech.TTS.vocoder.hifi_gan_unit.extract_code import extract_ljspeech
+    >>> data_folder = 'data/LJspeech/'
+    >>> splits = ['train', 'valid']
+    >>> kmeans_folder = 'speechbrain/SSL_Quantization'
+    >>> kmeans_dataset = LibriSpeech-100-360-500
+    >>> encoder_type = 'HuBERT'
+    >>> encoder_source = facebook/hubert-large-ll60k
+    >>> layer = [7]
+    >>> encoder_save_folder = 'ssl_encoder/'
+    >>> codes_save_folder = 'codes/'
+    >>> extract_ljspeech(data_folder, splits, kmeans_folder, kmeans_filename, encoder_type, encoder_source, layer, encoder_save_folder, codes_save_folder)
+    """
+    logger = setup_logger()
+
+    if skip_extract:
+        return
+    # Create configuration for easily skipping code extraction stage
+    conf = {
+        "data_folder": data_folder,
+        "splits": splits,
+        "save_folder": codes_save_folder,
+        "kmeans_folder": kmeans_folder,
+        "encoder_type": encoder_type,
+        "encoder_source": encoder_source,
+        "layer": layer,
+    }
+
+    codes_save_folder = pl.Path(codes_save_folder)
+    # Check if this phase is already done (if so, skip it)
+    if skip(splits, codes_save_folder, conf):
+        logger.info("Skipping code extraction, completed in previous run.")
+        return
+
+    # Fetch device
+    device = get_device(use_cuda=True)
+
+    save_opt = codes_save_folder / OPT_FILE
+    data_folder = pl.Path(data_folder)
+    encoder_save_folder = pl.Path(encoder_save_folder)
+    codes_save_folder.mkdir(parents=True, exist_ok=True)
+
+    logger.info(f"Loading encoder: {encoder_source} ...")
+    if encoder_type not in ENCODER_CLASSES:
+        raise TypeError("Not a supported Encoder")
+
+    encoder_class = ENCODER_CLASSES[encoder_type]
+    encoder = encoder_class(
+        source=encoder_source,
+        save_path=encoder_save_folder.as_posix(),
+        output_norm=False,
+        freeze=True,
+        freeze_feature_extractor=True,
+        apply_spec_augment=False,
+        output_all_hiddens=True,
+    ).to(device)
+
+    discrete_encoder = DiscreteSSL(
+        save_path=encoder_save_folder.as_posix(),
+        ssl_model=encoder,
+        kmeans_dataset=kmeans_dataset,
+        kmeans_repo_id=kmeans_folder,
+        num_clusters=num_clusters,
+    )
+
+    for split in splits:
+        dataset_path = data_folder / f"{split}.json"
+        logger.info(f"Reading dataset from {dataset_path} ...")
+        meta_json = json.load(open(dataset_path))
+        for key in tqdm(meta_json.keys()):
+            item = meta_json[key]
+            wav = item["wav"]
+            with torch.no_grad():
+                info = torchaudio.info(wav)
+                audio = sb.dataio.dataio.read_audio(wav)
+                audio = torchaudio.transforms.Resample(
+                    info.sample_rate,
+                    sample_rate,
+                )(audio)
+                audio = audio.unsqueeze(0).to(device)
+                deduplicates = [False for _ in layer]
+                bpe_tokenizers = [None for _ in layer]
+                tokens, _, _ = discrete_encoder(
+                    audio,
+                    SSL_layers=layer,
+                    deduplicates=deduplicates,
+                    bpe_tokenizers=bpe_tokenizers,
+                )
+                tokens = np_array(tokens.squeeze(0))
+            np.save(codes_save_folder / f"{key}.npy", tokens)
+
+    logger.info("Extraction completed.")
+    save_pkl(conf, save_opt)
--- a/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan_discrete/hparams/train.yaml
+++ b/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan_discrete/hparams/train.yaml
+############################################################################
+# Model: Unit HiFi-GAN
+# Tokens: discrete speech units (K-means)
+# Training: LJSpeech (English)
+# Authors: Jarod Duret, Yingzhi Wang
+# ############################################################################
+
+
+###################################
+# Experiment Parameters and setup #
+###################################
+seed: 4321
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref ./results/hifi_gan/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+progress_sample_path: !ref <output_folder>/samples
+epochs: 200
+keep_checkpoint_interval: 50
+use_tensorboard: False
+
+#################################
+# Data files and pre-processing #
+#################################
+data_folder: !PLACEHOLDER # e.g, /datasets/ljspeech
+train_json: !ref <save_folder>/train.json
+valid_json: !ref <save_folder>/valid.json
+test_json: !ref <save_folder>/test.json
+
+splits: ["train", "valid", "test"]
+split_ratio: [80, 10, 10]
+skip_prep: False
+
+########################################################
+#  Encoder  |  HF model                                #
+#------------------------------------------------------#
+#  HuBERT   |  facebook/hubert-large-ll60k             #
+#  Wav2Vec2 |  facebook/wav2vec2-large-960h-lv60-self  #
+#  WavLM    |  microsoft/wavlm-large                   #
+########################################################
+kmeans_folder: speechbrain/SSL_Quantization
+kmeans_dataset: LibriSpeech-100-360-500
+codes_save_folder: !ref <save_folder>/codes
+encoder_type: HuBERT
+encoder_hub: facebook/hubert-large-ll60k
+encoder_save_folder: !ref <save_folder>/ssl_encoder
+layer: [1, 3, 7, 12, 18, 23]
+num_clusters: 1000
+skip_extract: False
+
+################################
+# Audio Parameters             #
+################################
+
+segment_size: 8960
+code_hop_size: 320
+sample_rate: 16000
+layer_drop: True
+
+hop_length: 256
+win_length: 1024
+n_mel_channels: 80
+n_fft: 1024
+mel_fmin: 0.0
+mel_fmax: 8000
+mel_normalized: False
+power: 1
+norm: "slaney"
+mel_scale: "slaney"
+dynamic_range_compression: True
+
+################################
+# Optimization Hyperparameters #
+################################
+learning_rate: 0.0002
+weight_decay: 0.9999
+adam_b1: 0.8
+adam_b2: 0.99
+batch_size: 32 #minimum 32
+
+train_dataloader_opts:
+  batch_size: !ref <batch_size>
+  drop_last: False
+  num_workers: 8
+
+valid_dataloader_opts:
+  batch_size: 1
+  num_workers: 8
+
+test_dataloader_opts:
+  batch_size: 1
+  num_workers: 8
+
+################################
+# Model Parameters and model   #
+################################
+duration_predictor: False
+
+# embedding params
+vocab_size: 6001 # K-means size * num layer + 1 for padding 1000x6+1
+embedding_dim: 128
+
+# generator params
+in_channels: 128
+out_channels: 1
+
+var_pred_hidden_dim: 128
+var_pred_kernel_size: 3
+var_pred_dropout: 0.5
+
+###########################################################################################################################################################
+# version | resblock_type | upsample_kernel_sizes | upsample_factors | resblock_kernel_sizes | upsample_initial_channel | resblock_dilation_sizes
+#    1    |      "1"      |      [16,16,4,4]      |    [8, 8, 2, 2]  |     [3, 7, 11]        |           512            | [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+#    2    |      "1"      |      [16,16,4,4]      |    [8, 8, 2, 2]  |     [3, 7, 11]        |           128            | [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+#    3    |      "2"      |       [16,16,8]       |      [8,8,4]     |       [3,5,7]         |           256            |     [[1,2], [2,6], [3,12]]
+###########################################################################################################################################################
+resblock_type: "1"
+resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+resblock_kernel_sizes: [3, 7, 11]
+upsample_kernel_sizes: [11, 8, 8, 4, 4]
+upsample_initial_channel: 512
+upsample_factors: [5, 4, 4, 2, 2]
+
+inference_padding: 5
+cond_channels: 0
+conv_post_bias: True
+
+mel_spectogram: !name:speechbrain.lobes.models.HifiGAN.mel_spectogram
+  sample_rate: !ref <sample_rate>
+  hop_length: !ref <hop_length>
+  win_length: !ref <win_length>
+  n_fft: !ref <n_fft>
+  n_mels: !ref <n_mel_channels>
+  f_min: !ref <mel_fmin>
+  f_max: !ref <mel_fmax>
+  power: !ref <power>
+  normalized: !ref <mel_normalized>
+  norm: !ref <norm>
+  mel_scale: !ref <mel_scale>
+  compression: !ref <dynamic_range_compression>
+
+generator: !new:speechbrain.lobes.models.HifiGAN.UnitHifiganGenerator
+  in_channels: !ref <in_channels>
+  out_channels: !ref <out_channels>
+  resblock_type: !ref <resblock_type>
+  resblock_dilation_sizes: !ref <resblock_dilation_sizes>
+  resblock_kernel_sizes: !ref <resblock_kernel_sizes>
+  upsample_kernel_sizes: !ref <upsample_kernel_sizes>
+  upsample_initial_channel: !ref <upsample_initial_channel>
+  upsample_factors: !ref <upsample_factors>
+  inference_padding: !ref <inference_padding>
+  cond_channels: !ref <cond_channels>
+  conv_post_bias: !ref <conv_post_bias>
+  vocab_size: !ref <vocab_size>
+  embedding_dim: !ref <embedding_dim>
+  duration_predictor: !ref <duration_predictor>
+  var_pred_hidden_dim: !ref <var_pred_hidden_dim>
+  var_pred_kernel_size: !ref <var_pred_kernel_size>
+  var_pred_dropout: !ref <var_pred_dropout>
+
+discriminator: !new:speechbrain.lobes.models.HifiGAN.HifiganDiscriminator
+
+modules:
+  generator: !ref <generator>
+  discriminator: !ref <discriminator>
+
+#generator loss
+stft_loss: null
+mseg_loss: !new:speechbrain.lobes.models.HifiGAN.MSEGLoss
+feat_match_loss: !new:speechbrain.lobes.models.HifiGAN.MelganFeatureLoss
+l1_spec_loss: !new:speechbrain.lobes.models.HifiGAN.L1SpecLoss
+  sample_rate: !ref <sample_rate>
+  hop_length: !ref <hop_length>
+  win_length: !ref <win_length>
+  n_mel_channels: !ref <n_mel_channels>
+  n_fft: !ref <n_fft>
+  n_stft: !ref <n_fft> // 2 + 1
+  mel_fmin: !ref <mel_fmin>
+  mel_fmax: null
+  mel_normalized: !ref <mel_normalized>
+  power: !ref <power>
+  dynamic_range_compression: !ref <dynamic_range_compression>
+mseg_dur_loss: False
+
+generator_loss: !new:speechbrain.lobes.models.HifiGAN.GeneratorLoss
+  stft_loss: !ref <stft_loss>
+  stft_loss_weight: 0
+  mseg_loss: !ref <mseg_loss>
+  mseg_loss_weight: 1
+  feat_match_loss: !ref <feat_match_loss>
+  feat_match_loss_weight: 10
+  l1_spec_loss: !ref  <l1_spec_loss>
+  l1_spec_loss_weight: 45
+  mseg_dur_loss: !ref <mseg_dur_loss>
+  mseg_dur_loss_weight: 1
+
+#discriminator loss
+msed_loss: !new:speechbrain.lobes.models.HifiGAN.MSEDLoss
+
+discriminator_loss: !new:speechbrain.lobes.models.HifiGAN.DiscriminatorLoss
+  msed_loss: !ref <msed_loss>
+
+#optimizer
+opt_class_generator: !name:torch.optim.AdamW
+  lr: !ref <learning_rate>
+  betas: [!ref <adam_b1>, !ref <adam_b2>]
+
+opt_class_discriminator: !name:torch.optim.AdamW
+  lr: !ref <learning_rate>
+  betas: [!ref <adam_b1>, !ref <adam_b2>]
+
+sch_class_generator: !name:torch.optim.lr_scheduler.ExponentialLR
+  gamma: !ref <weight_decay>
+  last_epoch: -1
+
+sch_class_discriminator: !name:torch.optim.lr_scheduler.ExponentialLR
+  gamma: !ref <weight_decay>
+  last_epoch: -1
+
+#epoch object
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+  limit: !ref <epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+  save_file: !ref <train_log>
+
+#checkpointer
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+  checkpoints_dir: !ref <save_folder>
+  recoverables:
+    generator: !ref <generator>
+    discriminator: !ref <discriminator>
+    counter: !ref <epoch_counter>
--- a/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan_discrete/ljspeech_prepare.py
+++ b/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan_discrete/ljspeech_prepare.py
+../../../ljspeech_prepare.py
\ No newline at end of file
--- a/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan_discrete/train.py
+++ b/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan_discrete/train.py
+#!/usr/bin/env python3
+"""Recipe for training a hifi-gan vocoder on self-supervised representations.
+For more details about hifi-gan: https://arxiv.org/pdf/2010.05646.pdf
+For more details about speech synthesis using self-supervised representations: https://arxiv.org/pdf/2104.00355.pdf
+
+To run this recipe, do the following:
+> python train.py hparams/train.yaml --data_folder=/path/to/LJspeech
+
+Authors
+ * Jarod Duret 2023
+ * Yingzhi WANG 2022
+"""
+
+import copy
+import pathlib as pl
+import random
+import sys
+
+import numpy as np
+import torch
+import torchaudio
+from hyperpyyaml import load_hyperpyyaml
+
+import speechbrain as sb
+from speechbrain.utils.data_utils import scalarize
+
+
+class HifiGanBrain(sb.Brain):
+    def compute_forward(self, batch, stage):
+        """The forward function, generates synthesized waveforms,
+        calculates the scores and the features of the discriminator
+        for synthesized waveforms and real waveforms.
+
+        Arguments
+        ---------
+        batch : torch.Tensor or tensors
+            An element from the dataloader, including inputs for processing.
+        stage : Stage
+            The stage of the experiment: Stage.TRAIN, Stage.VALID, Stage.TEST
+
+        """
+        batch = batch.to(self.device)
+
+        x, _ = batch.code
+        y, _ = batch.sig
+
+        # generate sythesized waveforms
+        y_g_hat, (log_dur_pred, log_dur) = self.modules.generator(x)
+        y_g_hat = y_g_hat[:, :, : y.size(2)]
+
+        # get scores and features from discriminator for real and synthesized waveforms
+        scores_fake, feats_fake = self.modules.discriminator(y_g_hat.detach())
+        scores_real, feats_real = self.modules.discriminator(y)
+
+        return (
+            y_g_hat,
+            scores_fake,
+            feats_fake,
+            scores_real,
+            feats_real,
+            log_dur_pred,
+            log_dur,
+        )
+
+    def compute_objectives(self, predictions, batch, stage):
+        """Computes the loss given the predicted and targeted outputs.
+        Arguments
+        ---------
+        predictions : torch.Tensor
+            The model generated spectrograms and other metrics from `compute_forward`.
+        batch : PaddedBatch
+            This batch object contains all the relevant tensors for computation.
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+        Returns
+        -------
+        loss : torch.Tensor
+            A one-element tensor used for backpropagating the gradient.
+        """
+        batch = batch.to(self.device)
+
+        x, _ = batch.code
+        y, y_lens = batch.sig
+
+        # Hold on to the batch for the inference sample. This is needed because
+        # the infernece sample is run from on_stage_end only, where
+        # batch information is not available
+        self.last_batch = (x, y)
+
+        (
+            y_hat,
+            scores_fake,
+            feats_fake,
+            scores_real,
+            feats_real,
+            log_dur_pred,
+            log_dur,
+        ) = predictions
+
+        loss_g = self.hparams.generator_loss(
+            stage,
+            y_hat,
+            y,
+            scores_fake,
+            feats_fake,
+            feats_real,
+            log_dur_pred,
+            log_dur,
+        )
+
+        loss_d = self.hparams.discriminator_loss(scores_fake, scores_real)
+        loss = {**loss_g, **loss_d}
+        self.last_loss_stats[stage] = scalarize(loss)
+
+        return loss
+
+    def fit_batch(self, batch):
+        """Fits a single batch.
+        Arguments
+        ---------
+        batch: tuple
+            a training batch
+        Returns
+        -------
+        loss: torch.Tensor
+            detached loss
+        """
+        batch = batch.to(self.device)
+        y, _ = batch.sig
+
+        outputs = self.compute_forward(batch, sb.core.Stage.TRAIN)
+        (
+            y_g_hat,
+            scores_fake,
+            feats_fake,
+            scores_real,
+            feats_real,
+            log_dur_pred,
+            log_dur,
+        ) = outputs
+        # calculate discriminator loss with the latest updated generator
+        loss_d = self.compute_objectives(outputs, batch, sb.core.Stage.TRAIN)[
+            "D_loss"
+        ]
+        # First train the discriminator
+        self.optimizer_d.zero_grad()
+        loss_d.backward()
+        self.optimizer_d.step()
+
+        # calculate generator loss with the latest updated discriminator
+        scores_fake, feats_fake = self.modules.discriminator(y_g_hat)
+        scores_real, feats_real = self.modules.discriminator(y)
+        outputs = (
+            y_g_hat,
+            scores_fake,
+            feats_fake,
+            scores_real,
+            feats_real,
+            log_dur_pred,
+            log_dur,
+        )
+        loss_g = self.compute_objectives(outputs, batch, sb.core.Stage.TRAIN)[
+            "G_loss"
+        ]
+        # Then train the generator
+        self.optimizer_g.zero_grad()
+        loss_g.backward()
+        self.optimizer_g.step()
+
+        return loss_g.detach().cpu()
+
+    def evaluate_batch(self, batch, stage):
+        """Evaluate one batch.
+
+        Arguments
+        ---------
+        batch : list of torch.Tensors
+            Batch of data to use for evaluation. Default implementation assumes
+            this batch has two elements: inputs and targets.
+        stage : Stage
+            The stage of the experiment: Stage.VALID, Stage.TEST
+
+        Returns
+        -------
+        detached loss
+        """
+        out = self.compute_forward(batch, stage=stage)
+        loss = self.compute_objectives(out, batch, stage=stage)
+        loss_g = loss["G_loss"]
+        return loss_g.detach().cpu()
+
+    def on_fit_start(self):
+        """Gets called at the beginning of ``fit()``, on multiple processes
+        if ``distributed_count > 0`` and backend is ddp and initializes statistics.
+        """
+        self.last_epoch = 0
+        self.last_batch = None
+        self.last_loss_stats = {}
+        return super().on_fit_start()
+
+    def init_optimizers(self):
+        """Called during ``on_fit_start()``, initialize optimizers
+        after parameters are fully configured (e.g. DDP, jit).
+        """
+        if self.opt_class is not None:
+            (
+                opt_g_class,
+                opt_d_class,
+                sch_g_class,
+                sch_d_class,
+            ) = self.opt_class
+
+            self.optimizer_g = opt_g_class(self.modules.generator.parameters())
+            self.optimizer_d = opt_d_class(
+                self.modules.discriminator.parameters()
+            )
+            self.optimizers_dict = {
+                "optimizer_g": self.optimizer_g,
+                "optimizer_d": self.optimizer_d,
+            }
+
+            self.scheduler_g = sch_g_class(self.optimizer_g)
+            self.scheduler_d = sch_d_class(self.optimizer_d)
+
+            if self.checkpointer is not None:
+                self.checkpointer.add_recoverable(
+                    "optimizer_g", self.optimizer_g
+                )
+                self.checkpointer.add_recoverable(
+                    "optimizer_d", self.optimizer_d
+                )
+                self.checkpointer.add_recoverable(
+                    "scheduler_g", self.scheduler_d
+                )
+                self.checkpointer.add_recoverable(
+                    "scheduler_d", self.scheduler_d
+                )
+
+    def on_stage_end(self, stage, stage_loss, epoch):
+        """Gets called at the end of an epoch.
+
+        Arguments
+        ---------
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST
+        stage_loss : float
+            The average loss for all of the data processed in this stage.
+        epoch : int
+            The currently-starting epoch. This is passed
+            `None` during the test stage.
+        """
+        if stage == sb.Stage.VALID:
+            # Update learning rate
+            self.scheduler_g.step()
+            self.scheduler_d.step()
+            lr_g = self.optimizer_g.param_groups[-1]["lr"]
+            lr_d = self.optimizer_d.param_groups[-1]["lr"]
+
+            stats = {
+                **self.last_loss_stats[sb.Stage.VALID],
+            }
+
+            self.hparams.train_logger.log_stats(  # 1#2#
+                stats_meta={"Epoch": epoch, "lr_g": lr_g, "lr_d": lr_d},
+                train_stats=self.last_loss_stats[sb.Stage.TRAIN],
+                valid_stats=stats,
+            )
+            # The tensorboard_logger writes a summary to stdout and to the logfile.
+            if self.hparams.use_tensorboard:
+                self.tensorboard_logger.log_stats(
+                    stats_meta={"Epoch": epoch, "lr_g": lr_g, "lr_d": lr_d},
+                    train_stats=self.last_loss_stats[sb.Stage.TRAIN],
+                    valid_stats=stats,
+                )
+
+            # Save the current checkpoint and delete previous checkpoints.
+            epoch_metadata = {
+                **{"epoch": epoch},
+                **self.last_loss_stats[sb.Stage.VALID],
+            }
+            if self.checkpointer is not None:
+                self.checkpointer.save_and_keep_only(
+                    meta=epoch_metadata,
+                    end_of_epoch=True,
+                    min_keys=["loss"],
+                    ckpt_predicate=(
+                        (
+                            lambda ckpt: (
+                                ckpt.meta["epoch"]
+                                % self.hparams.keep_checkpoint_interval
+                                != 0
+                            )
+                        )
+                        if self.hparams.keep_checkpoint_interval is not None
+                        else None
+                    ),
+                )
+
+            self.run_inference_sample("Valid", epoch)
+
+        # We also write statistics about test data to stdout and to the TensorboardLogger.
+        if stage == sb.Stage.TEST:
+            self.hparams.train_logger.log_stats(  # 1#2#
+                {"Epoch loaded": self.hparams.epoch_counter.current},
+                test_stats=self.last_loss_stats[sb.Stage.TEST],
+            )
+            if self.hparams.use_tensorboard:
+                self.tensorboard_logger.log_stats(
+                    {"Epoch loaded": self.hparams.epoch_counter.current},
+                    test_stats=self.last_loss_stats[sb.Stage.TEST],
+                )
+            self.run_inference_sample("Test", epoch)
+
+    def run_inference_sample(self, name, epoch):
+        """Produces a sample in inference mode.
+        This is called when producing samples.
+
+        Arguments
+        ---------
+        name: str
+            the name of the saved audio folder
+        epoch: int or str
+            the epoch number (used in file path calculations)
+            or "test" for test stage
+        """
+        with torch.no_grad():
+            if self.last_batch is None:
+                return
+            x, y = self.last_batch
+
+            # Preparing model for inference by removing weight norm
+            inference_generator = copy.deepcopy(self.hparams.generator)
+            inference_generator.remove_weight_norm()
+            if inference_generator.duration_predictor:
+                x = torch.unique_consecutive(x, dim=1)
+            sig_out = inference_generator.inference(x)
+            spec_out = self.hparams.mel_spectogram(
+                audio=sig_out.squeeze(0).cpu()
+            )
+        if self.hparams.use_tensorboard:
+            self.tensorboard_logger.log_audio(
+                f"{name}/audio_target", y.squeeze(0), self.hparams.sample_rate
+            )
+            self.tensorboard_logger.log_audio(
+                f"{name}/audio_pred",
+                sig_out.squeeze(0),
+                self.hparams.sample_rate,
+            )
+            self.tensorboard_logger.log_figure(f"{name}/mel_target", x)
+            self.tensorboard_logger.log_figure(f"{name}/mel_pred", spec_out)
+        else:
+            # folder name is the current epoch for validation and "test" for test
+            folder = (
+                self.hparams.epoch_counter.current
+                if name == "Valid"
+                else "test"
+            )
+            self.save_audio("target", y.squeeze(0), folder)
+            self.save_audio("synthesized", sig_out.squeeze(0), folder)
+
+    def save_audio(self, name, data, epoch):
+        """Saves a single wav file.
+
+        Arguments
+        ---------
+        name: str
+            the name of the saved audio
+        data: torch.Tensor
+            the  wave data to save
+        epoch: int or str
+            the epoch number (used in file path calculations)
+            or "test" for test stage
+        """
+        target_path = pl.Path(self.hparams.progress_sample_path) / str(epoch)
+        target_path.mkdir(parents=True, exist_ok=True)
+        file_name = target_path / f"{name}.wav"
+        torchaudio.save(file_name.as_posix(), data.cpu(), 16000)
+
+
+def sample_interval(seqs, segment_size):
+    "This function sample an interval of audio and code according to segment size."
+    N = max([v.shape[-1] for v in seqs])
+    seq_len = segment_size if segment_size > 0 else N
+    hops = [N // v.shape[-1] for v in seqs]
+    lcm = np.lcm.reduce(hops)
+    interval_start = 0
+    interval_end = N // lcm - seq_len // lcm
+    start_step = random.randint(interval_start, interval_end)
+
+    new_seqs = []
+    for i, v in enumerate(seqs):
+        start = start_step * (lcm // hops[i])
+        end = (start_step + seq_len // lcm) * (lcm // hops[i])
+        new_seqs += [v[..., start:end]]
+
+    return new_seqs
+
+
+def dataio_prepare(hparams):
+    """This function prepares the datasets to be used in the brain class.
+    It also defines the data processing pipeline through user-defined functions.
+    """
+    segment_size = hparams["segment_size"]
+    code_hop_size = hparams["code_hop_size"]
+    codes_folder = pl.Path(hparams["codes_save_folder"])
+
+    # Define audio pipeline:
+    @sb.utils.data_pipeline.takes("id", "wav", "segment")
+    @sb.utils.data_pipeline.provides("code", "sig")
+    def audio_pipeline(utt_id, wav, segment):
+        info = torchaudio.info(wav)
+        audio = sb.dataio.dataio.read_audio(wav)
+        audio = torchaudio.transforms.Resample(
+            info.sample_rate,
+            hparams["sample_rate"],
+        )(audio)
+
+        code = np.load(codes_folder / f"{utt_id}.npy")
+
+        num_layer = len(hparams["layer"])
+        offsets = np.arange(num_layer) * hparams["num_clusters"]
+        code = code + offsets + 1
+
+        if hparams["layer_drop"]:
+            num_layers_to_drop = np.random.randint(0, code.shape[1])
+            if num_layers_to_drop > 0:
+                layers_to_drop = np.random.choice(
+                    code.shape[1], size=num_layers_to_drop, replace=False
+                )
+                code[:, layers_to_drop] = 0
+
+        code = torch.IntTensor(code)
+
+        # Trim end of audio
+        code_length = min(audio.shape[0] // code_hop_size, code.shape[0])
+        code = code[:code_length]
+        audio = audio[: code_length * code_hop_size]
+
+        while audio.shape[0] < segment_size:
+            audio = torch.hstack([audio, audio])
+            code = torch.hstack([code, code])
+        audio = audio.unsqueeze(0)
+
+        if segment:
+            code = code.swapdims(0, 1)
+            audio, code = sample_interval([audio, code], segment_size)
+            code = code.swapdims(0, 1)
+
+        return code, audio
+
+    datasets = {}
+    data_info = {
+        "train": hparams["train_json"],
+        "valid": hparams["valid_json"],
+        "test": hparams["test_json"],
+    }
+    for dataset in hparams["splits"]:
+        datasets[dataset] = sb.dataio.dataset.DynamicItemDataset.from_json(
+            json_path=data_info[dataset],
+            replacements={"data_root": hparams["data_folder"]},
+            dynamic_items=[audio_pipeline],
+            output_keys=["id", "code", "sig"],
+        )
+
+    return datasets
+
+
+if __name__ == "__main__":
+    # Load hyperparameters file with command-line overrides
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+
+    with open(hparams_file) as fin:
+        hparams = load_hyperpyyaml(fin, overrides)
+
+    # If --distributed_launch then
+    # create ddp_group with the right communication protocol
+    sb.utils.distributed.ddp_init_group(run_opts)
+
+    # Create experiment directory
+    sb.create_experiment_directory(
+        experiment_directory=hparams["output_folder"],
+        hyperparams_to_save=hparams_file,
+        overrides=overrides,
+    )
+
+    from ljspeech_prepare import prepare_ljspeech
+
+    sb.utils.distributed.run_on_main(
+        prepare_ljspeech,
+        kwargs={
+            "data_folder": hparams["data_folder"],
+            "save_folder": hparams["save_folder"],
+            "splits": hparams["splits"],
+            "split_ratio": hparams["split_ratio"],
+            "seed": hparams["seed"],
+            "skip_prep": hparams["skip_prep"],
+        },
+    )
+
+    from extract_code import extract_ljspeech
+
+    sb.utils.distributed.run_on_main(
+        extract_ljspeech,
+        kwargs={
+            "data_folder": hparams["save_folder"],
+            "splits": hparams["splits"],
+            "kmeans_folder": hparams["kmeans_folder"],
+            "kmeans_dataset": hparams["kmeans_dataset"],
+            "num_clusters": hparams["num_clusters"],
+            "encoder_type": hparams["encoder_type"],
+            "encoder_source": hparams["encoder_hub"],
+            "layer": hparams["layer"],
+            "encoder_save_folder": hparams["encoder_save_folder"],
+            "codes_save_folder": hparams["codes_save_folder"],
+            "sample_rate": hparams["sample_rate"],
+            "skip_extract": hparams["skip_extract"],
+        },
+    )
+
+    datasets = dataio_prepare(hparams)
+
+    # Brain class initialization
+    hifi_gan_brain = HifiGanBrain(
+        modules=hparams["modules"],
+        opt_class=[
+            hparams["opt_class_generator"],
+            hparams["opt_class_discriminator"],
+            hparams["sch_class_generator"],
+            hparams["sch_class_discriminator"],
+        ],
+        hparams=hparams,
+        run_opts=run_opts,
+        checkpointer=hparams["checkpointer"],
+    )
+
+    if hparams["use_tensorboard"]:
+        hifi_gan_brain.tensorboard_logger = (
+            sb.utils.train_logger.TensorboardLogger(
+                save_dir=hparams["output_folder"] + "/tensorboard"
+            )
+        )
+
+    # Training
+    hifi_gan_brain.fit(
+        hifi_gan_brain.hparams.epoch_counter,
+        train_set=datasets["train"],
+        valid_set=datasets["valid"],
+        train_loader_kwargs=hparams["train_dataloader_opts"],
+        valid_loader_kwargs=hparams["valid_dataloader_opts"],
+    )
+
+    # Test
+    if "test" in datasets:
+        hifi_gan_brain.evaluate(
+            datasets["test"],
+            test_loader_kwargs=hparams["test_dataloader_opts"],
+        )
--- a/speechbrain/recipes/LJSpeech/ljspeech_prepare.py
+++ b/speechbrain/recipes/LJSpeech/ljspeech_prepare.py
+"""
+LJspeech data preparation.
+Download: https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
+
+Authors
+ * Yingzhi WANG 2022
+ * Sathvik Udupa 2022
+ * Pradnya Kandarkar 2023
+"""
+
+import csv
+import json
+import logging
+import os
+import random
+import re
+
+import numpy as np
+import tgt
+import torch
+import torchaudio
+from tqdm import tqdm
+from unidecode import unidecode
+
+from speechbrain.dataio.dataio import load_pkl, save_pkl
+from speechbrain.inference.text import GraphemeToPhoneme
+from speechbrain.utils.data_utils import download_file
+from speechbrain.utils.text_to_sequence import _g2p_keep_punctuations
+
+logger = logging.getLogger(__name__)
+OPT_FILE = "opt_ljspeech_prepare.pkl"
+METADATA_CSV = "metadata.csv"
+TRAIN_JSON = "train.json"
+VALID_JSON = "valid.json"
+TEST_JSON = "test.json"
+WAVS = "wavs"
+DURATIONS = "durations"
+
+logger = logging.getLogger(__name__)
+OPT_FILE = "opt_ljspeech_prepare.pkl"
+
+
+def prepare_ljspeech(
+    data_folder,
+    save_folder,
+    splits=["train", "valid"],
+    split_ratio=[90, 10],
+    model_name=None,
+    seed=1234,
+    pitch_n_fft=1024,
+    pitch_hop_length=256,
+    pitch_min_f0=65,
+    pitch_max_f0=400,
+    skip_prep=False,
+    use_custom_cleaner=False,
+    device="cpu",
+):
+    """
+    Prepares the csv files for the LJspeech datasets.
+
+    Arguments
+    ---------
+    data_folder : str
+        Path to the folder where the original LJspeech dataset is stored
+    save_folder : str
+        The directory where to store the csv/json files
+    splits : list
+        List of dataset splits to prepare
+    split_ratio : list
+        Proportion for dataset splits
+    model_name : str
+        Model name (used to prepare additional model specific data)
+    seed : int
+        Random seed
+    pitch_n_fft : int
+        Number of fft points for pitch computation
+    pitch_hop_length : int
+        Hop length for pitch computation
+    pitch_min_f0 : int
+        Minimum f0 for pitch computation
+    pitch_max_f0 : int
+        Max f0 for pitch computation
+    skip_prep : bool
+        If True, skip preparation
+    use_custom_cleaner : bool
+        If True, uses custom cleaner defined for this recipe
+    device : str
+        Device for to be used for computation (used as required)
+
+    Returns
+    -------
+    None
+
+    Example
+    -------
+    >>> from recipes.LJSpeech.TTS.ljspeech_prepare import prepare_ljspeech
+    >>> data_folder = 'data/LJspeech/'
+    >>> save_folder = 'save/'
+    >>> splits = ['train', 'valid']
+    >>> split_ratio = [90, 10]
+    >>> seed = 1234
+    >>> prepare_ljspeech(data_folder, save_folder, splits, split_ratio, seed)
+    """
+    # Sets seeds for reproducible code
+    random.seed(seed)
+
+    if skip_prep:
+        return
+
+    # Creating configuration for easily skipping data_preparation stage
+    conf = {
+        "data_folder": data_folder,
+        "splits": splits,
+        "split_ratio": split_ratio,
+        "save_folder": save_folder,
+        "seed": seed,
+    }
+    if not os.path.exists(save_folder):
+        os.makedirs(save_folder)
+
+    # Setting output files
+    meta_csv = os.path.join(data_folder, METADATA_CSV)
+    wavs_folder = os.path.join(data_folder, WAVS)
+
+    save_opt = os.path.join(save_folder, OPT_FILE)
+    save_json_train = os.path.join(save_folder, TRAIN_JSON)
+    save_json_valid = os.path.join(save_folder, VALID_JSON)
+    save_json_test = os.path.join(save_folder, TEST_JSON)
+
+    phoneme_alignments_folder = None
+    duration_folder = None
+    pitch_folder = None
+    # Setting up additional folders required for FastSpeech2
+    if model_name is not None and "FastSpeech2" in model_name:
+        # This step requires phoneme alignments to be present in the data_folder
+        # We automatically download the alignments from https://www.dropbox.com/s/v28x5ldqqa288pu/LJSpeech.zip
+        # Download and unzip LJSpeech phoneme alignments from here: https://drive.google.com/drive/folders/1DBRkALpPd6FL9gjHMmMEdHODmkgNIIK4
+        alignment_URL = (
+            "https://www.dropbox.com/s/v28x5ldqqa288pu/LJSpeech.zip?dl=1"
+        )
+        phoneme_alignments_folder = os.path.join(
+            data_folder, "TextGrid", "LJSpeech"
+        )
+        download_file(
+            alignment_URL, data_folder + "/alignments.zip", unpack=True
+        )
+
+        duration_folder = os.path.join(data_folder, "durations")
+        if not os.path.exists(duration_folder):
+            os.makedirs(duration_folder)
+
+        # extract pitch for both Fastspeech2 and FastSpeech2WithAligner models
+        pitch_folder = os.path.join(data_folder, "pitch")
+        if not os.path.exists(pitch_folder):
+            os.makedirs(pitch_folder)
+
+    # Check if this phase is already done (if so, skip it)
+    if skip(splits, save_folder, conf):
+        logger.info("Skipping preparation, completed in previous run.")
+        return
+
+    # Additional check to make sure metadata.csv and wavs folder exists
+    assert os.path.exists(meta_csv), "metadata.csv does not exist"
+    assert os.path.exists(wavs_folder), "wavs/ folder does not exist"
+
+    # Prepare data splits
+    msg = "Creating json file for ljspeech Dataset.."
+    logger.info(msg)
+    data_split, meta_csv = split_sets(data_folder, splits, split_ratio)
+
+    if "train" in splits:
+        prepare_json(
+            model_name,
+            data_split["train"],
+            save_json_train,
+            wavs_folder,
+            meta_csv,
+            phoneme_alignments_folder,
+            duration_folder,
+            pitch_folder,
+            pitch_n_fft,
+            pitch_hop_length,
+            pitch_min_f0,
+            pitch_max_f0,
+            use_custom_cleaner,
+            device,
+        )
+    if "valid" in splits:
+        prepare_json(
+            model_name,
+            data_split["valid"],
+            save_json_valid,
+            wavs_folder,
+            meta_csv,
+            phoneme_alignments_folder,
+            duration_folder,
+            pitch_folder,
+            pitch_n_fft,
+            pitch_hop_length,
+            pitch_min_f0,
+            pitch_max_f0,
+            use_custom_cleaner,
+            device,
+        )
+    if "test" in splits:
+        prepare_json(
+            model_name,
+            data_split["test"],
+            save_json_test,
+            wavs_folder,
+            meta_csv,
+            phoneme_alignments_folder,
+            duration_folder,
+            pitch_folder,
+            pitch_n_fft,
+            pitch_hop_length,
+            pitch_min_f0,
+            pitch_max_f0,
+            use_custom_cleaner,
+            device,
+        )
+    save_pkl(conf, save_opt)
+
+
+def skip(splits, save_folder, conf):
+    """
+    Detects if the ljspeech data_preparation has been already done.
+    If the preparation has been done, we can skip it.
+
+    Arguments
+    ---------
+    splits : list
+        The portions of data to review.
+    save_folder : str
+        The path to the directory containing prepared files.
+    conf : dict
+        Configuration to match against saved config.
+
+    Returns
+    -------
+    bool
+        if True, the preparation phase can be skipped.
+        if False, it must be done.
+    """
+    # Checking json files
+    skip = True
+
+    split_files = {
+        "train": TRAIN_JSON,
+        "valid": VALID_JSON,
+        "test": TEST_JSON,
+    }
+
+    for split in splits:
+        if not os.path.isfile(os.path.join(save_folder, split_files[split])):
+            skip = False
+
+    #  Checking saved options
+    save_opt = os.path.join(save_folder, OPT_FILE)
+    if skip is True:
+        if os.path.isfile(save_opt):
+            opts_old = load_pkl(save_opt)
+            if opts_old == conf:
+                skip = True
+            else:
+                skip = False
+        else:
+            skip = False
+    return skip
+
+
+def split_sets(data_folder, splits, split_ratio):
+    """Randomly splits the wav list into training, validation, and test lists.
+    Note that a better approach is to make sure that all the classes have the
+    same proportion of samples for each session.
+
+    Arguments
+    ---------
+    data_folder : str
+        The path to the directory containing the data.
+    splits : list
+        The list of the selected splits.
+    split_ratio : list
+        List composed of three integers that sets split ratios for train,
+        valid, and test sets, respectively.
+        For instance split_ratio=[80, 10, 10] will assign 80% of the sentences
+        to training, 10% for validation, and 10% for test.
+
+    Returns
+    -------
+    dictionary containing train, valid, and test splits.
+    """
+    meta_csv = os.path.join(data_folder, METADATA_CSV)
+    csv_reader = csv.reader(
+        open(meta_csv), delimiter="|", quoting=csv.QUOTE_NONE
+    )
+
+    meta_csv = list(csv_reader)
+
+    index_for_sessions = []
+    session_id_start = "LJ001"
+    index_this_session = []
+    for i in range(len(meta_csv)):
+        session_id = meta_csv[i][0].split("-")[0]
+        if session_id == session_id_start:
+            index_this_session.append(i)
+            if i == len(meta_csv) - 1:
+                index_for_sessions.append(index_this_session)
+        else:
+            index_for_sessions.append(index_this_session)
+            session_id_start = session_id
+            index_this_session = [i]
+
+    session_len = [len(session) for session in index_for_sessions]
+
+    data_split = {}
+    for i, split in enumerate(splits):
+        data_split[split] = []
+        for j in range(len(index_for_sessions)):
+            if split == "train":
+                random.shuffle(index_for_sessions[j])
+                n_snts = int(session_len[j] * split_ratio[i] / sum(split_ratio))
+                data_split[split].extend(index_for_sessions[j][0:n_snts])
+                del index_for_sessions[j][0:n_snts]
+            if split == "valid":
+                if "test" in splits:
+                    random.shuffle(index_for_sessions[j])
+                    n_snts = int(
+                        session_len[j] * split_ratio[i] / sum(split_ratio)
+                    )
+                    data_split[split].extend(index_for_sessions[j][0:n_snts])
+                    del index_for_sessions[j][0:n_snts]
+                else:
+                    data_split[split].extend(index_for_sessions[j])
+            if split == "test":
+                data_split[split].extend(index_for_sessions[j])
+
+    return data_split, meta_csv
+
+
+def prepare_json(
+    model_name,
+    seg_lst,
+    json_file,
+    wavs_folder,
+    csv_reader,
+    phoneme_alignments_folder,
+    durations_folder,
+    pitch_folder,
+    pitch_n_fft,
+    pitch_hop_length,
+    pitch_min_f0,
+    pitch_max_f0,
+    use_custom_cleaner=False,
+    device="cpu",
+):
+    """
+    Creates json file given a list of indexes.
+
+    Arguments
+    ---------
+    model_name : str
+        Model name (used to prepare additional model specific data)
+    seg_lst : list
+        The list of json indexes of a given data split
+    json_file : str
+        Output json path
+    wavs_folder : str
+        LJspeech wavs folder
+    csv_reader : _csv.reader
+        LJspeech metadata
+    phoneme_alignments_folder : path
+        Path where the phoneme alignments are stored
+    durations_folder : path
+        Folder where to store the duration values of each audio
+    pitch_folder : path
+        Folder where to store the pitch of each audio
+    pitch_n_fft : int
+        Number of fft points for pitch computation
+    pitch_hop_length : int
+        Hop length for pitch computation
+    pitch_min_f0 : int
+        Minimum f0 for pitch computation
+    pitch_max_f0 : int
+        Max f0 for pitch computation
+    use_custom_cleaner : bool
+        If True, uses custom cleaner defined for this recipe
+    device : str
+        Device for to be used for computation (used as required)
+    """
+
+    logger.info(f"preparing {json_file}.")
+    if model_name in ["Tacotron2", "FastSpeech2WithAlignment"]:
+        logger.info(
+            "Computing phonemes for LJSpeech labels using SpeechBrain G2P. This may take a while."
+        )
+        g2p = GraphemeToPhoneme.from_hparams(
+            "speechbrain/soundchoice-g2p", run_opts={"device": device}
+        )
+    if model_name is not None and "FastSpeech2" in model_name:
+        logger.info(
+            "Computing pitch as required for FastSpeech2. This may take a while."
+        )
+
+    json_dict = {}
+    for index in tqdm(seg_lst):
+        # Common data preparation
+        id = list(csv_reader)[index][0]
+        wav = os.path.join(wavs_folder, f"{id}.wav")
+        label = list(csv_reader)[index][2]
+        if use_custom_cleaner:
+            label = custom_clean(label, model_name)
+
+        json_dict[id] = {
+            "uttid": id,
+            "wav": wav,
+            "label": label,
+            "segment": True if "train" in json_file else False,
+        }
+
+        # FastSpeech2 specific data preparation
+        if model_name == "FastSpeech2":
+            audio, fs = torchaudio.load(wav)
+
+            # Parses phoneme alignments
+            textgrid_path = os.path.join(
+                phoneme_alignments_folder, f"{id}.TextGrid"
+            )
+            textgrid = tgt.io.read_textgrid(
+                textgrid_path, include_empty_intervals=True
+            )
+
+            last_phoneme_flags = get_last_phoneme_info(
+                textgrid.get_tier_by_name("words"),
+                textgrid.get_tier_by_name("phones"),
+            )
+            (
+                phonemes,
+                duration,
+                start,
+                end,
+                trimmed_last_phoneme_flags,
+            ) = get_alignment(
+                textgrid.get_tier_by_name("phones"),
+                fs,
+                pitch_hop_length,
+                last_phoneme_flags,
+            )
+
+            # Gets label phonemes
+            label_phoneme = " ".join(phonemes)
+            spn_labels = [0] * len(phonemes)
+            for i in range(1, len(phonemes)):
+                if phonemes[i] == "spn":
+                    spn_labels[i - 1] = 1
+            if start >= end:
+                print(f"Skipping {id}")
+                continue
+
+            # Saves durations
+            duration_file_path = os.path.join(durations_folder, f"{id}.npy")
+            np.save(duration_file_path, duration)
+
+            # Computes pitch
+            audio = audio[:, int(fs * start) : int(fs * end)]
+            pitch_file = wav.replace(".wav", ".npy").replace(
+                wavs_folder, pitch_folder
+            )
+            if not os.path.isfile(pitch_file):
+                pitch = torchaudio.functional.detect_pitch_frequency(
+                    waveform=audio,
+                    sample_rate=fs,
+                    frame_time=(pitch_hop_length / fs),
+                    win_length=3,
+                    freq_low=pitch_min_f0,
+                    freq_high=pitch_max_f0,
+                ).squeeze(0)
+
+                # Concatenate last element to match duration.
+                pitch = torch.cat([pitch, pitch[-1].unsqueeze(0)])
+
+                # Mean and Variance Normalization
+                mean = 256.1732939688805
+                std = 328.319759158607
+
+                pitch = (pitch - mean) / std
+
+                pitch = pitch[: sum(duration)]
+                np.save(pitch_file, pitch)
+
+            # Updates data for the utterance
+            json_dict[id].update({"label_phoneme": label_phoneme})
+            json_dict[id].update({"spn_labels": spn_labels})
+            json_dict[id].update({"start": start})
+            json_dict[id].update({"end": end})
+            json_dict[id].update({"durations": duration_file_path})
+            json_dict[id].update({"pitch": pitch_file})
+            json_dict[id].update(
+                {"last_phoneme_flags": trimmed_last_phoneme_flags}
+            )
+
+        # FastSpeech2WithAlignment specific data preparation
+        if model_name == "FastSpeech2WithAlignment":
+            audio, fs = torchaudio.load(wav)
+            # Computes pitch
+            pitch_file = wav.replace(".wav", ".npy").replace(
+                wavs_folder, pitch_folder
+            )
+            if not os.path.isfile(pitch_file):
+                if torchaudio.__version__ < "2.1":
+                    pitch = torchaudio.functional.compute_kaldi_pitch(
+                        waveform=audio,
+                        sample_rate=fs,
+                        frame_length=(pitch_n_fft / fs * 1000),
+                        frame_shift=(pitch_hop_length / fs * 1000),
+                        min_f0=pitch_min_f0,
+                        max_f0=pitch_max_f0,
+                    )[0, :, 0]
+                else:
+                    pitch = torchaudio.functional.detect_pitch_frequency(
+                        waveform=audio,
+                        sample_rate=fs,
+                        frame_time=(pitch_hop_length / fs),
+                        win_length=3,
+                        freq_low=pitch_min_f0,
+                        freq_high=pitch_max_f0,
+                    ).squeeze(0)
+
+                    # Concatenate last element to match duration.
+                    pitch = torch.cat([pitch, pitch[-1].unsqueeze(0)])
+
+                    # Mean and Variance Normalization
+                    mean = 256.1732939688805
+                    std = 328.319759158607
+
+                    pitch = (pitch - mean) / std
+
+                np.save(pitch_file, pitch)
+
+            phonemes = _g2p_keep_punctuations(g2p, label)
+            # Updates data for the utterance
+            json_dict[id].update({"phonemes": phonemes})
+            json_dict[id].update({"pitch": pitch_file})
+
+    # Writing the dictionary to the json file
+    with open(json_file, mode="w") as json_f:
+        json.dump(json_dict, json_f, indent=2)
+
+    logger.info(f"{json_file} successfully created!")
+
+
+def get_alignment(tier, sampling_rate, hop_length, last_phoneme_flags):
+    """
+    Returns phonemes, phoneme durations (in frames), start time (in seconds), end time (in seconds).
+    This function is adopted from https://github.com/ming024/FastSpeech2/blob/master/preprocessor/preprocessor.py
+
+    Arguments
+    ---------
+    tier : tgt.core.IntervalTier
+        For an utterance, contains Interval objects for phonemes and their start time and end time in seconds
+    sampling_rate : int
+        Sample rate if audio signal
+    hop_length : int
+        Hop length for duration computation
+    last_phoneme_flags : list
+        List of (phoneme, flag) tuples with flag=1 if the phoneme is the last phoneme else flag=0
+
+
+    Returns
+    -------
+    (phones, durations, start_time, end_time) : tuple
+        The phonemes, durations, start time, and end time for an utterance
+    """
+
+    sil_phones = ["sil", "sp", "spn", ""]
+
+    phonemes = []
+    durations = []
+    start_time = 0
+    end_time = 0
+    end_idx = 0
+    trimmed_last_phoneme_flags = []
+
+    flag_iter = iter(last_phoneme_flags)
+
+    for t in tier._objects:
+        s, e, p = t.start_time, t.end_time, t.text
+        current_flag = next(flag_iter)
+
+        # Trims leading silences
+        if phonemes == []:
+            if p in sil_phones:
+                continue
+            else:
+                start_time = s
+
+        if p not in sil_phones:
+            # For ordinary phones
+            # Removes stress indicators
+            if p[-1].isdigit():
+                phonemes.append(p[:-1])
+            else:
+                phonemes.append(p)
+            trimmed_last_phoneme_flags.append(current_flag[1])
+            end_time = e
+            end_idx = len(phonemes)
+        else:
+            # Uses a unique token for all silent phones
+            phonemes.append("spn")
+            trimmed_last_phoneme_flags.append(current_flag[1])
+
+        durations.append(
+            int(
+                np.round(e * sampling_rate / hop_length)
+                - np.round(s * sampling_rate / hop_length)
+            )
+        )
+
+    # Trims tailing silences
+    phonemes = phonemes[:end_idx]
+    durations = durations[:end_idx]
+
+    return phonemes, durations, start_time, end_time, trimmed_last_phoneme_flags
+
+
+def get_last_phoneme_info(words_seq, phones_seq):
+    """This function takes word and phoneme tiers from a TextGrid file as input
+    and provides a list of tuples for the phoneme sequence indicating whether
+    each of the phonemes is the last phoneme of a word or not.
+
+    Each tuple of the returned list has this format: (phoneme, flag)
+
+
+    Arguments
+    ---------
+    words_seq : tier
+        word tier from a TextGrid file
+    phones_seq : tier
+        phoneme tier from a TextGrid file
+
+    Returns
+    -------
+    last_phoneme_flags : list
+        each tuple of the returned list has this format: (phoneme, flag)
+    """
+
+    # Gets all phoneme objects for the entire sequence
+    phoneme_objects = phones_seq._objects
+    phoneme_iter = iter(phoneme_objects)
+
+    # Stores flags to show if an element (phoneme) is a the last phoneme of a word
+    last_phoneme_flags = list()
+
+    # Matches the end times of the phoneme and word objects to get the last phoneme information
+    for word_obj in words_seq._objects:
+        word_end_time = word_obj.end_time
+
+        current_phoneme = next(phoneme_iter, None)
+        while current_phoneme:
+            phoneme_end_time = current_phoneme.end_time
+            if phoneme_end_time == word_end_time:
+                last_phoneme_flags.append((current_phoneme.text, 1))
+                break
+            else:
+                last_phoneme_flags.append((current_phoneme.text, 0))
+            current_phoneme = next(phoneme_iter, None)
+
+    return last_phoneme_flags
+
+
+def custom_clean(text, model_name):
+    """
+    Uses custom criteria to clean text.
+
+    Arguments
+    ---------
+    text : str
+        Input text to be cleaned
+    model_name : str
+        whether to treat punctuations
+
+    Returns
+    -------
+    text : str
+        Cleaned text
+    """
+
+    _abbreviations = [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("mrs", "missus"),
+            ("mr", "mister"),
+            ("dr", "doctor"),
+            ("st", "saint"),
+            ("co", "company"),
+            ("jr", "junior"),
+            ("maj", "major"),
+            ("gen", "general"),
+            ("drs", "doctors"),
+            ("rev", "reverend"),
+            ("lt", "lieutenant"),
+            ("hon", "honorable"),
+            ("sgt", "sergeant"),
+            ("capt", "captain"),
+            ("esq", "esquire"),
+            ("ltd", "limited"),
+            ("col", "colonel"),
+            ("ft", "fort"),
+        ]
+    ]
+    text = unidecode(text.lower())
+    if model_name != "FastSpeech2WithAlignment":
+        text = re.sub("[:;]", " - ", text)
+        text = re.sub(r'[)(\[\]"]', " ", text)
+        text = text.strip().strip().strip("-")
+
+    text = re.sub(" +", " ", text)
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
--- a/speechbrain/recipes/LJSpeech/quantization/README.md
+++ b/speechbrain/recipes/LJSpeech/quantization/README.md
+
+# K-means (Quantization)
+This folder contains recipes for training K-means clustering model for the LJSpeech Dataset.
+The model serves to quantize self-supervised representations into discrete representation. Thus representations can be used as a discrete audio input for various tasks including classification, ASR and speech generation.
+It supports kmeans model using the features from  HuBERT, WAVLM or Wav2Vec.
+
+You can download LibriSpeech at http://www.openslr.org/12
+
+## Installing Extra Dependencies
+
+Before proceeding, ensure you have installed the necessary additional dependencies. To do this, simply run the following command in your terminal:
+
+```
+pip install -r extra_requirements.txt
+```
+
+# How to run:
+To configure the SSL model type and corresponding Hub in your YAML configuration file, follow these steps:
+
+1. Locate the `model_config` section in your YAML file.
+2. Modify the `ssl_model_type` field to specify one of the SSL models: "Hubert", "WavLM", or "Wav2Vec2".
+3. Update the `ssl_hub` field with the specific name of the SSL Hub associated with your chosen model type.
+Here are the supported SSL models along with their corresponding SSL Hubs:
+```
+ssl_model_type: hubert, wavlm, wav2vec2
+ssl_hub:
+  - facebook/hubert-large-ll60k
+  - microsoft/wavlm-large
+  - facebook/wav2vec2-large
+```
+4. Set the output folder according to the experiments you are running (e.g., `output_folder: !ref results/LJSpeech/clustering/wavlm/<seed>`)
+
+To initiate training using a specific SSL model, execute the following command:
+
+
+```shell
+python train.py hparams/train_discrete_ssl.yaml
+```
+This command will start the training process using the configurations specified in 'train_discrete_ssl.yaml'.
+# Results
+
+The checkpoints can be found at [this](https://huggingface.co/speechbrain/SSL_Quantization) HuggingFace repository.
+
+
+
+# **About SpeechBrain**
+- Website: https://speechbrain.github.io/
+- Code: https://github.com/speechbrain/speechbrain/
+- HuggingFace: https://huggingface.co/speechbrain/
+
+
+# **Citing SpeechBrain**
+Please, cite SpeechBrain if you use it for your research or business.
+
+```bibtex
+@misc{ravanelli2024opensourceconversationalaispeechbrain,
+      title={Open-Source Conversational AI with SpeechBrain 1.0},
+      author={Mirco Ravanelli and Titouan Parcollet and Adel Moumen and Sylvain de Langen and Cem Subakan and Peter Plantinga and Yingzhi Wang and Pooneh Mousavi and Luca Della Libera and Artem Ploujnikov and Francesco Paissan and Davide Borra and Salah Zaiem and Zeyu Zhao and Shucong Zhang and Georgios Karakasidis and Sung-Lin Yeh and Pierre Champion and Aku Rouhe and Rudolf Braun and Florian Mai and Juan Zuluaga-Gomez and Seyed Mahed Mousavi and Andreas Nautsch and Xuechen Liu and Sangeet Sagar and Jarod Duret and Salima Mdhaffar and Gaelle Laperriere and Mickael Rouvier and Renato De Mori and Yannick Esteve},
+      year={2024},
+      eprint={2407.00463},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/2407.00463},
+}
+@misc{speechbrain,
+  title={{SpeechBrain}: A General-Purpose Speech Toolkit},
+  author={Mirco Ravanelli and Titouan Parcollet and Peter Plantinga and Aku Rouhe and Samuele Cornell and Loren Lugosch and Cem Subakan and Nauman Dawalatabad and Abdelwahab Heba and Jianyuan Zhong and Ju-Chieh Chou and Sung-Lin Yeh and Szu-Wei Fu and Chien-Feng Liao and Elena Rastorgueva and François Grondin and William Aris and Hwidong Na and Yan Gao and Renato De Mori and Yoshua Bengio},
+  year={2021},
+  eprint={2106.04624},
+  archivePrefix={arXiv},
+  primaryClass={eess.AS},
+  note={arXiv:2106.04624}
+}
+```
--- a/speechbrain/recipes/LJSpeech/quantization/extra-requirements.txt
+++ b/speechbrain/recipes/LJSpeech/quantization/extra-requirements.txt
+scikit-learn
+tgt
+unidecode
--- a/speechbrain/recipes/LJSpeech/quantization/hparams/train_discrete_ssl.yaml
+++ b/speechbrain/recipes/LJSpeech/quantization/hparams/train_discrete_ssl.yaml
+################################
+# Recipe for Training K-Means Clustering on LJSpeech Data
+# Using Self-Supervised Model-Based Representations
+#
+# It is used for creating discrete audio representations from LJSpeech data.
+#
+# Author: Pooneh Mousavi (2023)
+################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/LJSpeech/clustering/hubert/<seed>
+save_folder: !ref <output_folder>/save
+
+# Data files
+data_folder: !PLACEHOLDER # e,g./path/to/LJSpeech-1.1
+
+train_json: !ref <save_folder>/train.json
+
+splits: ["train"]
+split_ratio: [80]
+skip_prep: False
+sample_rate: 16000
+
+# model_config
+# ssl_model_type: hubert, wavlm, wav2vec2
+# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large,  facebook/wav2vec2-large
+ssl_model_type: hubert # hubert, wavml or wav2vec2
+ssl_hub: facebook/hubert-large-ll60k
+freeze_feature_extractor: True
+freeze_ssl: True
+ssl_folder: !ref <save_folder>/hubert_checkpoint
+ssl_layer_num: 7
+batch_size: 128 # batch_size for loading and extracting features. It is different from kmeans_batch_size.
+checkpoint_interval: 100
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    drop_last: True
+
+ssl_model: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
+            source: !ref <ssl_hub>
+            output_norm: False
+            freeze: !ref <freeze_ssl>
+            freeze_feature_extractor: !ref <freeze_feature_extractor>
+            output_all_hiddens: True
+            save_path: !ref <ssl_folder>
+        hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
+            source: !ref <ssl_hub>
+            output_norm: False
+            freeze: !ref <freeze_ssl>
+            freeze_feature_extractor: !ref <freeze_feature_extractor>
+            output_all_hiddens: True
+            save_path: !ref <ssl_folder>
+        wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
+            source: !ref <ssl_hub>
+            output_norm: False
+            freeze: !ref <freeze_ssl>
+            freeze_feature_extractor: !ref <freeze_feature_extractor>
+            output_all_hiddens: True
+            save_path: !ref <ssl_folder>
+
+
+####################
+# Model Parameters #
+####################
+num_clusters: 128
+init: k-means++
+max_iter: 100
+kmeans_batch_size: 1000 # should be >= num_clusters
+tol: 0.0
+max_no_improvement: 100
+n_init: 20
+reassignment_ratio: 0.0
--- a/speechbrain/recipes/LJSpeech/quantization/ljspeech_prepare.py
+++ b/speechbrain/recipes/LJSpeech/quantization/ljspeech_prepare.py
+../ljspeech_prepare.py
\ No newline at end of file
--- a/speechbrain/recipes/LJSpeech/quantization/train.py
+++ b/speechbrain/recipes/LJSpeech/quantization/train.py
+"""
+Recipe  to train K-means clustering model on self-supervised representations.
+
+To run this recipe, do the following:
+> python train.py hparams/train_with_[SSL-model].yaml --data_folder=/path/to/LJSpeech
+Author
+ * Pooneh Mousavi 2023
+"""
+
+import logging
+import os
+import sys
+
+import torchaudio
+from hyperpyyaml import load_hyperpyyaml
+from torch.utils.data import DataLoader
+
+import speechbrain as sb
+from speechbrain.dataio.dataloader import LoopedLoader
+from speechbrain.utils.distributed import run_on_main
+from speechbrain.utils.kmeans import fetch_kmeans_model, save_model, train
+
+logger = logging.getLogger(__name__)
+
+
+def dataio_prepare(hparams):
+
+    # Define audio pipeline:
+    @sb.utils.data_pipeline.takes("wav")
+    @sb.utils.data_pipeline.provides("sig")
+    def audio_pipeline(wav):
+        sig = sb.dataio.dataio.read_audio(wav)
+        info = torchaudio.info(wav)
+        resampled = torchaudio.transforms.Resample(
+            info.sample_rate,
+            hparams["sample_rate"],
+        )(sig)
+        return resampled
+
+    datasets = {}
+    data_info = {
+        "train": hparams["train_json"],
+    }
+    for dataset in hparams["splits"]:
+        datasets[dataset] = sb.dataio.dataset.DynamicItemDataset.from_json(
+            json_path=data_info[dataset],
+            replacements={"data_root": hparams["data_folder"]},
+            dynamic_items=[audio_pipeline],
+            output_keys=["id", "sig"],
+        )
+
+    return datasets
+
+    return datasets
+
+
+if __name__ == "__main__":
+    # Load hyperparameters file with command-line overrides
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+
+    with open(hparams_file) as fin:
+        hparams = load_hyperpyyaml(fin, overrides)
+
+    # Create experiment directory
+    sb.create_experiment_directory(
+        experiment_directory=hparams["output_folder"],
+        hyperparams_to_save=hparams_file,
+        overrides=overrides,
+    )
+
+    # Dataset prep (parsing Librispeech)
+    from ljspeech_prepare import prepare_ljspeech  # noqa
+
+    # multi-gpu (ddp) save data preparation
+    run_on_main(
+        prepare_ljspeech,
+        kwargs={
+            "data_folder": hparams["data_folder"],
+            "save_folder": hparams["save_folder"],
+            "splits": hparams["splits"],
+            "split_ratio": hparams["split_ratio"],
+            "seed": hparams["seed"],
+            "skip_prep": hparams["skip_prep"],
+        },
+    )
+
+    # Load SSL model
+    hparams["ssl_model"] = hparams["ssl_model"].to(run_opts["device"])
+
+    # Make training Dataloader
+    train_set = dataio_prepare(hparams)["train"]
+    if not (
+        isinstance(train_set, DataLoader) or isinstance(train_set, LoopedLoader)
+    ):
+        train_set = sb.dataio.dataloader.make_dataloader(
+            train_set, **hparams["train_dataloader_opts"]
+        )
+    os.makedirs(hparams["save_folder"], exist_ok=True)
+    # If you use dataloader checkpoints, make sure to keep all the settings as in the previous run and keep the dataset ordering the same.
+    dataloader_path = os.path.join(
+        hparams["save_folder"], "dataloader-TRAIN.ckpt"
+    )
+    if os.path.exists(dataloader_path):
+        logger.info(
+            f"The dataloader checkpoint is loaded from {dataloader_path}."
+        )
+        train_set._speechbrain_load(dataloader_path, False)
+
+    # Load pretrained KMeans model if it exists. Otherwise,  create new one.
+    checkpoint_path = os.path.join(
+        hparams["save_folder"],
+        f"kmeans-cluster-{hparams['num_clusters']}-layer-{hparams['ssl_layer_num']}.pt",
+    )
+
+    kmeans_model = fetch_kmeans_model(
+        n_clusters=hparams["num_clusters"],
+        init=hparams["init"],
+        max_iter=hparams["max_iter"],
+        batch_size=hparams["batch_size"],
+        tol=hparams["tol"],
+        max_no_improvement=hparams["max_no_improvement"],
+        n_init=hparams["n_init"],
+        reassignment_ratio=hparams["reassignment_ratio"],
+        random_state=hparams["seed"],
+        checkpoint_path=checkpoint_path,
+    )
+
+    # Train and save Kmeans model
+    train(
+        kmeans_model,
+        train_set,
+        hparams["ssl_model"],
+        hparams["save_folder"],
+        hparams["ssl_layer_num"],
+        kmeans_batch_size=hparams["kmeans_batch_size"],
+        device=run_opts["device"],
+        checkpoint_interval=hparams["checkpoint_interval"],
+    )
+
+    logger.info(f"Saving kmeans model at {checkpoint_path}.")
+    save_model(kmeans_model, checkpoint_path)
+    train_set._speechbrain_save(dataloader_path)