Commit 4130a52d authored by changhl's avatar changhl
Browse files

init model

parent eb6a18fd
Pipeline #1617 failed with stages
in 0 seconds
# Generated 2024-08-27 from:
# /public/home/changhl/tacotron2_pytorch/speechbrain/recipes/LJSpeech/TTS/tacotron2/hparams/train.yaml
# yamllint disable
############################################################################
# Model: Tacotron2
# Tokens: Raw characters (English text)
# losses: Transducer
# Training: LJSpeech
# Authors: Georges Abous-Rjeili, Artem Ploujnikov, Yingzhi Wang
# ############################################################################
###################################
# Experiment Parameters and setup #
###################################
seed: 1234
__set_seed: !apply:torch.manual_seed [1234]
output_folder: ./results/tacotron2/1234
save_folder: /public/home/changhl/taco/logdir
train_log: ./results/tacotron2/1234/train_log.txt
epochs: 750
keep_checkpoint_interval: 50
###################################
# Progress Samples #
###################################
# Progress samples are used to monitor the progress
# of an ongoing training session by outputting samples
# of spectrograms, alignments, etc at regular intervals
# Whether to enable progress samples
progress_samples: true
# The path where the samples will be stored
progress_sample_path: ./results/tacotron2/1234/samples
# The interval, in epochs. For instance, if it is set to 5,
# progress samples will be output every 5 epochs
progress_samples_interval: 1
# The sample size for raw batch samples saved in batch.pth
# (useful mostly for model debugging)
progress_batch_sample_size: 3
#################################
# Data files and pre-processing #
#################################
data_folder: /public/home/changhl/LJSpeech-1.1
# e.g, /localscratch/ljspeech
train_json: /public/home/changhl/taco/logdir/train.json
valid_json: /public/home/changhl/taco/logdir/valid.json
test_json: /public/home/changhl/taco/logdir/test.json
splits: [train, valid]
split_ratio: [90, 10]
skip_prep: false
# Use the original preprocessing from nvidia
# The cleaners to be used (applicable to nvidia only)
text_cleaners: [english_cleaners]
################################
# Audio Parameters #
################################
sample_rate: 22050
hop_length: 256
win_length: 1024
n_mel_channels: 80
n_fft: 1024
mel_fmin: 0.0
mel_fmax: 8000.0
mel_normalized: false
power: 1
norm: slaney
mel_scale: slaney
dynamic_range_compression: true
################################
# Optimization Hyperparameters #
################################
learning_rate: 0.001
weight_decay: 0.000006
batch_size: 64 #minimum 2
num_workers: 8
mask_padding: true
guided_attention_sigma: 0.2
guided_attention_weight: 50.0
guided_attention_weight_half_life: 10.
guided_attention_hard_stop: 50
gate_loss_weight: 1.0
train_dataloader_opts:
batch_size: 64
drop_last: false #True #False
num_workers: 8
collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
valid_dataloader_opts:
batch_size: 64
num_workers: 8
collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
test_dataloader_opts:
batch_size: 64
num_workers: 8
collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
################################
# Model Parameters and model #
################################
n_symbols: 148 #fixed depending on symbols in textToSequence
symbols_embedding_dim: 512
# Encoder parameters
encoder_kernel_size: 5
encoder_n_convolutions: 3
encoder_embedding_dim: 512
# Decoder parameters
# The number of frames in the target per encoder step
n_frames_per_step: 1
decoder_rnn_dim: 1024
prenet_dim: 256
max_decoder_steps: 1000
gate_threshold: 0.5
p_attention_dropout: 0.1
p_decoder_dropout: 0.1
decoder_no_early_stopping: false
# Attention parameters
attention_rnn_dim: 1024
attention_dim: 128
# Location Layer parameters
attention_location_n_filters: 32
attention_location_kernel_size: 31
# Mel-post processing network parameters
postnet_embedding_dim: 512
postnet_kernel_size: 5
postnet_n_convolutions: 5
mel_spectogram: !name:speechbrain.lobes.models.Tacotron2.mel_spectogram
sample_rate: 22050
hop_length: 256
win_length: 1024
n_fft: 1024
n_mels: 80
f_min: 0.0
f_max: 8000.0
power: 1
normalized: false
norm: slaney
mel_scale: slaney
compression: true
#model
model: &id002 !new:speechbrain.lobes.models.Tacotron2.Tacotron2
#optimizer
mask_padding: true
n_mel_channels: 80
# symbols
n_symbols: 148
symbols_embedding_dim: 512
# encoder
encoder_kernel_size: 5
encoder_n_convolutions: 3
encoder_embedding_dim: 512
# attention
attention_rnn_dim: 1024
attention_dim: 128
# attention location
attention_location_n_filters: 32
attention_location_kernel_size: 31
# decoder
n_frames_per_step: 1
decoder_rnn_dim: 1024
prenet_dim: 256
max_decoder_steps: 1000
gate_threshold: 0.5
p_attention_dropout: 0.1
p_decoder_dropout: 0.1
# postnet
postnet_embedding_dim: 512
postnet_kernel_size: 5
postnet_n_convolutions: 5
decoder_no_early_stopping: false
guided_attention_scheduler: &id001 !new:speechbrain.nnet.schedulers.StepScheduler
initial_value: 50.0
half_life: 10.
criterion: !new:speechbrain.lobes.models.Tacotron2.Loss
gate_loss_weight: 1.0
guided_attention_weight: 50.0
guided_attention_sigma: 0.2
guided_attention_scheduler: *id001
guided_attention_hard_stop: 50
modules:
model: *id002
opt_class: !name:torch.optim.Adam
lr: 0.001
weight_decay: 0.000006
#epoch object
epoch_counter: &id003 !new:speechbrain.utils.epoch_loop.EpochCounter
limit: 750
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: ./results/tacotron2/1234/train_log.txt
#annealing_function
lr_annealing: &id004 !new:speechbrain.nnet.schedulers.IntervalScheduler
#infer: !name:speechbrain.lobes.models.Tacotron2.infer
intervals:
- steps: 6000
lr: 0.0005
- steps: 8000
lr: 0.0003
- steps: 10000
lr: 0.0001
#checkpointer
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: /public/home/changhl/taco/logdir
recoverables:
model: *id002
counter: *id003
scheduler: *id004
progress_sample_logger: !new:speechbrain.utils.train_logger.ProgressSampleLogger
output_path: ./results/tacotron2/1234/samples
batch_sample_size: 3
formats:
raw_batch: raw
2024-08-27 14:39:21,619 - speechbrain.core - INFO - Beginning experiment!
2024-08-27 14:39:21,620 - speechbrain.core - INFO - Experiment folder: ./results/tacotron2/1234
2024-08-27 14:39:22,259 - speechbrain.utils.superpowers - DEBUG - accelerate==0.31.0
addict==2.4.0
aiosignal==1.3.1
aitemplate @ http://10.6.10.68:8000/release/aitemplate/dtk24.04.1/aitemplate-0.0.1%2Bdas1.1.git5d8aa20.dtk2404.torch2.1.0-py3-none-any.whl#sha256=ad763a7cfd3935857cf10a07a2a97899fd64dda481add2f48de8b8930bd341dd
annotated-types==0.7.0
anyio==4.4.0
apex @ http://10.6.10.68:8000/release/apex/dtk24.04.1/apex-1.1.0%2Bdas1.1.gitf477a3a.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=85eb662d13d6e6c3b61c2d878378c2338c4479bc03a1912c3eabddc2d9d08aa1
attrs==23.2.0
audioread==3.0.1
bitsandbytes @ http://10.6.10.68:8000/release/bitsandbyte/dtk24.04.1/bitsandbytes-0.42.0%2Bdas1.1.gitce85679.abi1.dtk2404.torch2.1.0-py3-none-any.whl#sha256=6324e330c8d12b858d39f4986c0ed0836fcb05f539cee92a7cf558e17954ae0d
certifi==2024.6.2
cffi==1.17.0
cfgv==3.4.0
charset-normalizer==3.3.2
click==8.1.7
coloredlogs==15.0.1
contourpy==1.2.1
cycler==0.12.1
decorator==5.1.1
deepspeed @ http://10.6.10.68:8000/release/deepspeed/dtk24.04.1/deepspeed-0.12.3%2Bgita724046.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=2c158ed2dab21f4f09e7fc29776cb43a1593b13cec33168ce3483f318b852fc9
distlib==0.3.8
dnspython==2.6.1
dropout-layer-norm @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/dropout_layer_norm-0.1%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=ae10c7cc231a8e38492292e91e76ba710d7679762604c0a7f10964b2385cdbd7
einops==0.8.0
email_validator==2.1.1
exceptiongroup==1.2.1
fastapi==0.111.0
fastapi-cli==0.0.4
fastpt @ http://10.6.10.68:8000/release/fastpt/dtk24.04.1/fastpt-1.0.0%2Bdas1.1.abi1.dtk2404-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=ecf30dadcd2482adb1107991edde19b6559b8237379dbb0a3e6eb7306aad3f9a
filelock==3.15.1
fire==0.6.0
flash-attn @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/flash_attn-2.0.4%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=7ca8e78ee0624b1ff0e91e9fc265e61b9510f02123a010ac71a2f8e5d08a62f7
flatbuffers==24.3.25
fonttools==4.53.0
frozenlist==1.4.1
fsspec==2024.6.0
fused-dense-lib @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/fused_dense_lib-0.1%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=7202dd258a86bb7a1572e3b44b90dae667b0c948bf0f420b05924a107aaaba03
h11==0.14.0
hjson==3.1.0
httpcore==1.0.5
httptools==0.6.1
httpx==0.27.0
huggingface-hub==0.23.4
humanfriendly==10.0
HyperPyYAML==1.2.2
hypothesis==5.35.1
identify==2.6.0
idna==3.7
importlib_metadata==7.1.0
Jinja2==3.1.4
joblib==1.4.2
jsonschema==4.22.0
jsonschema-specifications==2023.12.1
kiwisolver==1.4.5
layer-check-pt @ http://10.6.10.68:8000/release/layercheck/dtk24.04.1/layer_check_pt-1.2.3.git59a087a.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=807adae2d4d4b74898777f81e1b94f1af4d881afe6a7826c7c910b211accbea7
lazy_loader==0.4
librosa==0.10.2.post1
lightop @ http://10.6.10.68:8000/release/lightop/dtk24.04.1/lightop-0.4%2Bdas1.1git8e60f07.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=2f2c88fd3fe4be179f44c4849e9224cb5b2b259843fc5a2d088e468b7a14c1b1
llvmlite==0.43.0
lmdeploy @ http://10.6.10.68:8000/release/lmdeploy/dtk24.04.1/lmdeploy-0.2.6%2Bdas1.1.git6ba90df.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=92ecee2c8b982f86e5c3219ded24d2ede219f415bf2cd4297f989a03387a203c
markdown-it-py==3.0.0
MarkupSafe==2.1.5
matplotlib==3.9.0
mdurl==0.1.2
mmcv @ http://10.6.10.68:8000/release/mmcv/dtk24.04.1/mmcv-2.0.1%2Bdas1.1.gite58da25.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=7a937ae22f81b44d9100907e11303c31bf9a670cb4c92e361675674a41a8a07f
mmengine==0.10.4
mmengine-lite==0.10.4
mpmath==1.3.0
msgpack==1.0.8
networkx==3.3
ninja==1.11.1.1
nodeenv==1.9.1
numba==0.60.0
numpy==1.24.3
onnxruntime @ http://10.6.10.68:8000/release/onnxruntime/dtk24.04.1/onnxruntime-1.15.0%2Bdas1.1.git739f24d.abi1.dtk2404-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=d0d24167188d2c85f1ed4110fc43e62ea40c74280716d9b5fe9540256f17869a
opencv-python==4.10.0.82
orjson==3.10.5
packaging==24.1
pandas==2.2.2
peft==0.9.0
pillow==10.3.0
platformdirs==4.2.2
pooch==1.8.2
pre-commit==3.8.0
prometheus_client==0.20.0
protobuf==5.27.1
psutil==5.9.8
py-cpuinfo==9.0.0
pycparser==2.22
pydantic==2.7.4
pydantic_core==2.18.4
Pygments==2.18.0
pygtrie==2.5.0
pynvml==11.5.0
pyparsing==3.1.2
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
python-multipart==0.0.9
pytz==2024.1
PyYAML==6.0.1
ray==2.9.1
referencing==0.35.1
regex==2024.5.15
requests==2.32.3
rich==13.7.1
rotary-emb @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/rotary_emb-0.1%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=cc15ec6ae73875515243d7f5c96ab214455a33a4a99eb7f1327f773cae1e6721
rpds-py==0.18.1
ruamel.yaml==0.18.6
ruamel.yaml.clib==0.2.8
safetensors==0.4.3
scikit-learn==1.5.1
scipy==1.13.1
sentencepiece==0.2.0
shellingham==1.5.4
shortuuid==1.0.13
six==1.16.0
sniffio==1.3.1
sortedcontainers==2.4.0
soundfile==0.12.1
soxr==0.5.0
speechbrain==1.0.0
starlette==0.37.2
sympy==1.12.1
termcolor==2.4.0
tgt==1.5
threadpoolctl==3.5.0
tiktoken==0.7.0
tokenizers==0.15.0
tomli==2.0.1
torch @ http://10.6.10.68:8000/release/pytorch/dtk24.04.1/torch-2.1.0%2Bdas1.1.git3ac1bdd.abi1.dtk2404-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=5fd3bcef3aa197c0922727913aca53db9ce3f2fd4a9b22bba1973c3d526377f9
torchaudio @ http://10.6.10.68:8000/release/torchaudio/dtk24.04.1/torchaudio-2.1.2%2Bdas1.1.git63d9a68.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=4fcc556a7a2fffe64ddd57f22e5972b1b2b723f6fdfdaa305bd01551036df38b
torchvision @ http://10.6.10.68:8000/release/vision/dtk24.04.1/torchvision-0.16.0%2Bdas1.1.git7d45932.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=e3032e1bcc0857b54391d66744f97e5cff0dc7e7bb508196356ee927fb81ec01
tqdm==4.66.4
transformers==4.38.0
triton @ http://10.6.10.68:8000/release/triton/dtk24.04.1/triton-2.1.0%2Bdas1.1.git4bf1007a.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=4c30d45dab071e65d1704a5cd189b14c4ac20bd59a7061032dfd631b1fc37645
typer==0.12.3
typing_extensions==4.12.2
tzdata==2024.1
ujson==5.10.0
urllib3==2.2.1
uvicorn==0.30.1
uvloop==0.19.0
virtualenv==20.26.3
vllm @ http://10.6.10.68:8000/release/vllm/dtk24.04.1/vllm-0.3.3%2Bdas1.1.gitdf6349c.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=48d265b07efa36f028eca45a3667fa10d3cf30eb1b8f019b62e3b255fb9e49c4
watchfiles==0.22.0
websockets==12.0
xentropy-cuda-lib @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/xentropy_cuda_lib-0.1%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=91b058d6a5fd2734a5085d68e08d3a1f948fe9c0119c46885d19f55293e2cce4
xformers @ http://10.6.10.68:8000/release/xformers/dtk24.04.1/xformers-0.0.25%2Bdas1.1.git8ef8bc1.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=ca87fd065753c1be3b9fad552eba02d30cd3f4c673f01e81a763834eb5cbb9cc
yapf==0.40.2
zipp==3.19.2
2024-08-27 14:39:22,428 - speechbrain.core - ERROR - Exception:
Traceback (most recent call last):
File "/public/home/changhl/tacotron2_pytorch/speechbrain/recipes/LJSpeech/TTS/tacotron2/train.py", line 353, in <module>
from ljspeech_prepare import prepare_ljspeech
File "/public/home/changhl/tacotron2_pytorch/speechbrain/recipes/LJSpeech/TTS/tacotron2/ljspeech_prepare.py", line 23, in <module>
from unidecode import unidecode
ModuleNotFoundError: No module named 'unidecode'
2024-08-27 14:41:02,748 - speechbrain.core - INFO - Beginning experiment!
2024-08-27 14:41:02,749 - speechbrain.core - INFO - Experiment folder: ./results/tacotron2/1234
2024-08-27 14:41:03,353 - speechbrain.utils.superpowers - DEBUG - accelerate==0.31.0
addict==2.4.0
aiosignal==1.3.1
aitemplate @ http://10.6.10.68:8000/release/aitemplate/dtk24.04.1/aitemplate-0.0.1%2Bdas1.1.git5d8aa20.dtk2404.torch2.1.0-py3-none-any.whl#sha256=ad763a7cfd3935857cf10a07a2a97899fd64dda481add2f48de8b8930bd341dd
annotated-types==0.7.0
anyio==4.4.0
apex @ http://10.6.10.68:8000/release/apex/dtk24.04.1/apex-1.1.0%2Bdas1.1.gitf477a3a.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=85eb662d13d6e6c3b61c2d878378c2338c4479bc03a1912c3eabddc2d9d08aa1
attrs==23.2.0
audioread==3.0.1
bitsandbytes @ http://10.6.10.68:8000/release/bitsandbyte/dtk24.04.1/bitsandbytes-0.42.0%2Bdas1.1.gitce85679.abi1.dtk2404.torch2.1.0-py3-none-any.whl#sha256=6324e330c8d12b858d39f4986c0ed0836fcb05f539cee92a7cf558e17954ae0d
certifi==2024.6.2
cffi==1.17.0
cfgv==3.4.0
charset-normalizer==3.3.2
click==8.1.7
coloredlogs==15.0.1
contourpy==1.2.1
cycler==0.12.1
decorator==5.1.1
deepspeed @ http://10.6.10.68:8000/release/deepspeed/dtk24.04.1/deepspeed-0.12.3%2Bgita724046.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=2c158ed2dab21f4f09e7fc29776cb43a1593b13cec33168ce3483f318b852fc9
distlib==0.3.8
dnspython==2.6.1
dropout-layer-norm @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/dropout_layer_norm-0.1%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=ae10c7cc231a8e38492292e91e76ba710d7679762604c0a7f10964b2385cdbd7
einops==0.8.0
email_validator==2.1.1
exceptiongroup==1.2.1
fastapi==0.111.0
fastapi-cli==0.0.4
fastpt @ http://10.6.10.68:8000/release/fastpt/dtk24.04.1/fastpt-1.0.0%2Bdas1.1.abi1.dtk2404-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=ecf30dadcd2482adb1107991edde19b6559b8237379dbb0a3e6eb7306aad3f9a
filelock==3.15.1
fire==0.6.0
flash-attn @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/flash_attn-2.0.4%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=7ca8e78ee0624b1ff0e91e9fc265e61b9510f02123a010ac71a2f8e5d08a62f7
flatbuffers==24.3.25
fonttools==4.53.0
frozenlist==1.4.1
fsspec==2024.6.0
fused-dense-lib @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/fused_dense_lib-0.1%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=7202dd258a86bb7a1572e3b44b90dae667b0c948bf0f420b05924a107aaaba03
h11==0.14.0
hjson==3.1.0
httpcore==1.0.5
httptools==0.6.1
httpx==0.27.0
huggingface-hub==0.23.4
humanfriendly==10.0
HyperPyYAML==1.2.2
hypothesis==5.35.1
identify==2.6.0
idna==3.7
importlib_metadata==7.1.0
Jinja2==3.1.4
joblib==1.4.2
jsonschema==4.22.0
jsonschema-specifications==2023.12.1
kiwisolver==1.4.5
layer-check-pt @ http://10.6.10.68:8000/release/layercheck/dtk24.04.1/layer_check_pt-1.2.3.git59a087a.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=807adae2d4d4b74898777f81e1b94f1af4d881afe6a7826c7c910b211accbea7
lazy_loader==0.4
librosa==0.10.2.post1
lightop @ http://10.6.10.68:8000/release/lightop/dtk24.04.1/lightop-0.4%2Bdas1.1git8e60f07.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=2f2c88fd3fe4be179f44c4849e9224cb5b2b259843fc5a2d088e468b7a14c1b1
llvmlite==0.43.0
lmdeploy @ http://10.6.10.68:8000/release/lmdeploy/dtk24.04.1/lmdeploy-0.2.6%2Bdas1.1.git6ba90df.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=92ecee2c8b982f86e5c3219ded24d2ede219f415bf2cd4297f989a03387a203c
markdown-it-py==3.0.0
MarkupSafe==2.1.5
matplotlib==3.9.0
mdurl==0.1.2
mmcv @ http://10.6.10.68:8000/release/mmcv/dtk24.04.1/mmcv-2.0.1%2Bdas1.1.gite58da25.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=7a937ae22f81b44d9100907e11303c31bf9a670cb4c92e361675674a41a8a07f
mmengine==0.10.4
mmengine-lite==0.10.4
mpmath==1.3.0
msgpack==1.0.8
networkx==3.3
ninja==1.11.1.1
nodeenv==1.9.1
numba==0.60.0
numpy==1.24.3
onnxruntime @ http://10.6.10.68:8000/release/onnxruntime/dtk24.04.1/onnxruntime-1.15.0%2Bdas1.1.git739f24d.abi1.dtk2404-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=d0d24167188d2c85f1ed4110fc43e62ea40c74280716d9b5fe9540256f17869a
opencv-python==4.10.0.82
orjson==3.10.5
packaging==24.1
pandas==2.2.2
peft==0.9.0
pillow==10.3.0
platformdirs==4.2.2
pooch==1.8.2
pre-commit==3.8.0
prometheus_client==0.20.0
protobuf==5.27.1
psutil==5.9.8
py-cpuinfo==9.0.0
pycparser==2.22
pydantic==2.7.4
pydantic_core==2.18.4
Pygments==2.18.0
pygtrie==2.5.0
pynvml==11.5.0
pyparsing==3.1.2
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
python-multipart==0.0.9
pytz==2024.1
PyYAML==6.0.1
ray==2.9.1
referencing==0.35.1
regex==2024.5.15
requests==2.32.3
rich==13.7.1
rotary-emb @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/rotary_emb-0.1%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=cc15ec6ae73875515243d7f5c96ab214455a33a4a99eb7f1327f773cae1e6721
rpds-py==0.18.1
ruamel.yaml==0.18.6
ruamel.yaml.clib==0.2.8
safetensors==0.4.3
scikit-learn==1.5.1
scipy==1.13.1
sentencepiece==0.2.0
shellingham==1.5.4
shortuuid==1.0.13
six==1.16.0
sniffio==1.3.1
sortedcontainers==2.4.0
soundfile==0.12.1
soxr==0.5.0
speechbrain==1.0.0
starlette==0.37.2
sympy==1.12.1
termcolor==2.4.0
tgt==1.5
threadpoolctl==3.5.0
tiktoken==0.7.0
tokenizers==0.15.0
tomli==2.0.1
torch @ http://10.6.10.68:8000/release/pytorch/dtk24.04.1/torch-2.1.0%2Bdas1.1.git3ac1bdd.abi1.dtk2404-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=5fd3bcef3aa197c0922727913aca53db9ce3f2fd4a9b22bba1973c3d526377f9
torchaudio @ http://10.6.10.68:8000/release/torchaudio/dtk24.04.1/torchaudio-2.1.2%2Bdas1.1.git63d9a68.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=4fcc556a7a2fffe64ddd57f22e5972b1b2b723f6fdfdaa305bd01551036df38b
torchvision @ http://10.6.10.68:8000/release/vision/dtk24.04.1/torchvision-0.16.0%2Bdas1.1.git7d45932.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=e3032e1bcc0857b54391d66744f97e5cff0dc7e7bb508196356ee927fb81ec01
tqdm==4.66.4
transformers==4.38.0
triton @ http://10.6.10.68:8000/release/triton/dtk24.04.1/triton-2.1.0%2Bdas1.1.git4bf1007a.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=4c30d45dab071e65d1704a5cd189b14c4ac20bd59a7061032dfd631b1fc37645
typer==0.12.3
typing_extensions==4.12.2
tzdata==2024.1
ujson==5.10.0
Unidecode==1.3.8
urllib3==2.2.1
uvicorn==0.30.1
uvloop==0.19.0
virtualenv==20.26.3
vllm @ http://10.6.10.68:8000/release/vllm/dtk24.04.1/vllm-0.3.3%2Bdas1.1.gitdf6349c.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=48d265b07efa36f028eca45a3667fa10d3cf30eb1b8f019b62e3b255fb9e49c4
watchfiles==0.22.0
websockets==12.0
xentropy-cuda-lib @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/xentropy_cuda_lib-0.1%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=91b058d6a5fd2734a5085d68e08d3a1f948fe9c0119c46885d19f55293e2cce4
xformers @ http://10.6.10.68:8000/release/xformers/dtk24.04.1/xformers-0.0.25%2Bdas1.1.git8ef8bc1.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=ca87fd065753c1be3b9fad552eba02d30cd3f4c673f01e81a763834eb5cbb9cc
yapf==0.40.2
zipp==3.19.2
2024-08-27 14:41:03,824 - ljspeech_prepare - INFO - Creating json file for ljspeech Dataset..
2024-08-27 14:41:03,858 - ljspeech_prepare - INFO - preparing /public/home/changhl/taco/logdir/train.json.
2024-08-27 14:41:05,014 - ljspeech_prepare - INFO - /public/home/changhl/taco/logdir/train.json successfully created!
2024-08-27 14:41:05,017 - ljspeech_prepare - INFO - preparing /public/home/changhl/taco/logdir/valid.json.
2024-08-27 14:41:05,144 - ljspeech_prepare - INFO - /public/home/changhl/taco/logdir/valid.json successfully created!
2024-08-27 14:41:06,035 - speechbrain.core - INFO - Gradscaler enabled: False. Using precision: fp32.
2024-08-27 14:41:06,036 - speechbrain.core - INFO - 28.2M trainable parameters in Tacotron2Brain
2024-08-27 14:41:06,039 - speechbrain.utils.checkpoints - INFO - Would load a checkpoint here, but none found yet.
2024-08-27 14:41:06,039 - speechbrain.utils.epoch_loop - INFO - Going into epoch 1
2024-08-27 14:41:36,638 - speechbrain.core - ERROR - Exception:
Traceback (most recent call last):
File "/public/home/changhl/tacotron2_pytorch/speechbrain/recipes/LJSpeech/TTS/tacotron2/train.py", line 379, in <module>
tacotron2_brain.fit(
File "/usr/local/lib/python3.10/site-packages/speechbrain/core.py", line 1555, in fit
self._fit_train(train_set=train_set, epoch=epoch, enable=enable)
File "/usr/local/lib/python3.10/site-packages/speechbrain/core.py", line 1384, in _fit_train
loss = self.fit_batch(batch)
File "/usr/local/lib/python3.10/site-packages/speechbrain/core.py", line 1185, in fit_batch
scaled_loss.backward()
File "/usr/local/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
torch.autograd.backward(
File "/usr/local/lib/python3.10/site-packages/torch/autograd/__init__.py", line 251, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
KeyboardInterrupt
2024-08-27 14:43:04,704 - speechbrain.core - INFO - Beginning experiment!
2024-08-27 14:43:04,704 - speechbrain.core - INFO - Experiment folder: ./results/tacotron2/1234
2024-08-27 14:43:05,313 - speechbrain.utils.superpowers - DEBUG - accelerate==0.31.0
addict==2.4.0
aiosignal==1.3.1
aitemplate @ http://10.6.10.68:8000/release/aitemplate/dtk24.04.1/aitemplate-0.0.1%2Bdas1.1.git5d8aa20.dtk2404.torch2.1.0-py3-none-any.whl#sha256=ad763a7cfd3935857cf10a07a2a97899fd64dda481add2f48de8b8930bd341dd
annotated-types==0.7.0
anyio==4.4.0
apex @ http://10.6.10.68:8000/release/apex/dtk24.04.1/apex-1.1.0%2Bdas1.1.gitf477a3a.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=85eb662d13d6e6c3b61c2d878378c2338c4479bc03a1912c3eabddc2d9d08aa1
attrs==23.2.0
audioread==3.0.1
bitsandbytes @ http://10.6.10.68:8000/release/bitsandbyte/dtk24.04.1/bitsandbytes-0.42.0%2Bdas1.1.gitce85679.abi1.dtk2404.torch2.1.0-py3-none-any.whl#sha256=6324e330c8d12b858d39f4986c0ed0836fcb05f539cee92a7cf558e17954ae0d
certifi==2024.6.2
cffi==1.17.0
cfgv==3.4.0
charset-normalizer==3.3.2
click==8.1.7
coloredlogs==15.0.1
contourpy==1.2.1
cycler==0.12.1
decorator==5.1.1
deepspeed @ http://10.6.10.68:8000/release/deepspeed/dtk24.04.1/deepspeed-0.12.3%2Bgita724046.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=2c158ed2dab21f4f09e7fc29776cb43a1593b13cec33168ce3483f318b852fc9
distlib==0.3.8
dnspython==2.6.1
dropout-layer-norm @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/dropout_layer_norm-0.1%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=ae10c7cc231a8e38492292e91e76ba710d7679762604c0a7f10964b2385cdbd7
einops==0.8.0
email_validator==2.1.1
exceptiongroup==1.2.1
fastapi==0.111.0
fastapi-cli==0.0.4
fastpt @ http://10.6.10.68:8000/release/fastpt/dtk24.04.1/fastpt-1.0.0%2Bdas1.1.abi1.dtk2404-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=ecf30dadcd2482adb1107991edde19b6559b8237379dbb0a3e6eb7306aad3f9a
filelock==3.15.1
fire==0.6.0
flash-attn @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/flash_attn-2.0.4%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=7ca8e78ee0624b1ff0e91e9fc265e61b9510f02123a010ac71a2f8e5d08a62f7
flatbuffers==24.3.25
fonttools==4.53.0
frozenlist==1.4.1
fsspec==2024.6.0
fused-dense-lib @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/fused_dense_lib-0.1%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=7202dd258a86bb7a1572e3b44b90dae667b0c948bf0f420b05924a107aaaba03
h11==0.14.0
hjson==3.1.0
httpcore==1.0.5
httptools==0.6.1
httpx==0.27.0
huggingface-hub==0.23.4
humanfriendly==10.0
HyperPyYAML==1.2.2
hypothesis==5.35.1
identify==2.6.0
idna==3.7
importlib_metadata==7.1.0
Jinja2==3.1.4
joblib==1.4.2
jsonschema==4.22.0
jsonschema-specifications==2023.12.1
kiwisolver==1.4.5
layer-check-pt @ http://10.6.10.68:8000/release/layercheck/dtk24.04.1/layer_check_pt-1.2.3.git59a087a.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=807adae2d4d4b74898777f81e1b94f1af4d881afe6a7826c7c910b211accbea7
lazy_loader==0.4
librosa==0.10.2.post1
lightop @ http://10.6.10.68:8000/release/lightop/dtk24.04.1/lightop-0.4%2Bdas1.1git8e60f07.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=2f2c88fd3fe4be179f44c4849e9224cb5b2b259843fc5a2d088e468b7a14c1b1
llvmlite==0.43.0
lmdeploy @ http://10.6.10.68:8000/release/lmdeploy/dtk24.04.1/lmdeploy-0.2.6%2Bdas1.1.git6ba90df.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=92ecee2c8b982f86e5c3219ded24d2ede219f415bf2cd4297f989a03387a203c
markdown-it-py==3.0.0
MarkupSafe==2.1.5
matplotlib==3.9.0
mdurl==0.1.2
mmcv @ http://10.6.10.68:8000/release/mmcv/dtk24.04.1/mmcv-2.0.1%2Bdas1.1.gite58da25.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=7a937ae22f81b44d9100907e11303c31bf9a670cb4c92e361675674a41a8a07f
mmengine==0.10.4
mmengine-lite==0.10.4
mpmath==1.3.0
msgpack==1.0.8
networkx==3.3
ninja==1.11.1.1
nodeenv==1.9.1
numba==0.60.0
numpy==1.24.3
onnxruntime @ http://10.6.10.68:8000/release/onnxruntime/dtk24.04.1/onnxruntime-1.15.0%2Bdas1.1.git739f24d.abi1.dtk2404-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=d0d24167188d2c85f1ed4110fc43e62ea40c74280716d9b5fe9540256f17869a
opencv-python==4.10.0.82
orjson==3.10.5
packaging==24.1
pandas==2.2.2
peft==0.9.0
pillow==10.3.0
platformdirs==4.2.2
pooch==1.8.2
pre-commit==3.8.0
prometheus_client==0.20.0
protobuf==5.27.1
psutil==5.9.8
py-cpuinfo==9.0.0
pycparser==2.22
pydantic==2.7.4
pydantic_core==2.18.4
Pygments==2.18.0
pygtrie==2.5.0
pynvml==11.5.0
pyparsing==3.1.2
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
python-multipart==0.0.9
pytz==2024.1
PyYAML==6.0.1
ray==2.9.1
referencing==0.35.1
regex==2024.5.15
requests==2.32.3
rich==13.7.1
rotary-emb @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/rotary_emb-0.1%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=cc15ec6ae73875515243d7f5c96ab214455a33a4a99eb7f1327f773cae1e6721
rpds-py==0.18.1
ruamel.yaml==0.18.6
ruamel.yaml.clib==0.2.8
safetensors==0.4.3
scikit-learn==1.5.1
scipy==1.13.1
sentencepiece==0.2.0
shellingham==1.5.4
shortuuid==1.0.13
six==1.16.0
sniffio==1.3.1
sortedcontainers==2.4.0
soundfile==0.12.1
soxr==0.5.0
speechbrain==1.0.0
starlette==0.37.2
sympy==1.12.1
termcolor==2.4.0
tgt==1.5
threadpoolctl==3.5.0
tiktoken==0.7.0
tokenizers==0.15.0
tomli==2.0.1
torch @ http://10.6.10.68:8000/release/pytorch/dtk24.04.1/torch-2.1.0%2Bdas1.1.git3ac1bdd.abi1.dtk2404-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=5fd3bcef3aa197c0922727913aca53db9ce3f2fd4a9b22bba1973c3d526377f9
torchaudio @ http://10.6.10.68:8000/release/torchaudio/dtk24.04.1/torchaudio-2.1.2%2Bdas1.1.git63d9a68.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=4fcc556a7a2fffe64ddd57f22e5972b1b2b723f6fdfdaa305bd01551036df38b
torchvision @ http://10.6.10.68:8000/release/vision/dtk24.04.1/torchvision-0.16.0%2Bdas1.1.git7d45932.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=e3032e1bcc0857b54391d66744f97e5cff0dc7e7bb508196356ee927fb81ec01
tqdm==4.66.4
transformers==4.38.0
triton @ http://10.6.10.68:8000/release/triton/dtk24.04.1/triton-2.1.0%2Bdas1.1.git4bf1007a.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=4c30d45dab071e65d1704a5cd189b14c4ac20bd59a7061032dfd631b1fc37645
typer==0.12.3
typing_extensions==4.12.2
tzdata==2024.1
ujson==5.10.0
Unidecode==1.3.8
urllib3==2.2.1
uvicorn==0.30.1
uvloop==0.19.0
virtualenv==20.26.3
vllm @ http://10.6.10.68:8000/release/vllm/dtk24.04.1/vllm-0.3.3%2Bdas1.1.gitdf6349c.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=48d265b07efa36f028eca45a3667fa10d3cf30eb1b8f019b62e3b255fb9e49c4
watchfiles==0.22.0
websockets==12.0
xentropy-cuda-lib @ http://10.6.10.68:8000/release/flash_attn/dtk24.04.1/xentropy_cuda_lib-0.1%2Bdas1.1gitc7a8c18.abi1.dtk2404.torch2.1-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=91b058d6a5fd2734a5085d68e08d3a1f948fe9c0119c46885d19f55293e2cce4
xformers @ http://10.6.10.68:8000/release/xformers/dtk24.04.1/xformers-0.0.25%2Bdas1.1.git8ef8bc1.abi1.dtk2404.torch2.1.0-cp310-cp310-manylinux_2_31_x86_64.whl#sha256=ca87fd065753c1be3b9fad552eba02d30cd3f4c673f01e81a763834eb5cbb9cc
yapf==0.40.2
zipp==3.19.2
2024-08-27 14:43:05,329 - ljspeech_prepare - INFO - Skipping preparation, completed in previous run.
2024-08-27 14:43:06,197 - speechbrain.core - INFO - Gradscaler enabled: False. Using precision: fp32.
2024-08-27 14:43:06,198 - speechbrain.core - INFO - 28.2M trainable parameters in Tacotron2Brain
2024-08-27 14:43:06,200 - speechbrain.utils.checkpoints - INFO - Would load a checkpoint here, but none found yet.
2024-08-27 14:43:06,200 - speechbrain.utils.epoch_loop - INFO - Going into epoch 1
2024-08-27 14:44:00,358 - speechbrain.core - ERROR - Exception:
Traceback (most recent call last):
File "/public/home/changhl/tacotron2_pytorch/speechbrain/recipes/LJSpeech/TTS/tacotron2/train.py", line 379, in <module>
tacotron2_brain.fit(
File "/usr/local/lib/python3.10/site-packages/speechbrain/core.py", line 1555, in fit
self._fit_train(train_set=train_set, epoch=epoch, enable=enable)
File "/usr/local/lib/python3.10/site-packages/speechbrain/core.py", line 1384, in _fit_train
loss = self.fit_batch(batch)
File "/usr/local/lib/python3.10/site-packages/speechbrain/core.py", line 1185, in fit_batch
scaled_loss.backward()
File "/usr/local/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
torch.autograd.backward(
File "/usr/local/lib/python3.10/site-packages/torch/autograd/__init__.py", line 251, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
KeyboardInterrupt
# -*- coding: utf-8 -*-
"""
Recipe for training the Tacotron Text-To-Speech model, an end-to-end
neural text-to-speech (TTS) system
To run this recipe, do the following:
# python train.py --device=cuda:0 --max_grad_norm=1.0 --data_folder=/your_folder/LJSpeech-1.1 hparams/train.yaml
to infer simply load saved model and do
savemodel.infer(text_Sequence,len(textsequence))
were text_Sequence is the output of the text_to_sequence function from
textToSequence.py (from textToSequence import text_to_sequence)
Authors
* Georges Abous-Rjeili 2021
* Artem Ploujnikov 2021
* Yingzhi Wang 2022
"""
import logging
import sys
import torch
from hyperpyyaml import load_hyperpyyaml
import speechbrain as sb
from speechbrain.utils.data_utils import scalarize
from speechbrain.utils.text_to_sequence import text_to_sequence
logger = logging.getLogger(__name__)
class Tacotron2Brain(sb.Brain):
"""The Brain implementation for Tacotron2"""
def on_fit_start(self):
"""Gets called at the beginning of ``fit()``, on multiple processes
if ``distributed_count > 0`` and backend is ddp and initializes statistics
"""
self.hparams.progress_sample_logger.reset()
self.last_epoch = 0
self.last_batch = None
self.last_loss_stats = {}
return super().on_fit_start()
def compute_forward(self, batch, stage):
"""Computes the forward pass
Arguments
---------
batch: str
a single batch
stage: speechbrain.Stage
the training stage
Returns
-------
the model output
"""
effective_batch = self.batch_to_device(batch)
inputs, y, num_items, _, _ = effective_batch
_, input_lengths, _, _, _ = inputs
max_input_length = input_lengths.max().item()
return self.modules.model(inputs, alignments_dim=max_input_length)
def on_fit_batch_end(self, batch, outputs, loss, should_step):
"""At the end of the optimizer step, apply noam annealing."""
if should_step:
self.hparams.lr_annealing(self.optimizer)
def compute_objectives(self, predictions, batch, stage):
"""Computes the loss given the predicted and targeted outputs.
Arguments
---------
predictions : torch.Tensor
The model generated spectrograms and other metrics from `compute_forward`.
batch : PaddedBatch
This batch object contains all the relevant tensors for computation.
stage : sb.Stage
One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
Returns
-------
loss : torch.Tensor
A one-element tensor used for backpropagating the gradient.
"""
effective_batch = self.batch_to_device(batch)
# Hold on to the batch for the inference sample. This is needed because
# the inference sample is run from on_stage_end only, where
# batch information is not available
self.last_batch = effective_batch
# Hold on to a sample (for logging)
self._remember_sample(effective_batch, predictions)
# Compute the loss
loss = self._compute_loss(predictions, effective_batch, stage)
return loss
def _compute_loss(self, predictions, batch, stage):
"""Computes the value of the loss function and updates stats
Arguments
---------
predictions: tuple
model predictions
batch: PaddedBatch
Inputs for this training iteration.
stage: sb.Stage
One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
Returns
-------
loss: torch.Tensor
the loss value
"""
inputs, targets, num_items, labels, wavs = batch
text_padded, input_lengths, _, max_len, output_lengths = inputs
loss_stats = self.hparams.criterion(
predictions, targets, input_lengths, output_lengths, self.last_epoch
)
self.last_loss_stats[stage] = scalarize(loss_stats)
return loss_stats.loss
def _remember_sample(self, batch, predictions):
"""Remembers samples of spectrograms and the batch for logging purposes
Arguments
---------
batch: tuple
a training batch
predictions: tuple
predictions (raw output of the Tacotron model)
"""
inputs, targets, num_items, labels, wavs = batch
text_padded, input_lengths, _, max_len, output_lengths = inputs
mel_target, _ = targets
mel_out, mel_out_postnet, gate_out, alignments = predictions
alignments_max = (
alignments[0]
.max(dim=-1)
.values.max(dim=-1)
.values.unsqueeze(-1)
.unsqueeze(-1)
)
alignments_output = alignments[0].T.flip(dims=(1,)) / alignments_max
self.hparams.progress_sample_logger.remember(
target=self._get_spectrogram_sample(mel_target),
output=self._get_spectrogram_sample(mel_out),
output_postnet=self._get_spectrogram_sample(mel_out_postnet),
alignments=alignments_output,
raw_batch=self.hparams.progress_sample_logger.get_batch_sample(
{
"text_padded": text_padded,
"input_lengths": input_lengths,
"mel_target": mel_target,
"mel_out": mel_out,
"mel_out_postnet": mel_out_postnet,
"max_len": max_len,
"output_lengths": output_lengths,
"gate_out": gate_out,
"alignments": alignments,
"labels": labels,
"wavs": wavs,
}
),
)
def batch_to_device(self, batch):
"""Transfers the batch to the target device
Arguments
---------
batch: tuple
the batch to use
Returns
-------
batch: tuple
the batch on the correct device
"""
(
text_padded,
input_lengths,
mel_padded,
gate_padded,
output_lengths,
len_x,
labels,
wavs,
) = batch
text_padded = text_padded.to(self.device, non_blocking=True).long()
input_lengths = input_lengths.to(self.device, non_blocking=True).long()
max_len = torch.max(input_lengths.data).item()
mel_padded = mel_padded.to(self.device, non_blocking=True).float()
gate_padded = gate_padded.to(self.device, non_blocking=True).float()
output_lengths = output_lengths.to(
self.device, non_blocking=True
).long()
x = (text_padded, input_lengths, mel_padded, max_len, output_lengths)
y = (mel_padded, gate_padded)
len_x = torch.sum(output_lengths)
return (x, y, len_x, labels, wavs)
def _get_spectrogram_sample(self, raw):
"""Converts a raw spectrogram to one that can be saved as an image
sample = sqrt(exp(raw))
Arguments
---------
raw: torch.Tensor
the raw spectrogram (as used in the model)
Returns
-------
sample: torch.Tensor
the spectrogram, for image saving purposes
"""
sample = raw[0]
return torch.sqrt(torch.exp(sample))
def on_stage_end(self, stage, stage_loss, epoch):
"""Gets called at the end of an epoch.
Arguments
---------
stage : sb.Stage
One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST
stage_loss : float
The average loss for all of the data processed in this stage.
epoch : int
The currently-starting epoch. This is passed
`None` during the test stage.
"""
# Store the train loss until the validation stage.
# At the end of validation, we can write
if stage == sb.Stage.VALID:
# Update learning rate
lr = self.optimizer.param_groups[-1]["lr"]
self.last_epoch = epoch
# The train_logger writes a summary to stdout and to the logfile.
self.hparams.train_logger.log_stats( # 1#2#
stats_meta={"Epoch": epoch, "lr": lr},
train_stats=self.last_loss_stats[sb.Stage.TRAIN],
valid_stats=self.last_loss_stats[sb.Stage.VALID],
)
# Save the current checkpoint and delete previous checkpoints.
epoch_metadata = {
**{"epoch": epoch},
**self.last_loss_stats[sb.Stage.VALID],
}
self.checkpointer.save_and_keep_only(
meta=epoch_metadata,
min_keys=["loss"],
ckpt_predicate=(
(
lambda ckpt: (
ckpt.meta["epoch"]
% self.hparams.keep_checkpoint_interval
!= 0
)
)
if self.hparams.keep_checkpoint_interval is not None
else None
),
)
output_progress_sample = (
self.hparams.progress_samples
and epoch % self.hparams.progress_samples_interval == 0
)
if output_progress_sample:
self.run_inference_sample()
self.hparams.progress_sample_logger.save(epoch)
# We also write statistics about test data to stdout and to the logfile.
if stage == sb.Stage.TEST:
self.hparams.train_logger.log_stats(
{"Epoch loaded": self.hparams.epoch_counter.current},
test_stats=self.last_loss_stats[sb.Stage.TEST],
)
if self.hparams.progress_samples:
self.run_inference_sample()
self.hparams.progress_sample_logger.save("test")
def run_inference_sample(self):
"""Produces a sample in inference mode. This is called when producing
samples and can be useful because"""
if self.last_batch is None:
return
inputs, _, _, _, _ = self.last_batch
text_padded, input_lengths, _, _, _ = inputs
mel_out, _, _ = self.hparams.model.infer(
text_padded[:1], input_lengths[:1]
)
self.hparams.progress_sample_logger.remember(
inference_mel_out=self._get_spectrogram_sample(mel_out)
)
def dataio_prepare(hparams):
# Define audio pipeline:
@sb.utils.data_pipeline.takes("wav", "label")
@sb.utils.data_pipeline.provides("mel_text_pair")
def audio_pipeline(wav, label):
text_seq = torch.IntTensor(
text_to_sequence(label, hparams["text_cleaners"])
)
audio = sb.dataio.dataio.read_audio(wav)
mel = hparams["mel_spectogram"](audio=audio)
len_text = len(text_seq)
return text_seq, mel, len_text
datasets = {}
data_info = {
"train": hparams["train_json"],
"valid": hparams["valid_json"],
"test": hparams["test_json"],
}
for dataset in hparams["splits"]:
datasets[dataset] = sb.dataio.dataset.DynamicItemDataset.from_json(
json_path=data_info[dataset],
replacements={"data_root": hparams["data_folder"]},
dynamic_items=[audio_pipeline],
output_keys=["mel_text_pair", "wav", "label"],
)
return datasets
if __name__ == "__main__":
# Load hyperparameters file with command-line overrides
hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
with open(hparams_file) as fin:
hparams = load_hyperpyyaml(fin, overrides)
# create ddp_group with the right communication protocol
sb.utils.distributed.ddp_init_group(run_opts)
# Create experiment directory
sb.create_experiment_directory(
experiment_directory=hparams["output_folder"],
hyperparams_to_save=hparams_file,
overrides=overrides,
)
from ljspeech_prepare import prepare_ljspeech
sb.utils.distributed.run_on_main(
prepare_ljspeech,
kwargs={
"data_folder": hparams["data_folder"],
"save_folder": hparams["save_folder"],
"splits": hparams["splits"],
"split_ratio": hparams["split_ratio"],
"seed": hparams["seed"],
"skip_prep": hparams["skip_prep"],
},
)
datasets = dataio_prepare(hparams)
# Brain class initialization
tacotron2_brain = Tacotron2Brain(
modules=hparams["modules"],
opt_class=hparams["opt_class"],
hparams=hparams,
run_opts=run_opts,
checkpointer=hparams["checkpointer"],
)
# Training
tacotron2_brain.fit(
tacotron2_brain.hparams.epoch_counter,
train_set=datasets["train"],
valid_set=datasets["valid"],
train_loader_kwargs=hparams["train_dataloader_opts"],
valid_loader_kwargs=hparams["valid_dataloader_opts"],
)
# Test
if "test" in datasets:
tacotron2_brain.evaluate(
datasets["test"],
test_loader_kwargs=hparams["test_dataloader_opts"],
)
# -*- coding: utf-8 -*-
"""
Recipe for training the Tacotron Text-To-Speech model, an end-to-end
neural text-to-speech (TTS) system
To run this recipe, do the following:
# python train.py --device=cuda:0 --max_grad_norm=1.0 --data_folder=/your_folder/LJSpeech-1.1 hparams/train.yaml
to infer simply load saved model and do
savemodel.infer(text_Sequence,len(textsequence))
were text_Sequence is the output of the text_to_sequence function from
textToSequence.py (from textToSequence import text_to_sequence)
Authors
* Georges Abous-Rjeili 2021
* Artem Ploujnikov 2021
* Yingzhi Wang 2022
"""
import logging
import sys
import torch
from hyperpyyaml import load_hyperpyyaml
import speechbrain as sb
from speechbrain.utils.data_utils import scalarize
from speechbrain.utils.text_to_sequence import text_to_sequence
logger = logging.getLogger(__name__)
class Tacotron2Brain(sb.Brain):
"""The Brain implementation for Tacotron2"""
def on_fit_start(self):
"""Gets called at the beginning of ``fit()``, on multiple processes
if ``distributed_count > 0`` and backend is ddp and initializes statistics
"""
self.hparams.progress_sample_logger.reset()
self.last_epoch = 0
self.last_batch = None
self.last_loss_stats = {}
return super().on_fit_start()
def compute_forward(self, batch, stage):
"""Computes the forward pass
Arguments
---------
batch: str
a single batch
stage: speechbrain.Stage
the training stage
Returns
-------
the model output
"""
effective_batch = self.batch_to_device(batch)
inputs, y, num_items, _, _ = effective_batch
_, input_lengths, _, _, _ = inputs
max_input_length = input_lengths.max().item()
return self.modules.model(inputs, alignments_dim=max_input_length)
def on_fit_batch_end(self, batch, outputs, loss, should_step):
"""At the end of the optimizer step, apply noam annealing."""
if should_step:
self.hparams.lr_annealing(self.optimizer)
def compute_objectives(self, predictions, batch, stage):
"""Computes the loss given the predicted and targeted outputs.
Arguments
---------
predictions : torch.Tensor
The model generated spectrograms and other metrics from `compute_forward`.
batch : PaddedBatch
This batch object contains all the relevant tensors for computation.
stage : sb.Stage
One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
Returns
-------
loss : torch.Tensor
A one-element tensor used for backpropagating the gradient.
"""
effective_batch = self.batch_to_device(batch)
# Hold on to the batch for the inference sample. This is needed because
# the inference sample is run from on_stage_end only, where
# batch information is not available
self.last_batch = effective_batch
# Hold on to a sample (for logging)
self._remember_sample(effective_batch, predictions)
# Compute the loss
loss = self._compute_loss(predictions, effective_batch, stage)
return loss
def _compute_loss(self, predictions, batch, stage):
"""Computes the value of the loss function and updates stats
Arguments
---------
predictions: tuple
model predictions
batch: PaddedBatch
Inputs for this training iteration.
stage: sb.Stage
One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
Returns
-------
loss: torch.Tensor
the loss value
"""
inputs, targets, num_items, labels, wavs = batch
text_padded, input_lengths, _, max_len, output_lengths = inputs
loss_stats = self.hparams.criterion(
predictions, targets, input_lengths, output_lengths, self.last_epoch
)
self.last_loss_stats[stage] = scalarize(loss_stats)
return loss_stats.loss
def _remember_sample(self, batch, predictions):
"""Remembers samples of spectrograms and the batch for logging purposes
Arguments
---------
batch: tuple
a training batch
predictions: tuple
predictions (raw output of the Tacotron model)
"""
inputs, targets, num_items, labels, wavs = batch
text_padded, input_lengths, _, max_len, output_lengths = inputs
mel_target, _ = targets
mel_out, mel_out_postnet, gate_out, alignments = predictions
alignments_max = (
alignments[0]
.max(dim=-1)
.values.max(dim=-1)
.values.unsqueeze(-1)
.unsqueeze(-1)
)
alignments_output = alignments[0].T.flip(dims=(1,)) / alignments_max
self.hparams.progress_sample_logger.remember(
target=self._get_spectrogram_sample(mel_target),
output=self._get_spectrogram_sample(mel_out),
output_postnet=self._get_spectrogram_sample(mel_out_postnet),
alignments=alignments_output,
raw_batch=self.hparams.progress_sample_logger.get_batch_sample(
{
"text_padded": text_padded,
"input_lengths": input_lengths,
"mel_target": mel_target,
"mel_out": mel_out,
"mel_out_postnet": mel_out_postnet,
"max_len": max_len,
"output_lengths": output_lengths,
"gate_out": gate_out,
"alignments": alignments,
"labels": labels,
"wavs": wavs,
}
),
)
def batch_to_device(self, batch):
"""Transfers the batch to the target device
Arguments
---------
batch: tuple
the batch to use
Returns
-------
batch: tuple
the batch on the correct device
"""
(
text_padded,
input_lengths,
mel_padded,
gate_padded,
output_lengths,
len_x,
labels,
wavs,
) = batch
text_padded = text_padded.to(self.device, non_blocking=True).long()
input_lengths = input_lengths.to(self.device, non_blocking=True).long()
max_len = torch.max(input_lengths.data).item()
mel_padded = mel_padded.to(self.device, non_blocking=True).float()
gate_padded = gate_padded.to(self.device, non_blocking=True).float()
output_lengths = output_lengths.to(
self.device, non_blocking=True
).long()
x = (text_padded, input_lengths, mel_padded, max_len, output_lengths)
y = (mel_padded, gate_padded)
len_x = torch.sum(output_lengths)
return (x, y, len_x, labels, wavs)
def _get_spectrogram_sample(self, raw):
"""Converts a raw spectrogram to one that can be saved as an image
sample = sqrt(exp(raw))
Arguments
---------
raw: torch.Tensor
the raw spectrogram (as used in the model)
Returns
-------
sample: torch.Tensor
the spectrogram, for image saving purposes
"""
sample = raw[0]
return torch.sqrt(torch.exp(sample))
def on_stage_end(self, stage, stage_loss, epoch):
"""Gets called at the end of an epoch.
Arguments
---------
stage : sb.Stage
One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST
stage_loss : float
The average loss for all of the data processed in this stage.
epoch : int
The currently-starting epoch. This is passed
`None` during the test stage.
"""
# Store the train loss until the validation stage.
# At the end of validation, we can write
if stage == sb.Stage.VALID:
# Update learning rate
lr = self.optimizer.param_groups[-1]["lr"]
self.last_epoch = epoch
# The train_logger writes a summary to stdout and to the logfile.
self.hparams.train_logger.log_stats( # 1#2#
stats_meta={"Epoch": epoch, "lr": lr},
train_stats=self.last_loss_stats[sb.Stage.TRAIN],
valid_stats=self.last_loss_stats[sb.Stage.VALID],
)
# Save the current checkpoint and delete previous checkpoints.
epoch_metadata = {
**{"epoch": epoch},
**self.last_loss_stats[sb.Stage.VALID],
}
self.checkpointer.save_and_keep_only(
meta=epoch_metadata,
min_keys=["loss"],
ckpt_predicate=(
(
lambda ckpt: (
ckpt.meta["epoch"]
% self.hparams.keep_checkpoint_interval
!= 0
)
)
if self.hparams.keep_checkpoint_interval is not None
else None
),
)
output_progress_sample = (
self.hparams.progress_samples
and epoch % self.hparams.progress_samples_interval == 0
)
if output_progress_sample:
self.run_inference_sample()
self.hparams.progress_sample_logger.save(epoch)
# We also write statistics about test data to stdout and to the logfile.
if stage == sb.Stage.TEST:
self.hparams.train_logger.log_stats(
{"Epoch loaded": self.hparams.epoch_counter.current},
test_stats=self.last_loss_stats[sb.Stage.TEST],
)
if self.hparams.progress_samples:
self.run_inference_sample()
self.hparams.progress_sample_logger.save("test")
def run_inference_sample(self):
"""Produces a sample in inference mode. This is called when producing
samples and can be useful because"""
if self.last_batch is None:
return
inputs, _, _, _, _ = self.last_batch
text_padded, input_lengths, _, _, _ = inputs
mel_out, _, _ = self.hparams.model.infer(
text_padded[:1], input_lengths[:1]
)
self.hparams.progress_sample_logger.remember(
inference_mel_out=self._get_spectrogram_sample(mel_out)
)
def dataio_prepare(hparams):
# Define audio pipeline:
@sb.utils.data_pipeline.takes("wav", "label")
@sb.utils.data_pipeline.provides("mel_text_pair")
def audio_pipeline(wav, label):
text_seq = torch.IntTensor(
text_to_sequence(label, hparams["text_cleaners"])
)
audio = sb.dataio.dataio.read_audio(wav)
mel = hparams["mel_spectogram"](audio=audio)
len_text = len(text_seq)
return text_seq, mel, len_text
datasets = {}
data_info = {
"train": hparams["train_json"],
"valid": hparams["valid_json"],
"test": hparams["test_json"],
}
for dataset in hparams["splits"]:
datasets[dataset] = sb.dataio.dataset.DynamicItemDataset.from_json(
json_path=data_info[dataset],
replacements={"data_root": hparams["data_folder"]},
dynamic_items=[audio_pipeline],
output_keys=["mel_text_pair", "wav", "label"],
)
return datasets
if __name__ == "__main__":
# Load hyperparameters file with command-line overrides
hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
with open(hparams_file) as fin:
hparams = load_hyperpyyaml(fin, overrides)
# create ddp_group with the right communication protocol
sb.utils.distributed.ddp_init_group(run_opts)
# Create experiment directory
sb.create_experiment_directory(
experiment_directory=hparams["output_folder"],
hyperparams_to_save=hparams_file,
overrides=overrides,
)
from ljspeech_prepare import prepare_ljspeech
sb.utils.distributed.run_on_main(
prepare_ljspeech,
kwargs={
"data_folder": hparams["data_folder"],
"save_folder": hparams["save_folder"],
"splits": hparams["splits"],
"split_ratio": hparams["split_ratio"],
"seed": hparams["seed"],
"skip_prep": hparams["skip_prep"],
},
)
datasets = dataio_prepare(hparams)
# Brain class initialization
tacotron2_brain = Tacotron2Brain(
modules=hparams["modules"],
opt_class=hparams["opt_class"],
hparams=hparams,
run_opts=run_opts,
checkpointer=hparams["checkpointer"],
)
# Training
tacotron2_brain.fit(
tacotron2_brain.hparams.epoch_counter,
train_set=datasets["train"],
valid_set=datasets["valid"],
train_loader_kwargs=hparams["train_dataloader_opts"],
valid_loader_kwargs=hparams["valid_dataloader_opts"],
)
# Test
if "test" in datasets:
tacotron2_brain.evaluate(
datasets["test"],
test_loader_kwargs=hparams["test_dataloader_opts"],
)
# ################################################
# Basic training parameters for a diffwave vocoder
#
# Author:
# * Yingzhi Wang 2022
# ################################################
# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1234
__set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]
data_folder: !PLACEHOLDER
output_folder: !ref ./results/diffwave/<seed>
save_folder: !ref <output_folder>/save
progress_sample_path: !ref <output_folder>/samples
train_log: !ref <output_folder>/train_log.txt
progress_samples_interval: 10
train_json: !ref <save_folder>/train.json
valid_json: !ref <save_folder>/valid.json
test_json: !ref <save_folder>/test.json
splits: ["train", "valid"]
split_ratio: [90, 10]
skip_prep: False
# The train logger writes training statistics to a file, as well as stdout.
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: !ref <train_log>
keep_checkpoint_interval: 100
# conditional training length
segment_size: 15872
# Training Parameters
sample_rate: 22050
number_of_epochs: 500
batch_size: 16
num_workers: 8
lr: 0.0002
# diffusion parameters
train_timesteps: 50
beta_start: 0.0001
beta_end: 0.05
fast_sampling: True
fast_sampling_noise_schedule: [0.0001, 0.001, 0.01, 0.05, 0.2, 0.5]
loss_l2_steps: 0
adam_beta1: 0.95
adam_beta2: 0.999
adam_weight_decay: 0.000001
adam_epsilon: 0.00000001
train_dataloader_opts:
batch_size: !ref <batch_size>
drop_last: False
num_workers: !ref <num_workers>
valid_dataloader_opts:
batch_size: 1
num_workers: !ref <num_workers>
test_dataloader_opts:
batch_size: 1
num_workers: !ref <num_workers>
use_tensorboard: False
tensorboard_logs: !ref <output_folder>/logs/
residual_layers: 30
residual_channels: 64
dilation_cycle_length: 10
unconditional: False
# Spectrogram Parameters
spec_n_fft: 1024
spec_f_min: 0
spec_f_max: 8000
mel_normalized: False
spec_n_mels: 80
spec_power: 1
spec_hop_length: 256
spec_win_length: 1024
spec_norm: "slaney"
spec_mel_scale: "slaney"
dynamic_range_compression: True
# Feature extraction
mel_spectogram: !name:speechbrain.lobes.models.HifiGAN.mel_spectogram
sample_rate: !ref <sample_rate>
hop_length: !ref <spec_hop_length>
win_length: !ref <spec_win_length>
n_fft: !ref <spec_n_fft>
n_mels: !ref <spec_n_mels>
f_min: !ref <spec_f_min>
f_max: !ref <spec_f_max>
power: !ref <spec_power>
normalized: !ref <mel_normalized>
norm: !ref <spec_norm>
mel_scale: !ref <spec_mel_scale>
compression: !ref <dynamic_range_compression>
compute_cost: !new:speechbrain.nnet.schedulers.ScheduledLoss
schedule:
- loss_fn: !name:speechbrain.nnet.losses.mse_loss
steps: !ref <loss_l2_steps>
- loss_fn: !name:speechbrain.nnet.losses.l1_loss
# To design a custom model, either just edit the simple CustomModel
# class that's listed here, or replace this `!new` call with a line
# pointing to a different file you've defined.
diffwave: !new:speechbrain.lobes.models.DiffWave.DiffWave
input_channels: !ref <spec_n_mels>
residual_layers: !ref <residual_layers>
residual_channels: !ref <residual_channels>
dilation_cycle_length: !ref <dilation_cycle_length>
total_steps: !ref <train_timesteps>
unconditional: !ref <unconditional>
noise: !new:speechbrain.nnet.diffusion.GaussianNoise
diffusion: !new:speechbrain.lobes.models.DiffWave.DiffWaveDiffusion
model: !ref <diffwave.diffusion_forward>
beta_start: !ref <beta_start>
beta_end: !ref <beta_end>
timesteps: !ref <train_timesteps>
noise: !ref <noise>
# The first object passed to the Brain class is this "Epoch Counter"
# which is saved by the Checkpointer so that training can be resumed
# if it gets interrupted at any point.
epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
limit: !ref <number_of_epochs>
# Objects in "modules" dict will have their parameters moved to the correct
# device, as well as having train()/eval() called on them by the Brain class.
modules:
diffwave: !ref <diffwave>
diffusion: !ref <diffusion>
# This optimizer will be constructed by the Brain class after all parameters
# are moved to the correct device. Then it will be added to the checkpointer.
opt_class: !name:torch.optim.AdamW
lr: !ref <lr>
betas: !ref (<adam_beta1>, <adam_beta2>)
weight_decay: !ref <adam_weight_decay>
eps: !ref <adam_epsilon>
# This function manages learning rate annealing over the epochs.
# We here use the simple lr annealing method that linearly decreases
# the lr from the initial value to the final one.
# lr_annealing: !new:speechbrain.nnet.schedulers.WarmCoolDecayLRSchedule
# lr: !ref <lr>
# warmup: !ref <lr_warmup_steps>
# cooldown: !ref <lr_cooldown_steps>
# total_steps: !ref <lr_total_steps>
# This object is used for saving the state of training both so that it
# can be resumed if it gets interrupted, and also so that the best checkpoint
# can be later loaded for evaluation or inference.
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: !ref <save_folder>
recoverables:
diffwave: !ref <diffwave>
counter: !ref <epoch_counter>
../../../ljspeech_prepare.py
\ No newline at end of file
#!/usr/bin/env python3
"""script to train a diffwave vocoder
See https://arxiv.org/pdf/2009.09761.pdf for more details
Authors
* Yingzhi Wang 2022
"""
import logging
import os
import sys
import torch
import torchaudio
from hyperpyyaml import load_hyperpyyaml
import speechbrain as sb
logger = logging.getLogger(__name__)
class DiffWaveBrain(sb.Brain):
"""Class that manages the training loop. See speechbrain.core.Brain."""
def compute_forward(self, batch, stage):
"""Runs all the computation of that transforms the input into the
output probabilities over the N classes.
Arguments
---------
batch : PaddedBatch
This batch object contains all the relevant tensors for computation.
stage : sb.Stage
One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
Returns
-------
predictions : torch.Tensor
torch.Tensor that contains the posterior probabilities over the N classes.
"""
# We first move the batch to the appropriate device.
batch = batch.to(self.device)
x, _ = batch.mel
y, _ = batch.sig
pred, noise, noisy_sample = self.modules.diffusion.train_sample(
y,
timesteps=None,
condition=x,
)
return pred, noise, noisy_sample, None
def compute_objectives(self, predictions, batch, stage):
"""Computes the loss given the predicted and targeted outputs.
Arguments
---------
predictions : tensor
The output tensor from `compute_forward`.
batch : PaddedBatch
This batch object contains all the relevant tensors for computation.
stage : sb.Stage
One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
Returns
-------
loss : torch.Tensor
A one-element tensor used for backpropagating the gradient.
"""
batch = batch.to(self.device)
x, _ = batch.mel
y, _ = batch.sig
self.last_batch = (x, y)
self._remember_sample(self.last_batch, predictions)
preds, noise, noisy_sample, lens = predictions
loss = self.hparams.compute_cost(
preds.squeeze(1), noise.squeeze(1), length=lens
)
self.last_loss_stats[stage] = {"loss": loss}
return loss
def on_fit_start(self):
"""Gets called at the beginning of ``fit()``, on multiple processes
if ``distributed_count > 0`` and backend is ddp and initializes statistics
"""
self.last_batch = None
self.last_loss_stats = {}
return super().on_fit_start()
def _remember_sample(self, batch, predictions):
"""Remembers samples of spectrograms and the batch for logging purposes
Arguments
---------
batch: tuple
a training batch
predictions: tuple
predictions (raw output of the Tacotron model)
"""
mel, sig = batch
pred, noise, noisy_sample, steps = predictions
def on_stage_end(self, stage, stage_loss, epoch):
"""Gets called at the end of a stage (TRAIN, VALID, Or TEST)"""
if stage == sb.Stage.VALID:
lr = self.optimizer.param_groups[0]["lr"]
self.hparams.train_logger.log_stats(
stats_meta={"Epoch": epoch, "lr": lr},
train_stats=self.last_loss_stats[sb.Stage.TRAIN],
valid_stats=self.last_loss_stats[sb.Stage.VALID],
)
# The tensorboard_logger writes a summary to stdout and to the logfile.
if self.hparams.use_tensorboard:
self.tensorboard_logger.log_stats(
stats_meta={"Epoch": epoch, "lr": lr},
train_stats=self.last_loss_stats[sb.Stage.TRAIN],
valid_stats=self.last_loss_stats[sb.Stage.VALID],
)
# Save the current checkpoint and delete previous checkpoints.
epoch_metadata = {
**{"epoch": epoch},
**self.last_loss_stats[sb.Stage.VALID],
}
self.checkpointer.save_and_keep_only(
meta=epoch_metadata,
end_of_epoch=True,
min_keys=["loss"],
ckpt_predicate=(
(
lambda ckpt: (
ckpt.meta["epoch"]
% self.hparams.keep_checkpoint_interval
!= 0
)
)
if self.hparams.keep_checkpoint_interval is not None
else None
),
)
if epoch % self.hparams.progress_samples_interval == 0:
self.run_inference_sample("Valid")
# We also write statistics about test data to stdout and to the TensorboardLogger.
if stage == sb.Stage.TEST:
self.hparams.train_logger.log_stats( # 1#2#
{"Epoch loaded": self.hparams.epoch_counter.current},
test_stats=self.last_loss_stats[sb.Stage.TEST],
)
if self.hparams.use_tensorboard:
self.tensorboard_logger.log_stats(
{"Epoch loaded": self.hparams.epoch_counter.current},
test_stats=self.last_loss_stats[sb.Stage.TEST],
)
self.run_inference_sample("Test")
def run_inference_sample(self, name):
"""Produces a sample in inference mode. This is called when producing
samples.
"""
with torch.no_grad():
if self.last_batch is None:
return
x, y = self.last_batch
sig_out = self.modules.diffusion.inference(
unconditional=self.hparams.unconditional,
scale=self.hparams.spec_hop_length,
condition=x,
fast_sampling=self.hparams.fast_sampling,
fast_sampling_noise_schedule=self.hparams.fast_sampling_noise_schedule,
)
spec_out = self.hparams.mel_spectogram(
audio=sig_out.squeeze(1).cpu()
)
if self.hparams.use_tensorboard:
self.tensorboard_logger.log_audio(
f"{name}/audio_target", y.squeeze(0), self.hparams.sample_rate
)
self.tensorboard_logger.log_audio(
f"{name}/audio_pred",
sig_out.squeeze(0),
self.hparams.sample_rate,
)
self.tensorboard_logger.log_figure(f"{name}/mel_target", x)
self.tensorboard_logger.log_figure(f"{name}/mel_pred", spec_out)
else:
# folder name is the current epoch for validation and "test" for test
folder = (
self.hparams.epoch_counter.current
if name == "Valid"
else "test"
)
self.save_audio("target", y.squeeze(1), folder)
self.save_audio("synthesized", sig_out, folder)
def save_audio(self, name, data, epoch):
"""Saves a single wav
Arguments
---------
name: str
the name of the saved audio
data: torch.Tensor
the wave data to save
epoch: int or str
the epoch number (used in file path calculations)
or "test" for test stage
"""
target_path = os.path.join(
self.hparams.progress_sample_path, str(epoch)
)
if not os.path.exists(target_path):
os.makedirs(target_path)
file_name = f"{name}.wav"
effective_file_name = os.path.join(target_path, file_name)
torchaudio.save(effective_file_name, data.cpu(), 22050)
def dataio_prepare(hparams):
"""This function prepares the datasets to be used in the brain class.
It also defines the data processing pipeline through user-defined functions.
"""
segment_size = hparams["segment_size"]
# Define audio pipeline:
@sb.utils.data_pipeline.takes("wav", "segment")
@sb.utils.data_pipeline.provides("mel", "sig")
def audio_pipeline(wav, segment):
audio = sb.dataio.dataio.read_audio(wav)
audio = torch.FloatTensor(audio)
audio = audio.unsqueeze(0)
if segment:
if audio.size(1) >= segment_size:
max_audio_start = audio.size(1) - segment_size
audio_start = torch.randint(0, max_audio_start, (1,))
audio = audio[:, audio_start : audio_start + segment_size]
else:
audio = torch.nn.functional.pad(
audio, (0, segment_size - audio.size(1)), "constant"
)
mel = hparams["mel_spectogram"](audio=audio.squeeze(0))
# for diffwave the audio length needs to be hop_length * mel_length
audio_length = mel.shape[-1] * hparams["spec_hop_length"]
audio = torch.nn.functional.pad(
audio, (0, audio_length - audio.size(1)), "constant"
)
return mel, audio
datasets = {}
data_info = {
"train": hparams["train_json"],
"valid": hparams["valid_json"],
"test": hparams["test_json"],
}
for dataset in hparams["splits"]:
datasets[dataset] = sb.dataio.dataset.DynamicItemDataset.from_json(
json_path=data_info[dataset],
replacements={"data_root": hparams["data_folder"]},
dynamic_items=[audio_pipeline],
output_keys=["id", "mel", "sig"],
)
return datasets
def check_tensorboard(hparams):
"""Checks whether Tensorboard is enabled and initializes the logger if it is
Arguments
---------
hparams: dict
the hyperparameter dictionary
"""
if hparams["use_tensorboard"]:
try:
from speechbrain.utils.train_logger import TensorboardLogger
hparams["tensorboard_train_logger"] = TensorboardLogger(
hparams["tensorboard_logs"]
)
except ImportError:
logger.warning(
"Could not enable torch.TensorBoard logging - torch.TensorBoard is not available"
)
hparams["use_tensorboard"] = False
# Recipe begins!
if __name__ == "__main__":
# Reading command line arguments.
hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
# Initialize ddp (useful only for multi-GPU DDP training).
sb.utils.distributed.ddp_init_group(run_opts)
# Load hyperparameters file with command-line overrides.
with open(hparams_file) as fin:
hparams = load_hyperpyyaml(fin, overrides)
# Check whether Tensorboard is available and enabled
check_tensorboard(hparams)
# Create experiment directory
sb.create_experiment_directory(
experiment_directory=hparams["output_folder"],
hyperparams_to_save=hparams_file,
overrides=overrides,
)
# Create dataset objects "train", "valid", and "test".
sys.path.append("../../")
from ljspeech_prepare import prepare_ljspeech
sb.utils.distributed.run_on_main(
prepare_ljspeech,
kwargs={
"data_folder": hparams["data_folder"],
"save_folder": hparams["save_folder"],
"splits": hparams["splits"],
"split_ratio": hparams["split_ratio"],
"seed": hparams["seed"],
"skip_prep": hparams["skip_prep"],
},
)
datasets = dataio_prepare(hparams)
# Initialize the Brain object to prepare for mask training.
diffusion_brain = DiffWaveBrain(
modules=hparams["modules"],
opt_class=hparams["opt_class"],
hparams=hparams,
run_opts=run_opts,
checkpointer=hparams["checkpointer"],
)
# The `fit()` method iterates the training loop, calling the methods
# necessary to update the parameters of the model. Since all objects
# with changing state are managed by the Checkpointer, training can be
# stopped at any point, and will be resumed on next call.
diffusion_brain.fit(
epoch_counter=diffusion_brain.hparams.epoch_counter,
train_set=datasets["train"],
valid_set=datasets["valid"],
train_loader_kwargs=hparams["train_dataloader_opts"],
valid_loader_kwargs=hparams["valid_dataloader_opts"],
)
# Load the best checkpoint for evaluation
if "test" in datasets:
test_stats = diffusion_brain.evaluate(
test_set=datasets["test"],
min_key="error",
test_loader_kwargs=hparams["test_dataloader_opts"],
)
###################################
# Experiment Parameters and setup #
###################################
seed: 1234
__set_seed: !apply:torch.manual_seed [!ref <seed>]
output_folder: !ref ./results/hifi_gan/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt
progress_sample_path: !ref <output_folder>/samples
epochs: 500
keep_checkpoint_interval: 50
use_tensorboard: False
#################################
# Data files and pre-processing #
#################################
data_folder: !PLACEHOLDER # e.g, /datasets/ljspeech
train_json: !ref <save_folder>/train.json
valid_json: !ref <save_folder>/valid.json
test_json: !ref <save_folder>/test.json
splits: ["train", "valid"]
split_ratio: [90, 10]
################################
# Audio Parameters #
################################
skip_prep: False
segment_size: 8192
sample_rate: 22050
hop_length: 256
win_length: 1024
n_mel_channels: 80
n_fft: 1024
mel_fmin: 0.0
mel_fmax: 8000
mel_normalized: False
power: 1
norm: "slaney"
mel_scale: "slaney"
dynamic_range_compression: True
################################
# Optimization Hyperparameters #
################################
learning_rate: 0.0002
weight_decay: 0.9999
adam_b1: 0.8
adam_b2: 0.99
batch_size: 32 #minimum 2
num_workers: 8
train_dataloader_opts:
batch_size: !ref <batch_size>
drop_last: False
num_workers: !ref <num_workers>
valid_dataloader_opts:
batch_size: 1
num_workers: !ref <num_workers>
test_dataloader_opts:
batch_size: 1
num_workers: !ref <num_workers>
################################
# Model Parameters and model #
################################
# generator params
in_channels: 80
out_channels: 1
###########################################################################################################################################################
# version | resblock_type | upsample_kernel_sizes | upsample_factors | resblock_kernel_sizes | upsample_initial_channel | resblock_dilation_sizes
# 1 | "1" | [16,16,4,4] | [8, 8, 2, 2] | [3, 7, 11] | 512 | [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
# 2 | "1" | [16,16,4,4] | [8, 8, 2, 2] | [3, 7, 11] | 128 | [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
# 3 | "2" | [16,16,8] | [8,8,4] | [3,5,7] | 256 | [[1,2], [2,6], [3,12]]
###########################################################################################################################################################
resblock_type: "1"
resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
resblock_kernel_sizes: [3, 7, 11]
upsample_kernel_sizes: [16, 16, 4, 4]
upsample_initial_channel: 512
upsample_factors: [8, 8, 2, 2]
inference_padding: 5
cond_channels: 0
conv_post_bias: True
mel_spectogram: !name:speechbrain.lobes.models.HifiGAN.mel_spectogram
sample_rate: !ref <sample_rate>
hop_length: !ref <hop_length>
win_length: !ref <win_length>
n_fft: !ref <n_fft>
n_mels: !ref <n_mel_channels>
f_min: !ref <mel_fmin>
f_max: !ref <mel_fmax>
power: !ref <power>
normalized: !ref <mel_normalized>
norm: !ref <norm>
mel_scale: !ref <mel_scale>
compression: !ref <dynamic_range_compression>
generator: !new:speechbrain.lobes.models.HifiGAN.HifiganGenerator
in_channels: !ref <in_channels>
out_channels: !ref <out_channels>
resblock_type: !ref <resblock_type>
resblock_dilation_sizes: !ref <resblock_dilation_sizes>
resblock_kernel_sizes: !ref <resblock_kernel_sizes>
upsample_kernel_sizes: !ref <upsample_kernel_sizes>
upsample_initial_channel: !ref <upsample_initial_channel>
upsample_factors: !ref <upsample_factors>
inference_padding: !ref <inference_padding>
cond_channels: !ref <cond_channels>
conv_post_bias: !ref <conv_post_bias>
discriminator: !new:speechbrain.lobes.models.HifiGAN.HifiganDiscriminator
modules:
generator: !ref <generator>
discriminator: !ref <discriminator>
#generator loss
stft_loss: null
mseg_loss: !new:speechbrain.lobes.models.HifiGAN.MSEGLoss
feat_match_loss: !new:speechbrain.lobes.models.HifiGAN.MelganFeatureLoss
l1_spec_loss: !new:speechbrain.lobes.models.HifiGAN.L1SpecLoss
sample_rate: !ref <sample_rate>
hop_length: !ref <hop_length>
win_length: !ref <win_length>
n_mel_channels: !ref <n_mel_channels>
n_fft: !ref <n_fft>
n_stft: !ref <n_fft> // 2 + 1
mel_fmin: !ref <mel_fmin>
mel_fmax: null
mel_normalized: !ref <mel_normalized>
power: !ref <power>
dynamic_range_compression: !ref <dynamic_range_compression>
generator_loss: !new:speechbrain.lobes.models.HifiGAN.GeneratorLoss
stft_loss: !ref <stft_loss>
stft_loss_weight: 0
mseg_loss: !ref <mseg_loss>
mseg_loss_weight: 1
feat_match_loss: !ref <feat_match_loss>
feat_match_loss_weight: 10
l1_spec_loss: !ref <l1_spec_loss>
l1_spec_loss_weight: 45
#discriminator loss
msed_loss: !new:speechbrain.lobes.models.HifiGAN.MSEDLoss
discriminator_loss: !new:speechbrain.lobes.models.HifiGAN.DiscriminatorLoss
msed_loss: !ref <msed_loss>
#optimizer
opt_class_generator: !name:torch.optim.AdamW
lr: !ref <learning_rate>
betas: [!ref <adam_b1>, !ref <adam_b2>]
opt_class_discriminator: !name:torch.optim.AdamW
lr: !ref <learning_rate>
betas: [!ref <adam_b1>, !ref <adam_b2>]
sch_class_generator: !name:torch.optim.lr_scheduler.ExponentialLR
gamma: !ref <weight_decay>
last_epoch: -1
sch_class_discriminator: !name:torch.optim.lr_scheduler.ExponentialLR
gamma: !ref <weight_decay>
last_epoch: -1
#epoch object
epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
limit: !ref <epochs>
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: !ref <train_log>
#checkpointer
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: !ref <save_folder>
recoverables:
generator: !ref <generator>
discriminator: !ref <discriminator>
counter: !ref <epoch_counter>
../../../ljspeech_prepare.py
\ No newline at end of file
#!/usr/bin/env python3
"""Recipe for training a hifi-gan vocoder.
For more details about hifi-gan: https://arxiv.org/pdf/2010.05646.pdf
To run this recipe, do the following:
> python train.py hparams/train.yaml --data_folder /path/to/LJspeech
Authors
* Duret Jarod 2021
* Yingzhi WANG 2022
"""
import copy
import os
import sys
import torch
import torchaudio
from hyperpyyaml import load_hyperpyyaml
import speechbrain as sb
from speechbrain.utils.data_utils import scalarize
class HifiGanBrain(sb.Brain):
def compute_forward(self, batch, stage):
"""The forward function, generates synthesized waveforms,
calculates the scores and the features of the discriminator
for synthesized waveforms and real waveforms.
Arguments
---------
batch: str
a single batch
stage: speechbrain.Stage
the training stage
Returns
-------
y_g_hat : torch.Tensor
scores_fake : torch.Tensor
feats_fake : torch.Tensor
scores_real : torch.Tensor
feats_real : torch.Tensor
"""
batch = batch.to(self.device)
x, _ = batch.mel
y, _ = batch.sig
# generate synthesized waveforms
y_g_hat = self.modules.generator(x)[:, :, : y.size(2)]
# get scores and features from discriminator for real and synthesized waveforms
scores_fake, feats_fake = self.modules.discriminator(y_g_hat.detach())
scores_real, feats_real = self.modules.discriminator(y)
return (y_g_hat, scores_fake, feats_fake, scores_real, feats_real)
def compute_objectives(self, predictions, batch, stage):
"""Computes and combines generator and discriminator losses"""
batch = batch.to(self.device)
x, _ = batch.mel
y, _ = batch.sig
# Hold on to the batch for the inference sample. This is needed because
# the inference sample is run from on_stage_end only, where
# batch information is not available
self.last_batch = (x, y)
# Hold on to a sample (for logging)
self._remember_sample(self.last_batch, predictions)
y_hat, scores_fake, feats_fake, scores_real, feats_real = predictions
loss_g = self.hparams.generator_loss(
stage, y_hat, y, scores_fake, feats_fake, feats_real
)
loss_d = self.hparams.discriminator_loss(scores_fake, scores_real)
loss = {**loss_g, **loss_d}
self.last_loss_stats[stage] = scalarize(loss)
return loss
def fit_batch(self, batch):
"""Train discriminator and generator adversarially"""
batch = batch.to(self.device)
y, _ = batch.sig
outputs = self.compute_forward(batch, sb.core.Stage.TRAIN)
(y_g_hat, scores_fake, feats_fake, scores_real, feats_real) = outputs
# calculate discriminator loss with the latest updated generator
loss_d = self.compute_objectives(outputs, batch, sb.core.Stage.TRAIN)[
"D_loss"
]
# First train the discriminator
self.optimizer_d.zero_grad()
loss_d.backward()
self.optimizer_d.step()
# calculate generator loss with the latest updated discriminator
scores_fake, feats_fake = self.modules.discriminator(y_g_hat)
scores_real, feats_real = self.modules.discriminator(y)
outputs = (y_g_hat, scores_fake, feats_fake, scores_real, feats_real)
loss_g = self.compute_objectives(outputs, batch, sb.core.Stage.TRAIN)[
"G_loss"
]
# Then train the generator
self.optimizer_g.zero_grad()
loss_g.backward()
self.optimizer_g.step()
return loss_g.detach().cpu()
def evaluate_batch(self, batch, stage):
"""Evaluate one batch"""
out = self.compute_forward(batch, stage=stage)
loss = self.compute_objectives(out, batch, stage=stage)
loss_g = loss["G_loss"]
return loss_g.detach().cpu()
def on_fit_start(self):
"""Gets called at the beginning of ``fit()``, on multiple processes
if ``distributed_count > 0`` and backend is ddp and initializes statistics
"""
self.last_epoch = 0
self.last_batch = None
self.last_loss_stats = {}
return super().on_fit_start()
def init_optimizers(self):
"""Called during ``on_fit_start()``, initialize optimizers
after parameters are fully configured (e.g. DDP, jit).
"""
if self.opt_class is not None:
(
opt_g_class,
opt_d_class,
sch_g_class,
sch_d_class,
) = self.opt_class
self.optimizer_g = opt_g_class(self.modules.generator.parameters())
self.optimizer_d = opt_d_class(
self.modules.discriminator.parameters()
)
self.scheduler_g = sch_g_class(self.optimizer_g)
self.scheduler_d = sch_d_class(self.optimizer_d)
if self.checkpointer is not None:
self.checkpointer.add_recoverable(
"optimizer_g", self.optimizer_g
)
self.checkpointer.add_recoverable(
"optimizer_d", self.optimizer_d
)
self.checkpointer.add_recoverable(
"scheduler_g", self.scheduler_d
)
self.checkpointer.add_recoverable(
"scheduler_d", self.scheduler_d
)
def zero_grad(self, set_to_none=False):
if self.opt_class is not None:
self.optimizer_g.zero_grad(set_to_none)
self.optimizer_d.zero_grad(set_to_none)
def _remember_sample(self, batch, predictions):
"""Remembers samples of spectrograms and the batch for logging purposes
Arguments
---------
batch: tuple
a training batch
predictions: tuple
predictions (raw output of the Tacotron model)
"""
mel, sig = batch
y_hat, scores_fake, feats_fake, scores_real, feats_real = predictions
def on_stage_end(self, stage, stage_loss, epoch):
"""Gets called at the end of a stage (TRAIN, VALID, Or TEST)"""
if stage == sb.Stage.VALID:
# Update learning rate
self.scheduler_g.step()
self.scheduler_d.step()
lr_g = self.optimizer_g.param_groups[-1]["lr"]
lr_d = self.optimizer_d.param_groups[-1]["lr"]
self.hparams.train_logger.log_stats( # 1#2#
stats_meta={"Epoch": epoch, "lr_g": lr_g, "lr_d": lr_d},
train_stats=self.last_loss_stats[sb.Stage.TRAIN],
valid_stats=self.last_loss_stats[sb.Stage.VALID],
)
# The tensorboard_logger writes a summary to stdout and to the logfile.
if self.hparams.use_tensorboard:
self.tensorboard_logger.log_stats(
stats_meta={"Epoch": epoch, "lr_g": lr_g, "lr_d": lr_d},
train_stats=self.last_loss_stats[sb.Stage.TRAIN],
valid_stats=self.last_loss_stats[sb.Stage.VALID],
)
# Save the current checkpoint and delete previous checkpoints.
epoch_metadata = {
**{"epoch": epoch},
**self.last_loss_stats[sb.Stage.VALID],
}
self.checkpointer.save_and_keep_only(
meta=epoch_metadata,
end_of_epoch=True,
min_keys=["loss"],
ckpt_predicate=(
(
lambda ckpt: (
ckpt.meta["epoch"]
% self.hparams.keep_checkpoint_interval
!= 0
)
)
if self.hparams.keep_checkpoint_interval is not None
else None
),
)
self.run_inference_sample("Valid")
# We also write statistics about test data to stdout and to the torch.TensorboardLogger.
if stage == sb.Stage.TEST:
self.hparams.train_logger.log_stats( # 1#2#
{"Epoch loaded": self.hparams.epoch_counter.current},
test_stats=self.last_loss_stats[sb.Stage.TEST],
)
if self.hparams.use_tensorboard:
self.tensorboard_logger.log_stats(
{"Epoch loaded": self.hparams.epoch_counter.current},
test_stats=self.last_loss_stats[sb.Stage.TEST],
)
self.run_inference_sample("Test")
def run_inference_sample(self, name):
"""Produces a sample in inference mode. This is called when producing
samples.
"""
with torch.no_grad():
if self.last_batch is None:
return
x, y = self.last_batch
# Preparing model for inference by removing weight norm
inference_generator = copy.deepcopy(self.hparams.generator)
inference_generator.remove_weight_norm()
sig_out = inference_generator.inference(x)
spec_out = self.hparams.mel_spectogram(
audio=sig_out.squeeze(0).cpu()
)
if self.hparams.use_tensorboard:
self.tensorboard_logger.log_audio(
f"{name}/audio_target", y.squeeze(0), self.hparams.sample_rate
)
self.tensorboard_logger.log_audio(
f"{name}/audio_pred",
sig_out.squeeze(0),
self.hparams.sample_rate,
)
self.tensorboard_logger.log_figure(f"{name}/mel_target", x)
self.tensorboard_logger.log_figure(f"{name}/mel_pred", spec_out)
else:
# folder name is the current epoch for validation and "test" for test
folder = (
self.hparams.epoch_counter.current
if name == "Valid"
else "test"
)
self.save_audio("target", y.squeeze(0), folder)
self.save_audio("synthesized", sig_out.squeeze(0), folder)
def save_audio(self, name, data, epoch):
"""Saves a single wav
Arguments
---------
name: str
the name of the saved audio
data: torch.Tensor
the wave data to save
epoch: int or str
the epoch number (used in file path calculations)
or "test" for test stage
"""
target_path = os.path.join(
self.hparams.progress_sample_path, str(epoch)
)
if not os.path.exists(target_path):
os.makedirs(target_path)
file_name = f"{name}.wav"
effective_file_name = os.path.join(target_path, file_name)
torchaudio.save(effective_file_name, data.cpu(), 22050)
def dataio_prepare(hparams):
"""This function prepares the datasets to be used in the brain class.
It also defines the data processing pipeline through user-defined functions.
"""
segment_size = hparams["segment_size"]
# Define audio pipeline:
@sb.utils.data_pipeline.takes("wav", "segment")
@sb.utils.data_pipeline.provides("mel", "sig")
def audio_pipeline(wav, segment):
audio = sb.dataio.dataio.read_audio(wav)
audio = torch.FloatTensor(audio)
audio = audio.unsqueeze(0)
if segment:
if audio.size(1) >= segment_size:
max_audio_start = audio.size(1) - segment_size
audio_start = torch.randint(0, max_audio_start, (1,))
audio = audio[:, audio_start : audio_start + segment_size]
else:
audio = torch.nn.functional.pad(
audio, (0, segment_size - audio.size(1)), "constant"
)
mel = hparams["mel_spectogram"](audio=audio.squeeze(0))
return mel, audio
datasets = {}
data_info = {
"train": hparams["train_json"],
"valid": hparams["valid_json"],
"test": hparams["test_json"],
}
for dataset in hparams["splits"]:
datasets[dataset] = sb.dataio.dataset.DynamicItemDataset.from_json(
json_path=data_info[dataset],
replacements={"data_root": hparams["data_folder"]},
dynamic_items=[audio_pipeline],
output_keys=["id", "mel", "sig"],
)
return datasets
if __name__ == "__main__":
# Load hyperparameters file with command-line overrides
hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
with open(hparams_file) as fin:
hparams = load_hyperpyyaml(fin, overrides)
# create ddp_group with the right communication protocol
sb.utils.distributed.ddp_init_group(run_opts)
# Create experiment directory
sb.create_experiment_directory(
experiment_directory=hparams["output_folder"],
hyperparams_to_save=hparams_file,
overrides=overrides,
)
from ljspeech_prepare import prepare_ljspeech
sb.utils.distributed.run_on_main(
prepare_ljspeech,
kwargs={
"data_folder": hparams["data_folder"],
"save_folder": hparams["save_folder"],
"splits": hparams["splits"],
"split_ratio": hparams["split_ratio"],
"seed": hparams["seed"],
"skip_prep": hparams["skip_prep"],
},
)
datasets = dataio_prepare(hparams)
# Brain class initialization
hifi_gan_brain = HifiGanBrain(
modules=hparams["modules"],
opt_class=[
hparams["opt_class_generator"],
hparams["opt_class_discriminator"],
hparams["sch_class_generator"],
hparams["sch_class_discriminator"],
],
hparams=hparams,
run_opts=run_opts,
checkpointer=hparams["checkpointer"],
)
if hparams["use_tensorboard"]:
hifi_gan_brain.tensorboard_logger = (
sb.utils.train_logger.TensorboardLogger(
save_dir=hparams["output_folder"] + "/tensorboard"
)
)
# Training
hifi_gan_brain.fit(
hifi_gan_brain.hparams.epoch_counter,
train_set=datasets["train"],
valid_set=datasets["valid"],
train_loader_kwargs=hparams["train_dataloader_opts"],
valid_loader_kwargs=hparams["valid_dataloader_opts"],
)
# Test
if "test" in datasets:
hifi_gan_brain.evaluate(
datasets["test"],
test_loader_kwargs=hparams["test_dataloader_opts"],
)
"""
Apply K-means clustering over acoustic features to extract speech units for HiFi-GAN training.
Authors
* Jarod Duret 2023
"""
import json
import logging
import pathlib as pl
import numpy as np
import torch
import torchaudio
from tqdm import tqdm
import speechbrain as sb
from speechbrain.dataio.dataio import load_pkl, save_pkl
from speechbrain.lobes.models.huggingface_transformers import (
hubert,
wav2vec2,
wavlm,
)
from speechbrain.lobes.models.huggingface_transformers.discrete_ssl import (
DiscreteSSL,
)
OPT_FILE = "opt_ljspeech_extract_code.pkl"
TRAIN_JSON = "train.json"
VALID_JSON = "valid.json"
TEST_JSON = "test.json"
ENCODER_CLASSES = {
"HuBERT": hubert.HuBERT,
"Wav2Vec2": wav2vec2.Wav2Vec2,
"WavLM": wavlm.WavLM,
}
def setup_logger():
"""Set up a logger with a log format and logging level."""
log_format = "[%(asctime)s] [%(levelname)s]: %(message)s"
logging.basicConfig(format=log_format, level=logging.INFO)
logger = logging.getLogger(__name__)
return logger
def get_device(use_cuda):
"""Determine and return the appropriate device for computation."""
use_cuda = use_cuda and torch.cuda.is_available()
print("\n" + "=" * 30)
print("USE_CUDA SET TO: {}".format(use_cuda))
print("CUDA AVAILABLE?: {}".format(torch.cuda.is_available()))
print("=" * 30 + "\n")
return torch.device("cuda" if use_cuda else "cpu")
def np_array(tensor):
"""Convert a Pytorch tensor to a Numpy array."""
tensor = tensor.squeeze(0)
tensor = tensor.detach().cpu()
return tensor.numpy()
def skip(splits, save_folder, conf):
"""
Detects if the ljspeech data_extraction has been already done.
If the extraction has been done, we can skip it.
Returns
-------
bool
if True, the preparation phase can be skipped.
if False, it must be done.
"""
# Checking json files
skip = True
split_files = {
"train": TRAIN_JSON,
"valid": VALID_JSON,
"test": TEST_JSON,
}
for split in splits:
if not (save_folder / split_files[split]).exists():
skip = False
# Checking saved options
save_opt = save_folder / OPT_FILE
if skip is True:
if save_opt.is_file():
opts_old = load_pkl(save_opt.as_posix())
if opts_old == conf:
skip = True
else:
skip = False
else:
skip = False
return skip
def extract_ljspeech(
data_folder,
splits,
kmeans_folder,
kmeans_dataset,
num_clusters,
encoder_type,
encoder_source,
layer,
encoder_save_folder,
codes_save_folder,
sample_rate=16000,
skip_extract=False,
):
"""
Extract speech units for HiFi-GAN training on the LJspeech datasets.
Arguments
---------
data_folder : str
Path to the folder where the original LJspeech dataset is stored.
splits : list
List of splits to prepare.
kmeans_folder: str
Huggingface repository if that contains the pretrained kmean model.
kmeans_dataset : str
Name of the dataset that Kmeans model on HF repo is trained with.
num_clusters: (int)
determine the number of clusters of the targeted kmeans models to be downloaded.
encoder_type: str
Name of the model used as feature extractor.
encoder_source: str
Url to the model used as feature extractor.
layer: List[int] (default: [7]):
Determine which layers of SSL should be used to extract information.
encoder_save_folder: str
Path to the folder where the ssl encoder stored.
codes_save_folder: str
Path to the folder where the tokens are stored.
sample_rate: int
LjSpeech dataset sample rate
skip_extract: Bool
If True, skip extraction.
Example
-------
>>> from recipes.LJSpeech.TTS.vocoder.hifi_gan_unit.extract_code import extract_ljspeech
>>> data_folder = 'data/LJspeech/'
>>> splits = ['train', 'valid']
>>> kmeans_folder = 'speechbrain/SSL_Quantization'
>>> kmeans_dataset = LibriSpeech-100-360-500
>>> encoder_type = 'HuBERT'
>>> encoder_source = facebook/hubert-large-ll60k
>>> layer = [7]
>>> encoder_save_folder = 'ssl_encoder/'
>>> codes_save_folder = 'codes/'
>>> extract_ljspeech(data_folder, splits, kmeans_folder, kmeans_filename, encoder_type, encoder_source, layer, encoder_save_folder, codes_save_folder)
"""
logger = setup_logger()
if skip_extract:
return
# Create configuration for easily skipping code extraction stage
conf = {
"data_folder": data_folder,
"splits": splits,
"save_folder": codes_save_folder,
"kmeans_folder": kmeans_folder,
"encoder_type": encoder_type,
"encoder_source": encoder_source,
"layer": layer,
}
codes_save_folder = pl.Path(codes_save_folder)
# Check if this phase is already done (if so, skip it)
if skip(splits, codes_save_folder, conf):
logger.info("Skipping code extraction, completed in previous run.")
return
# Fetch device
device = get_device(use_cuda=True)
save_opt = codes_save_folder / OPT_FILE
data_folder = pl.Path(data_folder)
encoder_save_folder = pl.Path(encoder_save_folder)
codes_save_folder.mkdir(parents=True, exist_ok=True)
logger.info(f"Loading encoder: {encoder_source} ...")
if encoder_type not in ENCODER_CLASSES:
raise TypeError("Not a supported Encoder")
encoder_class = ENCODER_CLASSES[encoder_type]
encoder = encoder_class(
source=encoder_source,
save_path=encoder_save_folder.as_posix(),
output_norm=False,
freeze=True,
freeze_feature_extractor=True,
apply_spec_augment=False,
output_all_hiddens=True,
).to(device)
discrete_encoder = DiscreteSSL(
save_path=encoder_save_folder.as_posix(),
ssl_model=encoder,
kmeans_dataset=kmeans_dataset,
kmeans_repo_id=kmeans_folder,
num_clusters=num_clusters,
)
for split in splits:
dataset_path = data_folder / f"{split}.json"
logger.info(f"Reading dataset from {dataset_path} ...")
meta_json = json.load(open(dataset_path))
for key in tqdm(meta_json.keys()):
item = meta_json[key]
wav = item["wav"]
with torch.no_grad():
info = torchaudio.info(wav)
audio = sb.dataio.dataio.read_audio(wav)
audio = torchaudio.transforms.Resample(
info.sample_rate,
sample_rate,
)(audio)
audio = audio.unsqueeze(0).to(device)
deduplicates = [False for _ in layer]
bpe_tokenizers = [None for _ in layer]
tokens, _, _ = discrete_encoder(
audio,
SSL_layers=layer,
deduplicates=deduplicates,
bpe_tokenizers=bpe_tokenizers,
)
tokens = np_array(tokens.squeeze(0))
np.save(codes_save_folder / f"{key}.npy", tokens)
logger.info("Extraction completed.")
save_pkl(conf, save_opt)
############################################################################
# Model: Unit HiFi-GAN
# Tokens: discrete speech units (K-means)
# Training: LJSpeech (English)
# Authors: Jarod Duret, Yingzhi Wang
# ############################################################################
###################################
# Experiment Parameters and setup #
###################################
seed: 4321
__set_seed: !apply:torch.manual_seed [!ref <seed>]
output_folder: !ref ./results/hifi_gan/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt
progress_sample_path: !ref <output_folder>/samples
epochs: 200
keep_checkpoint_interval: 50
use_tensorboard: False
#################################
# Data files and pre-processing #
#################################
data_folder: !PLACEHOLDER # e.g, /datasets/ljspeech
train_json: !ref <save_folder>/train.json
valid_json: !ref <save_folder>/valid.json
test_json: !ref <save_folder>/test.json
splits: ["train", "valid", "test"]
split_ratio: [80, 10, 10]
skip_prep: False
########################################################
# Encoder | HF model #
#------------------------------------------------------#
# HuBERT | facebook/hubert-large-ll60k #
# Wav2Vec2 | facebook/wav2vec2-large-960h-lv60-self #
# WavLM | microsoft/wavlm-large #
########################################################
kmeans_folder: speechbrain/SSL_Quantization
kmeans_dataset: LibriSpeech-100-360-500
codes_save_folder: !ref <save_folder>/codes
encoder_type: HuBERT
encoder_hub: facebook/hubert-large-ll60k
encoder_save_folder: !ref <save_folder>/ssl_encoder
layer: [1, 3, 7, 12, 18, 23]
num_clusters: 1000
skip_extract: False
################################
# Audio Parameters #
################################
segment_size: 8960
code_hop_size: 320
sample_rate: 16000
layer_drop: True
hop_length: 256
win_length: 1024
n_mel_channels: 80
n_fft: 1024
mel_fmin: 0.0
mel_fmax: 8000
mel_normalized: False
power: 1
norm: "slaney"
mel_scale: "slaney"
dynamic_range_compression: True
################################
# Optimization Hyperparameters #
################################
learning_rate: 0.0002
weight_decay: 0.9999
adam_b1: 0.8
adam_b2: 0.99
batch_size: 32 #minimum 32
train_dataloader_opts:
batch_size: !ref <batch_size>
drop_last: False
num_workers: 8
valid_dataloader_opts:
batch_size: 1
num_workers: 8
test_dataloader_opts:
batch_size: 1
num_workers: 8
################################
# Model Parameters and model #
################################
duration_predictor: False
# embedding params
vocab_size: 6001 # K-means size * num layer + 1 for padding 1000x6+1
embedding_dim: 128
# generator params
in_channels: 128
out_channels: 1
var_pred_hidden_dim: 128
var_pred_kernel_size: 3
var_pred_dropout: 0.5
###########################################################################################################################################################
# version | resblock_type | upsample_kernel_sizes | upsample_factors | resblock_kernel_sizes | upsample_initial_channel | resblock_dilation_sizes
# 1 | "1" | [16,16,4,4] | [8, 8, 2, 2] | [3, 7, 11] | 512 | [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
# 2 | "1" | [16,16,4,4] | [8, 8, 2, 2] | [3, 7, 11] | 128 | [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
# 3 | "2" | [16,16,8] | [8,8,4] | [3,5,7] | 256 | [[1,2], [2,6], [3,12]]
###########################################################################################################################################################
resblock_type: "1"
resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
resblock_kernel_sizes: [3, 7, 11]
upsample_kernel_sizes: [11, 8, 8, 4, 4]
upsample_initial_channel: 512
upsample_factors: [5, 4, 4, 2, 2]
inference_padding: 5
cond_channels: 0
conv_post_bias: True
mel_spectogram: !name:speechbrain.lobes.models.HifiGAN.mel_spectogram
sample_rate: !ref <sample_rate>
hop_length: !ref <hop_length>
win_length: !ref <win_length>
n_fft: !ref <n_fft>
n_mels: !ref <n_mel_channels>
f_min: !ref <mel_fmin>
f_max: !ref <mel_fmax>
power: !ref <power>
normalized: !ref <mel_normalized>
norm: !ref <norm>
mel_scale: !ref <mel_scale>
compression: !ref <dynamic_range_compression>
generator: !new:speechbrain.lobes.models.HifiGAN.UnitHifiganGenerator
in_channels: !ref <in_channels>
out_channels: !ref <out_channels>
resblock_type: !ref <resblock_type>
resblock_dilation_sizes: !ref <resblock_dilation_sizes>
resblock_kernel_sizes: !ref <resblock_kernel_sizes>
upsample_kernel_sizes: !ref <upsample_kernel_sizes>
upsample_initial_channel: !ref <upsample_initial_channel>
upsample_factors: !ref <upsample_factors>
inference_padding: !ref <inference_padding>
cond_channels: !ref <cond_channels>
conv_post_bias: !ref <conv_post_bias>
vocab_size: !ref <vocab_size>
embedding_dim: !ref <embedding_dim>
duration_predictor: !ref <duration_predictor>
var_pred_hidden_dim: !ref <var_pred_hidden_dim>
var_pred_kernel_size: !ref <var_pred_kernel_size>
var_pred_dropout: !ref <var_pred_dropout>
discriminator: !new:speechbrain.lobes.models.HifiGAN.HifiganDiscriminator
modules:
generator: !ref <generator>
discriminator: !ref <discriminator>
#generator loss
stft_loss: null
mseg_loss: !new:speechbrain.lobes.models.HifiGAN.MSEGLoss
feat_match_loss: !new:speechbrain.lobes.models.HifiGAN.MelganFeatureLoss
l1_spec_loss: !new:speechbrain.lobes.models.HifiGAN.L1SpecLoss
sample_rate: !ref <sample_rate>
hop_length: !ref <hop_length>
win_length: !ref <win_length>
n_mel_channels: !ref <n_mel_channels>
n_fft: !ref <n_fft>
n_stft: !ref <n_fft> // 2 + 1
mel_fmin: !ref <mel_fmin>
mel_fmax: null
mel_normalized: !ref <mel_normalized>
power: !ref <power>
dynamic_range_compression: !ref <dynamic_range_compression>
mseg_dur_loss: False
generator_loss: !new:speechbrain.lobes.models.HifiGAN.GeneratorLoss
stft_loss: !ref <stft_loss>
stft_loss_weight: 0
mseg_loss: !ref <mseg_loss>
mseg_loss_weight: 1
feat_match_loss: !ref <feat_match_loss>
feat_match_loss_weight: 10
l1_spec_loss: !ref <l1_spec_loss>
l1_spec_loss_weight: 45
mseg_dur_loss: !ref <mseg_dur_loss>
mseg_dur_loss_weight: 1
#discriminator loss
msed_loss: !new:speechbrain.lobes.models.HifiGAN.MSEDLoss
discriminator_loss: !new:speechbrain.lobes.models.HifiGAN.DiscriminatorLoss
msed_loss: !ref <msed_loss>
#optimizer
opt_class_generator: !name:torch.optim.AdamW
lr: !ref <learning_rate>
betas: [!ref <adam_b1>, !ref <adam_b2>]
opt_class_discriminator: !name:torch.optim.AdamW
lr: !ref <learning_rate>
betas: [!ref <adam_b1>, !ref <adam_b2>]
sch_class_generator: !name:torch.optim.lr_scheduler.ExponentialLR
gamma: !ref <weight_decay>
last_epoch: -1
sch_class_discriminator: !name:torch.optim.lr_scheduler.ExponentialLR
gamma: !ref <weight_decay>
last_epoch: -1
#epoch object
epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
limit: !ref <epochs>
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: !ref <train_log>
#checkpointer
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: !ref <save_folder>
recoverables:
generator: !ref <generator>
discriminator: !ref <discriminator>
counter: !ref <epoch_counter>
../../../ljspeech_prepare.py
\ No newline at end of file
#!/usr/bin/env python3
"""Recipe for training a hifi-gan vocoder on self-supervised representations.
For more details about hifi-gan: https://arxiv.org/pdf/2010.05646.pdf
For more details about speech synthesis using self-supervised representations: https://arxiv.org/pdf/2104.00355.pdf
To run this recipe, do the following:
> python train.py hparams/train.yaml --data_folder=/path/to/LJspeech
Authors
* Jarod Duret 2023
* Yingzhi WANG 2022
"""
import copy
import pathlib as pl
import random
import sys
import numpy as np
import torch
import torchaudio
from hyperpyyaml import load_hyperpyyaml
import speechbrain as sb
from speechbrain.utils.data_utils import scalarize
class HifiGanBrain(sb.Brain):
def compute_forward(self, batch, stage):
"""The forward function, generates synthesized waveforms,
calculates the scores and the features of the discriminator
for synthesized waveforms and real waveforms.
Arguments
---------
batch : torch.Tensor or tensors
An element from the dataloader, including inputs for processing.
stage : Stage
The stage of the experiment: Stage.TRAIN, Stage.VALID, Stage.TEST
"""
batch = batch.to(self.device)
x, _ = batch.code
y, _ = batch.sig
# generate sythesized waveforms
y_g_hat, (log_dur_pred, log_dur) = self.modules.generator(x)
y_g_hat = y_g_hat[:, :, : y.size(2)]
# get scores and features from discriminator for real and synthesized waveforms
scores_fake, feats_fake = self.modules.discriminator(y_g_hat.detach())
scores_real, feats_real = self.modules.discriminator(y)
return (
y_g_hat,
scores_fake,
feats_fake,
scores_real,
feats_real,
log_dur_pred,
log_dur,
)
def compute_objectives(self, predictions, batch, stage):
"""Computes the loss given the predicted and targeted outputs.
Arguments
---------
predictions : torch.Tensor
The model generated spectrograms and other metrics from `compute_forward`.
batch : PaddedBatch
This batch object contains all the relevant tensors for computation.
stage : sb.Stage
One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
Returns
-------
loss : torch.Tensor
A one-element tensor used for backpropagating the gradient.
"""
batch = batch.to(self.device)
x, _ = batch.code
y, y_lens = batch.sig
# Hold on to the batch for the inference sample. This is needed because
# the infernece sample is run from on_stage_end only, where
# batch information is not available
self.last_batch = (x, y)
(
y_hat,
scores_fake,
feats_fake,
scores_real,
feats_real,
log_dur_pred,
log_dur,
) = predictions
loss_g = self.hparams.generator_loss(
stage,
y_hat,
y,
scores_fake,
feats_fake,
feats_real,
log_dur_pred,
log_dur,
)
loss_d = self.hparams.discriminator_loss(scores_fake, scores_real)
loss = {**loss_g, **loss_d}
self.last_loss_stats[stage] = scalarize(loss)
return loss
def fit_batch(self, batch):
"""Fits a single batch.
Arguments
---------
batch: tuple
a training batch
Returns
-------
loss: torch.Tensor
detached loss
"""
batch = batch.to(self.device)
y, _ = batch.sig
outputs = self.compute_forward(batch, sb.core.Stage.TRAIN)
(
y_g_hat,
scores_fake,
feats_fake,
scores_real,
feats_real,
log_dur_pred,
log_dur,
) = outputs
# calculate discriminator loss with the latest updated generator
loss_d = self.compute_objectives(outputs, batch, sb.core.Stage.TRAIN)[
"D_loss"
]
# First train the discriminator
self.optimizer_d.zero_grad()
loss_d.backward()
self.optimizer_d.step()
# calculate generator loss with the latest updated discriminator
scores_fake, feats_fake = self.modules.discriminator(y_g_hat)
scores_real, feats_real = self.modules.discriminator(y)
outputs = (
y_g_hat,
scores_fake,
feats_fake,
scores_real,
feats_real,
log_dur_pred,
log_dur,
)
loss_g = self.compute_objectives(outputs, batch, sb.core.Stage.TRAIN)[
"G_loss"
]
# Then train the generator
self.optimizer_g.zero_grad()
loss_g.backward()
self.optimizer_g.step()
return loss_g.detach().cpu()
def evaluate_batch(self, batch, stage):
"""Evaluate one batch.
Arguments
---------
batch : list of torch.Tensors
Batch of data to use for evaluation. Default implementation assumes
this batch has two elements: inputs and targets.
stage : Stage
The stage of the experiment: Stage.VALID, Stage.TEST
Returns
-------
detached loss
"""
out = self.compute_forward(batch, stage=stage)
loss = self.compute_objectives(out, batch, stage=stage)
loss_g = loss["G_loss"]
return loss_g.detach().cpu()
def on_fit_start(self):
"""Gets called at the beginning of ``fit()``, on multiple processes
if ``distributed_count > 0`` and backend is ddp and initializes statistics.
"""
self.last_epoch = 0
self.last_batch = None
self.last_loss_stats = {}
return super().on_fit_start()
def init_optimizers(self):
"""Called during ``on_fit_start()``, initialize optimizers
after parameters are fully configured (e.g. DDP, jit).
"""
if self.opt_class is not None:
(
opt_g_class,
opt_d_class,
sch_g_class,
sch_d_class,
) = self.opt_class
self.optimizer_g = opt_g_class(self.modules.generator.parameters())
self.optimizer_d = opt_d_class(
self.modules.discriminator.parameters()
)
self.optimizers_dict = {
"optimizer_g": self.optimizer_g,
"optimizer_d": self.optimizer_d,
}
self.scheduler_g = sch_g_class(self.optimizer_g)
self.scheduler_d = sch_d_class(self.optimizer_d)
if self.checkpointer is not None:
self.checkpointer.add_recoverable(
"optimizer_g", self.optimizer_g
)
self.checkpointer.add_recoverable(
"optimizer_d", self.optimizer_d
)
self.checkpointer.add_recoverable(
"scheduler_g", self.scheduler_d
)
self.checkpointer.add_recoverable(
"scheduler_d", self.scheduler_d
)
def on_stage_end(self, stage, stage_loss, epoch):
"""Gets called at the end of an epoch.
Arguments
---------
stage : sb.Stage
One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST
stage_loss : float
The average loss for all of the data processed in this stage.
epoch : int
The currently-starting epoch. This is passed
`None` during the test stage.
"""
if stage == sb.Stage.VALID:
# Update learning rate
self.scheduler_g.step()
self.scheduler_d.step()
lr_g = self.optimizer_g.param_groups[-1]["lr"]
lr_d = self.optimizer_d.param_groups[-1]["lr"]
stats = {
**self.last_loss_stats[sb.Stage.VALID],
}
self.hparams.train_logger.log_stats( # 1#2#
stats_meta={"Epoch": epoch, "lr_g": lr_g, "lr_d": lr_d},
train_stats=self.last_loss_stats[sb.Stage.TRAIN],
valid_stats=stats,
)
# The tensorboard_logger writes a summary to stdout and to the logfile.
if self.hparams.use_tensorboard:
self.tensorboard_logger.log_stats(
stats_meta={"Epoch": epoch, "lr_g": lr_g, "lr_d": lr_d},
train_stats=self.last_loss_stats[sb.Stage.TRAIN],
valid_stats=stats,
)
# Save the current checkpoint and delete previous checkpoints.
epoch_metadata = {
**{"epoch": epoch},
**self.last_loss_stats[sb.Stage.VALID],
}
if self.checkpointer is not None:
self.checkpointer.save_and_keep_only(
meta=epoch_metadata,
end_of_epoch=True,
min_keys=["loss"],
ckpt_predicate=(
(
lambda ckpt: (
ckpt.meta["epoch"]
% self.hparams.keep_checkpoint_interval
!= 0
)
)
if self.hparams.keep_checkpoint_interval is not None
else None
),
)
self.run_inference_sample("Valid", epoch)
# We also write statistics about test data to stdout and to the TensorboardLogger.
if stage == sb.Stage.TEST:
self.hparams.train_logger.log_stats( # 1#2#
{"Epoch loaded": self.hparams.epoch_counter.current},
test_stats=self.last_loss_stats[sb.Stage.TEST],
)
if self.hparams.use_tensorboard:
self.tensorboard_logger.log_stats(
{"Epoch loaded": self.hparams.epoch_counter.current},
test_stats=self.last_loss_stats[sb.Stage.TEST],
)
self.run_inference_sample("Test", epoch)
def run_inference_sample(self, name, epoch):
"""Produces a sample in inference mode.
This is called when producing samples.
Arguments
---------
name: str
the name of the saved audio folder
epoch: int or str
the epoch number (used in file path calculations)
or "test" for test stage
"""
with torch.no_grad():
if self.last_batch is None:
return
x, y = self.last_batch
# Preparing model for inference by removing weight norm
inference_generator = copy.deepcopy(self.hparams.generator)
inference_generator.remove_weight_norm()
if inference_generator.duration_predictor:
x = torch.unique_consecutive(x, dim=1)
sig_out = inference_generator.inference(x)
spec_out = self.hparams.mel_spectogram(
audio=sig_out.squeeze(0).cpu()
)
if self.hparams.use_tensorboard:
self.tensorboard_logger.log_audio(
f"{name}/audio_target", y.squeeze(0), self.hparams.sample_rate
)
self.tensorboard_logger.log_audio(
f"{name}/audio_pred",
sig_out.squeeze(0),
self.hparams.sample_rate,
)
self.tensorboard_logger.log_figure(f"{name}/mel_target", x)
self.tensorboard_logger.log_figure(f"{name}/mel_pred", spec_out)
else:
# folder name is the current epoch for validation and "test" for test
folder = (
self.hparams.epoch_counter.current
if name == "Valid"
else "test"
)
self.save_audio("target", y.squeeze(0), folder)
self.save_audio("synthesized", sig_out.squeeze(0), folder)
def save_audio(self, name, data, epoch):
"""Saves a single wav file.
Arguments
---------
name: str
the name of the saved audio
data: torch.Tensor
the wave data to save
epoch: int or str
the epoch number (used in file path calculations)
or "test" for test stage
"""
target_path = pl.Path(self.hparams.progress_sample_path) / str(epoch)
target_path.mkdir(parents=True, exist_ok=True)
file_name = target_path / f"{name}.wav"
torchaudio.save(file_name.as_posix(), data.cpu(), 16000)
def sample_interval(seqs, segment_size):
"This function sample an interval of audio and code according to segment size."
N = max([v.shape[-1] for v in seqs])
seq_len = segment_size if segment_size > 0 else N
hops = [N // v.shape[-1] for v in seqs]
lcm = np.lcm.reduce(hops)
interval_start = 0
interval_end = N // lcm - seq_len // lcm
start_step = random.randint(interval_start, interval_end)
new_seqs = []
for i, v in enumerate(seqs):
start = start_step * (lcm // hops[i])
end = (start_step + seq_len // lcm) * (lcm // hops[i])
new_seqs += [v[..., start:end]]
return new_seqs
def dataio_prepare(hparams):
"""This function prepares the datasets to be used in the brain class.
It also defines the data processing pipeline through user-defined functions.
"""
segment_size = hparams["segment_size"]
code_hop_size = hparams["code_hop_size"]
codes_folder = pl.Path(hparams["codes_save_folder"])
# Define audio pipeline:
@sb.utils.data_pipeline.takes("id", "wav", "segment")
@sb.utils.data_pipeline.provides("code", "sig")
def audio_pipeline(utt_id, wav, segment):
info = torchaudio.info(wav)
audio = sb.dataio.dataio.read_audio(wav)
audio = torchaudio.transforms.Resample(
info.sample_rate,
hparams["sample_rate"],
)(audio)
code = np.load(codes_folder / f"{utt_id}.npy")
num_layer = len(hparams["layer"])
offsets = np.arange(num_layer) * hparams["num_clusters"]
code = code + offsets + 1
if hparams["layer_drop"]:
num_layers_to_drop = np.random.randint(0, code.shape[1])
if num_layers_to_drop > 0:
layers_to_drop = np.random.choice(
code.shape[1], size=num_layers_to_drop, replace=False
)
code[:, layers_to_drop] = 0
code = torch.IntTensor(code)
# Trim end of audio
code_length = min(audio.shape[0] // code_hop_size, code.shape[0])
code = code[:code_length]
audio = audio[: code_length * code_hop_size]
while audio.shape[0] < segment_size:
audio = torch.hstack([audio, audio])
code = torch.hstack([code, code])
audio = audio.unsqueeze(0)
if segment:
code = code.swapdims(0, 1)
audio, code = sample_interval([audio, code], segment_size)
code = code.swapdims(0, 1)
return code, audio
datasets = {}
data_info = {
"train": hparams["train_json"],
"valid": hparams["valid_json"],
"test": hparams["test_json"],
}
for dataset in hparams["splits"]:
datasets[dataset] = sb.dataio.dataset.DynamicItemDataset.from_json(
json_path=data_info[dataset],
replacements={"data_root": hparams["data_folder"]},
dynamic_items=[audio_pipeline],
output_keys=["id", "code", "sig"],
)
return datasets
if __name__ == "__main__":
# Load hyperparameters file with command-line overrides
hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
with open(hparams_file) as fin:
hparams = load_hyperpyyaml(fin, overrides)
# If --distributed_launch then
# create ddp_group with the right communication protocol
sb.utils.distributed.ddp_init_group(run_opts)
# Create experiment directory
sb.create_experiment_directory(
experiment_directory=hparams["output_folder"],
hyperparams_to_save=hparams_file,
overrides=overrides,
)
from ljspeech_prepare import prepare_ljspeech
sb.utils.distributed.run_on_main(
prepare_ljspeech,
kwargs={
"data_folder": hparams["data_folder"],
"save_folder": hparams["save_folder"],
"splits": hparams["splits"],
"split_ratio": hparams["split_ratio"],
"seed": hparams["seed"],
"skip_prep": hparams["skip_prep"],
},
)
from extract_code import extract_ljspeech
sb.utils.distributed.run_on_main(
extract_ljspeech,
kwargs={
"data_folder": hparams["save_folder"],
"splits": hparams["splits"],
"kmeans_folder": hparams["kmeans_folder"],
"kmeans_dataset": hparams["kmeans_dataset"],
"num_clusters": hparams["num_clusters"],
"encoder_type": hparams["encoder_type"],
"encoder_source": hparams["encoder_hub"],
"layer": hparams["layer"],
"encoder_save_folder": hparams["encoder_save_folder"],
"codes_save_folder": hparams["codes_save_folder"],
"sample_rate": hparams["sample_rate"],
"skip_extract": hparams["skip_extract"],
},
)
datasets = dataio_prepare(hparams)
# Brain class initialization
hifi_gan_brain = HifiGanBrain(
modules=hparams["modules"],
opt_class=[
hparams["opt_class_generator"],
hparams["opt_class_discriminator"],
hparams["sch_class_generator"],
hparams["sch_class_discriminator"],
],
hparams=hparams,
run_opts=run_opts,
checkpointer=hparams["checkpointer"],
)
if hparams["use_tensorboard"]:
hifi_gan_brain.tensorboard_logger = (
sb.utils.train_logger.TensorboardLogger(
save_dir=hparams["output_folder"] + "/tensorboard"
)
)
# Training
hifi_gan_brain.fit(
hifi_gan_brain.hparams.epoch_counter,
train_set=datasets["train"],
valid_set=datasets["valid"],
train_loader_kwargs=hparams["train_dataloader_opts"],
valid_loader_kwargs=hparams["valid_dataloader_opts"],
)
# Test
if "test" in datasets:
hifi_gan_brain.evaluate(
datasets["test"],
test_loader_kwargs=hparams["test_dataloader_opts"],
)
"""
LJspeech data preparation.
Download: https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
Authors
* Yingzhi WANG 2022
* Sathvik Udupa 2022
* Pradnya Kandarkar 2023
"""
import csv
import json
import logging
import os
import random
import re
import numpy as np
import tgt
import torch
import torchaudio
from tqdm import tqdm
from unidecode import unidecode
from speechbrain.dataio.dataio import load_pkl, save_pkl
from speechbrain.inference.text import GraphemeToPhoneme
from speechbrain.utils.data_utils import download_file
from speechbrain.utils.text_to_sequence import _g2p_keep_punctuations
logger = logging.getLogger(__name__)
OPT_FILE = "opt_ljspeech_prepare.pkl"
METADATA_CSV = "metadata.csv"
TRAIN_JSON = "train.json"
VALID_JSON = "valid.json"
TEST_JSON = "test.json"
WAVS = "wavs"
DURATIONS = "durations"
logger = logging.getLogger(__name__)
OPT_FILE = "opt_ljspeech_prepare.pkl"
def prepare_ljspeech(
data_folder,
save_folder,
splits=["train", "valid"],
split_ratio=[90, 10],
model_name=None,
seed=1234,
pitch_n_fft=1024,
pitch_hop_length=256,
pitch_min_f0=65,
pitch_max_f0=400,
skip_prep=False,
use_custom_cleaner=False,
device="cpu",
):
"""
Prepares the csv files for the LJspeech datasets.
Arguments
---------
data_folder : str
Path to the folder where the original LJspeech dataset is stored
save_folder : str
The directory where to store the csv/json files
splits : list
List of dataset splits to prepare
split_ratio : list
Proportion for dataset splits
model_name : str
Model name (used to prepare additional model specific data)
seed : int
Random seed
pitch_n_fft : int
Number of fft points for pitch computation
pitch_hop_length : int
Hop length for pitch computation
pitch_min_f0 : int
Minimum f0 for pitch computation
pitch_max_f0 : int
Max f0 for pitch computation
skip_prep : bool
If True, skip preparation
use_custom_cleaner : bool
If True, uses custom cleaner defined for this recipe
device : str
Device for to be used for computation (used as required)
Returns
-------
None
Example
-------
>>> from recipes.LJSpeech.TTS.ljspeech_prepare import prepare_ljspeech
>>> data_folder = 'data/LJspeech/'
>>> save_folder = 'save/'
>>> splits = ['train', 'valid']
>>> split_ratio = [90, 10]
>>> seed = 1234
>>> prepare_ljspeech(data_folder, save_folder, splits, split_ratio, seed)
"""
# Sets seeds for reproducible code
random.seed(seed)
if skip_prep:
return
# Creating configuration for easily skipping data_preparation stage
conf = {
"data_folder": data_folder,
"splits": splits,
"split_ratio": split_ratio,
"save_folder": save_folder,
"seed": seed,
}
if not os.path.exists(save_folder):
os.makedirs(save_folder)
# Setting output files
meta_csv = os.path.join(data_folder, METADATA_CSV)
wavs_folder = os.path.join(data_folder, WAVS)
save_opt = os.path.join(save_folder, OPT_FILE)
save_json_train = os.path.join(save_folder, TRAIN_JSON)
save_json_valid = os.path.join(save_folder, VALID_JSON)
save_json_test = os.path.join(save_folder, TEST_JSON)
phoneme_alignments_folder = None
duration_folder = None
pitch_folder = None
# Setting up additional folders required for FastSpeech2
if model_name is not None and "FastSpeech2" in model_name:
# This step requires phoneme alignments to be present in the data_folder
# We automatically download the alignments from https://www.dropbox.com/s/v28x5ldqqa288pu/LJSpeech.zip
# Download and unzip LJSpeech phoneme alignments from here: https://drive.google.com/drive/folders/1DBRkALpPd6FL9gjHMmMEdHODmkgNIIK4
alignment_URL = (
"https://www.dropbox.com/s/v28x5ldqqa288pu/LJSpeech.zip?dl=1"
)
phoneme_alignments_folder = os.path.join(
data_folder, "TextGrid", "LJSpeech"
)
download_file(
alignment_URL, data_folder + "/alignments.zip", unpack=True
)
duration_folder = os.path.join(data_folder, "durations")
if not os.path.exists(duration_folder):
os.makedirs(duration_folder)
# extract pitch for both Fastspeech2 and FastSpeech2WithAligner models
pitch_folder = os.path.join(data_folder, "pitch")
if not os.path.exists(pitch_folder):
os.makedirs(pitch_folder)
# Check if this phase is already done (if so, skip it)
if skip(splits, save_folder, conf):
logger.info("Skipping preparation, completed in previous run.")
return
# Additional check to make sure metadata.csv and wavs folder exists
assert os.path.exists(meta_csv), "metadata.csv does not exist"
assert os.path.exists(wavs_folder), "wavs/ folder does not exist"
# Prepare data splits
msg = "Creating json file for ljspeech Dataset.."
logger.info(msg)
data_split, meta_csv = split_sets(data_folder, splits, split_ratio)
if "train" in splits:
prepare_json(
model_name,
data_split["train"],
save_json_train,
wavs_folder,
meta_csv,
phoneme_alignments_folder,
duration_folder,
pitch_folder,
pitch_n_fft,
pitch_hop_length,
pitch_min_f0,
pitch_max_f0,
use_custom_cleaner,
device,
)
if "valid" in splits:
prepare_json(
model_name,
data_split["valid"],
save_json_valid,
wavs_folder,
meta_csv,
phoneme_alignments_folder,
duration_folder,
pitch_folder,
pitch_n_fft,
pitch_hop_length,
pitch_min_f0,
pitch_max_f0,
use_custom_cleaner,
device,
)
if "test" in splits:
prepare_json(
model_name,
data_split["test"],
save_json_test,
wavs_folder,
meta_csv,
phoneme_alignments_folder,
duration_folder,
pitch_folder,
pitch_n_fft,
pitch_hop_length,
pitch_min_f0,
pitch_max_f0,
use_custom_cleaner,
device,
)
save_pkl(conf, save_opt)
def skip(splits, save_folder, conf):
"""
Detects if the ljspeech data_preparation has been already done.
If the preparation has been done, we can skip it.
Arguments
---------
splits : list
The portions of data to review.
save_folder : str
The path to the directory containing prepared files.
conf : dict
Configuration to match against saved config.
Returns
-------
bool
if True, the preparation phase can be skipped.
if False, it must be done.
"""
# Checking json files
skip = True
split_files = {
"train": TRAIN_JSON,
"valid": VALID_JSON,
"test": TEST_JSON,
}
for split in splits:
if not os.path.isfile(os.path.join(save_folder, split_files[split])):
skip = False
# Checking saved options
save_opt = os.path.join(save_folder, OPT_FILE)
if skip is True:
if os.path.isfile(save_opt):
opts_old = load_pkl(save_opt)
if opts_old == conf:
skip = True
else:
skip = False
else:
skip = False
return skip
def split_sets(data_folder, splits, split_ratio):
"""Randomly splits the wav list into training, validation, and test lists.
Note that a better approach is to make sure that all the classes have the
same proportion of samples for each session.
Arguments
---------
data_folder : str
The path to the directory containing the data.
splits : list
The list of the selected splits.
split_ratio : list
List composed of three integers that sets split ratios for train,
valid, and test sets, respectively.
For instance split_ratio=[80, 10, 10] will assign 80% of the sentences
to training, 10% for validation, and 10% for test.
Returns
-------
dictionary containing train, valid, and test splits.
"""
meta_csv = os.path.join(data_folder, METADATA_CSV)
csv_reader = csv.reader(
open(meta_csv), delimiter="|", quoting=csv.QUOTE_NONE
)
meta_csv = list(csv_reader)
index_for_sessions = []
session_id_start = "LJ001"
index_this_session = []
for i in range(len(meta_csv)):
session_id = meta_csv[i][0].split("-")[0]
if session_id == session_id_start:
index_this_session.append(i)
if i == len(meta_csv) - 1:
index_for_sessions.append(index_this_session)
else:
index_for_sessions.append(index_this_session)
session_id_start = session_id
index_this_session = [i]
session_len = [len(session) for session in index_for_sessions]
data_split = {}
for i, split in enumerate(splits):
data_split[split] = []
for j in range(len(index_for_sessions)):
if split == "train":
random.shuffle(index_for_sessions[j])
n_snts = int(session_len[j] * split_ratio[i] / sum(split_ratio))
data_split[split].extend(index_for_sessions[j][0:n_snts])
del index_for_sessions[j][0:n_snts]
if split == "valid":
if "test" in splits:
random.shuffle(index_for_sessions[j])
n_snts = int(
session_len[j] * split_ratio[i] / sum(split_ratio)
)
data_split[split].extend(index_for_sessions[j][0:n_snts])
del index_for_sessions[j][0:n_snts]
else:
data_split[split].extend(index_for_sessions[j])
if split == "test":
data_split[split].extend(index_for_sessions[j])
return data_split, meta_csv
def prepare_json(
model_name,
seg_lst,
json_file,
wavs_folder,
csv_reader,
phoneme_alignments_folder,
durations_folder,
pitch_folder,
pitch_n_fft,
pitch_hop_length,
pitch_min_f0,
pitch_max_f0,
use_custom_cleaner=False,
device="cpu",
):
"""
Creates json file given a list of indexes.
Arguments
---------
model_name : str
Model name (used to prepare additional model specific data)
seg_lst : list
The list of json indexes of a given data split
json_file : str
Output json path
wavs_folder : str
LJspeech wavs folder
csv_reader : _csv.reader
LJspeech metadata
phoneme_alignments_folder : path
Path where the phoneme alignments are stored
durations_folder : path
Folder where to store the duration values of each audio
pitch_folder : path
Folder where to store the pitch of each audio
pitch_n_fft : int
Number of fft points for pitch computation
pitch_hop_length : int
Hop length for pitch computation
pitch_min_f0 : int
Minimum f0 for pitch computation
pitch_max_f0 : int
Max f0 for pitch computation
use_custom_cleaner : bool
If True, uses custom cleaner defined for this recipe
device : str
Device for to be used for computation (used as required)
"""
logger.info(f"preparing {json_file}.")
if model_name in ["Tacotron2", "FastSpeech2WithAlignment"]:
logger.info(
"Computing phonemes for LJSpeech labels using SpeechBrain G2P. This may take a while."
)
g2p = GraphemeToPhoneme.from_hparams(
"speechbrain/soundchoice-g2p", run_opts={"device": device}
)
if model_name is not None and "FastSpeech2" in model_name:
logger.info(
"Computing pitch as required for FastSpeech2. This may take a while."
)
json_dict = {}
for index in tqdm(seg_lst):
# Common data preparation
id = list(csv_reader)[index][0]
wav = os.path.join(wavs_folder, f"{id}.wav")
label = list(csv_reader)[index][2]
if use_custom_cleaner:
label = custom_clean(label, model_name)
json_dict[id] = {
"uttid": id,
"wav": wav,
"label": label,
"segment": True if "train" in json_file else False,
}
# FastSpeech2 specific data preparation
if model_name == "FastSpeech2":
audio, fs = torchaudio.load(wav)
# Parses phoneme alignments
textgrid_path = os.path.join(
phoneme_alignments_folder, f"{id}.TextGrid"
)
textgrid = tgt.io.read_textgrid(
textgrid_path, include_empty_intervals=True
)
last_phoneme_flags = get_last_phoneme_info(
textgrid.get_tier_by_name("words"),
textgrid.get_tier_by_name("phones"),
)
(
phonemes,
duration,
start,
end,
trimmed_last_phoneme_flags,
) = get_alignment(
textgrid.get_tier_by_name("phones"),
fs,
pitch_hop_length,
last_phoneme_flags,
)
# Gets label phonemes
label_phoneme = " ".join(phonemes)
spn_labels = [0] * len(phonemes)
for i in range(1, len(phonemes)):
if phonemes[i] == "spn":
spn_labels[i - 1] = 1
if start >= end:
print(f"Skipping {id}")
continue
# Saves durations
duration_file_path = os.path.join(durations_folder, f"{id}.npy")
np.save(duration_file_path, duration)
# Computes pitch
audio = audio[:, int(fs * start) : int(fs * end)]
pitch_file = wav.replace(".wav", ".npy").replace(
wavs_folder, pitch_folder
)
if not os.path.isfile(pitch_file):
pitch = torchaudio.functional.detect_pitch_frequency(
waveform=audio,
sample_rate=fs,
frame_time=(pitch_hop_length / fs),
win_length=3,
freq_low=pitch_min_f0,
freq_high=pitch_max_f0,
).squeeze(0)
# Concatenate last element to match duration.
pitch = torch.cat([pitch, pitch[-1].unsqueeze(0)])
# Mean and Variance Normalization
mean = 256.1732939688805
std = 328.319759158607
pitch = (pitch - mean) / std
pitch = pitch[: sum(duration)]
np.save(pitch_file, pitch)
# Updates data for the utterance
json_dict[id].update({"label_phoneme": label_phoneme})
json_dict[id].update({"spn_labels": spn_labels})
json_dict[id].update({"start": start})
json_dict[id].update({"end": end})
json_dict[id].update({"durations": duration_file_path})
json_dict[id].update({"pitch": pitch_file})
json_dict[id].update(
{"last_phoneme_flags": trimmed_last_phoneme_flags}
)
# FastSpeech2WithAlignment specific data preparation
if model_name == "FastSpeech2WithAlignment":
audio, fs = torchaudio.load(wav)
# Computes pitch
pitch_file = wav.replace(".wav", ".npy").replace(
wavs_folder, pitch_folder
)
if not os.path.isfile(pitch_file):
if torchaudio.__version__ < "2.1":
pitch = torchaudio.functional.compute_kaldi_pitch(
waveform=audio,
sample_rate=fs,
frame_length=(pitch_n_fft / fs * 1000),
frame_shift=(pitch_hop_length / fs * 1000),
min_f0=pitch_min_f0,
max_f0=pitch_max_f0,
)[0, :, 0]
else:
pitch = torchaudio.functional.detect_pitch_frequency(
waveform=audio,
sample_rate=fs,
frame_time=(pitch_hop_length / fs),
win_length=3,
freq_low=pitch_min_f0,
freq_high=pitch_max_f0,
).squeeze(0)
# Concatenate last element to match duration.
pitch = torch.cat([pitch, pitch[-1].unsqueeze(0)])
# Mean and Variance Normalization
mean = 256.1732939688805
std = 328.319759158607
pitch = (pitch - mean) / std
np.save(pitch_file, pitch)
phonemes = _g2p_keep_punctuations(g2p, label)
# Updates data for the utterance
json_dict[id].update({"phonemes": phonemes})
json_dict[id].update({"pitch": pitch_file})
# Writing the dictionary to the json file
with open(json_file, mode="w") as json_f:
json.dump(json_dict, json_f, indent=2)
logger.info(f"{json_file} successfully created!")
def get_alignment(tier, sampling_rate, hop_length, last_phoneme_flags):
"""
Returns phonemes, phoneme durations (in frames), start time (in seconds), end time (in seconds).
This function is adopted from https://github.com/ming024/FastSpeech2/blob/master/preprocessor/preprocessor.py
Arguments
---------
tier : tgt.core.IntervalTier
For an utterance, contains Interval objects for phonemes and their start time and end time in seconds
sampling_rate : int
Sample rate if audio signal
hop_length : int
Hop length for duration computation
last_phoneme_flags : list
List of (phoneme, flag) tuples with flag=1 if the phoneme is the last phoneme else flag=0
Returns
-------
(phones, durations, start_time, end_time) : tuple
The phonemes, durations, start time, and end time for an utterance
"""
sil_phones = ["sil", "sp", "spn", ""]
phonemes = []
durations = []
start_time = 0
end_time = 0
end_idx = 0
trimmed_last_phoneme_flags = []
flag_iter = iter(last_phoneme_flags)
for t in tier._objects:
s, e, p = t.start_time, t.end_time, t.text
current_flag = next(flag_iter)
# Trims leading silences
if phonemes == []:
if p in sil_phones:
continue
else:
start_time = s
if p not in sil_phones:
# For ordinary phones
# Removes stress indicators
if p[-1].isdigit():
phonemes.append(p[:-1])
else:
phonemes.append(p)
trimmed_last_phoneme_flags.append(current_flag[1])
end_time = e
end_idx = len(phonemes)
else:
# Uses a unique token for all silent phones
phonemes.append("spn")
trimmed_last_phoneme_flags.append(current_flag[1])
durations.append(
int(
np.round(e * sampling_rate / hop_length)
- np.round(s * sampling_rate / hop_length)
)
)
# Trims tailing silences
phonemes = phonemes[:end_idx]
durations = durations[:end_idx]
return phonemes, durations, start_time, end_time, trimmed_last_phoneme_flags
def get_last_phoneme_info(words_seq, phones_seq):
"""This function takes word and phoneme tiers from a TextGrid file as input
and provides a list of tuples for the phoneme sequence indicating whether
each of the phonemes is the last phoneme of a word or not.
Each tuple of the returned list has this format: (phoneme, flag)
Arguments
---------
words_seq : tier
word tier from a TextGrid file
phones_seq : tier
phoneme tier from a TextGrid file
Returns
-------
last_phoneme_flags : list
each tuple of the returned list has this format: (phoneme, flag)
"""
# Gets all phoneme objects for the entire sequence
phoneme_objects = phones_seq._objects
phoneme_iter = iter(phoneme_objects)
# Stores flags to show if an element (phoneme) is a the last phoneme of a word
last_phoneme_flags = list()
# Matches the end times of the phoneme and word objects to get the last phoneme information
for word_obj in words_seq._objects:
word_end_time = word_obj.end_time
current_phoneme = next(phoneme_iter, None)
while current_phoneme:
phoneme_end_time = current_phoneme.end_time
if phoneme_end_time == word_end_time:
last_phoneme_flags.append((current_phoneme.text, 1))
break
else:
last_phoneme_flags.append((current_phoneme.text, 0))
current_phoneme = next(phoneme_iter, None)
return last_phoneme_flags
def custom_clean(text, model_name):
"""
Uses custom criteria to clean text.
Arguments
---------
text : str
Input text to be cleaned
model_name : str
whether to treat punctuations
Returns
-------
text : str
Cleaned text
"""
_abbreviations = [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
("mrs", "missus"),
("mr", "mister"),
("dr", "doctor"),
("st", "saint"),
("co", "company"),
("jr", "junior"),
("maj", "major"),
("gen", "general"),
("drs", "doctors"),
("rev", "reverend"),
("lt", "lieutenant"),
("hon", "honorable"),
("sgt", "sergeant"),
("capt", "captain"),
("esq", "esquire"),
("ltd", "limited"),
("col", "colonel"),
("ft", "fort"),
]
]
text = unidecode(text.lower())
if model_name != "FastSpeech2WithAlignment":
text = re.sub("[:;]", " - ", text)
text = re.sub(r'[)(\[\]"]', " ", text)
text = text.strip().strip().strip("-")
text = re.sub(" +", " ", text)
for regex, replacement in _abbreviations:
text = re.sub(regex, replacement, text)
return text
# K-means (Quantization)
This folder contains recipes for training K-means clustering model for the LJSpeech Dataset.
The model serves to quantize self-supervised representations into discrete representation. Thus representations can be used as a discrete audio input for various tasks including classification, ASR and speech generation.
It supports kmeans model using the features from HuBERT, WAVLM or Wav2Vec.
You can download LibriSpeech at http://www.openslr.org/12
## Installing Extra Dependencies
Before proceeding, ensure you have installed the necessary additional dependencies. To do this, simply run the following command in your terminal:
```
pip install -r extra_requirements.txt
```
# How to run:
To configure the SSL model type and corresponding Hub in your YAML configuration file, follow these steps:
1. Locate the `model_config` section in your YAML file.
2. Modify the `ssl_model_type` field to specify one of the SSL models: "Hubert", "WavLM", or "Wav2Vec2".
3. Update the `ssl_hub` field with the specific name of the SSL Hub associated with your chosen model type.
Here are the supported SSL models along with their corresponding SSL Hubs:
```
ssl_model_type: hubert, wavlm, wav2vec2
ssl_hub:
- facebook/hubert-large-ll60k
- microsoft/wavlm-large
- facebook/wav2vec2-large
```
4. Set the output folder according to the experiments you are running (e.g., `output_folder: !ref results/LJSpeech/clustering/wavlm/<seed>`)
To initiate training using a specific SSL model, execute the following command:
```shell
python train.py hparams/train_discrete_ssl.yaml
```
This command will start the training process using the configurations specified in 'train_discrete_ssl.yaml'.
# Results
The checkpoints can be found at [this](https://huggingface.co/speechbrain/SSL_Quantization) HuggingFace repository.
# **About SpeechBrain**
- Website: https://speechbrain.github.io/
- Code: https://github.com/speechbrain/speechbrain/
- HuggingFace: https://huggingface.co/speechbrain/
# **Citing SpeechBrain**
Please, cite SpeechBrain if you use it for your research or business.
```bibtex
@misc{ravanelli2024opensourceconversationalaispeechbrain,
title={Open-Source Conversational AI with SpeechBrain 1.0},
author={Mirco Ravanelli and Titouan Parcollet and Adel Moumen and Sylvain de Langen and Cem Subakan and Peter Plantinga and Yingzhi Wang and Pooneh Mousavi and Luca Della Libera and Artem Ploujnikov and Francesco Paissan and Davide Borra and Salah Zaiem and Zeyu Zhao and Shucong Zhang and Georgios Karakasidis and Sung-Lin Yeh and Pierre Champion and Aku Rouhe and Rudolf Braun and Florian Mai and Juan Zuluaga-Gomez and Seyed Mahed Mousavi and Andreas Nautsch and Xuechen Liu and Sangeet Sagar and Jarod Duret and Salima Mdhaffar and Gaelle Laperriere and Mickael Rouvier and Renato De Mori and Yannick Esteve},
year={2024},
eprint={2407.00463},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2407.00463},
}
@misc{speechbrain,
title={{SpeechBrain}: A General-Purpose Speech Toolkit},
author={Mirco Ravanelli and Titouan Parcollet and Peter Plantinga and Aku Rouhe and Samuele Cornell and Loren Lugosch and Cem Subakan and Nauman Dawalatabad and Abdelwahab Heba and Jianyuan Zhong and Ju-Chieh Chou and Sung-Lin Yeh and Szu-Wei Fu and Chien-Feng Liao and Elena Rastorgueva and François Grondin and William Aris and Hwidong Na and Yan Gao and Renato De Mori and Yoshua Bengio},
year={2021},
eprint={2106.04624},
archivePrefix={arXiv},
primaryClass={eess.AS},
note={arXiv:2106.04624}
}
```
################################
# Recipe for Training K-Means Clustering on LJSpeech Data
# Using Self-Supervised Model-Based Representations
#
# It is used for creating discrete audio representations from LJSpeech data.
#
# Author: Pooneh Mousavi (2023)
################################
# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1986
__set_seed: !apply:torch.manual_seed [!ref <seed>]
output_folder: !ref results/LJSpeech/clustering/hubert/<seed>
save_folder: !ref <output_folder>/save
# Data files
data_folder: !PLACEHOLDER # e,g./path/to/LJSpeech-1.1
train_json: !ref <save_folder>/train.json
splits: ["train"]
split_ratio: [80]
skip_prep: False
sample_rate: 16000
# model_config
# ssl_model_type: hubert, wavlm, wav2vec2
# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large, facebook/wav2vec2-large
ssl_model_type: hubert # hubert, wavml or wav2vec2
ssl_hub: facebook/hubert-large-ll60k
freeze_feature_extractor: True
freeze_ssl: True
ssl_folder: !ref <save_folder>/hubert_checkpoint
ssl_layer_num: 7
batch_size: 128 # batch_size for loading and extracting features. It is different from kmeans_batch_size.
checkpoint_interval: 100
# Dataloader options
train_dataloader_opts:
batch_size: !ref <batch_size>
drop_last: True
ssl_model: !apply:speechbrain.utils.hparams.choice
value: !ref <ssl_model_type>
choices:
wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
source: !ref <ssl_hub>
output_norm: False
freeze: !ref <freeze_ssl>
freeze_feature_extractor: !ref <freeze_feature_extractor>
output_all_hiddens: True
save_path: !ref <ssl_folder>
hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
source: !ref <ssl_hub>
output_norm: False
freeze: !ref <freeze_ssl>
freeze_feature_extractor: !ref <freeze_feature_extractor>
output_all_hiddens: True
save_path: !ref <ssl_folder>
wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
source: !ref <ssl_hub>
output_norm: False
freeze: !ref <freeze_ssl>
freeze_feature_extractor: !ref <freeze_feature_extractor>
output_all_hiddens: True
save_path: !ref <ssl_folder>
####################
# Model Parameters #
####################
num_clusters: 128
init: k-means++
max_iter: 100
kmeans_batch_size: 1000 # should be >= num_clusters
tol: 0.0
max_no_improvement: 100
n_init: 20
reassignment_ratio: 0.0
../ljspeech_prepare.py
\ No newline at end of file
"""
Recipe to train K-means clustering model on self-supervised representations.
To run this recipe, do the following:
> python train.py hparams/train_with_[SSL-model].yaml --data_folder=/path/to/LJSpeech
Author
* Pooneh Mousavi 2023
"""
import logging
import os
import sys
import torchaudio
from hyperpyyaml import load_hyperpyyaml
from torch.utils.data import DataLoader
import speechbrain as sb
from speechbrain.dataio.dataloader import LoopedLoader
from speechbrain.utils.distributed import run_on_main
from speechbrain.utils.kmeans import fetch_kmeans_model, save_model, train
logger = logging.getLogger(__name__)
def dataio_prepare(hparams):
# Define audio pipeline:
@sb.utils.data_pipeline.takes("wav")
@sb.utils.data_pipeline.provides("sig")
def audio_pipeline(wav):
sig = sb.dataio.dataio.read_audio(wav)
info = torchaudio.info(wav)
resampled = torchaudio.transforms.Resample(
info.sample_rate,
hparams["sample_rate"],
)(sig)
return resampled
datasets = {}
data_info = {
"train": hparams["train_json"],
}
for dataset in hparams["splits"]:
datasets[dataset] = sb.dataio.dataset.DynamicItemDataset.from_json(
json_path=data_info[dataset],
replacements={"data_root": hparams["data_folder"]},
dynamic_items=[audio_pipeline],
output_keys=["id", "sig"],
)
return datasets
return datasets
if __name__ == "__main__":
# Load hyperparameters file with command-line overrides
hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
with open(hparams_file) as fin:
hparams = load_hyperpyyaml(fin, overrides)
# Create experiment directory
sb.create_experiment_directory(
experiment_directory=hparams["output_folder"],
hyperparams_to_save=hparams_file,
overrides=overrides,
)
# Dataset prep (parsing Librispeech)
from ljspeech_prepare import prepare_ljspeech # noqa
# multi-gpu (ddp) save data preparation
run_on_main(
prepare_ljspeech,
kwargs={
"data_folder": hparams["data_folder"],
"save_folder": hparams["save_folder"],
"splits": hparams["splits"],
"split_ratio": hparams["split_ratio"],
"seed": hparams["seed"],
"skip_prep": hparams["skip_prep"],
},
)
# Load SSL model
hparams["ssl_model"] = hparams["ssl_model"].to(run_opts["device"])
# Make training Dataloader
train_set = dataio_prepare(hparams)["train"]
if not (
isinstance(train_set, DataLoader) or isinstance(train_set, LoopedLoader)
):
train_set = sb.dataio.dataloader.make_dataloader(
train_set, **hparams["train_dataloader_opts"]
)
os.makedirs(hparams["save_folder"], exist_ok=True)
# If you use dataloader checkpoints, make sure to keep all the settings as in the previous run and keep the dataset ordering the same.
dataloader_path = os.path.join(
hparams["save_folder"], "dataloader-TRAIN.ckpt"
)
if os.path.exists(dataloader_path):
logger.info(
f"The dataloader checkpoint is loaded from {dataloader_path}."
)
train_set._speechbrain_load(dataloader_path, False)
# Load pretrained KMeans model if it exists. Otherwise, create new one.
checkpoint_path = os.path.join(
hparams["save_folder"],
f"kmeans-cluster-{hparams['num_clusters']}-layer-{hparams['ssl_layer_num']}.pt",
)
kmeans_model = fetch_kmeans_model(
n_clusters=hparams["num_clusters"],
init=hparams["init"],
max_iter=hparams["max_iter"],
batch_size=hparams["batch_size"],
tol=hparams["tol"],
max_no_improvement=hparams["max_no_improvement"],
n_init=hparams["n_init"],
reassignment_ratio=hparams["reassignment_ratio"],
random_state=hparams["seed"],
checkpoint_path=checkpoint_path,
)
# Train and save Kmeans model
train(
kmeans_model,
train_set,
hparams["ssl_model"],
hparams["save_folder"],
hparams["ssl_layer_num"],
kmeans_batch_size=hparams["kmeans_batch_size"],
device=run_opts["device"],
checkpoint_interval=hparams["checkpoint_interval"],
)
logger.info(f"Saving kmeans model at {checkpoint_path}.")
save_model(kmeans_model, checkpoint_path)
train_set._speechbrain_save(dataloader_path)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment