############################################################################ # Model: FastSpeech2 # Tokens: Raw characters (English text) # Training: LJSpeech # Authors: Sathvik Udupa, Yingzhi Wang, Pradnya Kandarkar # ############################################################################ ################################### # Experiment Parameters and setup # ################################### seed: 1234 __set_seed: !apply:torch.manual_seed [!ref ] output_folder: !ref results/fastspeech2/ save_folder: !ref /save train_log: !ref /train_log.txt epochs: 500 train_spn_predictor_epochs: 8 progress_samples: True progress_sample_path: !ref /samples progress_samples_min_run: 10 progress_samples_interval: 10 progress_batch_sample_size: 4 ################################# # Data files and pre-processing # ################################# data_folder: #!PLACEHOLDER # e.g., /data/Database/LJSpeech-1.1 train_json: !ref /train.json valid_json: !ref /valid.json test_json: !ref /test.json splits: ["train", "valid"] split_ratio: [90, 10] skip_prep: False ################################ # Audio Parameters # ################################ sample_rate: 22050 hop_length: 256 win_length: null n_mel_channels: 80 n_fft: 1024 mel_fmin: 0.0 mel_fmax: 8000.0 power: 1 norm: "slaney" mel_scale: "slaney" dynamic_range_compression: True mel_normalized: False min_max_energy_norm: True min_f0: 65 #(torchaudio pyin values) max_f0: 2093 #(torchaudio pyin values) ################################ # Optimization Hyperparameters # ################################ learning_rate: 0.0001 weight_decay: 0.000001 max_grad_norm: 1.0 batch_size: 32 #minimum 2 num_workers_train: 16 num_workers_valid: 4 betas: [0.9, 0.98] ################################ # Model Parameters and model # ################################ # Input parameters lexicon: - AA - AE - AH - AO - AW - AY - B - CH - D - DH - EH - ER - EY - F - G - HH - IH - IY - JH - K - L - M - N - NG - OW - OY - P - R - S - SH - T - TH - UH - UW - V - W - Y - Z - ZH - spn n_symbols: 42 #fixed depending on symbols in the lexicon +1 for a dummy symbol used for padding padding_idx: 0 # Encoder parameters enc_num_layers: 4 enc_num_head: 2 enc_d_model: 384 enc_ffn_dim: 1024 enc_k_dim: 384 enc_v_dim: 384 enc_dropout: 0.2 # Decoder parameters dec_num_layers: 4 dec_num_head: 2 dec_d_model: 384 dec_ffn_dim: 1024 dec_k_dim: 384 dec_v_dim: 384 dec_dropout: 0.2 # Postnet parameters postnet_embedding_dim: 512 postnet_kernel_size: 5 postnet_n_convolutions: 5 postnet_dropout: 0.5 # common normalize_before: True ffn_type: 1dcnn #1dcnn or ffn ffn_cnn_kernel_size_list: [9, 1] # variance predictor dur_pred_kernel_size: 3 pitch_pred_kernel_size: 3 energy_pred_kernel_size: 3 variance_predictor_dropout: 0.5 # silent phoneme token predictor spn_predictor: !new:speechbrain.lobes.models.FastSpeech2.SPNPredictor enc_num_layers: !ref enc_num_head: !ref enc_d_model: !ref enc_ffn_dim: !ref enc_k_dim: !ref enc_v_dim: !ref enc_dropout: !ref normalize_before: !ref ffn_type: !ref ffn_cnn_kernel_size_list: !ref n_char: !ref padding_idx: !ref #model model: !new:speechbrain.lobes.models.FastSpeech2.FastSpeech2 enc_num_layers: !ref enc_num_head: !ref enc_d_model: !ref enc_ffn_dim: !ref enc_k_dim: !ref enc_v_dim: !ref enc_dropout: !ref dec_num_layers: !ref dec_num_head: !ref dec_d_model: !ref dec_ffn_dim: !ref dec_k_dim: !ref dec_v_dim: !ref dec_dropout: !ref normalize_before: !ref ffn_type: !ref ffn_cnn_kernel_size_list: !ref n_char: !ref n_mels: !ref postnet_embedding_dim: !ref postnet_kernel_size: !ref postnet_n_convolutions: !ref postnet_dropout: !ref padding_idx: !ref dur_pred_kernel_size: !ref pitch_pred_kernel_size: !ref energy_pred_kernel_size: !ref variance_predictor_dropout: !ref mel_spectogram: !name:speechbrain.lobes.models.FastSpeech2.mel_spectogram sample_rate: !ref hop_length: !ref win_length: !ref n_fft: !ref n_mels: !ref f_min: !ref f_max: !ref power: !ref normalized: !ref min_max_energy_norm: !ref norm: !ref mel_scale: !ref compression: !ref criterion: !new:speechbrain.lobes.models.FastSpeech2.Loss log_scale_durations: True duration_loss_weight: 1.0 pitch_loss_weight: 1.0 energy_loss_weight: 1.0 ssim_loss_weight: 1.0 mel_loss_weight: 1.0 postnet_mel_loss_weight: 1.0 spn_loss_weight: 1.0 spn_loss_max_epochs: !ref vocoder: "hifi-gan" pretrained_vocoder: True vocoder_source: speechbrain/tts-hifigan-ljspeech vocoder_download_path: tmpdir_vocoder modules: spn_predictor: !ref model: !ref train_dataloader_opts: batch_size: !ref drop_last: False #True #False num_workers: !ref shuffle: True collate_fn: !new:speechbrain.lobes.models.FastSpeech2.TextMelCollate valid_dataloader_opts: batch_size: !ref num_workers: !ref shuffle: False collate_fn: !new:speechbrain.lobes.models.FastSpeech2.TextMelCollate #optimizer opt_class: !name:torch.optim.Adam lr: !ref weight_decay: !ref betas: !ref noam_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler lr_initial: !ref n_warmup_steps: 4000 #epoch object epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref #checkpointer checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: spn_predictor: !ref model: !ref lr_annealing: !ref counter: !ref input_encoder: !new:speechbrain.dataio.encoder.TextEncoder progress_sample_logger: !new:speechbrain.utils.train_logger.ProgressSampleLogger output_path: !ref batch_sample_size: !ref formats: raw_batch: raw