############################################################################ # Model: Tacotron2 # Tokens: Raw characters (English text) # losses: Transducer # Training: LJSpeech # Authors: Georges Abous-Rjeili, Artem Ploujnikov, Yingzhi Wang # ############################################################################ ################################### # Experiment Parameters and setup # ################################### seed: 1234 __set_seed: !apply:torch.manual_seed [!ref ] output_folder: !ref ./results/tacotron2/ save_folder: !ref /save train_log: !ref /train_log.txt epochs: 750 keep_checkpoint_interval: 50 ################################### # Progress Samples # ################################### # Progress samples are used to monitor the progress # of an ongoing training session by outputting samples # of spectrograms, alignments, etc at regular intervals # Whether to enable progress samples progress_samples: True # The path where the samples will be stored progress_sample_path: !ref /samples # The interval, in epochs. For instance, if it is set to 5, # progress samples will be output every 5 epochs progress_samples_interval: 1 # The sample size for raw batch samples saved in batch.pth # (useful mostly for model debugging) progress_batch_sample_size: 3 ################################# # Data files and pre-processing # ################################# data_folder: !PLACEHOLDER # e.g, /localscratch/ljspeech train_json: !ref /train.json valid_json: !ref /valid.json test_json: !ref /test.json splits: ["train", "valid"] split_ratio: [90, 10] skip_prep: False # Use the original preprocessing from nvidia # The cleaners to be used (applicable to nvidia only) text_cleaners: ['english_cleaners'] ################################ # Audio Parameters # ################################ sample_rate: 22050 hop_length: 256 win_length: 1024 n_mel_channels: 80 n_fft: 1024 mel_fmin: 0.0 mel_fmax: 8000.0 mel_normalized: False power: 1 norm: "slaney" mel_scale: "slaney" dynamic_range_compression: True ################################ # Optimization Hyperparameters # ################################ learning_rate: 0.001 weight_decay: 0.000006 batch_size: 64 #minimum 2 num_workers: 8 mask_padding: True guided_attention_sigma: 0.2 guided_attention_weight: 50.0 guided_attention_weight_half_life: 10. guided_attention_hard_stop: 50 gate_loss_weight: 1.0 train_dataloader_opts: batch_size: !ref drop_last: False #True #False num_workers: !ref collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate valid_dataloader_opts: batch_size: !ref num_workers: !ref collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate test_dataloader_opts: batch_size: !ref num_workers: !ref collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate ################################ # Model Parameters and model # ################################ n_symbols: 148 #fixed depending on symbols in textToSequence symbols_embedding_dim: 512 # Encoder parameters encoder_kernel_size: 5 encoder_n_convolutions: 3 encoder_embedding_dim: 512 # Decoder parameters # The number of frames in the target per encoder step n_frames_per_step: 1 decoder_rnn_dim: 1024 prenet_dim: 256 max_decoder_steps: 1000 gate_threshold: 0.5 p_attention_dropout: 0.1 p_decoder_dropout: 0.1 decoder_no_early_stopping: False # Attention parameters attention_rnn_dim: 1024 attention_dim: 128 # Location Layer parameters attention_location_n_filters: 32 attention_location_kernel_size: 31 # Mel-post processing network parameters postnet_embedding_dim: 512 postnet_kernel_size: 5 postnet_n_convolutions: 5 mel_spectogram: !name:speechbrain.lobes.models.Tacotron2.mel_spectogram sample_rate: !ref hop_length: !ref win_length: !ref n_fft: !ref n_mels: !ref f_min: !ref f_max: !ref power: !ref normalized: !ref norm: !ref mel_scale: !ref compression: !ref #model model: !new:speechbrain.lobes.models.Tacotron2.Tacotron2 mask_padding: !ref n_mel_channels: !ref # symbols n_symbols: !ref symbols_embedding_dim: !ref # encoder encoder_kernel_size: !ref encoder_n_convolutions: !ref encoder_embedding_dim: !ref # attention attention_rnn_dim: !ref attention_dim: !ref # attention location attention_location_n_filters: !ref attention_location_kernel_size: !ref # decoder n_frames_per_step: !ref decoder_rnn_dim: !ref prenet_dim: !ref max_decoder_steps: !ref gate_threshold: !ref p_attention_dropout: !ref p_decoder_dropout: !ref # postnet postnet_embedding_dim: !ref postnet_kernel_size: !ref postnet_n_convolutions: !ref decoder_no_early_stopping: !ref guided_attention_scheduler: !new:speechbrain.nnet.schedulers.StepScheduler initial_value: !ref half_life: !ref criterion: !new:speechbrain.lobes.models.Tacotron2.Loss gate_loss_weight: !ref guided_attention_weight: !ref guided_attention_sigma: !ref guided_attention_scheduler: !ref guided_attention_hard_stop: !ref modules: model: !ref #optimizer opt_class: !name:torch.optim.Adam lr: !ref weight_decay: !ref #epoch object epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref #annealing_function lr_annealing: !new:speechbrain.nnet.schedulers.IntervalScheduler intervals: - steps: 6000 lr: 0.0005 - steps: 8000 lr: 0.0003 - steps: 10000 lr: 0.0001 #checkpointer checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: model: !ref counter: !ref scheduler: !ref #infer: !name:speechbrain.lobes.models.Tacotron2.infer progress_sample_logger: !new:speechbrain.utils.train_logger.ProgressSampleLogger output_path: !ref batch_sample_size: !ref formats: raw_batch: raw