# TODO(Oktai15): update this config in 1.8.0 version name: Tacotron2 sample_rate: 44100 # , , will be added by the tacotron2.py script labels: - ' ' - '!' - '"' - '''' - ( - ) - ',' - '-' - . - ':' - ; - '?' - a - b - c - d - e - f - g - h - i - j - k - l - m - 'n' - o - p - q - r - s - t - u - v - w - x - 'y' - z n_fft: 2048 n_mels: 80 fmax: null n_stride: 512 pad_value: -11.52 train_dataset: ??? validation_datasets: ??? model: labels: ${labels} train_ds: dataset: _target_: "nemo.collections.asr.data.audio_to_text.AudioToCharDataset" manifest_filepath: ${train_dataset} max_duration: null min_duration: 0.1 trim: false int_values: false normalize: true sample_rate: ${sample_rate} # bos_id: 66 # eos_id: 67 # pad_id: 68 These parameters are added automatically in Tacotron2 dataloader_params: drop_last: false shuffle: true batch_size: 48 num_workers: 4 pin_memory: true validation_ds: dataset: _target_: "nemo.collections.asr.data.audio_to_text.AudioToCharDataset" manifest_filepath: ${validation_datasets} max_duration: null min_duration: 0.1 int_values: false normalize: true sample_rate: ${sample_rate} trim: false # bos_id: 66 # eos_id: 67 # pad_id: 68 These parameters are added automatically in Tacotron2 dataloader_params: drop_last: false shuffle: false batch_size: 48 num_workers: 8 pin_memory: true preprocessor: _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures dither: 0.0 nfilt: ${n_mels} frame_splicing: 1 highfreq: ${fmax} log: true log_zero_guard_type: clamp log_zero_guard_value: 1e-05 lowfreq: 0 mag_power: 1.0 n_fft: ${n_fft} n_window_size: 2048 n_window_stride: ${n_stride} normalize: null pad_to: 16 pad_value: ${pad_value} preemph: null sample_rate: ${sample_rate} window: hann encoder: _target_: nemo.collections.tts.modules.tacotron2.Encoder encoder_kernel_size: 5 encoder_n_convolutions: 3 encoder_embedding_dim: 512 decoder: _target_: nemo.collections.tts.modules.tacotron2.Decoder decoder_rnn_dim: 1024 encoder_embedding_dim: ${model.encoder.encoder_embedding_dim} gate_threshold: 0.5 max_decoder_steps: 1000 n_frames_per_step: 1 # currently only 1 is supported n_mel_channels: ${n_mels} p_attention_dropout: 0.1 p_decoder_dropout: 0.1 prenet_dim: 256 prenet_p_dropout: 0.5 # Attention parameters attention_dim: 128 attention_rnn_dim: 1024 # AttentionLocation Layer parameters attention_location_kernel_size: 31 attention_location_n_filters: 32 early_stopping: true postnet: _target_: nemo.collections.tts.modules.tacotron2.Postnet n_mel_channels: ${n_mels} p_dropout: 0.5 postnet_embedding_dim: 512 postnet_kernel_size: 5 postnet_n_convolutions: 5 optim: name: adam lr: 1e-3 weight_decay: 1e-6 # scheduler setup sched: name: CosineAnnealing min_lr: 1e-5 trainer: devices: 1 # number of gpus max_epochs: ??? num_nodes: 1 accelerator: gpu strategy: ddp accumulate_grad_batches: 1 enable_checkpointing: False # Provided by exp_manager logger: False # Provided by exp_manager gradient_clip_val: 1.0 log_every_n_steps: 60 check_val_every_n_epoch: 2 benchmark: false exp_manager: exp_dir: null name: ${name} create_tensorboard_logger: True create_checkpoint_callback: True