tacotron2.yaml

# This config contains the default values for training Tacotron2 model on LJSpeech dataset.
# If you want to train model on other dataset, you can change config values according to your dataset.
# Most dataset-specific arguments are in the head of the config file, see below.

name: Tacotron2

train_dataset: ???
validation_datasets: ???
sup_data_path: null
sup_data_types: null

phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"

model:
  pitch_fmin: 65.40639132514966
  pitch_fmax: 2093.004522404789

  sample_rate: 22050
  n_mel_channels: 80
  n_window_size: 1024
  n_window_stride: 256
  n_fft: 1024
  lowfreq: 0
  highfreq: 8000
  window: hann
  pad_value: -11.52

  text_normalizer:
    _target_: nemo_text_processing.text_normalization.normalize.Normalizer
    lang: en
    input_case: cased

  text_normalizer_call_kwargs:
    verbose: false
    punct_pre_process: true
    punct_post_process: true

  text_tokenizer:
    _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer
    punct: true
    stresses: true
    chars: true
    apostrophe: true
    pad_with_space: true
    g2p:
      _target_: nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p
      phoneme_dict: ${phoneme_dict_path}
      heteronyms: ${heteronyms_path}

  train_ds:
    dataset:
      _target_: "nemo.collections.tts.data.dataset.TTSDataset"
      manifest_filepath: ${train_dataset}
      sample_rate: ${model.sample_rate}
      sup_data_path: ${sup_data_path}
      sup_data_types: ${sup_data_types}
      n_fft: ${model.n_fft}
      win_length: ${model.n_window_size}
      hop_length: ${model.n_window_stride}
      window: ${model.window}
      n_mels: ${model.n_mel_channels}
      lowfreq: ${model.lowfreq}
      highfreq: ${model.highfreq}
      max_duration: null
      min_duration: 0.1
      ignore_file: null
      trim: False
      pitch_fmin: ${model.pitch_fmin}
      pitch_fmax: ${model.pitch_fmax}
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 48
      num_workers: 4
      pin_memory: true

  validation_ds:
    dataset:
      _target_: "nemo.collections.tts.data.dataset.TTSDataset"
      manifest_filepath: ${validation_datasets}
      sample_rate: ${model.sample_rate}
      sup_data_path: ${sup_data_path}
      sup_data_types: ${sup_data_types}
      n_fft: ${model.n_fft}
      win_length: ${model.n_window_size}
      hop_length: ${model.n_window_stride}
      window: ${model.window}
      n_mels: ${model.n_mel_channels}
      lowfreq: ${model.lowfreq}
      highfreq: ${model.highfreq}
      max_duration: null
      min_duration: 0.1
      ignore_file: null
      trim: False
      pitch_fmin: ${model.pitch_fmin}
      pitch_fmax: ${model.pitch_fmax}
    dataloader_params:
      drop_last: false
      shuffle: false
      batch_size: 24
      num_workers: 8
      pin_memory: true

  preprocessor:
    _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures
    nfilt: ${model.n_mel_channels}
    highfreq: ${model.highfreq}
    log: true
    log_zero_guard_type: clamp
    log_zero_guard_value: 1e-05
    lowfreq: ${model.lowfreq}
    n_fft: ${model.n_fft}
    n_window_size: ${model.n_window_size}
    n_window_stride: ${model.n_window_stride}
    pad_to: 16
    pad_value: ${model.pad_value}
    sample_rate: ${model.sample_rate}
    window: ${model.window}
    normalize: null
    preemph: null
    dither: 0.0
    frame_splicing: 1
    stft_conv: false
    nb_augmentation_prob : 0
    mag_power: 1.0
    exact_pad: true
    use_grads: false

  encoder:
    _target_: nemo.collections.tts.modules.tacotron2.Encoder
    encoder_kernel_size: 5
    encoder_n_convolutions: 3
    encoder_embedding_dim: 512

  decoder:
    _target_: nemo.collections.tts.modules.tacotron2.Decoder
    decoder_rnn_dim: 1024
    encoder_embedding_dim: ${model.encoder.encoder_embedding_dim}
    gate_threshold: 0.5
    max_decoder_steps: 1000
    n_frames_per_step: 1  # currently only 1 is supported
    n_mel_channels: ${model.n_mel_channels}
    p_attention_dropout: 0.1
    p_decoder_dropout: 0.1
    prenet_dim: 256
    prenet_p_dropout: 0.5
    # Attention parameters
    attention_dim: 128
    attention_rnn_dim: 1024
    # AttentionLocation Layer parameters
    attention_location_kernel_size: 31
    attention_location_n_filters: 32
    early_stopping: true

  postnet:
    _target_: nemo.collections.tts.modules.tacotron2.Postnet
    n_mel_channels: ${model.n_mel_channels}
    p_dropout: 0.5
    postnet_embedding_dim: 512
    postnet_kernel_size: 5
    postnet_n_convolutions: 5

  optim:
    name: adam
    lr: 1e-3
    weight_decay: 1e-6

    # scheduler setup
    sched:
      name: CosineAnnealing
      min_lr: 1e-5

trainer:
  devices: 1 # number of gpus
  max_epochs: ???
  num_nodes: 1
  accelerator: gpu
  strategy: ddp
  accumulate_grad_batches: 1
  enable_checkpointing: False  # Provided by exp_manager
  logger: False  # Provided by exp_manager
  gradient_clip_val: 1.0
  log_every_n_steps: 60
  check_val_every_n_epoch: 2
  benchmark: false

exp_manager:
  exp_dir: null
  name: ${name}
  create_tensorboard_logger: true
  create_checkpoint_callback: true
  checkpoint_callback_params:
    monitor: val_loss
    mode: min