rad-tts_feature_pred.yaml

name: RadTTS
sample_rate: 22050

train_dataset: ???
validation_datasets: ???
ckpt_path: ???
export_dir: ???
sup_data_path: ???
sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voiced", "energy"]

# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
# by running `scripts/dataset_processing/tts/extract_sup_data.py`
pitch_mean: ???  # e.g. 212.35873413085938 for LJSpeech
pitch_std:  ???  # e.g.  68.52806091308594 for LJSpeech

# default values from librosa.pyin
pitch_fmin: 65.40639132514966
pitch_fmax: 2093.004522404789

# default values for sample_rate=22050
n_mels: 80
n_window_size: 1024
n_window_stride: 256
n_fft: 1024
lowfreq: 0
highfreq: 8000
window: "hann"

phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
mapping_file_path: ""

model:
  target: nemo.collections.tts.models.RadTTSModel
  bin_loss_start_ratio: 0.2
  bin_loss_warmup_epochs: 100

  symbols_embedding_dim: 384
  n_mel_channels: ${n_mels}

  pitch_mean: ${pitch_mean}
  pitch_std: ${pitch_std}

  text_normalizer:
      _target_: nemo_text_processing.text_normalization.normalize.Normalizer
      lang: en
      input_case: cased

  text_normalizer_call_kwargs:
      verbose: false
      punct_pre_process: true
      punct_post_process: true

  text_tokenizer:
    _target_: nemo.collections.tts.torch.tts_tokenizers.EnglishPhonemesTokenizer
    punct: true
    stresses: true
    chars: true
    apostrophe: true
    pad_with_space: true
    g2p:
      _target_: nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p
      phoneme_dict: ${phoneme_dict_path}
      heteronyms: ${heteronyms_path}
      phoneme_probability: 0.5
      mapping_file: ${mapping_file_path}

  train_ds:
    dataset:
      _target_: "nemo.collections.tts.data.dataset.TTSDataset"
      manifest_filepath: ${train_dataset}
      sample_rate: ${sample_rate}
      sup_data_path: ${sup_data_path}
      sup_data_types: ${sup_data_types}
      n_fft: ${n_fft}
      win_length: ${n_window_size}
      hop_length: ${n_window_stride}
      window: ${window}
      n_mels: ${n_mels}
      lowfreq: ${lowfreq}
      highfreq: ${highfreq}
      max_duration: null
      min_duration: 0.1
      ignore_file: null
      trim: False
      pitch_fmin: ${pitch_fmin}
      pitch_fmax: ${pitch_fmax}


      text_tokenizer:
        _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer"
        punct: True
        stresses: True
        chars: True
        space: ' '
        silence: null
        apostrophe: True
        sep: '|'
        add_blank_at: null
        pad_with_space: True
        g2p:
          _target_: "nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p"
          phoneme_dict: ${phoneme_dict_path}
          heteronyms: ${heteronyms_path}
          phoneme_probability: 0.5
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 32
      num_workers: 8
      pin_memory: True

  validation_ds:
    dataset:
      _target_: "nemo.collections.tts.data.dataset.TTSDataset"
      manifest_filepath: ${validation_datasets}
      sample_rate: ${sample_rate}
      sup_data_path: ${sup_data_path}
      sup_data_types: ${sup_data_types}
      n_fft: ${n_fft}
      win_length: ${n_window_size}
      hop_length: ${n_window_stride}
      window: ${window}
      n_mels: ${n_mels}
      lowfreq: ${lowfreq}
      highfreq: ${highfreq}
      max_duration: null
      min_duration: 0.1
      ignore_file: null
      trim: False
      pitch_fmin: ${pitch_fmin}
      pitch_fmax: ${pitch_fmax}

      text_tokenizer:
        _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer"
        punct: True
        stresses: True
        chars: True
        space: ' '
        silence: null
        apostrophe: True
        sep: '|'
        add_blank_at: null
        pad_with_space: True
        g2p:
          _target_: "nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p"
          phoneme_dict: ${phoneme_dict_path}
          heteronyms: ${heteronyms_path}
          phoneme_probability: 0.5
    dataloader_params:
      drop_last: false
      shuffle: false
      batch_size: 32
      num_workers: 8
      pin_memory: True

  optim:
    name: RAdam
    lr: 0.001
    betas: [0.9, 0.98]
    weight_decay: 0.000001

    sched:
      name: exp_decay
      warmup_steps: 40000
      last_epoch: -1
      d_model: 1  # Disable scaling based on model dim
  trainerConfig:
    sigma: 1
    iters_per_checkpoint: 3000
    seed: null
    ignore_layers: []
    finetune_layers: []
    include_layers: []
    with_tensorboard: true
    dur_loss_weight: 1
    ctc_loss_weight: 1
    mask_unvoiced_f0: false
    log_step: 1
    binarization_start_iter: 1000000
    kl_loss_start_iter: 1000000
    loss_weights:
        ctc_loss_weight: 0.1
        dur_loss_weight: 1.0
        f0_loss_weight: 1.0
        energy_loss_weight: 1.0
        vpred_loss_weight: 1.0
    unfreeze_modules: "durf0energyvpred"

  load_from_checkpoint: True
  init_from_ptl_ckpt: ${ckpt_path}
  modelConfig:
        _target_: "nemo.collections.tts.modules.radtts.RadTTSModule"
        n_speakers: 1
        n_speaker_dim: 16
        n_text: 384 #185
        n_text_dim: 512
        n_flows: 8
        n_conv_layers_per_step: 4
        n_mel_channels: 80
        n_hidden: 1024
        mel_encoder_n_hidden: 512
        n_components: 0
        mean_scale: 0
        fixed_gaussian: true
        dummy_speaker_embedding: false
        use_positional_embedding: false
        n_early_size: 2
        n_early_every: 2
        n_group_size: 2
        use_feature_gating: false
        affine_model: wavenet
        include_modules: "decatnunvbiasdpmvpredapm"
        what_to_train: decatnunvbias
        scaling_fn: tanh
        reduction_norm: ""
        matrix_decomposition: LUS
        learn_alignments: true
        use_query_proj: true
        align_query_enc_type: 3xconv
        lstm_applicable_steps: []
        use_context_lstm: true
        context_lstm_norm: spectral
        context_lstm_w_f0_and_energy: true
        text_encoder_lstm_norm: spectral
        use_text_conditional_priors: false
        zero_out_context: false
        n_aug_dims: 6
        n_f0_dims: 1
        n_energy_avg_dims: 1
        use_first_order_features: false
        unvoiced_bias_activation: "relu"
        decoder_use_partial_padding: false
        decoder_use_unvoiced_bias: true
        ap_pred_log_f0: true
        ap_use_unvoiced_bias: true
        ap_use_voiced_embeddings: true
        p_dropout: 0.1
        noise_to_unvoiced_in_f0: 0
        noise_to_pvoiced: 0
        dur_model_config:
            name: dap
            hparams:
                n_speaker_dim: 16
                bottleneck_hparams:
                    in_dim: 512
                    reduction_factor: 16
                    norm: weightnorm
                    non_linearity: relu
                take_log_of_input: true
                arch_hparams:
                    out_dim: 1
                    n_layers: 2
                    n_channels: 256
                    kernel_size: 3
                    p_dropout: 0.1
        f0_model_config:
            name: dap
            hparams:
                n_speaker_dim: 16
                bottleneck_hparams:
                    in_dim: 512
                    reduction_factor: 16
                    norm: weightnorm
                    non_linearity: relu
                take_log_of_input: false
                arch_hparams:
                    out_dim: 1
                    n_layers: 2
                    n_channels: 256
                    kernel_size: 11
                    p_dropout: 0.5

        energy_model_config:
            name: dap
            hparams:
                n_speaker_dim: 16
                bottleneck_hparams:
                    in_dim: 512
                    reduction_factor: 16
                    norm: weightnorm
                    non_linearity: relu
                take_log_of_input: false
                arch_hparams:
                    out_dim: 1
                    n_layers: 2
                    n_channels: 256
                    kernel_size: 3
                    p_dropout: 0.5
        v_model_config :
            name: dap
            hparams:
                n_speaker_dim: 16
                take_log_of_input: false
                bottleneck_hparams:
                    in_dim: 512
                    reduction_factor: 16
                    norm: weightnorm
                    non_linearity: relu
                arch_hparams:
                    out_dim: 1
                    n_layers: 2
                    n_channels: 256
                    kernel_size: 3
                    p_dropout: 0.5

trainer:
  devices: 8
  precision: 16
  max_epochs: 1000
  num_nodes: 1
  accelerator: gpu
  strategy: ddp
  accumulate_grad_batches: 1
  enable_checkpointing: False
  logger: False
  gradient_clip_val: 1
  flush_logs_every_n_steps: 1000
  log_every_n_steps: 100
  check_val_every_n_epoch: 2

exp_manager:
  exp_dir: ${export_dir}
  name: ${name}
  create_tensorboard_logger: True
  create_checkpoint_callback: True
  checkpoint_callback_params:
    monitor: val/loss_energy
    mode: min
    filepath: ${export_dir}
    filename: model_checkpoint