vits.yaml

# This config contains the default values for training VITS model on LJSpeech dataset.
# If you want to train model on other dataset, you can change config values according to your dataset.
# Most dataset-specific arguments are in the head of the config file, see below.

# TODO: remove unnecessary arguments, refactoring

name: VITS

train_dataset: ???
validation_datasets: ???
sup_data_path: null
sup_data_types: null

phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt"
heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"

# Default values from librosa.pyin
pitch_fmin: 65.40639132514966
pitch_fmax: 2093.004522404789

sample_rate: 22050
n_mel_channels: 80
n_window_size: 1024
n_window_stride: 256
n_fft: 1024
lowfreq: 0
highfreq: null
window: hann

model:
  pitch_fmin: ${pitch_fmin}
  pitch_fmax: ${pitch_fmax}

  sample_rate: ${sample_rate}
  n_mel_channels: ${n_mel_channels}
  n_window_size: ${n_window_size}
  n_window_stride: ${n_window_stride}
  n_fft: ${n_fft}
  lowfreq: ${lowfreq}
  highfreq: ${highfreq}
  window: ${window}
  mel_fmin: 0.0
  mel_fmax: null

  n_speakers: 0
  segment_size: 8192
  c_mel: 45
  c_kl: 1.
  use_spectral_norm: false

  text_normalizer:
    _target_: nemo_text_processing.text_normalization.normalize.Normalizer
    lang: en
    input_case: cased

  text_normalizer_call_kwargs:
    verbose: false
    punct_pre_process: true
    punct_post_process: true

  text_tokenizer:
    _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer
    punct: true
    apostrophe: true
    pad_with_space: false
    g2p:
      _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
      phoneme_dict: ${phoneme_dict_path}
      heteronyms: ${heteronyms_path}
      phoneme_probability: 0.8
      # Relies on the heteronyms list for anything that needs to be disambiguated
      ignore_ambiguous_words: false
      use_chars: true
      use_stresses: true

  train_ds:
    dataset:
      _target_: "nemo.collections.tts.data.dataset.TTSDataset"
      manifest_filepath: ${train_dataset}
      sample_rate: ${model.sample_rate}
      sup_data_path: ${sup_data_path}
      sup_data_types: ${sup_data_types}
      n_fft: ${model.n_fft}
      win_length: ${model.n_window_size}
      hop_length: ${model.n_window_stride}
      window: ${model.window}
      n_mels: ${model.n_mel_channels}
      lowfreq: ${model.lowfreq}
      highfreq: ${model.highfreq}
      max_duration: null
      min_duration: 0.1
      ignore_file: null
      trim: False
      pitch_fmin: ${model.pitch_fmin}
      pitch_fmax: ${model.pitch_fmax}
    
    dataloader_params:
      num_workers: 8
      pin_memory: false

    batch_sampler:
      batch_size: 32
      boundaries: [32,300,400,500,600,700,800,900,1000]
      num_replicas: ${trainer.devices}
      shuffle: true

  validation_ds:
    dataset:
      _target_: "nemo.collections.tts.data.dataset.TTSDataset"
      manifest_filepath: ${validation_datasets}
      sample_rate: ${model.sample_rate}
      sup_data_path: ${sup_data_path}
      sup_data_types: ${sup_data_types}
      n_fft: ${model.n_fft}
      win_length: ${model.n_window_size}
      hop_length: ${model.n_window_stride}
      window: ${model.window}
      n_mels: ${model.n_mel_channels}
      lowfreq: ${model.lowfreq}
      highfreq: ${model.highfreq}
      max_duration: null
      min_duration: 0.1
      ignore_file: null
      trim: False
      pitch_fmin: ${model.pitch_fmin}
      pitch_fmax: ${model.pitch_fmax}

    dataloader_params:
      drop_last: false
      shuffle: false
      batch_size: 16
      num_workers: 4
      pin_memory: false

  preprocessor:
    _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures
    nfilt: ${model.n_mel_channels}
    highfreq: ${model.highfreq}
    log: true
    log_zero_guard_type: clamp
    log_zero_guard_value: 1e-05
    lowfreq: ${model.lowfreq}
    n_fft: ${model.n_fft}
    n_window_size: ${model.n_window_size}
    n_window_stride: ${model.n_window_stride}
    pad_to: 1
    pad_value: 0
    sample_rate: ${model.sample_rate}
    window: ${model.window}
    normalize: null
    preemph: null
    dither: 0.0
    frame_splicing: 1
    stft_conv: false
    nb_augmentation_prob : 0
    mag_power: 1.0
    exact_pad: true
    use_grads: true

  synthesizer:
    _target_: nemo.collections.tts.modules.vits_modules.SynthesizerTrn
    inter_channels: 192
    hidden_channels: 192
    filter_channels: 768
    n_heads: 2 
    n_layers: 6
    kernel_size: 3
    p_dropout: 0.1
    resblock: "1"
    resblock_kernel_sizes: [3,7,11]
    resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
    upsample_rates: [8,8,2,2]
    upsample_initial_channel: 512
    upsample_kernel_sizes: [16,16,4,4]
    n_speakers: ${model.n_speakers}
    gin_channels: 256 # for multi-speaker

  optim:
    _target_: torch.optim.AdamW
    lr: 2e-4
    betas: [0.9, 0.99]
    eps: 1e-9
  
    sched:
      name: ExponentialLR
      lr_decay: 0.999875

trainer:
  num_nodes: 1
  devices: 2
  accelerator: gpu
  strategy: ddp
  precision: 32
  # amp_backend: 'apex'
  # amp_level: 'O2'
  # benchmark: true
  max_epochs: -1
  accumulate_grad_batches: 1
  enable_checkpointing: false # Provided by exp_manager
  logger: false # Provided by exp_manager
  log_every_n_steps: 50
  check_val_every_n_epoch: 1

exp_manager:
  exp_dir: ???
  name: ${name}
  create_tensorboard_logger: true
  create_checkpoint_callback: true
  checkpoint_callback_params:
    monitor: loss_gen_all
    mode: min
  resume_if_exists: false
  resume_ignore_no_checkpoint: false