fastpitch_align_44100.yaml

# This config contains the default values for training FastPitch model with aligner using 44.1KHz sampling
# rate. If you want to train model on other dataset, you can change config values according to your dataset.
# Most dataset-specific arguments are in the head of the config file, see below.

name: FastPitch

train_dataset: ???
validation_datasets: ???
sup_data_path: ???
sup_data_types: [ "align_prior_matrix", "pitch" ]

# Default values from librosa.pyin
pitch_fmin: 65.40639132514966
pitch_fmax: 2093.004522404789

# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
# by running `scripts/dataset_processing/tts/extract_sup_data.py`
pitch_mean: ???  # e.g. 212.35873413085938 for LJSpeech
pitch_std:  ???  # e.g.  68.52806091308594 for LJSpeech

sample_rate: 44100
n_mel_channels: 80
n_window_size: 2048
n_window_stride: 512
n_fft: 2048
lowfreq: 0
highfreq: null
window: hann

phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"

model:
  learn_alignment: true
  bin_loss_warmup_epochs: 100

  n_speakers: 1
  max_token_duration: 75
  symbols_embedding_dim: 384
  pitch_embedding_kernel_size: 3

  pitch_fmin: ${pitch_fmin}
  pitch_fmax: ${pitch_fmax}

  pitch_mean: ${pitch_mean}
  pitch_std: ${pitch_std}

  sample_rate: ${sample_rate}
  n_mel_channels: ${n_mel_channels}
  n_window_size: ${n_window_size}
  n_window_stride: ${n_window_stride}
  n_fft: ${n_fft}
  lowfreq: ${lowfreq}
  highfreq: ${highfreq}
  window: ${window}

  text_normalizer:
    _target_: nemo_text_processing.text_normalization.normalize.Normalizer
    lang: en
    input_case: cased

  text_normalizer_call_kwargs:
    verbose: false
    punct_pre_process: true
    punct_post_process: true

  text_tokenizer:
    _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer
    punct: true
    stresses: true
    chars: true
    apostrophe: true
    pad_with_space: true
    g2p:
      _target_: nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p
      phoneme_dict: ${phoneme_dict_path}
      heteronyms: ${heteronyms_path}
      phoneme_probability: 0.5

  train_ds:
    dataset:
      _target_: nemo.collections.tts.data.dataset.TTSDataset
      manifest_filepath: ${train_dataset}
      sample_rate: ${model.sample_rate}
      sup_data_path: ${sup_data_path}
      sup_data_types: ${sup_data_types}
      n_fft: ${model.n_fft}
      win_length: ${model.n_window_size}
      hop_length: ${model.n_window_stride}
      window: ${model.window}
      n_mels: ${model.n_mel_channels}
      lowfreq: ${model.lowfreq}
      highfreq: ${model.highfreq}
      max_duration: null
      min_duration: 0.1
      ignore_file: null
      trim: false
      pitch_fmin: ${model.pitch_fmin}
      pitch_fmax: ${model.pitch_fmax}
      pitch_norm: true
      pitch_mean: ${model.pitch_mean}
      pitch_std: ${model.pitch_std}
      use_beta_binomial_interpolator: true
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 32
      num_workers: 12
      pin_memory: true


  validation_ds:
    dataset:
      _target_: nemo.collections.tts.data.dataset.TTSDataset
      manifest_filepath: ${validation_datasets}
      sample_rate: ${model.sample_rate}
      sup_data_path: ${sup_data_path}
      sup_data_types: ${sup_data_types}
      n_fft: ${model.n_fft}
      win_length: ${model.n_window_size}
      hop_length: ${model.n_window_stride}
      window: ${model.window}
      n_mels: ${model.n_mel_channels}
      lowfreq: ${model.lowfreq}
      highfreq: ${model.highfreq}
      max_duration: null
      min_duration: null
      ignore_file: null
      trim: false
      pitch_fmin: ${model.pitch_fmin}
      pitch_fmax: ${model.pitch_fmax}
      pitch_norm: true
      pitch_mean: ${model.pitch_mean}
      pitch_std: ${model.pitch_std}
      use_beta_binomial_interpolator: true

    dataloader_params:
      drop_last: false
      shuffle: false
      batch_size: 32
      num_workers: 8
      pin_memory: true

  preprocessor:
    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
    features: ${model.n_mel_channels}
    lowfreq: ${model.lowfreq}
    highfreq: ${model.highfreq}
    n_fft: ${model.n_fft}
    n_window_size: ${model.n_window_size}
    window_size: false
    n_window_stride:  ${model.n_window_stride}
    window_stride: false
    pad_to: 1
    pad_value: 0
    sample_rate: ${model.sample_rate}
    window: ${model.window}
    normalize: null
    preemph: null
    dither: 0.0
    frame_splicing: 1
    log: true
    log_zero_guard_type: add
    log_zero_guard_value: 1e-05
    mag_power: 1.0

  input_fft: #n_embed and padding_idx are added by the model
    _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder
    n_layer: 6
    n_head: 1
    d_model: ${model.symbols_embedding_dim}
    d_head: 64
    d_inner: 1536
    kernel_size: 3
    dropout: 0.1
    dropatt: 0.1
    dropemb: 0.0
    d_embed: ${model.symbols_embedding_dim}

  output_fft:
    _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder
    n_layer: 6
    n_head: 1
    d_model: ${model.symbols_embedding_dim}
    d_head: 64
    d_inner: 1536
    kernel_size: 3
    dropout: 0.1
    dropatt: 0.1
    dropemb: 0.0

  alignment_module:
    _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder
    n_text_channels: ${model.symbols_embedding_dim}

  duration_predictor:
    _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
    input_size: ${model.symbols_embedding_dim}
    kernel_size: 3
    filter_size: 256
    dropout: 0.1
    n_layers: 2

  pitch_predictor:
    _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
    input_size: ${model.symbols_embedding_dim}
    kernel_size: 3
    filter_size: 256
    dropout: 0.1
    n_layers: 2

  optim:
    name: adamw
    lr: 1e-3
    betas: [0.9, 0.999]
    weight_decay: 1e-6

    sched:
      name: NoamAnnealing
      warmup_steps: 1000
      last_epoch: -1
      d_model: 1  # Disable scaling based on model dim

trainer:
  num_nodes: 1
  devices: -1 # number of gpus
  accelerator: gpu
  strategy: ddp
  precision: 16
  max_epochs: 1500
  accumulate_grad_batches: 1
  gradient_clip_val: 1000.0
  enable_checkpointing: False  # Provided by exp_manager
  logger: False  # Provided by exp_manager
  log_every_n_steps: 100
  check_val_every_n_epoch: 5
  benchmark: false

exp_manager:
  exp_dir: null
  name: ${name}
  create_tensorboard_logger: True
  create_checkpoint_callback: True
  checkpoint_callback_params:
    monitor: val_loss
  resume_if_exists: false
  resume_ignore_no_checkpoint: false