fastpitch_ssl.yaml

# This config contains the default values for training FastPitch model with aligner on LJSpeech dataset.
# If you want to train model on other dataset, you can change config values according to your dataset.
# Most dataset-specific arguments are in the head of the config file, see below.

name: FastPitch

train_dataset: ???
validation_datasets: ???
ssl_model_ckpt_path: ???
hifi_ckpt_path: ???
sup_data_dir: null

# LJSpeech stats (per frame)
# ignored if pitch_normalization: speaker_wise
pitch_mean: ??? #212.35873413085938
pitch_std: ??? #68.52806091308594

# Default values for dataset with sample_rate=22050
sample_rate: 22050
n_mel_channels: 80
n_window_size: 1024
n_window_stride: 256
n_fft: 1024
lowfreq: 0
highfreq: 8000
window: hann


ssl_content_emb_type: "embedding_and_probs"
speaker_stats_pitch_fp: null
pitch_normalization: speaker_wise
use_unique_tokens: true
speaker_conditioning_type: per_sample
segment_speaker_embedding: true
ssl_downsampling_factor: 4 # How many mel-spectrogram frames map to one content embedding in the SSL model

model:
  ssl_model_ckpt_path: ${ssl_model_ckpt_path}
  ssl_downsampling_factor: ${ssl_downsampling_factor}
  use_encoder: true
  use_duration_predictor: ${use_unique_tokens}
  pitch_conditioning: true
  pitch_loss_scale: 1.0
  learn_alignment: true
  bin_loss_warmup_epochs: 100

  n_speakers: 1
  n_datasets: 1
  max_token_duration: 75
  symbols_embedding_dim: 384
  pitch_embedding_kernel_size: 3

  sample_rate: ${sample_rate}
  n_mel_channels: ${n_mel_channels}
  n_window_size: ${n_window_size}
  n_window_stride: ${n_window_stride}
  n_fft: ${n_fft}
  lowfreq: ${lowfreq}
  highfreq: ${highfreq}
  window: ${window}
  
  content_emb_indim: 174
  speaker_emb_indim: 256
  content_emb_outdim: 192
  speaker_emb_outdim: 192
  
  train_ds:
    dataset:
      _target_: nemo.collections.tts.data.dataset.FastPitchSSLDataset
      manifest_filepath: ${train_dataset}
      sample_rate: ${model.sample_rate}
      ssl_content_emb_type: ${ssl_content_emb_type}
      pitch_conditioning: true
      pitch_normalization: ${pitch_normalization}
      pitch_mean: ${pitch_mean}
      pitch_std: ${pitch_std}
      speaker_stats_pitch_fp: ${speaker_stats_pitch_fp}
      min_duration: 0.5
      max_duration: 16.0
      pad_multiple: 1024
      speaker_conditioning_type: ${speaker_conditioning_type}
      sup_data_dir: ${sup_data_dir}

    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 2
      num_workers: 8
      pin_memory: true

  validation_ds:
    dataset:
      _target_: nemo.collections.tts.data.dataset.FastPitchSSLDataset
      manifest_filepath: ${validation_datasets}
      sample_rate: ${model.sample_rate}
      ssl_content_emb_type: ${ssl_content_emb_type}
      pitch_conditioning: true
      pitch_normalization: ${pitch_normalization}
      pitch_mean: ${pitch_mean}
      pitch_std: ${pitch_std}
      speaker_stats_pitch_fp: ${speaker_stats_pitch_fp}
      min_duration: 0.5
      max_duration: 16.0
      pad_multiple: 1024
      speaker_conditioning_type: ${speaker_conditioning_type}
      sup_data_dir: ${sup_data_dir}

    dataloader_params:
      drop_last: false
      shuffle: false
      batch_size: 2
      num_workers: 0
      pin_memory: true

  # both encoder and decoder have same architecture, FFTransformerDecoder 
  encoder: #n_embed and padding_idx are added by the model
    _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder
    n_layer: 6
    n_head: 1
    d_model: ${model.symbols_embedding_dim}
    d_head: 64
    d_inner: 1536
    kernel_size: 3
    dropout: 0.1
    dropatt: 0.1
    dropemb: 0.0

  output_fft:
    _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder
    n_layer: 6
    n_head: 1
    d_model: ${model.symbols_embedding_dim}
    d_head: 64
    d_inner: 1536
    kernel_size: 3
    dropout: 0.1
    dropatt: 0.1
    dropemb: 0.0

  duration_predictor:
    _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
    input_size: ${model.symbols_embedding_dim}
    kernel_size: 3
    filter_size: 256
    dropout: 0.1
    n_layers: 2

  pitch_predictor:
    _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
    input_size: ${model.symbols_embedding_dim}
    kernel_size: 3
    filter_size: 256
    dropout: 0.1
    n_layers: 2

  optim:
    _target_: torch.optim.AdamW
    lr: 0.0002
    betas: [0.8, 0.99]

trainer:
  num_nodes: 1
  devices: -1
  accelerator: gpu
  strategy: ddp
  precision: 32
  max_epochs: 1000
  accumulate_grad_batches: 1
  gradient_clip_val: 1000.0
  enable_checkpointing: False # Provided by exp_manager
  logger: false # Provided by exp_manager
  log_every_n_steps: 100
  check_val_every_n_epoch: 5
  benchmark: false

exp_manager:
  exp_dir: null
  name: ${name}
  create_tensorboard_logger: true
  create_checkpoint_callback: true
  checkpoint_callback_params:
    monitor: v_loss 
  resume_if_exists: false
  resume_ignore_no_checkpoint: false