tacotron2_44100.yaml

# TODO(Oktai15): update this config in 1.8.0 version

name: Tacotron2
sample_rate: 44100
# <PAD>, <BOS>, <EOS> will be added by the tacotron2.py script
labels:
- ' '
- '!'
- '"'
- ''''
- (
- )
- ','
- '-'
- .
- ':'
- ;
- '?'
- a
- b
- c
- d
- e
- f
- g
- h
- i
- j
- k
- l
- m
- 'n'
- o
- p
- q
- r
- s
- t
- u
- v
- w
- x
- 'y'
- z
n_fft: 2048
n_mels: 80
fmax: null
n_stride: 512
pad_value: -11.52
train_dataset: ???
validation_datasets: ???

model:
  labels: ${labels}
  train_ds:
    dataset:
      _target_: "nemo.collections.asr.data.audio_to_text.AudioToCharDataset"
      manifest_filepath: ${train_dataset}
      max_duration: null
      min_duration: 0.1
      trim: false
      int_values: false
      normalize: true
      sample_rate: ${sample_rate}
      # bos_id: 66
      # eos_id: 67
      # pad_id: 68  These parameters are added automatically in Tacotron2
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 48
      num_workers: 4
      pin_memory: true


  validation_ds:
    dataset:
      _target_: "nemo.collections.asr.data.audio_to_text.AudioToCharDataset"
      manifest_filepath: ${validation_datasets}
      max_duration: null
      min_duration: 0.1
      int_values: false
      normalize: true
      sample_rate: ${sample_rate}
      trim: false
      # bos_id: 66
      # eos_id: 67
      # pad_id: 68  These parameters are added automatically in Tacotron2
    dataloader_params:
      drop_last: false
      shuffle: false
      batch_size: 48
      num_workers: 8
      pin_memory: true

  preprocessor:
    _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures
    dither: 0.0
    nfilt: ${n_mels}
    frame_splicing: 1
    highfreq: ${fmax}
    log: true
    log_zero_guard_type: clamp
    log_zero_guard_value: 1e-05
    lowfreq: 0
    mag_power: 1.0
    n_fft: ${n_fft}
    n_window_size: 2048
    n_window_stride: ${n_stride}
    normalize: null
    pad_to: 16
    pad_value: ${pad_value}
    preemph: null
    sample_rate: ${sample_rate}
    window: hann

  encoder:
    _target_: nemo.collections.tts.modules.tacotron2.Encoder
    encoder_kernel_size: 5
    encoder_n_convolutions: 3
    encoder_embedding_dim: 512

  decoder:
    _target_: nemo.collections.tts.modules.tacotron2.Decoder
    decoder_rnn_dim: 1024
    encoder_embedding_dim: ${model.encoder.encoder_embedding_dim}
    gate_threshold: 0.5
    max_decoder_steps: 1000
    n_frames_per_step: 1  # currently only 1 is supported
    n_mel_channels: ${n_mels}
    p_attention_dropout: 0.1
    p_decoder_dropout: 0.1
    prenet_dim: 256
    prenet_p_dropout: 0.5
    # Attention parameters
    attention_dim: 128
    attention_rnn_dim: 1024
    # AttentionLocation Layer parameters
    attention_location_kernel_size: 31
    attention_location_n_filters: 32
    early_stopping: true

  postnet:
    _target_: nemo.collections.tts.modules.tacotron2.Postnet
    n_mel_channels: ${n_mels}
    p_dropout: 0.5
    postnet_embedding_dim: 512
    postnet_kernel_size: 5
    postnet_n_convolutions: 5

  optim:
    name: adam
    lr: 1e-3
    weight_decay: 1e-6

    # scheduler setup
    sched:
      name: CosineAnnealing
      min_lr: 1e-5


trainer:
  devices: 1 # number of gpus
  max_epochs: ???
  num_nodes: 1
  accelerator: gpu
  strategy: ddp
  accumulate_grad_batches: 1
  enable_checkpointing: False  # Provided by exp_manager
  logger: False  # Provided by exp_manager
  gradient_clip_val: 1.0
  log_every_n_steps: 60
  check_val_every_n_epoch: 2
  benchmark: false

exp_manager:
  exp_dir: null
  name: ${name}
  create_tensorboard_logger: True
  create_checkpoint_callback: True