waveglow.yaml 2.74 KB
Newer Older
wxj's avatar
wxj committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# This config contains the default values for training WaveGlow model on LJSpeech dataset.
# If you want to train model on other dataset, you can change config values according to your dataset.
# Most dataset-specific arguments are in the head of the config file, see below.

name: "WaveGlow"

train_dataset: ???
validation_datasets: ???

# Default values for dataset with sample_rate=22050
sample_rate: 22050
n_mel_channels: 80
n_window_size: 1024
n_window_stride: 256
n_fft: 1024
lowfreq: 0
highfreq: 8000
window: hann

model:
  sigma: 1.0
  train_ds:
    dataset:
      _target_: "nemo.collections.tts.data.dataset.VocoderDataset"
      manifest_filepath: ${train_dataset}
      sample_rate: ${sample_rate}
      max_duration: null
      min_duration: 0.1
      n_segments: 16000
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 12
      num_workers: 4
      pin_memory: true

  validation_ds:
    dataset:
      _target_: "nemo.collections.tts.data.dataset.VocoderDataset"
      manifest_filepath: ${validation_datasets}
      sample_rate: ${sample_rate}
      max_duration: null
      min_duration: 0.1
    dataloader_params:
      drop_last: false
      shuffle: false
      batch_size: 8
      num_workers: 4
      pin_memory: true

  preprocessor:
    _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures
    nfilt: ${n_mel_channels}
    lowfreq: ${lowfreq}
    highfreq: ${highfreq}
    n_fft: ${n_fft}
    # Changing these parameters are not recommended, because WaveGlow is currently hardcoded to these values
    n_window_size: ${n_window_size}
    n_window_stride: ${n_window_stride}
    pad_to: 16
    pad_value: -11.52
    sample_rate: ${sample_rate}
    window: ${window}
    normalize: null
    preemph: null
    dither: 0.0
    frame_splicing: 1
    log: true
    log_zero_guard_type: clamp
    log_zero_guard_value: 1e-05
    mag_power: 1.0

  waveglow:
    _target_: nemo.collections.tts.modules.waveglow.WaveGlowModule
    n_early_every: 4
    n_early_size: 2
    n_flows: 12
    n_group: 8
    n_mel_channels: ${n_mel_channels}
    n_wn_channels: 256
    n_wn_layers: 8
    wn_kernel_size: 3

  optim:
    name: adam
    lr: 1e-4

trainer:
  num_nodes: 1
  devices: 1
  accelerator: gpu
  strategy: ddp
  precision: 16
  max_epochs: ???
  accumulate_grad_batches: 1
  enable_checkpointing: False  # Provided by exp_manager
  logger: false # Provided by exp_manager
  log_every_n_steps: 200
  check_val_every_n_epoch: 25
  benchmark: false

exp_manager:
  exp_dir: null
  name: ${name}
  create_tensorboard_logger: true
  create_checkpoint_callback: true
  create_wandb_logger: false
  wandb_logger_kwargs:
    name: null
    project: null
    entity: null
  resume_if_exists: false
  resume_ignore_no_checkpoint: false