rad-tts_feature_pred.yaml 9.19 KB
Newer Older
wxj's avatar
wxj committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
name: RadTTS
sample_rate: 22050

train_dataset: ???
validation_datasets: ???
ckpt_path: ???
export_dir: ???
sup_data_path: ???
sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voiced", "energy"]

# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
# by running `scripts/dataset_processing/tts/extract_sup_data.py`
pitch_mean: ???  # e.g. 212.35873413085938 for LJSpeech
pitch_std:  ???  # e.g.  68.52806091308594 for LJSpeech

# default values from librosa.pyin
pitch_fmin: 65.40639132514966
pitch_fmax: 2093.004522404789

# default values for sample_rate=22050
n_mels: 80
n_window_size: 1024
n_window_stride: 256
n_fft: 1024
lowfreq: 0
highfreq: 8000
window: "hann"

phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
mapping_file_path: ""

model:
  target: nemo.collections.tts.models.RadTTSModel
  bin_loss_start_ratio: 0.2
  bin_loss_warmup_epochs: 100

  symbols_embedding_dim: 384
  n_mel_channels: ${n_mels}

  pitch_mean: ${pitch_mean}
  pitch_std: ${pitch_std}

  text_normalizer:
      _target_: nemo_text_processing.text_normalization.normalize.Normalizer
      lang: en
      input_case: cased

  text_normalizer_call_kwargs:
      verbose: false
      punct_pre_process: true
      punct_post_process: true

  text_tokenizer:
    _target_: nemo.collections.tts.torch.tts_tokenizers.EnglishPhonemesTokenizer
    punct: true
    stresses: true
    chars: true
    apostrophe: true
    pad_with_space: true
    g2p:
      _target_: nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p
      phoneme_dict: ${phoneme_dict_path}
      heteronyms: ${heteronyms_path}
      phoneme_probability: 0.5
      mapping_file: ${mapping_file_path}

  train_ds:
    dataset:
      _target_: "nemo.collections.tts.data.dataset.TTSDataset"
      manifest_filepath: ${train_dataset}
      sample_rate: ${sample_rate}
      sup_data_path: ${sup_data_path}
      sup_data_types: ${sup_data_types}
      n_fft: ${n_fft}
      win_length: ${n_window_size}
      hop_length: ${n_window_stride}
      window: ${window}
      n_mels: ${n_mels}
      lowfreq: ${lowfreq}
      highfreq: ${highfreq}
      max_duration: null
      min_duration: 0.1
      ignore_file: null
      trim: False
      pitch_fmin: ${pitch_fmin}
      pitch_fmax: ${pitch_fmax}



      text_tokenizer:
        _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer"
        punct: True
        stresses: True
        chars: True
        space: ' '
        silence: null
        apostrophe: True
        sep: '|'
        add_blank_at: null
        pad_with_space: True
        g2p:
          _target_: "nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p"
          phoneme_dict: ${phoneme_dict_path}
          heteronyms: ${heteronyms_path}
          phoneme_probability: 0.5
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 32
      num_workers: 8
      pin_memory: True

  validation_ds:
    dataset:
      _target_: "nemo.collections.tts.data.dataset.TTSDataset"
      manifest_filepath: ${validation_datasets}
      sample_rate: ${sample_rate}
      sup_data_path: ${sup_data_path}
      sup_data_types: ${sup_data_types}
      n_fft: ${n_fft}
      win_length: ${n_window_size}
      hop_length: ${n_window_stride}
      window: ${window}
      n_mels: ${n_mels}
      lowfreq: ${lowfreq}
      highfreq: ${highfreq}
      max_duration: null
      min_duration: 0.1
      ignore_file: null
      trim: False
      pitch_fmin: ${pitch_fmin}
      pitch_fmax: ${pitch_fmax}

      text_tokenizer:
        _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer"
        punct: True
        stresses: True
        chars: True
        space: ' '
        silence: null
        apostrophe: True
        sep: '|'
        add_blank_at: null
        pad_with_space: True
        g2p:
          _target_: "nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p"
          phoneme_dict: ${phoneme_dict_path}
          heteronyms: ${heteronyms_path}
          phoneme_probability: 0.5
    dataloader_params:
      drop_last: false
      shuffle: false
      batch_size: 32
      num_workers: 8
      pin_memory: True

  optim:
    name: RAdam
    lr: 0.001
    betas: [0.9, 0.98]
    weight_decay: 0.000001

    sched:
      name: exp_decay
      warmup_steps: 40000
      last_epoch: -1
      d_model: 1  # Disable scaling based on model dim
  trainerConfig:
    sigma: 1
    iters_per_checkpoint: 3000
    seed: null
    ignore_layers: []
    finetune_layers: []
    include_layers: []
    with_tensorboard: true
    dur_loss_weight: 1
    ctc_loss_weight: 1
    mask_unvoiced_f0: false
    log_step: 1
    binarization_start_iter: 1000000
    kl_loss_start_iter: 1000000
    loss_weights:
        ctc_loss_weight: 0.1
        dur_loss_weight: 1.0
        f0_loss_weight: 1.0
        energy_loss_weight: 1.0
        vpred_loss_weight: 1.0
    unfreeze_modules: "durf0energyvpred"

  load_from_checkpoint: True
  init_from_ptl_ckpt: ${ckpt_path}
  modelConfig:
        _target_: "nemo.collections.tts.modules.radtts.RadTTSModule"
        n_speakers: 1
        n_speaker_dim: 16
        n_text: 384 #185
        n_text_dim: 512
        n_flows: 8
        n_conv_layers_per_step: 4
        n_mel_channels: 80
        n_hidden: 1024
        mel_encoder_n_hidden: 512
        n_components: 0
        mean_scale: 0
        fixed_gaussian: true
        dummy_speaker_embedding: false
        use_positional_embedding: false
        n_early_size: 2
        n_early_every: 2
        n_group_size: 2
        use_feature_gating: false
        affine_model: wavenet
        include_modules: "decatnunvbiasdpmvpredapm"
        what_to_train: decatnunvbias
        scaling_fn: tanh
        reduction_norm: ""
        matrix_decomposition: LUS
        learn_alignments: true
        use_query_proj: true
        align_query_enc_type: 3xconv
        lstm_applicable_steps: []
        use_context_lstm: true
        context_lstm_norm: spectral
        context_lstm_w_f0_and_energy: true
        text_encoder_lstm_norm: spectral
        use_text_conditional_priors: false
        zero_out_context: false
        n_aug_dims: 6
        n_f0_dims: 1
        n_energy_avg_dims: 1
        use_first_order_features: false
        unvoiced_bias_activation: "relu"
        decoder_use_partial_padding: false
        decoder_use_unvoiced_bias: true
        ap_pred_log_f0: true
        ap_use_unvoiced_bias: true
        ap_use_voiced_embeddings: true
        p_dropout: 0.1
        noise_to_unvoiced_in_f0: 0
        noise_to_pvoiced: 0
        dur_model_config:
            name: dap
            hparams:
                n_speaker_dim: 16
                bottleneck_hparams:
                    in_dim: 512
                    reduction_factor: 16
                    norm: weightnorm
                    non_linearity: relu
                take_log_of_input: true
                arch_hparams:
                    out_dim: 1
                    n_layers: 2
                    n_channels: 256
                    kernel_size: 3
                    p_dropout: 0.1
        f0_model_config:
            name: dap
            hparams:
                n_speaker_dim: 16
                bottleneck_hparams:
                    in_dim: 512
                    reduction_factor: 16
                    norm: weightnorm
                    non_linearity: relu
                take_log_of_input: false
                arch_hparams:
                    out_dim: 1
                    n_layers: 2
                    n_channels: 256
                    kernel_size: 11
                    p_dropout: 0.5

        energy_model_config:
            name: dap
            hparams:
                n_speaker_dim: 16
                bottleneck_hparams:
                    in_dim: 512
                    reduction_factor: 16
                    norm: weightnorm
                    non_linearity: relu
                take_log_of_input: false
                arch_hparams:
                    out_dim: 1
                    n_layers: 2
                    n_channels: 256
                    kernel_size: 3
                    p_dropout: 0.5
        v_model_config :
            name: dap
            hparams:
                n_speaker_dim: 16
                take_log_of_input: false
                bottleneck_hparams:
                    in_dim: 512
                    reduction_factor: 16
                    norm: weightnorm
                    non_linearity: relu
                arch_hparams:
                    out_dim: 1
                    n_layers: 2
                    n_channels: 256
                    kernel_size: 3
                    p_dropout: 0.5

trainer:
  devices: 8
  precision: 16
  max_epochs: 1000
  num_nodes: 1
  accelerator: gpu
  strategy: ddp
  accumulate_grad_batches: 1
  enable_checkpointing: False
  logger: False
  gradient_clip_val: 1
  flush_logs_every_n_steps: 1000
  log_every_n_steps: 100
  check_val_every_n_epoch: 2

exp_manager:
  exp_dir: ${export_dir}
  name: ${name}
  create_tensorboard_logger: True
  create_checkpoint_callback: True
  checkpoint_callback_params:
    monitor: val/loss_energy
    mode: min
    filepath: ${export_dir}
    filename: model_checkpoint