hifigan_v1_48k.yaml 5.86 KB
Newer Older
liugh5's avatar
liugh5 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
model_type: hifigan
Model:
###########################################################
#         GENERATOR NETWORK ARCHITECTURE SETTING          #
###########################################################
  Generator:
    params:
      in_channels: 128                       
      out_channels: 1                      
      channels: 512                       
      kernel_size: 7                     
      upsample_scales: [10, 5, 3, 2, 2]        
      upsample_kernal_sizes: [20, 10, 6, 4, 4] 
      resblock_kernel_sizes: [3, 7, 11]     
      resblock_dilations:                  
            - [1, 3, 5, 7]
            - [1, 3, 5, 7]
            - [1, 3, 5, 7]
      bias: true                           
      causal: true                             
      nonlinear_activation: "LeakyReLU"    
      nonlinear_activation_params:         
        negative_slope: 0.1
      use_weight_norm: true               
    optimizer:
      type: Adam
      params:
        lr: 2.0e-4
        betas: [0.5, 0.9]
        weight_decay: 0.0
    scheduler:
      type: MultiStepLR
      params:
        gamma: 0.5
        milestones:
            - 200000
            - 400000
            - 600000
            - 800000

###########################################################
#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
###########################################################
  MultiScaleDiscriminator:
    params:
      scales: 3                              
      downsample_pooling: "DWT"  
      downsample_pooling_params:
          kernel_size: 4                    
          stride: 2                         
          padding: 2                        
      discriminator_params:
          in_channels: 1                     
          out_channels: 1                    
          kernel_sizes: [15, 41, 5, 3]       
          channels: 128                      
          max_downsample_channels: 1024     
          max_groups: 16                   
          bias: true
          downsample_scales: [4, 4, 4, 4, 1]
          nonlinear_activation: "LeakyReLU"  
          nonlinear_activation_params:
            negative_slope: 0.1
      follow_official_norm: true    
    optimizer:
      type: Adam
      params:
        lr: 2.0e-4
        betas: [0.5, 0.9]
        weight_decay: 0.0
    scheduler:
      type: MultiStepLR
      params:
        gamma: 0.5
        milestones:
            - 200000
            - 400000
            - 600000
            - 800000

  MultiPeriodDiscriminator:
    params:
      periods: [2, 3, 5, 7, 11]      
      discriminator_params:
        in_channels: 1                  
        out_channels: 1                  
        kernel_sizes: [5, 3]              
        channels: 32                       
        downsample_scales: [3, 3, 3, 3, 1] 
        max_downsample_channels: 1024      
        bias: true                       
        nonlinear_activation: "LeakyReLU"  
        nonlinear_activation_params:       
          negative_slope: 0.1
        use_spectral_norm: false           
    optimizer:
      type: Adam
      params:
        lr: 2.0e-4
        betas: [0.5, 0.9]
        weight_decay: 0.0
    scheduler:
      type: MultiStepLR
      params:
        gamma: 0.5
        milestones:
            - 200000
            - 400000
            - 600000
            - 800000

####################################################
#                   LOSS SETTING                   #
####################################################
Loss:
  generator_adv_loss:
    enable: True
    params:
      average_by_discriminators: False
    weights: 1.0

  discriminator_adv_loss:
    enable: True
    params:
      average_by_discriminators: False
    weights: 1.0

  stft_loss:
    enable: False             # Whether to use multi-resolution STFT loss.

  mel_loss:
    enable: True
    params:
      fs: 48000
      fft_size: 4096
      hop_size: 600
      win_length: 2400
      window: "hann"
      num_mels: 128
      fmin: 0
      fmax: 12000
      log_base: null
    weights: 45.0

  subband_stft_loss:
    enable: False
    params:
      fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
      hop_sizes: [35, 75, 15]     # List of hop size for STFT-based loss
      win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
      window: "hann_window"       # Window function for STFT-based loss

  feat_match_loss:
    enable: True
    params:
      average_by_discriminators: false 
      average_by_layers: false         
    weights: 2.0


###########################################################
#                  DATA LOADER SETTING                    #
###########################################################
batch_size: 16              
batch_max_steps: 19200       # Length of each audio in batch. Make sure dividable by hop_size.
pin_memory: True            
num_workers: 2 # FIXME: set > 0 may stuck on macos              
remove_short_samples: False 
allow_cache: True           

generator_grad_norm: -1

discriminator_grad_norm: -1

###########################################################
#                    INTERVAL SETTING                     #
###########################################################
generator_train_start_steps: 1     # Number of steps to start to train discriminator.
discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
train_max_steps: 2500000           # Number of training steps.
save_interval_steps: 20000         # Interval steps to save checkpoint.
eval_interval_steps: 10000          # Interval steps to evaluate the network.
log_interval_steps: 1000            # Interval steps to record the training log.

###########################################################
#                     OTHER SETTING                       #
###########################################################
num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.