update

51782715 · liugh5 · 8b4e9acd · 8b4e9acd · 8b4e9acd · 8b4e9acd
Commit 51782715 authored Feb 23, 2024 by liugh5
20 changed files
--- a/kantts/configs/audio_config_24k.yaml
+++ b/kantts/configs/audio_config_24k.yaml
-# Audio processing configs
-audio_config:
-  # Preprocess
-  wav_normalize: True
-  trim_silence: True
-  trim_silence_threshold_db: 60
-  preemphasize: False
-  # Feature extraction
-  sampling_rate: 24000
-  hop_length: 240
-  win_length: 1024
-  n_fft: 1024
-  n_mels: 80
-  fmin: 50.0
-  fmax: 8000.0
-  phone_level_feature: True
-  # Normalization
-  norm_type: "mean_std"  # "mean_std" or "global"
-  max_norm: 1.0
-  symmetric: False
-  min_level_db: -100.0
-  ref_level_db: 20
-  num_workers: 16
--- a/kantts/configs/audio_config_48k.yaml
+++ b/kantts/configs/audio_config_48k.yaml
-# Audio processing configs
-audio_config:
-  # Preprocess
-  wav_normalize: True
-  trim_silence: True
-  trim_silence_threshold_db: 60
-  preemphasize: False
-  # Feature extraction
-  sampling_rate: 48000
-  hop_length: 600
-  win_length: 2400
-  n_fft: 4096
-  n_mels: 128
-  fmin: 0.0
-  fmax: 12000.0
-  phone_level_feature: True
-  # Normalization
-  norm_type: "mean_std"  # "mean_std" or "global"
-  max_norm: 1.0
-  symmetric: False
-  min_level_db: -100.0
-  ref_level_db: 20
-  num_workers: 16
--- a/kantts/configs/audio_config_8k.yaml
+++ b/kantts/configs/audio_config_8k.yaml
-# Audio processing configs
-audio_config:
-  # Preprocess
-  wav_normalize: True
-  trim_silence: True
-  trim_silence_threshold_db: 60
-  preemphasize: False
-  # Feature extraction
-  sampling_rate: 8000
-  hop_length: 100
-  win_length: 600
-  n_fft: 2048
-  n_mels: 80
-  fmin: 0.0
-  fmax: 4000.0
-  phone_level_feature: True
-  # Normalization
-  norm_type: "mean_std"  # "mean_std" or "global"
-  max_norm: 1.0
-  symmetric: False
-  min_level_db: -100.0
-  ref_level_db: 20
-  num_workers: 16
--- a/kantts/configs/audio_config_se_16k.yaml
+++ b/kantts/configs/audio_config_se_16k.yaml
-# Audio processing configs
-audio_config:
-  # Preprocess
-  wav_normalize: True
-  trim_silence: True
-  trim_silence_threshold_db: 60
-  preemphasize: False
-  # Feature extraction
-  sampling_rate: 16000
-  hop_length: 200
-  win_length: 1000
-  n_fft: 2048
-  n_mels: 80
-  fmin: 0.0
-  fmax: 8000.0
-  phone_level_feature: True
-  se_feature: True
-  # Normalization
-  norm_type: "mean_std"  # "mean_std" or "global"
-  max_norm: 1.0
-  symmetric: False
-  min_level_db: -100.0
-  ref_level_db: 20
-  num_workers: 16
--- a/kantts/configs/hifigan_noncausal_nsf_global_v1_16k.yaml
+++ b/kantts/configs/hifigan_noncausal_nsf_global_v1_16k.yaml
-model_type: hifigan
-Model:
-###########################################################
-#         GENERATOR NETWORK ARCHITECTURE SETTING          #
-###########################################################
-  Generator:
-    params:
-      in_channels: 80                       
-      out_channels: 1                      
-      channels: 256                       
-      kernel_size: 7                     
-      upsample_scales: [10, 5, 2, 2]        
-      upsample_kernal_sizes: [20, 11, 4, 4] 
-      resblock_kernel_sizes: [3, 7, 11]     
-      resblock_dilations:                  
-            - [1, 3, 5, 7]
-            - [1, 3, 5, 7]
-            - [1, 3, 5, 7]
-      bias: true                           
-      causal: false                           
-      nonlinear_activation: "LeakyReLU"    
-      nonlinear_activation_params:         
-        negative_slope: 0.1
-      use_weight_norm: true           
-      nsf_params:
-        nb_harmonics: 7
-        sampling_rate: 16000 
-        nsf_norm_type: "global" # "mean_std" , "global"
-        nsf_f0_global_minimum: 30.0
-        nsf_f0_global_maximum: 730.0            
-    optimizer:
-      type: Adam
-      params:
-        lr: 2.0e-4
-        betas: [0.5, 0.9]
-        weight_decay: 0.0
-    scheduler:
-      type: MultiStepLR
-      params:
-        gamma: 0.5
-        milestones:
-            - 200000
-            - 400000
-            - 600000
-            - 800000
-###########################################################
-#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
-###########################################################
-  MultiScaleDiscriminator:
-    params:
-      scales: 3                              
-      downsample_pooling: "DWT"  
-      downsample_pooling_params:
-          kernel_size: 4                    
-          stride: 2                         
-          padding: 2                        
-      discriminator_params:
-          in_channels: 1                     
-          out_channels: 1                    
-          kernel_sizes: [15, 41, 5, 3]       
-          channels: 128                      
-          max_downsample_channels: 1024     
-          max_groups: 16                   
-          bias: true
-          downsample_scales: [4, 4, 4, 4, 1]
-          nonlinear_activation: "LeakyReLU"  
-          nonlinear_activation_params:
-            negative_slope: 0.1
-      follow_official_norm: true    
-    optimizer:
-      type: Adam
-      params:
-        lr: 2.0e-4
-        betas: [0.5, 0.9]
-        weight_decay: 0.0
-    scheduler:
-      type: MultiStepLR
-      params:
-        gamma: 0.5
-        milestones:
-            - 200000
-            - 400000
-            - 600000
-            - 800000
-  MultiPeriodDiscriminator:
-    params:
-      periods: [2, 3, 5, 7, 11]      
-      discriminator_params:
-        in_channels: 1                  
-        out_channels: 1                  
-        kernel_sizes: [5, 3]              
-        channels: 32                       
-        downsample_scales: [3, 3, 3, 3, 1] 
-        max_downsample_channels: 1024      
-        bias: true                       
-        nonlinear_activation: "LeakyReLU"  
-        nonlinear_activation_params:       
-          negative_slope: 0.1
-        use_spectral_norm: false           
-    optimizer:
-      type: Adam
-      params:
-        lr: 2.0e-4
-        betas: [0.5, 0.9]
-        weight_decay: 0.0
-    scheduler:
-      type: MultiStepLR
-      params:
-        gamma: 0.5
-        milestones:
-            - 200000
-            - 400000
-            - 600000
-            - 800000
-####################################################
-#                   LOSS SETTING                   #
-####################################################
-Loss:
-  generator_adv_loss:
-    enable: True
-    params:
-      average_by_discriminators: False
-    weights: 1.0
-  discriminator_adv_loss:
-    enable: True
-    params:
-      average_by_discriminators: False
-    weights: 1.0
-  stft_loss:
-    enable: False             # Whether to use multi-resolution STFT loss.
-  mel_loss:
-    enable: True
-    params:
-      fs: 16000
-      fft_size: 2048
-      hop_size: 200
-      win_length: 1000
-      window: "hann"
-      num_mels: 80
-      fmin: 0
-      fmax: 8000
-      log_base: null
-    weights: 45.0
-  subband_stft_loss:
-    enable: False
-    params:
-      fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
-      hop_sizes: [35, 75, 15]     # List of hop size for STFT-based loss
-      win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
-      window: "hann_window"       # Window function for STFT-based loss
-  feat_match_loss:
-    enable: True
-    params:
-      average_by_discriminators: false 
-      average_by_layers: false         
-    weights: 2.0
-###########################################################
-#                  DATA LOADER SETTING                    #
-###########################################################
-batch_size: 16              
-batch_max_steps: 9600       # Length of each audio in batch. Make sure dividable by hop_size.
-pin_memory: True            
-num_workers: 2 # FIXME: set > 0 may stuck on macos              
-remove_short_samples: False 
-allow_cache: False           
-generator_grad_norm: -1
-discriminator_grad_norm: -1
-###########################################################
-#                    INTERVAL SETTING                     #
-###########################################################
-generator_train_start_steps: 1     # Number of steps to start to train discriminator.
-discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
-train_max_steps: 2500000           # Number of training steps.
-save_interval_steps: 20000         # Interval steps to save checkpoint.
-eval_interval_steps: 10000          # Interval steps to evaluate the network.
-log_interval_steps: 1000            # Interval steps to record the training log.
-###########################################################
-#                     OTHER SETTING                       #
-###########################################################
-num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/kantts/configs/hifigan_noncausal_nsf_v1_16k.yaml
+++ b/kantts/configs/hifigan_noncausal_nsf_v1_16k.yaml
-model_type: hifigan
-Model:
-###########################################################
-#         GENERATOR NETWORK ARCHITECTURE SETTING          #
-###########################################################
-  Generator:
-    params:
-      in_channels: 80                       
-      out_channels: 1                      
-      channels: 256                       
-      kernel_size: 7                     
-      upsample_scales: [10, 5, 2, 2]        
-      upsample_kernal_sizes: [20, 11, 4, 4] 
-      resblock_kernel_sizes: [3, 7, 11]     
-      resblock_dilations:                  
-            - [1, 3, 5, 7]
-            - [1, 3, 5, 7]
-            - [1, 3, 5, 7]
-      bias: true                           
-      causal: false                           
-      nonlinear_activation: "LeakyReLU"    
-      nonlinear_activation_params:         
-        negative_slope: 0.1
-      use_weight_norm: true           
-      nsf_params:
-        nb_harmonics: 7
-        sampling_rate: 16000         
-    optimizer:
-      type: Adam
-      params:
-        lr: 2.0e-4
-        betas: [0.5, 0.9]
-        weight_decay: 0.0
-    scheduler:
-      type: MultiStepLR
-      params:
-        gamma: 0.5
-        milestones:
-            - 200000
-            - 400000
-            - 600000
-            - 800000
-###########################################################
-#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
-###########################################################
-  MultiScaleDiscriminator:
-    params:
-      scales: 3                              
-      downsample_pooling: "DWT"  
-      downsample_pooling_params:
-          kernel_size: 4                    
-          stride: 2                         
-          padding: 2                        
-      discriminator_params:
-          in_channels: 1                     
-          out_channels: 1                    
-          kernel_sizes: [15, 41, 5, 3]       
-          channels: 128                      
-          max_downsample_channels: 1024     
-          max_groups: 16                   
-          bias: true
-          downsample_scales: [4, 4, 4, 4, 1]
-          nonlinear_activation: "LeakyReLU"  
-          nonlinear_activation_params:
-            negative_slope: 0.1
-      follow_official_norm: true    
-    optimizer:
-      type: Adam
-      params:
-        lr: 2.0e-4
-        betas: [0.5, 0.9]
-        weight_decay: 0.0
-    scheduler:
-      type: MultiStepLR
-      params:
-        gamma: 0.5
-        milestones:
-            - 200000
-            - 400000
-            - 600000
-            - 800000
-  MultiPeriodDiscriminator:
-    params:
-      periods: [2, 3, 5, 7, 11]      
-      discriminator_params:
-        in_channels: 1                  
-        out_channels: 1                  
-        kernel_sizes: [5, 3]              
-        channels: 32                       
-        downsample_scales: [3, 3, 3, 3, 1] 
-        max_downsample_channels: 1024      
-        bias: true                       
-        nonlinear_activation: "LeakyReLU"  
-        nonlinear_activation_params:       
-          negative_slope: 0.1
-        use_spectral_norm: false           
-    optimizer:
-      type: Adam
-      params:
-        lr: 2.0e-4
-        betas: [0.5, 0.9]
-        weight_decay: 0.0
-    scheduler:
-      type: MultiStepLR
-      params:
-        gamma: 0.5
-        milestones:
-            - 200000
-            - 400000
-            - 600000
-            - 800000
-####################################################
-#                   LOSS SETTING                   #
-####################################################
-Loss:
-  generator_adv_loss:
-    enable: True
-    params:
-      average_by_discriminators: False
-    weights: 1.0
-  discriminator_adv_loss:
-    enable: True
-    params:
-      average_by_discriminators: False
-    weights: 1.0
-  stft_loss:
-    enable: False             # Whether to use multi-resolution STFT loss.
-  mel_loss:
-    enable: True
-    params:
-      fs: 16000
-      fft_size: 2048
-      hop_size: 200
-      win_length: 1000
-      window: "hann"
-      num_mels: 80
-      fmin: 0
-      fmax: 8000
-      log_base: null
-    weights: 45.0
-  subband_stft_loss:
-    enable: False
-    params:
-      fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
-      hop_sizes: [35, 75, 15]     # List of hop size for STFT-based loss
-      win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
-      window: "hann_window"       # Window function for STFT-based loss
-  feat_match_loss:
-    enable: True
-    params:
-      average_by_discriminators: false 
-      average_by_layers: false         
-    weights: 2.0
-###########################################################
-#                  DATA LOADER SETTING                    #
-###########################################################
-batch_size: 16              
-batch_max_steps: 9600       # Length of each audio in batch. Make sure dividable by hop_size.
-pin_memory: True            
-num_workers: 2 # FIXME: set > 0 may stuck on macos              
-remove_short_samples: False 
-allow_cache: False           
-generator_grad_norm: -1
-discriminator_grad_norm: -1
-###########################################################
-#                    INTERVAL SETTING                     #
-###########################################################
-generator_train_start_steps: 1     # Number of steps to start to train discriminator.
-discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
-train_max_steps: 2500000           # Number of training steps.
-save_interval_steps: 20000         # Interval steps to save checkpoint.
-eval_interval_steps: 10000          # Interval steps to evaluate the network.
-log_interval_steps: 1000            # Interval steps to record the training log.
-###########################################################
-#                     OTHER SETTING                       #
-###########################################################
-num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/kantts/configs/hifigan_noncausal_v1_16k.yaml
+++ b/kantts/configs/hifigan_noncausal_v1_16k.yaml
-model_type: hifigan
-Model:
-###########################################################
-#         GENERATOR NETWORK ARCHITECTURE SETTING          #
-###########################################################
-  Generator:
-    params:
-      in_channels: 80                       
-      out_channels: 1                      
-      channels: 256                       
-      kernel_size: 7                     
-      upsample_scales: [10, 5, 2, 2]        
-      upsample_kernal_sizes: [20, 11, 4, 4] 
-      resblock_kernel_sizes: [3, 7, 11]     
-      resblock_dilations:                  
-            - [1, 3, 5, 7]
-            - [1, 3, 5, 7]
-            - [1, 3, 5, 7]
-      bias: true                           
-      causal: false
-      nonlinear_activation: "LeakyReLU"    
-      nonlinear_activation_params:         
-        negative_slope: 0.1
-      use_weight_norm: true               
-    optimizer:
-      type: Adam
-      params:
-        lr: 2.0e-4
-        betas: [0.5, 0.9]
-        weight_decay: 0.0
-    scheduler:
-      type: MultiStepLR
-      params:
-        gamma: 0.5
-        milestones:
-            - 200000
-            - 400000
-            - 600000
-            - 800000
-###########################################################
-#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
-###########################################################
-  MultiScaleDiscriminator:
-    params:
-      scales: 3                              
-      downsample_pooling: "DWT"  
-      downsample_pooling_params:
-          kernel_size: 4                    
-          stride: 2                         
-          padding: 2                        
-      discriminator_params:
-          in_channels: 1                     
-          out_channels: 1                    
-          kernel_sizes: [15, 41, 5, 3]       
-          channels: 128                      
-          max_downsample_channels: 1024     
-          max_groups: 16                   
-          bias: true
-          downsample_scales: [4, 4, 4, 4, 1]
-          nonlinear_activation: "LeakyReLU"  
-          nonlinear_activation_params:
-            negative_slope: 0.1
-      follow_official_norm: true    
-    optimizer:
-      type: Adam
-      params:
-        lr: 2.0e-4
-        betas: [0.5, 0.9]
-        weight_decay: 0.0
-    scheduler:
-      type: MultiStepLR
-      params:
-        gamma: 0.5
-        milestones:
-            - 200000
-            - 400000
-            - 600000
-            - 800000
-  MultiPeriodDiscriminator:
-    params:
-      periods: [2, 3, 5, 7, 11]      
-      discriminator_params:
-        in_channels: 1                  
-        out_channels: 1                  
-        kernel_sizes: [5, 3]              
-        channels: 32                       
-        downsample_scales: [3, 3, 3, 3, 1] 
-        max_downsample_channels: 1024      
-        bias: true                       
-        nonlinear_activation: "LeakyReLU"  
-        nonlinear_activation_params:       
-          negative_slope: 0.1
-        use_spectral_norm: false           
-    optimizer:
-      type: Adam
-      params:
-        lr: 2.0e-4
-        betas: [0.5, 0.9]
-        weight_decay: 0.0
-    scheduler:
-      type: MultiStepLR
-      params:
-        gamma: 0.5
-        milestones:
-            - 200000
-            - 400000
-            - 600000
-            - 800000
-####################################################
-#                   LOSS SETTING                   #
-####################################################
-Loss:
-  generator_adv_loss:
-    enable: True
-    params:
-      average_by_discriminators: False
-    weights: 1.0
-  discriminator_adv_loss:
-    enable: True
-    params:
-      average_by_discriminators: False
-    weights: 1.0
-  stft_loss:
-    enable: False             # Whether to use multi-resolution STFT loss.
-  mel_loss:
-    enable: True
-    params:
-      fs: 16000
-      fft_size: 2048
-      hop_size: 200
-      win_length: 1000
-      window: "hann"
-      num_mels: 80
-      fmin: 0
-      fmax: 8000
-      log_base: null
-    weights: 45.0
-  subband_stft_loss:
-    enable: False
-    params:
-      fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
-      hop_sizes: [35, 75, 15]     # List of hop size for STFT-based loss
-      win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
-      window: "hann_window"       # Window function for STFT-based loss
-  feat_match_loss:
-    enable: True
-    params:
-      average_by_discriminators: false 
-      average_by_layers: false         
-    weights: 2.0
-###########################################################
-#                  DATA LOADER SETTING                    #
-###########################################################
-batch_size: 16              
-batch_max_steps: 9600       # Length of each audio in batch. Make sure dividable by hop_size.
-pin_memory: True            
-num_workers: 2 # FIXME: set > 0 may stuck on macos              
-remove_short_samples: False 
-allow_cache: True           
-generator_grad_norm: -1
-discriminator_grad_norm: -1
-###########################################################
-#                    INTERVAL SETTING                     #
-###########################################################
-generator_train_start_steps: 1     # Number of steps to start to train discriminator.
-discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
-train_max_steps: 2500000           # Number of training steps.
-save_interval_steps: 20000         # Interval steps to save checkpoint.
-eval_interval_steps: 10000          # Interval steps to evaluate the network.
-log_interval_steps: 1000            # Interval steps to record the training log.
-###########################################################
-#                     OTHER SETTING                       #
-###########################################################
-num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/kantts/configs/hifigan_v1_16k.yaml
+++ b/kantts/configs/hifigan_v1_16k.yaml
-model_type: hifigan
-Model:
-###########################################################
-#         GENERATOR NETWORK ARCHITECTURE SETTING          #
-###########################################################
-  Generator:
-    params:
-      in_channels: 80                       
-      out_channels: 1                      
-      channels: 256                       
-      kernel_size: 7                     
-      upsample_scales: [10, 5, 2, 2]        
-      upsample_kernal_sizes: [20, 10, 4, 4] 
-      resblock_kernel_sizes: [3, 7, 11]     
-      resblock_dilations:                  
-            - [1, 3, 5, 7]
-            - [1, 3, 5, 7]
-            - [1, 3, 5, 7]
-      bias: true                           
-      causal: true                           
-      nonlinear_activation: "LeakyReLU"    
-      nonlinear_activation_params:         
-        negative_slope: 0.1
-      use_weight_norm: true               
-    optimizer:
-      type: Adam
-      params:
-        lr: 2.0e-4
-        betas: [0.5, 0.9]
-        weight_decay: 0.0
-    scheduler:
-      type: MultiStepLR
-      params:
-        gamma: 0.5
-        milestones:
-            - 200000
-            - 400000
-            - 600000
-            - 800000
-###########################################################
-#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
-###########################################################
-  MultiScaleDiscriminator:
-    params:
-      scales: 3                              
-      downsample_pooling: "DWT"  
-      downsample_pooling_params:
-          kernel_size: 4                    
-          stride: 2                         
-          padding: 2                        
-      discriminator_params:
-          in_channels: 1                     
-          out_channels: 1                    
-          kernel_sizes: [15, 41, 5, 3]       
-          channels: 128                      
-          max_downsample_channels: 1024     
-          max_groups: 16                   
-          bias: true
-          downsample_scales: [4, 4, 4, 4, 1]
-          nonlinear_activation: "LeakyReLU"  
-          nonlinear_activation_params:
-            negative_slope: 0.1
-      follow_official_norm: true    
-    optimizer:
-      type: Adam
-      params:
-        lr: 2.0e-4
-        betas: [0.5, 0.9]
-        weight_decay: 0.0
-    scheduler:
-      type: MultiStepLR
-      params:
-        gamma: 0.5
-        milestones:
-            - 200000
-            - 400000
-            - 600000
-            - 800000
-  MultiPeriodDiscriminator:
-    params:
-      periods: [2, 3, 5, 7, 11]      
-      discriminator_params:
-        in_channels: 1                  
-        out_channels: 1                  
-        kernel_sizes: [5, 3]              
-        channels: 32                       
-        downsample_scales: [3, 3, 3, 3, 1] 
-        max_downsample_channels: 1024      
-        bias: true                       
-        nonlinear_activation: "LeakyReLU"  
-        nonlinear_activation_params:       
-          negative_slope: 0.1
-        use_spectral_norm: false           
-    optimizer:
-      type: Adam
-      params:
-        lr: 2.0e-4
-        betas: [0.5, 0.9]
-        weight_decay: 0.0
-    scheduler:
-      type: MultiStepLR
-      params:
-        gamma: 0.5
-        milestones:
-            - 200000
-            - 400000
-            - 600000
-            - 800000
-####################################################
-#                   LOSS SETTING                   #
-####################################################
-Loss:
-  generator_adv_loss:
-    enable: True
-    params:
-      average_by_discriminators: False
-    weights: 1.0
-  discriminator_adv_loss:
-    enable: True
-    params:
-      average_by_discriminators: False
-    weights: 1.0
-  stft_loss:
-    enable: False             # Whether to use multi-resolution STFT loss.
-  mel_loss:
-    enable: True
-    params:
-      fs: 16000
-      fft_size: 2048
-      hop_size: 200
-      win_length: 1000
-      window: "hann"
-      num_mels: 80
-      fmin: 0
-      fmax: 8000
-      log_base: null
-    weights: 45.0
-  subband_stft_loss:
-    enable: False
-    params:
-      fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
-      hop_sizes: [35, 75, 15]     # List of hop size for STFT-based loss
-      win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
-      window: "hann_window"       # Window function for STFT-based loss
-  feat_match_loss:
-    enable: True
-    params:
-      average_by_discriminators: false 
-      average_by_layers: false         
-    weights: 2.0
-###########################################################
-#                  DATA LOADER SETTING                    #
-###########################################################
-batch_size: 16              
-batch_max_steps: 9600       # Length of each audio in batch. Make sure dividable by hop_size.
-pin_memory: True            
-num_workers: 2 # FIXME: set > 0 may stuck on macos              
-remove_short_samples: False 
-allow_cache: True           
-generator_grad_norm: -1
-discriminator_grad_norm: -1
-###########################################################
-#                    INTERVAL SETTING                     #
-###########################################################
-generator_train_start_steps: 1     # Number of steps to start to train discriminator.
-discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
-train_max_steps: 2500000           # Number of training steps.
-save_interval_steps: 20000         # Interval steps to save checkpoint.
-eval_interval_steps: 10000          # Interval steps to evaluate the network.
-log_interval_steps: 1000            # Interval steps to record the training log.
-###########################################################
-#                     OTHER SETTING                       #
-###########################################################
-num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/kantts/configs/hifigan_v1_24k.yaml
+++ b/kantts/configs/hifigan_v1_24k.yaml
-model_type: hifigan
-Model:
-###########################################################
-#         GENERATOR NETWORK ARCHITECTURE SETTING          #
-###########################################################
-  Generator:
-    params:
-      in_channels: 80                       
-      out_channels: 1                      
-      channels: 512                       
-      kernel_size: 7                     
-      upsample_scales: [8, 5, 3, 2]        
-      upsample_kernal_sizes: [16, 10, 6, 4] 
-      resblock_kernel_sizes: [3, 7, 11]     
-      resblock_dilations:                  
-            - [1, 3, 5]
-            - [1, 3, 5]
-            - [1, 3, 5]
-      bias: true                           
-      causal: true                             
-      nonlinear_activation: "LeakyReLU"    
-      nonlinear_activation_params:         
-        negative_slope: 0.1
-      use_weight_norm: true               
-    optimizer:
-      type: Adam
-      params:
-        lr: 2.0e-4
-        betas: [0.5, 0.9]
-        weight_decay: 0.0
-    scheduler:
-      type: MultiStepLR
-      params:
-        gamma: 0.5
-        milestones:
-            - 200000
-            - 400000
-            - 600000
-            - 800000
-###########################################################
-#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
-###########################################################
-  MultiScaleDiscriminator:
-    params:
-      scales: 3                              
-      downsample_pooling: "DWT"  
-      downsample_pooling_params:
-          kernel_size: 4                    
-          stride: 2                         
-          padding: 2                        
-      discriminator_params:
-          in_channels: 1                     
-          out_channels: 1                    
-          kernel_sizes: [15, 41, 5, 3]       
-          channels: 128                      
-          max_downsample_channels: 1024     
-          max_groups: 16                   
-          bias: true
-          downsample_scales: [4, 4, 4, 4, 1]
-          nonlinear_activation: "LeakyReLU"  
-          nonlinear_activation_params:
-            negative_slope: 0.1
-      follow_official_norm: true    
-    optimizer:
-      type: Adam
-      params:
-        lr: 2.0e-4
-        betas: [0.5, 0.9]
-        weight_decay: 0.0
-    scheduler:
-      type: MultiStepLR
-      params:
-        gamma: 0.5
-        milestones:
-            - 200000
-            - 400000
-            - 600000
-            - 800000
-  MultiPeriodDiscriminator:
-    params:
-      periods: [2, 3, 5, 7, 11]      
-      discriminator_params:
-        in_channels: 1                  
-        out_channels: 1                  
-        kernel_sizes: [5, 3]              
-        channels: 32                       
-        downsample_scales: [3, 3, 3, 3, 1] 
-        max_downsample_channels: 1024      
-        bias: true                       
-        nonlinear_activation: "LeakyReLU"  
-        nonlinear_activation_params:       
-          negative_slope: 0.1
-        use_spectral_norm: false           
-    optimizer:
-      type: Adam
-      params:
-        lr: 2.0e-4
-        betas: [0.5, 0.9]
-        weight_decay: 0.0
-    scheduler:
-      type: MultiStepLR
-      params:
-        gamma: 0.5
-        milestones:
-            - 200000
-            - 400000
-            - 600000
-            - 800000
-####################################################
-#                   LOSS SETTING                   #
-####################################################
-Loss:
-  generator_adv_loss:
-    enable: True
-    params:
-      average_by_discriminators: False
-    weights: 1.0
-  discriminator_adv_loss:
-    enable: True
-    params:
-      average_by_discriminators: False
-    weights: 1.0
-  stft_loss:
-    enable: False             # Whether to use multi-resolution STFT loss.
-  mel_loss:
-    enable: True
-    params:
-      fs: 24000
-      fft_size: 1024
-      hop_size: 240
-      win_length: 1024
-      window: "hann"
-      num_mels: 80
-      fmin: 0
-      fmax: 8000
-      log_base: null
-    weights: 45.0
-  subband_stft_loss:
-    enable: False
-    params:
-      fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
-      hop_sizes: [35, 75, 15]     # List of hop size for STFT-based loss
-      win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
-      window: "hann_window"       # Window function for STFT-based loss
-  feat_match_loss:
-    enable: True
-    params:
-      average_by_discriminators: false 
-      average_by_layers: false         
-    weights: 2.0
-###########################################################
-#                  DATA LOADER SETTING                    #
-###########################################################
-batch_size: 16              
-batch_max_steps: 9600       # Length of each audio in batch. Make sure dividable by hop_size.
-pin_memory: True            
-num_workers: 2 # FIXME: set > 0 may stuck on macos              
-remove_short_samples: False 
-allow_cache: True           
-generator_grad_norm: -1
-discriminator_grad_norm: -1
-###########################################################
-#                    INTERVAL SETTING                     #
-###########################################################
-generator_train_start_steps: 1     # Number of steps to start to train discriminator.
-discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
-train_max_steps: 2500000           # Number of training steps.
-save_interval_steps: 20000         # Interval steps to save checkpoint.
-eval_interval_steps: 10000          # Interval steps to evaluate the network.
-log_interval_steps: 1000            # Interval steps to record the training log.
-###########################################################
-#                     OTHER SETTING                       #
-###########################################################
-num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/kantts/configs/hifigan_v1_48k.yaml
+++ b/kantts/configs/hifigan_v1_48k.yaml
-model_type: hifigan
-Model:
-###########################################################
-#         GENERATOR NETWORK ARCHITECTURE SETTING          #
-###########################################################
-  Generator:
-    params:
-      in_channels: 128                       
-      out_channels: 1                      
-      channels: 512                       
-      kernel_size: 7                     
-      upsample_scales: [10, 5, 3, 2, 2]        
-      upsample_kernal_sizes: [20, 10, 6, 4, 4] 
-      resblock_kernel_sizes: [3, 7, 11]     
-      resblock_dilations:                  
-            - [1, 3, 5, 7]
-            - [1, 3, 5, 7]
-            - [1, 3, 5, 7]
-      bias: true                           
-      causal: true                             
-      nonlinear_activation: "LeakyReLU"    
-      nonlinear_activation_params:         
-        negative_slope: 0.1
-      use_weight_norm: true               
-    optimizer:
-      type: Adam
-      params:
-        lr: 2.0e-4
-        betas: [0.5, 0.9]
-        weight_decay: 0.0
-    scheduler:
-      type: MultiStepLR
-      params:
-        gamma: 0.5
-        milestones:
-            - 200000
-            - 400000
-            - 600000
-            - 800000
-###########################################################
-#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
-###########################################################
-  MultiScaleDiscriminator:
-    params:
-      scales: 3                              
-      downsample_pooling: "DWT"  
-      downsample_pooling_params:
-          kernel_size: 4                    
-          stride: 2                         
-          padding: 2                        
-      discriminator_params:
-          in_channels: 1                     
-          out_channels: 1                    
-          kernel_sizes: [15, 41, 5, 3]       
-          channels: 128                      
-          max_downsample_channels: 1024     
-          max_groups: 16                   
-          bias: true
-          downsample_scales: [4, 4, 4, 4, 1]
-          nonlinear_activation: "LeakyReLU"  
-          nonlinear_activation_params:
-            negative_slope: 0.1
-      follow_official_norm: true    
-    optimizer:
-      type: Adam
-      params:
-        lr: 2.0e-4
-        betas: [0.5, 0.9]
-        weight_decay: 0.0
-    scheduler:
-      type: MultiStepLR
-      params:
-        gamma: 0.5
-        milestones:
-            - 200000
-            - 400000
-            - 600000
-            - 800000
-  MultiPeriodDiscriminator:
-    params:
-      periods: [2, 3, 5, 7, 11]      
-      discriminator_params:
-        in_channels: 1                  
-        out_channels: 1                  
-        kernel_sizes: [5, 3]              
-        channels: 32                       
-        downsample_scales: [3, 3, 3, 3, 1] 
-        max_downsample_channels: 1024      
-        bias: true                       
-        nonlinear_activation: "LeakyReLU"  
-        nonlinear_activation_params:       
-          negative_slope: 0.1
-        use_spectral_norm: false           
-    optimizer:
-      type: Adam
-      params:
-        lr: 2.0e-4
-        betas: [0.5, 0.9]
-        weight_decay: 0.0
-    scheduler:
-      type: MultiStepLR
-      params:
-        gamma: 0.5
-        milestones:
-            - 200000
-            - 400000
-            - 600000
-            - 800000
-####################################################
-#                   LOSS SETTING                   #
-####################################################
-Loss:
-  generator_adv_loss:
-    enable: True
-    params:
-      average_by_discriminators: False
-    weights: 1.0
-  discriminator_adv_loss:
-    enable: True
-    params:
-      average_by_discriminators: False
-    weights: 1.0
-  stft_loss:
-    enable: False             # Whether to use multi-resolution STFT loss.
-  mel_loss:
-    enable: True
-    params:
-      fs: 48000
-      fft_size: 4096
-      hop_size: 600
-      win_length: 2400
-      window: "hann"
-      num_mels: 128
-      fmin: 0
-      fmax: 12000
-      log_base: null
-    weights: 45.0
-  subband_stft_loss:
-    enable: False
-    params:
-      fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
-      hop_sizes: [35, 75, 15]     # List of hop size for STFT-based loss
-      win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
-      window: "hann_window"       # Window function for STFT-based loss
-  feat_match_loss:
-    enable: True
-    params:
-      average_by_discriminators: false 
-      average_by_layers: false         
-    weights: 2.0
-###########################################################
-#                  DATA LOADER SETTING                    #
-###########################################################
-batch_size: 16              
-batch_max_steps: 19200       # Length of each audio in batch. Make sure dividable by hop_size.
-pin_memory: True            
-num_workers: 2 # FIXME: set > 0 may stuck on macos              
-remove_short_samples: False 
-allow_cache: True           
-generator_grad_norm: -1
-discriminator_grad_norm: -1
-###########################################################
-#                    INTERVAL SETTING                     #
-###########################################################
-generator_train_start_steps: 1     # Number of steps to start to train discriminator.
-discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
-train_max_steps: 2500000           # Number of training steps.
-save_interval_steps: 20000         # Interval steps to save checkpoint.
-eval_interval_steps: 10000          # Interval steps to evaluate the network.
-log_interval_steps: 1000            # Interval steps to record the training log.
-###########################################################
-#                     OTHER SETTING                       #
-###########################################################
-num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/kantts/configs/hifigan_v1_8k.yaml
+++ b/kantts/configs/hifigan_v1_8k.yaml
-model_type: hifigan
-Model:
-###########################################################
-#         GENERATOR NETWORK ARCHITECTURE SETTING          #
-###########################################################
-  Generator:
-    params:
-      in_channels: 80                       
-      out_channels: 1                      
-      channels: 256                       
-      kernel_size: 7                     
-      upsample_scales: [5, 5, 2, 2]        
-      upsample_kernal_sizes: [10, 10, 4, 4] 
-      resblock_kernel_sizes: [3, 7, 11]     
-      resblock_dilations:                  
-            - [1, 3, 5, 7]
-            - [1, 3, 5, 7]
-            - [1, 3, 5, 7]
-      bias: true                           
-      causal: true                           
-      nonlinear_activation: "LeakyReLU"    
-      nonlinear_activation_params:         
-        negative_slope: 0.1
-      use_weight_norm: true               
-    optimizer:
-      type: Adam
-      params:
-        lr: 2.0e-4
-        betas: [0.5, 0.9]
-        weight_decay: 0.0
-    scheduler:
-      type: MultiStepLR
-      params:
-        gamma: 0.5
-        milestones:
-            - 200000
-            - 400000
-            - 600000
-            - 800000
-###########################################################
-#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
-###########################################################
-  MultiScaleDiscriminator:
-    params:
-      scales: 3                              
-      downsample_pooling: "DWT"  
-      downsample_pooling_params:
-          kernel_size: 4                    
-          stride: 2                         
-          padding: 2                        
-      discriminator_params:
-          in_channels: 1                     
-          out_channels: 1                    
-          kernel_sizes: [15, 41, 5, 3]       
-          channels: 128                      
-          max_downsample_channels: 1024     
-          max_groups: 16                   
-          bias: true
-          downsample_scales: [4, 4, 4, 4, 1]
-          nonlinear_activation: "LeakyReLU"  
-          nonlinear_activation_params:
-            negative_slope: 0.1
-      follow_official_norm: true    
-    optimizer:
-      type: Adam
-      params:
-        lr: 2.0e-4
-        betas: [0.5, 0.9]
-        weight_decay: 0.0
-    scheduler:
-      type: MultiStepLR
-      params:
-        gamma: 0.5
-        milestones:
-            - 200000
-            - 400000
-            - 600000
-            - 800000
-  MultiPeriodDiscriminator:
-    params:
-      periods: [2, 3, 5, 7, 11]      
-      discriminator_params:
-        in_channels: 1                  
-        out_channels: 1                  
-        kernel_sizes: [5, 3]              
-        channels: 32                       
-        downsample_scales: [3, 3, 3, 3, 1] 
-        max_downsample_channels: 1024      
-        bias: true                       
-        nonlinear_activation: "LeakyReLU"  
-        nonlinear_activation_params:       
-          negative_slope: 0.1
-        use_spectral_norm: false           
-    optimizer:
-      type: Adam
-      params:
-        lr: 2.0e-4
-        betas: [0.5, 0.9]
-        weight_decay: 0.0
-    scheduler:
-      type: MultiStepLR
-      params:
-        gamma: 0.5
-        milestones:
-            - 200000
-            - 400000
-            - 600000
-            - 800000
-####################################################
-#                   LOSS SETTING                   #
-####################################################
-Loss:
-  generator_adv_loss:
-    enable: True
-    params:
-      average_by_discriminators: False
-    weights: 1.0
-  discriminator_adv_loss:
-    enable: True
-    params:
-      average_by_discriminators: False
-    weights: 1.0
-  stft_loss:
-    enable: False             # Whether to use multi-resolution STFT loss.
-  mel_loss:
-    enable: True
-    params:
-      fs: 8000
-      fft_size: 2048
-      hop_size: 100
-      win_length: 600
-      window: "hann"
-      num_mels: 80
-      fmin: 0
-      fmax: 8000
-      log_base: null
-    weights: 45.0
-  subband_stft_loss:
-    enable: False
-    params:
-      fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
-      hop_sizes: [35, 75, 15]     # List of hop size for STFT-based loss
-      win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
-      window: "hann_window"       # Window function for STFT-based loss
-  feat_match_loss:
-    enable: True
-    params:
-      average_by_discriminators: false 
-      average_by_layers: false         
-    weights: 2.0
-###########################################################
-#                  DATA LOADER SETTING                    #
-###########################################################
-batch_size: 16              
-batch_max_steps: 6000       # Length of each audio in batch. Make sure dividable by hop_size.
-pin_memory: True            
-num_workers: 2 # FIXME: set > 0 may stuck on macos              
-remove_short_samples: False 
-allow_cache: True           
-generator_grad_norm: -1
-discriminator_grad_norm: -1
-###########################################################
-#                    INTERVAL SETTING                     #
-###########################################################
-generator_train_start_steps: 1     # Number of steps to start to train discriminator.
-discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
-train_max_steps: 2500000           # Number of training steps.
-save_interval_steps: 20000         # Interval steps to save checkpoint.
-eval_interval_steps: 10000          # Interval steps to evaluate the network.
-log_interval_steps: 1000            # Interval steps to record the training log.
-###########################################################
-#                     OTHER SETTING                       #
-###########################################################
-num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/kantts/configs/hifigan_v1_nsf_24k.yaml
+++ b/kantts/configs/hifigan_v1_nsf_24k.yaml
-model_type: hifigan
-Model:
-###########################################################
-#         GENERATOR NETWORK ARCHITECTURE SETTING          #
-###########################################################
-  Generator:
-    params:
-      in_channels: 80                       
-      out_channels: 1                      
-      channels: 512                       
-      kernel_size: 7                     
-      upsample_scales: [8, 5, 3, 2]        
-      upsample_kernal_sizes: [16, 10, 6, 4] 
-      resblock_kernel_sizes: [3, 7, 11]     
-      resblock_dilations:                  
-            - [1, 3, 5]
-            - [1, 3, 5]
-            - [1, 3, 5]
-      bias: true                           
-      causal: true                             
-      nonlinear_activation: "LeakyReLU"    
-      nonlinear_activation_params:         
-        negative_slope: 0.1
-      use_weight_norm: true               
-      nsf_params:
-        nb_harmonics: 7
-        sampling_rate: 24000
-    optimizer:
-      type: Adam
-      params:
-        lr: 2.0e-4
-        betas: [0.5, 0.9]
-        weight_decay: 0.0
-    scheduler:
-      type: MultiStepLR
-      params:
-        gamma: 0.5
-        milestones:
-            - 200000
-            - 400000
-            - 600000
-            - 800000
-###########################################################
-#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
-###########################################################
-  MultiScaleDiscriminator:
-    params:
-      scales: 3                              
-      downsample_pooling: "DWT"  
-      downsample_pooling_params:
-          kernel_size: 4                    
-          stride: 2                         
-          padding: 2                        
-      discriminator_params:
-          in_channels: 1                     
-          out_channels: 1                    
-          kernel_sizes: [15, 41, 5, 3]       
-          channels: 128                      
-          max_downsample_channels: 1024     
-          max_groups: 16                   
-          bias: true
-          downsample_scales: [4, 4, 4, 4, 1]
-          nonlinear_activation: "LeakyReLU"  
-          nonlinear_activation_params:
-            negative_slope: 0.1
-      follow_official_norm: true    
-    optimizer:
-      type: Adam
-      params:
-        lr: 2.0e-4
-        betas: [0.5, 0.9]
-        weight_decay: 0.0
-    scheduler:
-      type: MultiStepLR
-      params:
-        gamma: 0.5
-        milestones:
-            - 200000
-            - 400000
-            - 600000
-            - 800000
-  MultiPeriodDiscriminator:
-    params:
-      periods: [2, 3, 5, 7, 11]      
-      discriminator_params:
-        in_channels: 1                  
-        out_channels: 1                  
-        kernel_sizes: [5, 3]              
-        channels: 32                       
-        downsample_scales: [3, 3, 3, 3, 1] 
-        max_downsample_channels: 1024      
-        bias: true                       
-        nonlinear_activation: "LeakyReLU"  
-        nonlinear_activation_params:       
-          negative_slope: 0.1
-        use_spectral_norm: false           
-    optimizer:
-      type: Adam
-      params:
-        lr: 2.0e-4
-        betas: [0.5, 0.9]
-        weight_decay: 0.0
-    scheduler:
-      type: MultiStepLR
-      params:
-        gamma: 0.5
-        milestones:
-            - 200000
-            - 400000
-            - 600000
-            - 800000
-####################################################
-#                   LOSS SETTING                   #
-####################################################
-Loss:
-  generator_adv_loss:
-    enable: True
-    params:
-      average_by_discriminators: False
-    weights: 1.0
-  discriminator_adv_loss:
-    enable: True
-    params:
-      average_by_discriminators: False
-    weights: 1.0
-  stft_loss:
-    enable: False             # Whether to use multi-resolution STFT loss.
-  mel_loss:
-    enable: True
-    params:
-      fs: 24000
-      fft_size: 1024
-      hop_size: 240
-      win_length: 1024
-      window: "hann"
-      num_mels: 80
-      fmin: 0
-      fmax: 8000
-      log_base: null
-    weights: 45.0
-  subband_stft_loss:
-    enable: False
-    params:
-      fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
-      hop_sizes: [35, 75, 15]     # List of hop size for STFT-based loss
-      win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
-      window: "hann_window"       # Window function for STFT-based loss
-  feat_match_loss:
-    enable: True
-    params:
-      average_by_discriminators: false 
-      average_by_layers: false         
-    weights: 2.0
-###########################################################
-#                  DATA LOADER SETTING                    #
-###########################################################
-batch_size: 16              
-batch_max_steps: 9600       # Length of each audio in batch. Make sure dividable by hop_size.
-pin_memory: False            
-num_workers: 2 # FIXME: set > 0 may stuck on macos              
-remove_short_samples: False 
-allow_cache: True           
-generator_grad_norm: -1
-discriminator_grad_norm: -1
-###########################################################
-#                    INTERVAL SETTING                     #
-###########################################################
-generator_train_start_steps: 1     # Number of steps to start to train discriminator.
-discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
-train_max_steps: 2500000           # Number of training steps.
-save_interval_steps: 20000         # Interval steps to save checkpoint.
-eval_interval_steps: 10000          # Interval steps to evaluate the network.
-log_interval_steps: 1000            # Interval steps to record the training log.
-###########################################################
-#                     OTHER SETTING                       #
-###########################################################
-num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/kantts/configs/sambert_16k.yaml
+++ b/kantts/configs/sambert_16k.yaml
-model_type: sambert
-Model:
-#########################################################
-#         SAMBERT NETWORK ARCHITECTURE SETTING          #
-#########################################################
-  KanTtsSAMBERT:
-    params:
-        max_len: 800
-        embedding_dim: 512 
-        encoder_num_layers: 8
-        encoder_num_heads: 8
-        encoder_num_units: 128
-        encoder_ffn_inner_dim: 1024
-        encoder_dropout: 0.1
-        encoder_attention_dropout: 0.1
-        encoder_relu_dropout: 0.1
-        encoder_projection_units: 32
-        speaker_units: 32
-        emotion_units: 32
-        predictor_filter_size: 41
-        predictor_fsmn_num_layers: 3
-        predictor_num_memory_units: 128
-        predictor_ffn_inner_dim: 256
-        predictor_dropout: 0.1
-        predictor_shift: 0
-        predictor_lstm_units: 128
-        dur_pred_prenet_units: [128, 128]
-        dur_pred_lstm_units: 128
-        decoder_prenet_units: [256, 256]
-        decoder_num_layers: 12
-        decoder_num_heads: 8
-        decoder_num_units: 128
-        decoder_ffn_inner_dim: 1024
-        decoder_dropout: 0.1
-        decoder_attention_dropout: 0.1
-        decoder_relu_dropout: 0.1
-        outputs_per_step: 3
-        num_mels: 80
-        postnet_filter_size: 41
-        postnet_fsmn_num_layers: 4
-        postnet_num_memory_units: 256
-        postnet_ffn_inner_dim: 512
-        postnet_dropout: 0.1
-        postnet_shift: 17
-        postnet_lstm_units: 128
-        MAS: False
-    optimizer:
-      type: Adam
-      params:
-        lr: 0.001
-        betas: [0.9, 0.98]
-        eps: 1.0e-9
-        weight_decay: 0.0
-    scheduler:
-      type: NoamLR
-      params:
-        warmup_steps: 4000
-linguistic_unit: 
-  cleaners: english_cleaners
-  lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
-  speaker_list: F7
-####################################################
-#                   LOSS SETTING                   #
-####################################################
-Loss:
-  MelReconLoss:
-    enable: True
-    params:
-      loss_type: mae
-  ProsodyReconLoss:
-    enable: True
-    params:
-      loss_type: mae
-###########################################################
-#                  DATA LOADER SETTING                    #
-###########################################################
-batch_size: 32              
-pin_memory: False            
-num_workers: 4 # FIXME: set > 0 may stuck on macos              
-remove_short_samples: False 
-allow_cache: True           
-grad_norm: 1.0
-###########################################################
-#                    INTERVAL SETTING                     #
-###########################################################
-train_max_steps: 1000000           # Number of training steps.
-save_interval_steps: 20000         # Interval steps to save checkpoint.
-eval_interval_steps: 10000          # Interval steps to evaluate the network.
-log_interval_steps: 1000            # Interval steps to record the training log.
-###########################################################
-#                     OTHER SETTING                       #
-###########################################################
-num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/kantts/configs/sambert_16k_MAS.yaml
+++ b/kantts/configs/sambert_16k_MAS.yaml
-model_type: sambert
-Model:
-#########################################################
-#         SAMBERT NETWORK ARCHITECTURE SETTING          #
-#########################################################
-  KanTtsSAMBERT:
-    params:
-        max_len: 800
-        embedding_dim: 512 
-        encoder_num_layers: 8
-        encoder_num_heads: 8
-        encoder_num_units: 128
-        encoder_ffn_inner_dim: 1024
-        encoder_dropout: 0.1
-        encoder_attention_dropout: 0.1
-        encoder_relu_dropout: 0.1
-        encoder_projection_units: 32
-        speaker_units: 32
-        emotion_units: 32
-        predictor_filter_size: 41
-        predictor_fsmn_num_layers: 3
-        predictor_num_memory_units: 128
-        predictor_ffn_inner_dim: 256
-        predictor_dropout: 0.1
-        predictor_shift: 0
-        predictor_lstm_units: 128
-        dur_pred_prenet_units: [128, 128]
-        dur_pred_lstm_units: 128
-        decoder_prenet_units: [256, 256]
-        decoder_num_layers: 12
-        decoder_num_heads: 8
-        decoder_num_units: 128
-        decoder_ffn_inner_dim: 1024
-        decoder_dropout: 0.1
-        decoder_attention_dropout: 0.1
-        decoder_relu_dropout: 0.1
-        outputs_per_step: 3
-        num_mels: 80
-        postnet_filter_size: 41
-        postnet_fsmn_num_layers: 4
-        postnet_num_memory_units: 256
-        postnet_ffn_inner_dim: 512
-        postnet_dropout: 0.1
-        postnet_shift: 17
-        postnet_lstm_units: 128
-        MAS: True
-    optimizer:
-      type: Adam
-      params:
-        lr: 0.001
-        betas: [0.9, 0.98]
-        eps: 1.0e-9
-        weight_decay: 0.0
-    scheduler:
-      type: NoamLR
-      params:
-        warmup_steps: 4000
-linguistic_unit: 
-  cleaners: english_cleaners
-  lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
-  speaker_list: F7
-####################################################
-#                   LOSS SETTING                   #
-####################################################
-Loss:
-  MelReconLoss:
-    enable: True
-    params:
-      loss_type: mae
-  ProsodyReconLoss:
-    enable: True
-    params:
-      loss_type: mae
-  AttentionCTCLoss:
-    enable: True
-  AttentionBinarizationLoss:
-    enable: True
-    params:
-      start_epoch: 0
-      warmup_epoch: 100
-###########################################################
-#                  DATA LOADER SETTING                    #
-###########################################################
-batch_size: 32              
-pin_memory: False            
-num_workers: 4 # FIXME: set > 0 may stuck on macos              
-remove_short_samples: False 
-allow_cache: True           
-grad_norm: 1.0
-###########################################################
-#                    INTERVAL SETTING                     #
-###########################################################
-train_max_steps: 1000000           # Number of training steps.
-save_interval_steps: 20000         # Interval steps to save checkpoint.
-eval_interval_steps: 10000          # Interval steps to evaluate the network.
-log_interval_steps: 1000            # Interval steps to record the training log.
-###########################################################
-#                     OTHER SETTING                       #
-###########################################################
-num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/kantts/configs/sambert_16k_MAS_byte.yaml
+++ b/kantts/configs/sambert_16k_MAS_byte.yaml
-model_type: sambert
-Model:
-#########################################################
-#         SAMBERT NETWORK ARCHITECTURE SETTING          #
-#########################################################
-  KanTtsSAMBERT:
-    params:
-        max_len: 800
-        embedding_dim: 512 
-        encoder_num_layers: 8
-        encoder_num_heads: 8
-        encoder_num_units: 128
-        encoder_ffn_inner_dim: 1024
-        encoder_dropout: 0.1
-        encoder_attention_dropout: 0.1
-        encoder_relu_dropout: 0.1
-        encoder_projection_units: 32
-        speaker_units: 32
-        emotion_units: 32
-        predictor_filter_size: 41
-        predictor_fsmn_num_layers: 3
-        predictor_num_memory_units: 128
-        predictor_ffn_inner_dim: 256
-        predictor_dropout: 0.1
-        predictor_shift: 0
-        predictor_lstm_units: 128
-        dur_pred_prenet_units: [128, 128]
-        dur_pred_lstm_units: 128
-        decoder_prenet_units: [256, 256]
-        decoder_num_layers: 12
-        decoder_num_heads: 8
-        decoder_num_units: 128
-        decoder_ffn_inner_dim: 1024
-        decoder_dropout: 0.1
-        decoder_attention_dropout: 0.1
-        decoder_relu_dropout: 0.1
-        outputs_per_step: 3
-        num_mels: 80
-        postnet_filter_size: 41
-        postnet_fsmn_num_layers: 4
-        postnet_num_memory_units: 256
-        postnet_ffn_inner_dim: 512
-        postnet_dropout: 0.1
-        postnet_shift: 17
-        postnet_lstm_units: 128
-        MAS: True
-        using_byte: True
-    optimizer:
-      type: Adam
-      params:
-        lr: 0.001
-        betas: [0.9, 0.98]
-        eps: 1.0e-9
-        weight_decay: 0.0
-    scheduler:
-      type: NoamLR
-      params:
-        warmup_steps: 4000
-linguistic_unit: 
-  cleaners: english_cleaners
-  lfeat_type_list: byte_index,emo_category,speaker_category
-  speaker_list: F7
-####################################################
-#                   LOSS SETTING                   #
-####################################################
-Loss:
-  MelReconLoss:
-    enable: True
-    params:
-      loss_type: mae
-  ProsodyReconLoss:
-    enable: True
-    params:
-      loss_type: mae
-  AttentionCTCLoss:
-    enable: True
-  AttentionBinarizationLoss:
-    enable: True
-    params:
-      start_epoch: 0
-      warmup_epoch: 100
-###########################################################
-#                  DATA LOADER SETTING                    #
-###########################################################
-batch_size: 8
-pin_memory: False            
-num_workers: 4 # FIXME: set > 0 may stuck on macos              
-remove_short_samples: False 
-allow_cache: True           
-grad_norm: 1.0
-###########################################################
-#                    INTERVAL SETTING                     #
-###########################################################
-train_max_steps: 1000000           # Number of training steps.
-save_interval_steps: 20000         # Interval steps to save checkpoint.
-eval_interval_steps: 10000          # Interval steps to evaluate the network.
-log_interval_steps: 1000            # Interval steps to record the training log.
-###########################################################
-#                     OTHER SETTING                       #
-###########################################################
-num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/kantts/configs/sambert_24k.yaml
+++ b/kantts/configs/sambert_24k.yaml
-model_type: sambert
-Model:
-#########################################################
-#         SAMBERT NETWORK ARCHITECTURE SETTING          #
-#########################################################
-  KanTtsSAMBERT:
-    params:
-        max_len: 800
-        embedding_dim: 512 
-        encoder_num_layers: 8
-        encoder_num_heads: 8
-        encoder_num_units: 128
-        encoder_ffn_inner_dim: 1024
-        encoder_dropout: 0.1
-        encoder_attention_dropout: 0.1
-        encoder_relu_dropout: 0.1
-        encoder_projection_units: 32
-        speaker_units: 32
-        emotion_units: 32
-        predictor_filter_size: 41
-        predictor_fsmn_num_layers: 3
-        predictor_num_memory_units: 128
-        predictor_ffn_inner_dim: 256
-        predictor_dropout: 0.1
-        predictor_shift: 0
-        predictor_lstm_units: 128
-        dur_pred_prenet_units: [128, 128]
-        dur_pred_lstm_units: 128
-        decoder_prenet_units: [256, 256]
-        decoder_num_layers: 12
-        decoder_num_heads: 8
-        decoder_num_units: 128
-        decoder_ffn_inner_dim: 1024
-        decoder_dropout: 0.1
-        decoder_attention_dropout: 0.1
-        decoder_relu_dropout: 0.1
-        outputs_per_step: 3
-        num_mels: 80
-        postnet_filter_size: 41
-        postnet_fsmn_num_layers: 4
-        postnet_num_memory_units: 256
-        postnet_ffn_inner_dim: 512
-        postnet_dropout: 0.1
-        postnet_shift: 17
-        postnet_lstm_units: 128
-        MAS: False
-    optimizer:
-      type: Adam
-      params:
-        lr: 0.001
-        betas: [0.9, 0.98]
-        eps: 1.0e-9
-        weight_decay: 0.0
-    scheduler:
-      type: NoamLR
-      params:
-        warmup_steps: 4000
-linguistic_unit: 
-  cleaners: english_cleaners
-  lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
-  speaker_list: F7
-####################################################
-#                   LOSS SETTING                   #
-####################################################
-Loss:
-  MelReconLoss:
-    enable: True
-    params:
-      loss_type: mae
-  ProsodyReconLoss:
-    enable: True
-    params:
-      loss_type: mae
-###########################################################
-#                  DATA LOADER SETTING                    #
-###########################################################
-batch_size: 32              
-pin_memory: False            
-num_workers: 4 # FIXME: set > 0 may stuck on macos              
-remove_short_samples: False 
-allow_cache: True           
-grad_norm: 1.0
-###########################################################
-#                    INTERVAL SETTING                     #
-###########################################################
-train_max_steps: 1000000           # Number of training steps.
-save_interval_steps: 20000         # Interval steps to save checkpoint.
-eval_interval_steps: 10000          # Interval steps to evaluate the network.
-log_interval_steps: 1000            # Interval steps to record the training log.
-###########################################################
-#                     OTHER SETTING                       #
-###########################################################
-num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/kantts/configs/sambert_48k.yaml
+++ b/kantts/configs/sambert_48k.yaml
-model_type: sambert
-Model:
-#########################################################
-#         SAMBERT NETWORK ARCHITECTURE SETTING          #
-#########################################################
-  KanTtsSAMBERT:
-    params:
-        max_len: 900
-        embedding_dim: 512 
-        encoder_num_layers: 8
-        encoder_num_heads: 8
-        encoder_num_units: 128
-        encoder_ffn_inner_dim: 1024
-        encoder_dropout: 0.1
-        encoder_attention_dropout: 0.1
-        encoder_relu_dropout: 0.1
-        encoder_projection_units: 32
-        speaker_units: 32
-        emotion_units: 32
-        predictor_filter_size: 41
-        predictor_fsmn_num_layers: 3
-        predictor_num_memory_units: 128
-        predictor_ffn_inner_dim: 256
-        predictor_dropout: 0.1
-        predictor_shift: 0
-        predictor_lstm_units: 128
-        dur_pred_prenet_units: [128, 128]
-        dur_pred_lstm_units: 128
-        decoder_prenet_units: [256, 256]
-        decoder_num_layers: 12
-        decoder_num_heads: 8
-        decoder_num_units: 128
-        decoder_ffn_inner_dim: 1024
-        decoder_dropout: 0.1
-        decoder_attention_dropout: 0.1
-        decoder_relu_dropout: 0.1
-        outputs_per_step: 3
-        num_mels: 128
-        postnet_filter_size: 41
-        postnet_fsmn_num_layers: 4
-        postnet_num_memory_units: 256
-        postnet_ffn_inner_dim: 512
-        postnet_dropout: 0.1
-        postnet_shift: 17
-        postnet_lstm_units: 128
-        MAS: False
-    optimizer:
-      type: Adam
-      params:
-        lr: 0.001
-        betas: [0.9, 0.98]
-        eps: 1.0e-9
-        weight_decay: 0.0
-    scheduler:
-      type: NoamLR
-      params:
-        warmup_steps: 4000
-linguistic_unit: 
-  cleaners: english_cleaners
-  lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
-  speaker_list: F7
-####################################################
-#                   LOSS SETTING                   #
-####################################################
-Loss:
-  MelReconLoss:
-    enable: True
-    params:
-      loss_type: mae
-  ProsodyReconLoss:
-    enable: True
-    params:
-      loss_type: mae
-###########################################################
-#                  DATA LOADER SETTING                    #
-###########################################################
-batch_size: 32              
-pin_memory: False            
-num_workers: 4 # FIXME: set > 0 may stuck on macos              
-remove_short_samples: False 
-allow_cache: True           
-grad_norm: 1.0
-###########################################################
-#                    INTERVAL SETTING                     #
-###########################################################
-train_max_steps: 1000000           # Number of training steps.
-save_interval_steps: 20000         # Interval steps to save checkpoint.
-eval_interval_steps: 10000          # Interval steps to evaluate the network.
-log_interval_steps: 1000            # Interval steps to record the training log.
-###########################################################
-#                     OTHER SETTING                       #
-###########################################################
-num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/kantts/configs/sambert_fp_8k.yaml
+++ b/kantts/configs/sambert_fp_8k.yaml
-model_type: sambert
-Model:
-#########################################################
-#         SAMBERT NETWORK ARCHITECTURE SETTING          #
-#########################################################
-  KanTtsSAMBERT:
-    params:
-        max_len: 800
-        embedding_dim: 512 
-        encoder_num_layers: 8
-        encoder_num_heads: 8
-        encoder_num_units: 128
-        encoder_ffn_inner_dim: 1024
-        encoder_dropout: 0.1
-        encoder_attention_dropout: 0.1
-        encoder_relu_dropout: 0.1
-        encoder_projection_units: 32
-        speaker_units: 32
-        emotion_units: 32
-        predictor_filter_size: 41
-        predictor_fsmn_num_layers: 3
-        predictor_num_memory_units: 128
-        predictor_ffn_inner_dim: 256
-        predictor_dropout: 0.1
-        predictor_shift: 0
-        predictor_lstm_units: 128
-        dur_pred_prenet_units: [128, 128]
-        dur_pred_lstm_units: 128
-        decoder_prenet_units: [256, 256]
-        decoder_num_layers: 12
-        decoder_num_heads: 8
-        decoder_num_units: 128
-        decoder_ffn_inner_dim: 1024
-        decoder_dropout: 0.1
-        decoder_attention_dropout: 0.1
-        decoder_relu_dropout: 0.1
-        outputs_per_step: 3
-        num_mels: 80
-        postnet_filter_size: 41
-        postnet_fsmn_num_layers: 4
-        postnet_num_memory_units: 256
-        postnet_ffn_inner_dim: 512
-        postnet_dropout: 0.1
-        postnet_shift: 17
-        postnet_lstm_units: 128
-        MAS: False
-        FP: True
-    optimizer:
-      type: Adam
-      params:
-        lr: 0.001
-        betas: [0.9, 0.98]
-        eps: 1.0e-9
-        weight_decay: 0.0
-    scheduler:
-      type: NoamLR
-      params:
-        warmup_steps: 4000
-linguistic_unit: 
-  cleaners: english_cleaners
-  lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
-  speaker_list: F7,F74,M7,FBYN,FRXL,xiaoyu
-####################################################
-#                   LOSS SETTING                   #
-####################################################
-Loss:
-  MelReconLoss:
-    enable: True
-    params:
-      loss_type: mae
-  ProsodyReconLoss:
-    enable: True
-    params:
-      loss_type: mae
-  FpCELoss:
-    enable: True
-    params:
-      loss_type: ce
-      weight: [1,4,4,8]
-###########################################################
-#                  DATA LOADER SETTING                    #
-###########################################################
-batch_size: 16              
-pin_memory: False            
-num_workers: 4 # FIXME: set > 0 may stuck on macos              
-remove_short_samples: False 
-allow_cache: True           
-grad_norm: 1.0
-###########################################################
-#                    INTERVAL SETTING                     #
-###########################################################
-train_max_steps: 1000000           # Number of training steps.
-save_interval_steps: 20000         # Interval steps to save checkpoint.
-eval_interval_steps: 10000          # Interval steps to evaluate the network.
-log_interval_steps: 1000            # Interval steps to record the training log.
-###########################################################
-#                     OTHER SETTING                       #
-###########################################################
-num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/kantts/configs/sambert_nsf_16k.yaml
+++ b/kantts/configs/sambert_nsf_16k.yaml
-model_type: sambert
-Model:
-#########################################################
-#         SAMBERT NETWORK ARCHITECTURE SETTING          #
-#########################################################
-  KanTtsSAMBERT:
-    params:
-        max_len: 800
-        embedding_dim: 512 
-        encoder_num_layers: 8
-        encoder_num_heads: 8
-        encoder_num_units: 128
-        encoder_ffn_inner_dim: 1024
-        encoder_dropout: 0.1
-        encoder_attention_dropout: 0.1
-        encoder_relu_dropout: 0.1
-        encoder_projection_units: 32
-        speaker_units: 32
-        emotion_units: 32
-        predictor_filter_size: 41
-        predictor_fsmn_num_layers: 3
-        predictor_num_memory_units: 128
-        predictor_ffn_inner_dim: 256
-        predictor_dropout: 0.1
-        predictor_shift: 0
-        predictor_lstm_units: 128
-        dur_pred_prenet_units: [128, 128]
-        dur_pred_lstm_units: 128
-        decoder_prenet_units: [256, 256]
-        decoder_num_layers: 12
-        decoder_num_heads: 8
-        decoder_num_units: 128
-        decoder_ffn_inner_dim: 1024
-        decoder_dropout: 0.1
-        decoder_attention_dropout: 0.1
-        decoder_relu_dropout: 0.1
-        outputs_per_step: 3
-        num_mels: 82
-        postnet_filter_size: 41
-        postnet_fsmn_num_layers: 4
-        postnet_num_memory_units: 256
-        postnet_ffn_inner_dim: 512
-        postnet_dropout: 0.1
-        postnet_shift: 17
-        postnet_lstm_units: 128
-        MAS: False
-        NSF: True
-    optimizer:
-      type: Adam
-      params:
-        lr: 0.001
-        betas: [0.9, 0.98]
-        eps: 1.0e-9
-        weight_decay: 0.0
-    scheduler:
-      type: NoamLR
-      params:
-        warmup_steps: 4000
-linguistic_unit: 
-  cleaners: english_cleaners
-  lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
-  speaker_list: F7,F74,FBYN,FRXL,M7,xiaoyu
-####################################################
-#                   LOSS SETTING                   #
-####################################################
-Loss:
-  MelReconLoss:
-    enable: True
-    params:
-      loss_type: mae
-  ProsodyReconLoss:
-    enable: True
-    params:
-      loss_type: mae
-###########################################################
-#                  DATA LOADER SETTING                    #
-###########################################################
-batch_size: 32              
-pin_memory: False            
-num_workers: 4 # FIXME: set > 0 may stuck on macos              
-remove_short_samples: False 
-allow_cache: True           
-grad_norm: 1.0
-###########################################################
-#                    INTERVAL SETTING                     #
-###########################################################
-train_max_steps: 10000000           # Number of training steps.
-save_interval_steps: 20000         # Interval steps to save checkpoint.
-eval_interval_steps: 2300500          # Interval steps to evaluate the network.
-log_interval_steps: 1000            # Interval steps to record the training log.
-###########################################################
-#                     OTHER SETTING                       #
-###########################################################
-num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/kantts/configs/sambert_nsf_24k.yaml
+++ b/kantts/configs/sambert_nsf_24k.yaml
-model_type: sambert
-Model:
-#########################################################
-#         SAMBERT NETWORK ARCHITECTURE SETTING          #
-#########################################################
-  KanTtsSAMBERT:
-    params:
-        max_len: 800
-        embedding_dim: 512 
-        encoder_num_layers: 8
-        encoder_num_heads: 8
-        encoder_num_units: 128
-        encoder_ffn_inner_dim: 1024
-        encoder_dropout: 0.1
-        encoder_attention_dropout: 0.1
-        encoder_relu_dropout: 0.1
-        encoder_projection_units: 32
-        speaker_units: 32
-        emotion_units: 32
-        predictor_filter_size: 41
-        predictor_fsmn_num_layers: 3
-        predictor_num_memory_units: 128
-        predictor_ffn_inner_dim: 256
-        predictor_dropout: 0.1
-        predictor_shift: 0
-        predictor_lstm_units: 128
-        dur_pred_prenet_units: [128, 128]
-        dur_pred_lstm_units: 128
-        decoder_prenet_units: [256, 256]
-        decoder_num_layers: 12
-        decoder_num_heads: 8
-        decoder_num_units: 128
-        decoder_ffn_inner_dim: 1024
-        decoder_dropout: 0.1
-        decoder_attention_dropout: 0.1
-        decoder_relu_dropout: 0.1
-        outputs_per_step: 3
-        num_mels: 82
-        postnet_filter_size: 41
-        postnet_fsmn_num_layers: 4
-        postnet_num_memory_units: 256
-        postnet_ffn_inner_dim: 512
-        postnet_dropout: 0.1
-        postnet_shift: 17
-        postnet_lstm_units: 128
-        MAS: False
-        NSF: True
-    optimizer:
-      type: Adam
-      params:
-        lr: 0.001
-        betas: [0.9, 0.98]
-        eps: 1.0e-9
-        weight_decay: 0.0
-    scheduler:
-      type: NoamLR
-      params:
-        warmup_steps: 4000
-linguistic_unit: 
-  cleaners: english_cleaners
-  lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
-  speaker_list: F7
-####################################################
-#                   LOSS SETTING                   #
-####################################################
-Loss:
-  MelReconLoss:
-    enable: True
-    params:
-      loss_type: mae
-  ProsodyReconLoss:
-    enable: True
-    params:
-      loss_type: mae
-###########################################################
-#                  DATA LOADER SETTING                    #
-###########################################################
-batch_size: 32              
-pin_memory: False            
-num_workers: 4 # FIXME: set > 0 may stuck on macos              
-remove_short_samples: False 
-allow_cache: True           
-grad_norm: 1.0
-###########################################################
-#                    INTERVAL SETTING                     #
-###########################################################
-train_max_steps: 1000000           # Number of training steps.
-save_interval_steps: 20000         # Interval steps to save checkpoint.
-eval_interval_steps: 10000          # Interval steps to evaluate the network.
-log_interval_steps: 1000            # Interval steps to record the training log.
-###########################################################
-#                     OTHER SETTING                       #
-###########################################################
-num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.