Initial commit

ee10550a · liugh5 · ee10550a · ee10550a · ee10550a · ee10550a
Commit ee10550a authored Feb 06, 2024 by liugh5
20 changed files
--- a/kantts/models/sambert/kantts_sambert_divide.py
+++ b/kantts/models/sambert/kantts_sambert_divide.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from kantts.models.sambert import FFTBlock, PNCABlock, Prenet
+from kantts.models.sambert.positions import (
+    SinusoidalPositionEncoder,
+    DurSinusoidalPositionEncoder,
+)
+from kantts.models.sambert.adaptors import (
+    LengthRegulator,
+    VarFsmnRnnNARPredictor,
+    VarRnnARPredictor,
+)
+from kantts.models.sambert.fsmn import FsmnEncoderV2
+from kantts.models.sambert.alignment import b_mas
+from kantts.models.sambert.attention import ConvAttention
+
+from kantts.models.utils import get_mask_from_lengths
+
+
+class SelfAttentionEncoder(nn.Module):
+    def __init__(
+        self,
+        n_layer,
+        d_in,
+        d_model,
+        n_head,
+        d_head,
+        d_inner,
+        dropout,
+        dropout_att,
+        dropout_relu,
+        position_encoder,
+    ):
+        super(SelfAttentionEncoder, self).__init__()
+
+        self.d_in = d_in
+        self.d_model = d_model
+        self.dropout = dropout
+        d_in_lst = [d_in] + [d_model] * (n_layer - 1)
+        self.fft = nn.ModuleList(
+            [
+                FFTBlock(
+                    d,
+                    d_model,
+                    n_head,
+                    d_head,
+                    d_inner,
+                    (3, 1),
+                    dropout,
+                    dropout_att,
+                    dropout_relu,
+                )
+                for d in d_in_lst
+            ]
+        )
+        self.ln = nn.LayerNorm(d_model, eps=1e-6)
+        self.position_enc = position_encoder
+
+    def forward(self, input, mask=None, return_attns=False):
+        input *= self.d_model ** 0.5
+        if isinstance(self.position_enc, SinusoidalPositionEncoder):
+            input = self.position_enc(input)
+        else:
+            raise NotImplementedError
+
+        input = F.dropout(input, p=self.dropout, training=self.training)
+
+        enc_slf_attn_list = []
+        max_len = input.size(1)
+        if mask is not None:
+            slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1)
+        else:
+            slf_attn_mask = None
+
+        enc_output = input
+        for id, layer in enumerate(self.fft):
+            enc_output, enc_slf_attn = layer(
+                enc_output, mask=mask, slf_attn_mask=slf_attn_mask
+            )
+            if return_attns:
+                enc_slf_attn_list += [enc_slf_attn]
+
+        enc_output = self.ln(enc_output)
+
+        return enc_output, enc_slf_attn_list
+
+
+class HybridAttentionDecoder(nn.Module):
+    def __init__(
+        self,
+        d_in,
+        prenet_units,
+        n_layer,
+        d_model,
+        d_mem,
+        n_head,
+        d_head,
+        d_inner,
+        dropout,
+        dropout_att,
+        dropout_relu,
+        d_out,
+    ):
+        super(HybridAttentionDecoder, self).__init__()
+
+        self.d_model = d_model
+        self.dropout = dropout
+        self.prenet = Prenet(d_in, prenet_units, d_model)
+        self.dec_in_proj = nn.Linear(d_model + d_mem, d_model)
+        self.pnca = nn.ModuleList(
+            [
+                PNCABlock(
+                    d_model,
+                    d_mem,
+                    n_head,
+                    d_head,
+                    d_inner,
+                    (1, 1),
+                    dropout,
+                    dropout_att,
+                    dropout_relu,
+                )
+                for _ in range(n_layer)
+            ]
+        )
+        self.ln = nn.LayerNorm(d_model, eps=1e-6)
+        self.dec_out_proj = nn.Linear(d_model, d_out)
+
+    def reset_state(self):
+        for layer in self.pnca:
+            layer.reset_state()
+
+    def get_pnca_attn_mask(
+        self, device, max_len, x_band_width, h_band_width, masks=None
+    ):
+        if masks is not None:
+            pnca_attn_mask = masks.unsqueeze(1).expand(-1, max_len, -1)
+        else:
+            pnca_attn_mask = None
+
+        range_ = torch.arange(max_len).to(device)
+        x_start = torch.clamp_min(range_ - x_band_width, 0)[None, None, :]
+        x_end = (range_ + 1)[None, None, :]
+        h_start = range_[None, None, :]
+        h_end = torch.clamp_max(range_ + h_band_width + 1, max_len + 1)[None, None, :]
+
+        pnca_x_attn_mask = ~(
+            (x_start <= range_[None, :, None]) & (x_end > range_[None, :, None])
+        ).transpose(1, 2)
+        pnca_h_attn_mask = ~(
+            (h_start <= range_[None, :, None]) & (h_end > range_[None, :, None])
+        ).transpose(1, 2)
+
+        if pnca_attn_mask is not None:
+            pnca_x_attn_mask = pnca_x_attn_mask | pnca_attn_mask
+            pnca_h_attn_mask = pnca_h_attn_mask | pnca_attn_mask
+            pnca_x_attn_mask = pnca_x_attn_mask.masked_fill(
+                pnca_attn_mask.transpose(1, 2), False
+            )
+            pnca_h_attn_mask = pnca_h_attn_mask.masked_fill(
+                pnca_attn_mask.transpose(1, 2), False
+            )
+
+        return pnca_attn_mask, pnca_x_attn_mask, pnca_h_attn_mask
+
+    # must call reset_state before
+    def forward(
+        self, input, memory, x_band_width, h_band_width, masks=None, return_attns=False
+    ):
+        input = self.prenet(input)
+        input = torch.cat([memory, input], dim=-1)
+        input = self.dec_in_proj(input)
+
+        if masks is not None:
+            input = input.masked_fill(masks.unsqueeze(-1), 0)
+
+        input *= self.d_model ** 0.5
+        input = F.dropout(input, p=self.dropout, training=self.training)
+
+        max_len = input.size(1)
+        pnca_attn_mask, pnca_x_attn_mask, pnca_h_attn_mask = self.get_pnca_attn_mask(
+            input.device, max_len, x_band_width, h_band_width, masks
+        )
+
+        dec_pnca_attn_x_list = []
+        dec_pnca_attn_h_list = []
+        dec_output = input
+        for id, layer in enumerate(self.pnca):
+            dec_output, dec_pnca_attn_x, dec_pnca_attn_h = layer(
+                dec_output,
+                memory,
+                masks=masks,
+                pnca_x_attn_mask=pnca_x_attn_mask,
+                pnca_h_attn_mask=pnca_h_attn_mask,
+            )
+            if return_attns:
+                dec_pnca_attn_x_list += [dec_pnca_attn_x]
+                dec_pnca_attn_h_list += [dec_pnca_attn_h]
+
+        dec_output = self.ln(dec_output)
+        dec_output = self.dec_out_proj(dec_output)
+
+        return dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list
+
+    # must call reset_state before when step == 0
+    def infer(
+        self,
+        step,
+        input,
+        memory,
+        x_band_width,
+        h_band_width,
+        masks=None,
+        return_attns=False,
+    ):
+        max_len = memory.size(1)
+
+        input = self.prenet(input)
+        input = torch.cat([memory[:, step : step + 1, :], input], dim=-1)
+        input = self.dec_in_proj(input)
+
+        input *= self.d_model ** 0.5
+        input = F.dropout(input, p=self.dropout, training=self.training)
+
+        pnca_attn_mask, pnca_x_attn_mask, pnca_h_attn_mask = self.get_pnca_attn_mask(
+            input.device, max_len, x_band_width, h_band_width, masks
+        )
+
+        dec_pnca_attn_x_list = []
+        dec_pnca_attn_h_list = []
+        dec_output = input
+        for id, layer in enumerate(self.pnca):
+            if masks is not None:
+                mask_step = masks[:, step : step + 1]
+            else:
+                mask_step = None
+            dec_output, dec_pnca_attn_x, dec_pnca_attn_h = layer(
+                dec_output,
+                memory,
+                mask=mask_step,
+                pnca_x_attn_mask=pnca_x_attn_mask[:, step : step + 1, : (step + 1)],
+                pnca_h_attn_mask=pnca_h_attn_mask[:, step : step + 1, :],
+            )
+            if return_attns:
+                dec_pnca_attn_x_list += [dec_pnca_attn_x]
+                dec_pnca_attn_h_list += [dec_pnca_attn_h]
+
+        dec_output = self.ln(dec_output)
+        dec_output = self.dec_out_proj(dec_output)
+
+        return dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list
+
+
+class TextFftEncoder(nn.Module):
+    def __init__(self, config):
+        super(TextFftEncoder, self).__init__()
+
+        d_emb = config["embedding_dim"]
+        self.using_byte = False
+        if config.get("using_byte", False):
+            self.using_byte = True
+            nb_ling_byte_index = config["byte_index"]
+            self.byte_index_emb = nn.Embedding(nb_ling_byte_index, d_emb)
+        else:
+            # linguistic unit lookup table
+            nb_ling_sy = config["sy"]
+            nb_ling_tone = config["tone"]
+            nb_ling_syllable_flag = config["syllable_flag"]
+            nb_ling_ws = config["word_segment"]
+            self.sy_emb = nn.Embedding(nb_ling_sy, d_emb)
+            self.tone_emb = nn.Embedding(nb_ling_tone, d_emb)
+            self.syllable_flag_emb = nn.Embedding(nb_ling_syllable_flag, d_emb)
+            self.ws_emb = nn.Embedding(nb_ling_ws, d_emb)
+
+        max_len = config["max_len"]
+
+        nb_layers = config["encoder_num_layers"]
+        nb_heads = config["encoder_num_heads"]
+        d_model = config["encoder_num_units"]
+        d_head = d_model // nb_heads
+        d_inner = config["encoder_ffn_inner_dim"]
+        dropout = config["encoder_dropout"]
+        dropout_attn = config["encoder_attention_dropout"]
+        dropout_relu = config["encoder_relu_dropout"]
+        d_proj = config["encoder_projection_units"]
+
+        self.d_model = d_model
+
+        position_enc = SinusoidalPositionEncoder(max_len, d_emb)
+
+        self.ling_enc = SelfAttentionEncoder(
+            nb_layers,
+            d_emb,
+            d_model,
+            nb_heads,
+            d_head,
+            d_inner,
+            dropout,
+            dropout_attn,
+            dropout_relu,
+            position_enc,
+        )
+
+        self.ling_proj = nn.Linear(d_model, d_proj, bias=False)
+
+    def forward(self, inputs_ling, masks=None, return_attns=False):
+        # Parse inputs_ling_seq
+        if self.using_byte:
+            inputs_byte_index = inputs_ling[:, :, 0]
+            byte_index_embedding = self.byte_index_emb(inputs_byte_index)
+            ling_embedding = byte_index_embedding
+        else:
+            inputs_sy = inputs_ling[:, :, 0]
+            inputs_tone = inputs_ling[:, :, 1]
+            inputs_syllable_flag = inputs_ling[:, :, 2]
+            inputs_ws = inputs_ling[:, :, 3]
+
+            # Lookup table
+            sy_embedding = self.sy_emb(inputs_sy)
+            tone_embedding = self.tone_emb(inputs_tone)
+            syllable_flag_embedding = self.syllable_flag_emb(inputs_syllable_flag)
+            ws_embedding = self.ws_emb(inputs_ws)
+
+            ling_embedding = (
+                sy_embedding + tone_embedding + syllable_flag_embedding + ws_embedding
+            )
+
+        enc_output, enc_slf_attn_lst = self.ling_enc(
+            ling_embedding, masks, return_attns
+        )
+
+        if hasattr(self, "ling_proj"):
+            enc_output = self.ling_proj(enc_output)
+
+        return enc_output, enc_slf_attn_lst, ling_embedding
+
+
+class TextEncoder(nn.Module):
+    def __init__(self, config):
+        super(TextEncoder, self).__init__()
+        self.text_encoder = TextFftEncoder(config)
+        self.se_enable = config.get("SE", False)
+        if not self.se_enable:
+            self.spk_tokenizer = nn.Embedding(config["speaker"], config["speaker_units"])
+        self.emo_tokenizer = nn.Embedding(config["emotion"], config["emotion_units"])
+        # self.variance_adaptor = VarianceAdaptor(config)
+        # self.mel_decoder = MelPNCADecoder(config)
+        # self.mel_postnet = PostNet(config)
+        self.MAS = False
+        if config.get("MAS", False):
+            self.MAS = True
+            self.align_attention = ConvAttention(
+                n_mel_channels=config["num_mels"],
+                n_text_channels=config["embedding_dim"],
+                n_att_channels=config["num_mels"],
+            )
+        self.fp_enable = config.get("FP", False)
+        if self.fp_enable:
+            self.FP_predictor = FP_Predictor(config)
+
+    def forward(self, inputs_ling, inputs_emotion, inputs_speaker, inputs_ling_masks=None, return_attns=False):
+        text_hid, enc_sla_attn_lst, ling_embedding = self.text_encoder(
+            inputs_ling, inputs_ling_masks, return_attns
+        )
+        emo_hid = self.emo_tokenizer(inputs_emotion)
+        spk_hid = inputs_speaker if self.se_enable else self.spk_tokenizer(inputs_speaker)
+
+        if return_attns:
+            return text_hid, enc_sla_attn_lst, ling_embedding, emo_hid, spk_hid
+        else:
+            return text_hid, ling_embedding, emo_hid, spk_hid
+
+
+class VarianceAdaptor(nn.Module):
+    def __init__(self, config):
+        super(VarianceAdaptor, self).__init__()
+
+        input_dim = (
+            config["encoder_projection_units"]
+            + config["emotion_units"]
+            + config["speaker_units"]
+        )
+        filter_size = config["predictor_filter_size"]
+        fsmn_num_layers = config["predictor_fsmn_num_layers"]
+        num_memory_units = config["predictor_num_memory_units"]
+        ffn_inner_dim = config["predictor_ffn_inner_dim"]
+        dropout = config["predictor_dropout"]
+        shift = config["predictor_shift"]
+        lstm_units = config["predictor_lstm_units"]
+
+        dur_pred_prenet_units = config["dur_pred_prenet_units"]
+        dur_pred_lstm_units = config["dur_pred_lstm_units"]
+
+        self.pitch_predictor = VarFsmnRnnNARPredictor(
+            input_dim,
+            filter_size,
+            fsmn_num_layers,
+            num_memory_units,
+            ffn_inner_dim,
+            dropout,
+            shift,
+            lstm_units,
+        )
+        self.energy_predictor = VarFsmnRnnNARPredictor(
+            input_dim,
+            filter_size,
+            fsmn_num_layers,
+            num_memory_units,
+            ffn_inner_dim,
+            dropout,
+            shift,
+            lstm_units,
+        )
+        self.duration_predictor = VarRnnARPredictor(
+            input_dim, dur_pred_prenet_units, dur_pred_lstm_units
+        )
+
+        self.length_regulator = LengthRegulator(config["outputs_per_step"])
+        self.dur_position_encoder = DurSinusoidalPositionEncoder(
+            config["encoder_projection_units"], config["outputs_per_step"]
+        )
+
+        self.pitch_emb = nn.Conv1d(
+            1, config["encoder_projection_units"], kernel_size=9, padding=4
+        )
+        self.energy_emb = nn.Conv1d(
+            1, config["encoder_projection_units"], kernel_size=9, padding=4
+        )
+
+    def forward(
+        self,
+        inputs_text_embedding,
+        inputs_emo_embedding,
+        inputs_spk_embedding,  # [1,20,192]
+        masks=None,
+        output_masks=None,
+        duration_targets=None,
+        pitch_targets=None,
+        energy_targets=None,
+    ):
+
+        batch_size = inputs_text_embedding.size(0)
+
+        variance_predictor_inputs = torch.cat(
+            [inputs_text_embedding, inputs_spk_embedding, inputs_emo_embedding], dim=-1
+        )
+
+        pitch_predictions = self.pitch_predictor(variance_predictor_inputs, masks)
+        energy_predictions = self.energy_predictor(variance_predictor_inputs, masks)
+
+        if pitch_targets is not None:
+            pitch_embeddings = self.pitch_emb(pitch_targets.unsqueeze(1)).transpose(
+                1, 2
+            )
+        else:
+            pitch_embeddings = self.pitch_emb(pitch_predictions.unsqueeze(1)).transpose(
+                1, 2
+            )
+
+        if energy_targets is not None:
+            energy_embeddings = self.energy_emb(energy_targets.unsqueeze(1)).transpose(
+                1, 2
+            )
+        else:
+            energy_embeddings = self.energy_emb(energy_predictions.unsqueeze(1)).transpose(
+                1, 2)
+
+        inputs_text_embedding_aug = (
+            inputs_text_embedding + pitch_embeddings + energy_embeddings
+        )
+        duration_predictor_cond = torch.cat(
+            [inputs_text_embedding_aug, inputs_spk_embedding, inputs_emo_embedding],
+            dim=-1,
+        )
+        if duration_targets is not None:
+            duration_predictor_go_frame = torch.zeros(batch_size, 1).to(
+                inputs_text_embedding.device
+            )
+            duration_predictor_input = torch.cat(
+                [duration_predictor_go_frame, duration_targets[:, :-1].float()], dim=-1
+            )
+            duration_predictor_input = torch.log(duration_predictor_input + 1)
+            log_duration_predictions, _ = self.duration_predictor(
+                duration_predictor_input.unsqueeze(-1),
+                duration_predictor_cond,
+                masks=masks,
+            )
+            duration_predictions = torch.exp(log_duration_predictions) - 1
+        else:
+            log_duration_predictions = self.duration_predictor.infer(
+                duration_predictor_cond, masks=masks
+            )
+            duration_predictions = torch.exp(log_duration_predictions) - 1
+
+        if duration_targets is not None:
+            LR_text_outputs, LR_length_rounded = self.length_regulator(
+                inputs_text_embedding_aug, duration_targets, masks=output_masks
+            )
+            LR_position_embeddings = self.dur_position_encoder(
+                duration_targets, masks=output_masks
+            )
+            LR_emo_outputs, _ = self.length_regulator(
+                inputs_emo_embedding, duration_targets, masks=output_masks
+            )
+            LR_spk_outputs, _ = self.length_regulator(
+                inputs_spk_embedding, duration_targets, masks=output_masks
+            )
+        else:
+            LR_text_outputs, LR_length_rounded = self.length_regulator(
+                inputs_text_embedding_aug, duration_predictions, masks=output_masks
+            )
+            LR_position_embeddings = self.dur_position_encoder(
+                duration_predictions, masks=output_masks
+            )
+            LR_emo_outputs, _ = self.length_regulator(
+                inputs_emo_embedding, duration_predictions, masks=output_masks
+            )
+            LR_spk_outputs, _ = self.length_regulator(
+                inputs_spk_embedding, duration_predictions, masks=output_masks
+            )
+
+        LR_text_outputs = LR_text_outputs + LR_position_embeddings
+
+        return (
+            LR_text_outputs,
+            LR_emo_outputs,
+            LR_spk_outputs,  # [1,153,192]
+            LR_length_rounded,
+            log_duration_predictions,
+            pitch_predictions,
+            energy_predictions,
+        )
+
+
+class VarianceAdaptor2(nn.Module):
+    def __init__(self, config):
+        super(VarianceAdaptor2, self).__init__()
+
+        input_dim = (
+            config["encoder_projection_units"]
+            + config["emotion_units"]
+            + config["speaker_units"]
+        )
+        filter_size = config["predictor_filter_size"]
+        fsmn_num_layers = config["predictor_fsmn_num_layers"]
+        num_memory_units = config["predictor_num_memory_units"]
+        ffn_inner_dim = config["predictor_ffn_inner_dim"]
+        dropout = config["predictor_dropout"]
+        shift = config["predictor_shift"]
+        lstm_units = config["predictor_lstm_units"]
+
+        dur_pred_prenet_units = config["dur_pred_prenet_units"]
+        dur_pred_lstm_units = config["dur_pred_lstm_units"]
+
+        self.pitch_predictor = VarFsmnRnnNARPredictor(
+            input_dim,
+            filter_size,
+            fsmn_num_layers,
+            num_memory_units,
+            ffn_inner_dim,
+            dropout,
+            shift,
+            lstm_units,
+        )
+        self.energy_predictor = VarFsmnRnnNARPredictor(
+            input_dim,
+            filter_size,
+            fsmn_num_layers,
+            num_memory_units,
+            ffn_inner_dim,
+            dropout,
+            shift,
+            lstm_units,
+        )
+        self.duration_predictor = VarRnnARPredictor(
+            input_dim, dur_pred_prenet_units, dur_pred_lstm_units
+        )
+
+        self.length_regulator = LengthRegulator(config["outputs_per_step"])
+        self.dur_position_encoder = DurSinusoidalPositionEncoder(
+            config["encoder_projection_units"], config["outputs_per_step"]
+        )
+
+        self.pitch_emb = nn.Conv1d(
+            1, config["encoder_projection_units"], kernel_size=9, padding=4
+        )
+        self.energy_emb = nn.Conv1d(
+            1, config["encoder_projection_units"], kernel_size=9, padding=4
+        )
+
+    def forward(
+        self,
+        inputs_text_embedding,
+        inputs_emo_embedding,
+        inputs_spk_embedding,  # [1,20,192]
+        scale=1.0,
+        masks=None,
+        output_masks=None,
+        duration_targets=None,
+        pitch_targets=None,
+        energy_targets=None,
+    ):
+        batch_size = inputs_text_embedding.size(0)
+
+        variance_predictor_inputs = torch.cat(
+            [inputs_text_embedding, inputs_spk_embedding, inputs_emo_embedding], dim=-1
+        )
+
+        pitch_predictions = self.pitch_predictor(variance_predictor_inputs, masks)
+        energy_predictions = self.energy_predictor(variance_predictor_inputs, masks)
+
+        if pitch_targets is not None:
+            pitch_embeddings = self.pitch_emb(pitch_targets.unsqueeze(1)).transpose(
+                1, 2
+            )
+        else:
+            pitch_embeddings = self.pitch_emb(pitch_predictions.unsqueeze(1)).transpose(
+                1, 2
+            )
+
+        if energy_targets is not None:
+            energy_embeddings = self.energy_emb(energy_targets.unsqueeze(1)).transpose(
+                1, 2
+            )
+        else:
+            energy_embeddings = self.energy_emb(energy_predictions.unsqueeze(1)).transpose(
+                1, 2)
+
+        inputs_text_embedding_aug = (
+            inputs_text_embedding + pitch_embeddings + energy_embeddings
+        )
+        duration_predictor_cond = torch.cat(
+            [inputs_text_embedding_aug, inputs_spk_embedding, inputs_emo_embedding],
+            dim=-1,
+        )
+        if duration_targets is not None:
+            duration_predictor_go_frame = torch.zeros(batch_size, 1).to(
+                inputs_text_embedding.device
+            )
+            duration_predictor_input = torch.cat(
+                [duration_predictor_go_frame, duration_targets[:, :-1].float()], dim=-1
+            )
+            duration_predictor_input = torch.log(duration_predictor_input + 1)
+            log_duration_predictions, _ = self.duration_predictor(
+                duration_predictor_input.unsqueeze(-1),
+                duration_predictor_cond,
+                masks=masks,
+            )
+            duration_predictions = torch.exp(log_duration_predictions) - 1
+        else:
+            log_duration_predictions = self.duration_predictor.infer(
+                duration_predictor_cond, masks=masks
+            )
+            duration_predictions = torch.exp(log_duration_predictions) - 1
+
+        if duration_targets is not None:
+            LR_text_outputs, LR_length_rounded = self.length_regulator(
+                inputs_text_embedding_aug, duration_targets*scale, masks=output_masks  # *scale
+            )
+            LR_position_embeddings = self.dur_position_encoder(
+                duration_targets, masks=output_masks
+            )
+            LR_emo_outputs, _ = self.length_regulator(
+                inputs_emo_embedding, duration_targets*scale, masks=output_masks  # *scale
+            )
+            LR_spk_outputs, _ = self.length_regulator(
+                inputs_spk_embedding, duration_targets*scale, masks=output_masks  # *scale
+            )
+        else:
+            LR_text_outputs, LR_length_rounded = self.length_regulator(
+                inputs_text_embedding_aug, duration_predictions*scale, masks=output_masks # *scale
+            )
+            LR_position_embeddings = self.dur_position_encoder(
+                duration_predictions*scale, masks=output_masks # *target_rate
+            )
+            LR_emo_outputs, _ = self.length_regulator(
+                inputs_emo_embedding, duration_predictions*scale, masks=output_masks # *scale
+            )
+            LR_spk_outputs, _ = self.length_regulator(
+                inputs_spk_embedding, duration_predictions*scale, masks=output_masks # *scale
+            )
+
+        LR_text_outputs = LR_text_outputs + LR_position_embeddings
+
+        return (
+            LR_text_outputs,
+            LR_emo_outputs,
+            LR_spk_outputs,  # [1,153,192]
+            LR_length_rounded,
+            log_duration_predictions,
+            pitch_predictions,
+            energy_predictions,
+        )
+
+
+class MelPNCADecoder(nn.Module):
+    def __init__(self, config):
+        super(MelPNCADecoder, self).__init__()
+
+        prenet_units = config["decoder_prenet_units"]
+        nb_layers = config["decoder_num_layers"]
+        nb_heads = config["decoder_num_heads"]
+        d_model = config["decoder_num_units"]
+        d_head = d_model // nb_heads
+        d_inner = config["decoder_ffn_inner_dim"]
+        dropout = config["decoder_dropout"]
+        dropout_attn = config["decoder_attention_dropout"]
+        dropout_relu = config["decoder_relu_dropout"]
+        outputs_per_step = config["outputs_per_step"]
+
+        d_mem = (
+            config["encoder_projection_units"] * outputs_per_step
+            + config["emotion_units"]
+            + config["speaker_units"]
+        )
+        d_mel = config["num_mels"]
+
+        self.d_mel = d_mel
+        self.r = outputs_per_step
+        self.nb_layers = nb_layers
+
+        self.mel_dec = HybridAttentionDecoder(
+            d_mel,
+            prenet_units,
+            nb_layers,
+            d_model,
+            d_mem,
+            nb_heads,
+            d_head,
+            d_inner,
+            dropout,
+            dropout_attn,
+            dropout_relu,
+            d_mel * outputs_per_step,
+        )
+
+    def forward(
+        self,
+        memory,
+        x_band_width,
+        h_band_width,
+        target=None,
+        masks=None,
+        return_attns=False,
+    ):
+        batch_size = memory.size(0)
+        go_frame = torch.zeros((batch_size, 1, self.d_mel)).to(memory.device)
+
+        if target is not None:
+            self.mel_dec.reset_state()
+            input = target[:, self.r - 1 :: self.r, :]
+            input = torch.cat([go_frame, input], dim=1)[:, :-1, :]
+            dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list = self.mel_dec(
+                input,
+                memory,
+                x_band_width,
+                h_band_width,
+                masks=masks,
+                return_attns=return_attns,
+            )
+
+        else:
+            dec_output = []
+            dec_pnca_attn_x_list = [[] for _ in range(self.nb_layers)]
+            dec_pnca_attn_h_list = [[] for _ in range(self.nb_layers)]
+            self.mel_dec.reset_state()
+            input = go_frame
+            for step in range(memory.size(1)):
+                (
+                    dec_output_step,
+                    dec_pnca_attn_x_step,
+                    dec_pnca_attn_h_step,
+                ) = self.mel_dec.infer(
+                    step,
+                    input,
+                    memory,
+                    x_band_width,
+                    h_band_width,
+                    masks=masks,
+                    return_attns=return_attns,
+                )
+                input = dec_output_step[:, :, -self.d_mel :]
+
+                dec_output.append(dec_output_step)
+                for layer_id, (pnca_x_attn, pnca_h_attn) in enumerate(
+                    zip(dec_pnca_attn_x_step, dec_pnca_attn_h_step)
+                ):
+                    left = memory.size(1) - pnca_x_attn.size(-1)
+                    if left > 0:
+                        padding = torch.zeros((pnca_x_attn.size(0), 1, left)).to(
+                            pnca_x_attn
+                        )
+                        pnca_x_attn = torch.cat([pnca_x_attn, padding], dim=-1)
+                    dec_pnca_attn_x_list[layer_id].append(pnca_x_attn)
+                    dec_pnca_attn_h_list[layer_id].append(pnca_h_attn)
+            dec_output = torch.cat(dec_output, dim=1)
+            if return_attns:
+                for layer_id in range(self.nb_layers):
+                    dec_pnca_attn_x_list[layer_id] = torch.cat(
+                        dec_pnca_attn_x_list[layer_id], dim=1
+                    )
+                    dec_pnca_attn_h_list[layer_id] = torch.cat(
+                        dec_pnca_attn_h_list[layer_id], dim=1
+                    )
+
+        if return_attns:
+            return dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list
+        else:
+            return dec_output
+
+
+class PostNet(nn.Module):
+    def __init__(self, config):
+        super(PostNet, self).__init__()
+
+        self.filter_size = config["postnet_filter_size"]
+        self.fsmn_num_layers = config["postnet_fsmn_num_layers"]
+        self.num_memory_units = config["postnet_num_memory_units"]
+        self.ffn_inner_dim = config["postnet_ffn_inner_dim"]
+        self.dropout = config["postnet_dropout"]
+        self.shift = config["postnet_shift"]
+        self.lstm_units = config["postnet_lstm_units"]
+        self.num_mels = config["num_mels"]
+
+        self.fsmn = FsmnEncoderV2(
+            self.filter_size,
+            self.fsmn_num_layers,
+            self.num_mels,
+            self.num_memory_units,
+            self.ffn_inner_dim,
+            self.dropout,
+            self.shift,
+        )
+        self.lstm = nn.LSTM(
+            self.num_memory_units, self.lstm_units, num_layers=1, batch_first=True
+        )
+        self.fc = nn.Linear(self.lstm_units, self.num_mels)
+
+    def forward(self, x, mask=None):
+        postnet_fsmn_output = self.fsmn(x, mask)
+        # The input can also be a packed variable length sequence,
+        # here we just omit it for simpliciy due to the mask and uni-directional lstm.
+        postnet_lstm_output, _ = self.lstm(postnet_fsmn_output)
+        mel_residual_output = self.fc(postnet_lstm_output)
+
+        return mel_residual_output
+
+
+class FP_Predictor(nn.Module):
+    def __init__(self, config):
+        super(FP_Predictor, self).__init__()
+
+        self.w_1 = nn.Conv1d(
+            config["encoder_projection_units"],
+            config["embedding_dim"] // 2,
+            kernel_size=3,
+            padding=1,
+        )
+        self.w_2 = nn.Conv1d(
+            config["embedding_dim"] // 2,
+            config["encoder_projection_units"],
+            kernel_size=1,
+            padding=0,
+        )
+        self.layer_norm1 = nn.LayerNorm(config["embedding_dim"] // 2, eps=1e-6)
+        self.layer_norm2 = nn.LayerNorm(config["encoder_projection_units"], eps=1e-6)
+        self.dropout_inner = nn.Dropout(0.1)
+        self.dropout = nn.Dropout(0.1)
+        self.fc = nn.Linear(config["encoder_projection_units"], 4)
+
+    def forward(self, x):
+        x = x.transpose(1, 2)
+        x = F.relu(self.w_1(x))
+        x = x.transpose(1, 2)
+        x = self.dropout_inner(self.layer_norm1(x))
+        x = x.transpose(1, 2)
+        x = F.relu(self.w_2(x))
+        x = x.transpose(1, 2)
+        x = self.dropout(self.layer_norm2(x))
+        output = F.softmax(self.fc(x), dim=2)
+        return output
\ No newline at end of file
--- a/kantts/models/sambert/positions.py
+++ b/kantts/models/sambert/positions.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import numpy as np
+
+
+class SinusoidalPositionEncoder(nn.Module):
+    def __init__(self, max_len, depth):
+        super(SinusoidalPositionEncoder, self).__init__()
+
+        self.max_len = max_len
+        self.depth = depth
+        self.position_enc = nn.Parameter(
+            self.get_sinusoid_encoding_table(max_len, depth).unsqueeze(0),
+            requires_grad=False,
+        )
+
+    def forward(self, input):
+        bz_in, len_in, _ = input.size()
+        if len_in > self.max_len:
+            self.max_len = len_in
+            self.position_enc.data = (
+                self.get_sinusoid_encoding_table(self.max_len, self.depth)
+                .unsqueeze(0)
+                .to(input.device)
+            )
+
+        output = input + self.position_enc[:, :len_in, :].expand(bz_in, -1, -1)
+
+        return output
+
+    @staticmethod
+    def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
+        """ Sinusoid position encoding table """
+
+        def cal_angle(position, hid_idx):
+            return position / np.power(10000, hid_idx / float(d_hid / 2 - 1))
+
+        def get_posi_angle_vec(position):
+            return [cal_angle(position, hid_j) for hid_j in range(d_hid // 2)]
+
+        scaled_time_table = np.array(
+            [get_posi_angle_vec(pos_i + 1) for pos_i in range(n_position)]
+        )
+
+        sinusoid_table = np.zeros((n_position, d_hid))
+        sinusoid_table[:, : d_hid // 2] = np.sin(scaled_time_table)
+        sinusoid_table[:, d_hid // 2 :] = np.cos(scaled_time_table)
+
+        if padding_idx is not None:
+            # zero vector for padding dimension
+            sinusoid_table[padding_idx] = 0.0
+
+        return torch.FloatTensor(sinusoid_table)
+
+
+class DurSinusoidalPositionEncoder(nn.Module):
+    def __init__(self, depth, outputs_per_step):
+        super(DurSinusoidalPositionEncoder, self).__init__()
+
+        self.depth = depth
+        self.outputs_per_step = outputs_per_step
+
+        inv_timescales = [
+            np.power(10000, 2 * (hid_idx // 2) / depth) for hid_idx in range(depth)
+        ]
+        self.inv_timescales = nn.Parameter(
+            torch.FloatTensor(inv_timescales), requires_grad=False
+        )
+
+    def forward(self, durations, masks=None):
+        reps = (durations + 0.5).long()
+        output_lens = reps.sum(dim=1)
+        max_len = output_lens.max()
+        reps_cumsum = torch.cumsum(F.pad(reps.float(), (1, 0, 0, 0), value=0.0), dim=1)[
+            :, None, :
+        ]
+        range_ = torch.arange(max_len).to(durations.device)[None, :, None]
+        mult = (reps_cumsum[:, :, :-1] <= range_) & (reps_cumsum[:, :, 1:] > range_)
+        mult = mult.float()
+        offsets = torch.matmul(mult, reps_cumsum[:, 0, :-1].unsqueeze(-1)).squeeze(-1)
+        dur_pos = range_[:, :, 0] - offsets + 1
+
+        if masks is not None:
+            assert masks.size(1) == dur_pos.size(1)
+            dur_pos = dur_pos.masked_fill(masks, 0.0)
+
+        seq_len = dur_pos.size(1)
+        padding = self.outputs_per_step - int(seq_len) % self.outputs_per_step
+        if padding < self.outputs_per_step:
+            dur_pos = F.pad(dur_pos, (0, padding, 0, 0), value=0.0)
+
+        position_embedding = dur_pos[:, :, None] / self.inv_timescales[None, None, :]
+        position_embedding[:, :, 0::2] = torch.sin(position_embedding[:, :, 0::2])
+        position_embedding[:, :, 1::2] = torch.cos(position_embedding[:, :, 1::2])
+
+        return position_embedding
--- a/kantts/models/utils.py
+++ b/kantts/models/utils.py
+import torch
+from distutils.version import LooseVersion
+
+is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion("1.7")
+
+
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+
+
+def get_mask_from_lengths(lengths, max_len=None):
+    batch_size = lengths.shape[0]
+    if max_len is None:
+        max_len = torch.max(lengths).item()
+
+    ids = (
+        torch.arange(0, max_len).unsqueeze(0).expand(batch_size, -1).to(lengths.device)
+    )
+    mask = ids >= lengths.unsqueeze(1).expand(-1, max_len)
+
+    return mask
--- a/kantts/preprocess/__init__.py
+++ b/kantts/preprocess/__init__.py
--- a/kantts/preprocess/__pycache__/__init__.cpython-38.pyc
+++ b/kantts/preprocess/__pycache__/__init__.cpython-38.pyc
--- a/kantts/preprocess/__pycache__/fp_processor.cpython-38.pyc
+++ b/kantts/preprocess/__pycache__/fp_processor.cpython-38.pyc
--- a/kantts/preprocess/audio_processor/__init__.py
+++ b/kantts/preprocess/audio_processor/__init__.py
--- a/kantts/preprocess/audio_processor/__pycache__/__init__.cpython-38.pyc
+++ b/kantts/preprocess/audio_processor/__pycache__/__init__.cpython-38.pyc
--- a/kantts/preprocess/audio_processor/__pycache__/audio_processor.cpython-38.pyc
+++ b/kantts/preprocess/audio_processor/__pycache__/audio_processor.cpython-38.pyc
--- a/kantts/preprocess/audio_processor/audio_processor.py
+++ b/kantts/preprocess/audio_processor/audio_processor.py
+import os
+import numpy as np
+from glob import glob
+from tqdm import tqdm
+from concurrent.futures import ProcessPoolExecutor
+import argparse
+import yaml
+
+import logging
+
+from .core.utils import (
+    volume_normalize,
+    get_pitch,
+    get_energy,
+    align_length,
+    compute_mean,
+    compute_std,
+    f0_norm_mean_std,
+    norm_mean_std,
+    parse_interval_file,
+    average_by_duration,
+    encode_16bits,
+)
+
+from .core.dsp import (
+    melspectrogram,
+    load_wav,
+    trim_silence,
+    trim_silence_with_interval,
+    save_wav,
+)
+
+logging.basicConfig(
+    format="%(asctime)s %(levelname)-4s [%(filename)s:%(lineno)d] %(message)s",
+    datefmt="%Y-%m-%d:%H:%M:%S",
+    level=logging.DEBUG,
+)
+
+
+default_audio_config = {
+    #  Preprocess
+    "wav_normalize": True,
+    "trim_silence": True,
+    "trim_silence_threshold_db": 60,
+    "preemphasize": False,
+    #  Feature extraction
+    "sampling_rate": 24000,
+    "hop_length": 240,
+    "win_length": 1024,
+    "n_mels": 80,
+    "n_fft": 1024,
+    "fmin": 50.0,
+    "fmax": 7600.0,
+    "min_level_db": -100,
+    "ref_level_db": 20,
+    "phone_level_feature": True,
+    "num_workers": 16,
+    #  Normalization
+    "norm_type": "mean_std",  # 'mean_std', 'global norm'
+    "max_norm": 1.0,
+    "symmetric": False,
+}
+
+
+class AudioProcessor:
+    def __init__(self, config=None):
+        #  TODO: Add more audio processing methods.
+        if not isinstance(config, dict):
+            logging.warning(
+                "[AudioProcessor] config is not a dict, fall into default config."
+            )
+            self.config = default_audio_config
+        else:
+            self.config = config
+
+        for key in self.config:
+            setattr(self, key, self.config[key])
+
+        self.min_wav_length = int(self.config["sampling_rate"] * 0.5)
+
+        self.badcase_list = []
+        self.pcm_dict = {}
+        self.mel_dict = {}
+        self.f0_dict = {}
+        self.uv_dict = {}
+        self.nccf_dict = {}
+        self.f0uv_dict = {}
+        self.energy_dict = {}
+        self.dur_dict = {}
+        logging.info("[AudioProcessor] Initialize AudioProcessor.")
+        logging.info("[AudioProcessor] config params:")
+        for key in self.config:
+            logging.info("[AudioProcessor] %s: %s", key, self.config[key])
+
+    def calibrate_SyllableDuration(
+        self, raw_dur_dir, raw_metafile, out_cali_duration_dir
+    ):
+        with open(raw_metafile, "r") as f:
+            lines = f.readlines()
+
+        output_dur_dir = out_cali_duration_dir
+        os.makedirs(output_dur_dir, exist_ok=True)
+
+        for line in lines:
+            line = line.strip()
+            index, symbols = line.split("\t")
+            symbols = [
+                symbol.strip("{").strip("}").split("$")[0]
+                for symbol in symbols.strip().split(" ")
+            ]
+            dur_file = os.path.join(raw_dur_dir, index + ".npy")
+            phone_file = os.path.join(raw_dur_dir, index + ".phone")
+            if not os.path.exists(dur_file) or not os.path.exists(phone_file):
+                logging.warning(
+                    "[AudioProcessor] dur file or phone file not exists: %s", index
+                )
+                continue
+            with open(phone_file, "r") as f:
+                phones = f.readlines()
+            dur = np.load(dur_file)
+            cali_duration = []
+
+            dur_idx = 0
+            syll_idx = 0
+
+            while dur_idx < len(dur) and syll_idx < len(symbols):
+                if phones[dur_idx].strip() == "sil":
+                    dur_idx += 1
+                    continue
+
+                if phones[dur_idx].strip() == "sp" and symbols[syll_idx][0] != "#":
+                    dur_idx += 1
+                    continue
+
+                if symbols[syll_idx] in ["ga", "go", "ge"]:
+                    cali_duration.append(0)
+                    syll_idx += 1
+                    #  print("NONE", symbols[syll_idx], 0)
+                    continue
+
+                if symbols[syll_idx][0] == "#":
+                    if phones[dur_idx].strip() != "sp":
+                        cali_duration.append(0)
+                        #  print("NONE", symbols[syll_idx], 0)
+                        syll_idx += 1
+                        continue
+                    else:
+                        cali_duration.append(dur[dur_idx])
+                        #  print(phones[dur_idx].strip(), symbols[syll_idx], dur[dur_idx])
+                        dur_idx += 1
+                        syll_idx += 1
+                        continue
+                # A corresponding phone is found
+                cali_duration.append(dur[dur_idx])
+                #  print(phones[dur_idx].strip(), symbols[syll_idx], dur[dur_idx])
+                dur_idx += 1
+                syll_idx += 1
+            # Add #4 phone duration
+            cali_duration.append(0)
+            if len(cali_duration) != len(symbols):
+                logging.error(
+                    "[Duration Calibrating] Syllable duration {}\
+                        is not equal to the number of symbols {}, index: {}".format(
+                        len(cali_duration), len(symbols), index
+                    )
+                )
+                continue
+
+            #  Align with mel frames
+            durs = np.array(cali_duration)
+            if len(self.mel_dict) > 0:
+                pair_mel = self.mel_dict.get(index, None)
+                if pair_mel is None:
+                    logging.warning(
+                        "[AudioProcessor] Interval file %s  has no corresponding mel",
+                        index,
+                    )
+                    continue
+                mel_frames = pair_mel.shape[0]
+                dur_frames = np.sum(durs)
+                if np.sum(durs) > mel_frames:
+                    durs[-2] -= dur_frames - mel_frames
+                elif np.sum(durs) < mel_frames:
+                    durs[-2] += mel_frames - np.sum(durs)
+
+                if durs[-2] < 0:
+                    logging.error(
+                        "[AudioProcessor] Duration calibrating failed for %s, mismatch frames %s",
+                        index,
+                        durs[-2],
+                    )
+                    self.badcase_list.append(index)
+                    continue
+
+            self.dur_dict[index] = durs
+
+            np.save(os.path.join(output_dur_dir, index + ".npy"), self.dur_dict[index])
+
+    def amp_normalize(self, src_wav_dir, out_wav_dir):
+        if self.wav_normalize:
+            logging.info("[AudioProcessor] Amplitude normalization started")
+            os.makedirs(out_wav_dir, exist_ok=True)
+            res = volume_normalize(src_wav_dir, out_wav_dir)
+            logging.info("[AudioProcessor] Amplitude normalization finished")
+            return res
+        else:
+            logging.info("[AudioProcessor] No amplitude normalization")
+            os.symlink(src_wav_dir, out_wav_dir, target_is_directory=True)
+            return True
+
+    def get_pcm_dict(self, src_wav_dir):
+        wav_list = glob(os.path.join(src_wav_dir, "*.wav"))
+        if len(self.pcm_dict) > 0:
+            return self.pcm_dict
+
+        logging.info("[AudioProcessor] Start to load pcm from %s", src_wav_dir)
+        with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
+            total=len(wav_list)
+        ) as progress:
+            futures = []
+            for wav_path in wav_list:
+                future = executor.submit(load_wav, wav_path, self.sampling_rate)
+                future.add_done_callback(lambda p: progress.update())
+                wav_name = os.path.splitext(os.path.basename(wav_path))[0]
+                futures.append((future, wav_name))
+            for future, wav_name in futures:
+                pcm = future.result()
+                if len(pcm) < self.min_wav_length:
+                    logging.warning("[AudioProcessor] %s is too short, skip", wav_name)
+                    self.badcase_list.append(wav_name)
+                    continue
+                self.pcm_dict[wav_name] = pcm
+
+        return self.pcm_dict
+
+    def trim_silence_wav(self, src_wav_dir, out_wav_dir=None):
+        wav_list = glob(os.path.join(src_wav_dir, "*.wav"))
+        logging.info("[AudioProcessor] Trim silence started")
+        if out_wav_dir is None:
+            out_wav_dir = src_wav_dir
+        else:
+            os.makedirs(out_wav_dir, exist_ok=True)
+        pcm_dict = self.get_pcm_dict(src_wav_dir)
+        with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
+            total=len(wav_list)
+        ) as progress:
+            futures = []
+            for wav_basename, pcm_data in pcm_dict.items():
+                future = executor.submit(
+                    trim_silence,
+                    pcm_data,
+                    self.trim_silence_threshold_db,
+                    self.hop_length,
+                    self.win_length,
+                )
+                future.add_done_callback(lambda p: progress.update())
+                futures.append((future, wav_basename))
+        # TODO: multi-processing
+        for future, wav_basename in tqdm(futures):
+            pcm = future.result()
+            if len(pcm) < self.min_wav_length:
+                logging.warning("[AudioProcessor] %s is too short, skip", wav_basename)
+                self.badcase_list.append(wav_basename)
+                self.pcm_dict.pop(wav_basename)
+                continue
+            self.pcm_dict[wav_basename] = pcm
+            save_wav(
+                self.pcm_dict[wav_basename],
+                os.path.join(out_wav_dir, wav_basename + ".wav"),
+                self.sampling_rate,
+            )
+
+        logging.info("[AudioProcessor] Trim silence finished")
+        return True
+
+    def trim_silence_wav_with_interval(self, src_wav_dir, dur_dir, out_wav_dir=None):
+        wav_list = glob(os.path.join(src_wav_dir, "*.wav"))
+        logging.info("[AudioProcessor] Trim silence with interval started")
+        if out_wav_dir is None:
+            out_wav_dir = src_wav_dir
+        else:
+            os.makedirs(out_wav_dir, exist_ok=True)
+        pcm_dict = self.get_pcm_dict(src_wav_dir)
+        with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
+            total=len(wav_list)
+        ) as progress:
+            futures = []
+            for wav_basename, pcm_data in pcm_dict.items():
+                future = executor.submit(
+                    trim_silence_with_interval,
+                    pcm_data,
+                    self.dur_dict.get(wav_basename, None),
+                    self.hop_length,
+                )
+                future.add_done_callback(lambda p: progress.update())
+                futures.append((future, wav_basename))
+        # TODO: multi-processing
+        for future, wav_basename in tqdm(futures):
+            trimed_pcm = future.result()
+            if trimed_pcm is None:
+                continue
+            if len(trimed_pcm) < self.min_wav_length:
+                logging.warning("[AudioProcessor] %s is too short, skip", wav_basename)
+                self.badcase_list.append(wav_basename)
+                self.pcm_dict.pop(wav_basename)
+                continue
+            self.pcm_dict[wav_basename] = trimed_pcm
+            save_wav(
+                self.pcm_dict[wav_basename],
+                os.path.join(out_wav_dir, wav_basename + ".wav"),
+                self.sampling_rate,
+            )
+
+        logging.info("[AudioProcessor] Trim silence finished")
+        return True
+
+    def mel_extract(self, src_wav_dir, out_feature_dir):
+        os.makedirs(out_feature_dir, exist_ok=True)
+        wav_list = glob(os.path.join(src_wav_dir, "*.wav"))
+        pcm_dict = self.get_pcm_dict(src_wav_dir)
+
+        logging.info("[AudioProcessor] Melspec extraction started")
+
+        # Get global normed mel spec
+        with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
+            total=len(wav_list)
+        ) as progress:
+            futures = []
+            for wav_basename, pcm_data in pcm_dict.items():
+                future = executor.submit(
+                    melspectrogram,
+                    pcm_data,
+                    self.sampling_rate,
+                    self.n_fft,
+                    self.hop_length,
+                    self.win_length,
+                    self.n_mels,
+                    self.max_norm,
+                    self.min_level_db,
+                    self.ref_level_db,
+                    self.fmin,
+                    self.fmax,
+                    self.symmetric,
+                    self.preemphasize,
+                )
+                future.add_done_callback(lambda p: progress.update())
+                futures.append((future, wav_basename))
+
+            for future, wav_basename in futures:
+                result = future.result()
+                if result is None:
+                    logging.warning(
+                        "[AudioProcessor] Melspec extraction failed for %s",
+                        wav_basename,
+                    )
+                    self.badcase_list.append(wav_basename)
+                else:
+                    melspec = result
+                    self.mel_dict[wav_basename] = melspec
+
+        logging.info("[AudioProcessor] Melspec extraction finished")
+
+        #  FIXME: is this step necessary?
+        #  Do mean std norm on global-normed melspec
+        logging.info("Melspec statistic proceeding...")
+        mel_mean = compute_mean(list(self.mel_dict.values()), dims=self.n_mels)
+        mel_std = compute_std(list(self.mel_dict.values()), mel_mean, dims=self.n_mels)
+        logging.info("Melspec statistic done")
+        np.savetxt(os.path.join(out_feature_dir, "mel_mean.txt"), mel_mean, fmt="%.6f")
+        np.savetxt(os.path.join(out_feature_dir, "mel_std.txt"), mel_std, fmt="%.6f")
+        logging.info(
+            "[AudioProcessor] melspec mean and std saved to:\n{},\n{}".format(
+                os.path.join(out_feature_dir, "mel_mean.txt"),
+                os.path.join(out_feature_dir, "mel_std.txt"),
+            )
+        )
+
+        logging.info("[AudioProcessor] Melspec mean std norm is proceeding...")
+        for wav_basename in self.mel_dict:
+            melspec = self.mel_dict[wav_basename]
+            norm_melspec = norm_mean_std(melspec, mel_mean, mel_std)
+            np.save(os.path.join(out_feature_dir, wav_basename + ".npy"), norm_melspec)
+
+        logging.info("[AudioProcessor] Melspec normalization finished")
+        logging.info("[AudioProcessor] Normed Melspec saved to %s", out_feature_dir)
+
+        return True
+
+    #  TODO: some dataset may have no interval label
+    def duration_generate(self, src_interval_dir, out_feature_dir):
+        os.makedirs(out_feature_dir, exist_ok=True)
+        interval_list = glob(os.path.join(src_interval_dir, "*.interval"))
+
+        logging.info("[AudioProcessor] Duration generation started")
+        with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
+            total=len(interval_list)
+        ) as progress:
+            futures = []
+            for interval_file_path in interval_list:
+                future = executor.submit(
+                    parse_interval_file,
+                    interval_file_path,
+                    self.sampling_rate,
+                    self.hop_length,
+                )
+                future.add_done_callback(lambda p: progress.update())
+                futures.append(
+                    (future, os.path.splitext(os.path.basename(interval_file_path))[0])
+                )
+
+            logging.info("[AudioProcessor] Duration align with mel is proceeding...")
+            for future, wav_basename in futures:
+                result = future.result()
+                if result is None:
+                    logging.warning(
+                        "[AudioProcessor] Duration generate failed for %s", wav_basename
+                    )
+                    self.badcase_list.append(wav_basename)
+                else:
+                    durs, phone_list = result
+                    #  Algin length with melspec
+                    if len(self.mel_dict) > 0:
+                        pair_mel = self.mel_dict.get(wav_basename, None)
+                        if pair_mel is None:
+                            logging.warning(
+                                "[AudioProcessor] Interval file %s  has no corresponding mel",
+                                wav_basename,
+                            )
+                            continue
+                        mel_frames = pair_mel.shape[0]
+                        dur_frames = np.sum(durs)
+                        if np.sum(durs) > mel_frames:
+                            durs[-1] -= dur_frames - mel_frames
+                        elif np.sum(durs) < mel_frames:
+                            durs[-1] += mel_frames - np.sum(durs)
+
+                        if durs[-1] < 0:
+                            logging.error(
+                                "[AudioProcessor] Duration align failed for %s, mismatch frames %s",
+                                wav_basename,
+                                durs[-1],
+                            )
+                            self.badcase_list.append(wav_basename)
+                            continue
+
+                    self.dur_dict[wav_basename] = durs
+
+                    np.save(os.path.join(out_feature_dir, wav_basename + ".npy"), durs)
+                    with open(
+                        os.path.join(out_feature_dir, wav_basename + ".phone"), "w"
+                    ) as f:
+                        f.write("\n".join(phone_list))
+        logging.info("[AudioProcessor] Duration generate finished")
+
+        return True
+
+    def pitch_extract(
+        self, src_wav_dir, out_f0_dir, out_frame_f0_dir, out_frame_uv_dir
+    ):
+        os.makedirs(out_f0_dir, exist_ok=True)
+        os.makedirs(out_frame_f0_dir, exist_ok=True)
+        os.makedirs(out_frame_uv_dir, exist_ok=True)
+        wav_list = glob(os.path.join(src_wav_dir, "*.wav"))
+        pcm_dict = self.get_pcm_dict(src_wav_dir)
+        mel_dict = self.mel_dict
+
+        logging.info("[AudioProcessor] Pitch extraction started")
+        # Get raw pitch
+        with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
+            total=len(wav_list)
+        ) as progress:
+            futures = []
+            for wav_basename, pcm_data in pcm_dict.items():
+                future = executor.submit(
+                    get_pitch,
+                    encode_16bits(pcm_data),
+                    self.sampling_rate,
+                    self.hop_length,
+                )
+                future.add_done_callback(lambda p: progress.update())
+                futures.append((future, wav_basename))
+
+            logging.info("[AudioProcessor] Pitch align with mel is proceeding...")
+            for future, wav_basename in futures:
+                result = future.result()
+                if result is None:
+                    logging.warning(
+                        "[AudioProcessor] Pitch extraction failed for %s", wav_basename
+                    )
+                    self.badcase_list.append(wav_basename)
+                else:
+                    f0, uv, f0uv = result
+                    if len(mel_dict) > 0:
+                        f0 = align_length(f0, mel_dict.get(wav_basename, None))
+                        uv = align_length(uv, mel_dict.get(wav_basename, None))
+                        f0uv = align_length(f0uv, mel_dict.get(wav_basename, None))
+
+                    if f0 is None or uv is None or f0uv is None:
+                        logging.warning(
+                            "[AudioProcessor] Pitch length mismatch with mel in %s",
+                            wav_basename,
+                        )
+                        self.badcase_list.append(wav_basename)
+                        continue
+                    self.f0_dict[wav_basename] = f0
+                    self.uv_dict[wav_basename] = uv
+                    self.f0uv_dict[wav_basename] = f0uv
+
+        #  Normalize f0
+        logging.info("[AudioProcessor] Pitch normalization is proceeding...")
+        f0_mean = compute_mean(list(self.f0uv_dict.values()), dims=1)
+        f0_std = compute_std(list(self.f0uv_dict.values()), f0_mean, dims=1)
+        np.savetxt(os.path.join(out_f0_dir, "f0_mean.txt"), f0_mean, fmt="%.6f")
+        np.savetxt(os.path.join(out_f0_dir, "f0_std.txt"), f0_std, fmt="%.6f")
+        logging.info(
+            "[AudioProcessor] f0 mean and std saved to:\n{},\n{}".format(
+                os.path.join(out_f0_dir, "f0_mean.txt"),
+                os.path.join(out_f0_dir, "f0_std.txt"),
+            )
+        )
+        logging.info("[AudioProcessor] Pitch mean std norm is proceeding...")
+        for wav_basename in self.f0uv_dict:
+            f0 = self.f0uv_dict[wav_basename]
+            norm_f0 = f0_norm_mean_std(f0, f0_mean, f0_std)
+            self.f0uv_dict[wav_basename] = norm_f0
+
+        for wav_basename in self.f0_dict:
+            f0 = self.f0_dict[wav_basename]
+            norm_f0 = f0_norm_mean_std(f0, f0_mean, f0_std)
+            self.f0_dict[wav_basename] = norm_f0
+
+        #  save frame f0 to a specific dir
+        for wav_basename in self.f0_dict:
+            np.save(
+                os.path.join(out_frame_f0_dir, wav_basename + ".npy"),
+                self.f0_dict[wav_basename].reshape(-1),
+            )
+
+        for wav_basename in self.uv_dict:
+            np.save(
+                os.path.join(out_frame_uv_dir, wav_basename + ".npy"),
+                self.uv_dict[wav_basename].reshape(-1),
+            )
+
+        #  phone level average
+        #  if there is no duration then save the frame-level f0
+        if self.phone_level_feature and len(self.dur_dict) > 0:
+            logging.info("[AudioProcessor] Pitch turn to phone-level is proceeding...")
+            with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
+                total=len(self.f0uv_dict)
+            ) as progress:
+                futures = []
+                for wav_basename in self.f0uv_dict:
+                    future = executor.submit(
+                        average_by_duration,
+                        self.f0uv_dict.get(wav_basename, None),
+                        self.dur_dict.get(wav_basename, None),
+                    )
+                    future.add_done_callback(lambda p: progress.update())
+                    futures.append((future, wav_basename))
+
+                for future, wav_basename in futures:
+                    result = future.result()
+                    if result is None:
+                        logging.warning(
+                            "[AudioProcessor] Pitch extraction failed in phone level avg for: %s",
+                            wav_basename,
+                        )
+                        self.badcase_list.append(wav_basename)
+                    else:
+                        avg_f0 = result
+                        self.f0uv_dict[wav_basename] = avg_f0
+
+        for wav_basename in self.f0uv_dict:
+            np.save(
+                os.path.join(out_f0_dir, wav_basename + ".npy"),
+                self.f0uv_dict[wav_basename].reshape(-1),
+            )
+
+        logging.info("[AudioProcessor] Pitch normalization finished")
+        logging.info("[AudioProcessor] Normed f0 saved to %s", out_f0_dir)
+        logging.info("[AudioProcessor] Pitch extraction finished")
+
+        return True
+
+    def energy_extract(self, src_wav_dir, out_energy_dir, out_frame_energy_dir):
+        os.makedirs(out_energy_dir, exist_ok=True)
+        os.makedirs(out_frame_energy_dir, exist_ok=True)
+        wav_list = glob(os.path.join(src_wav_dir, "*.wav"))
+        pcm_dict = self.get_pcm_dict(src_wav_dir)
+        mel_dict = self.mel_dict
+
+        logging.info("[AudioProcessor] Energy extraction started")
+        # Get raw energy
+        with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
+            total=len(wav_list)
+        ) as progress:
+            futures = []
+            for wav_basename, pcm_data in pcm_dict.items():
+                future = executor.submit(
+                    get_energy, pcm_data, self.hop_length, self.win_length, self.n_fft
+                )
+                future.add_done_callback(lambda p: progress.update())
+                futures.append((future, wav_basename))
+
+            for future, wav_basename in futures:
+                result = future.result()
+                if result is None:
+                    logging.warning(
+                        "[AudioProcessor] Energy extraction failed for %s", wav_basename
+                    )
+                    self.badcase_list.append(wav_basename)
+                else:
+                    energy = result
+                    if len(mel_dict) > 0:
+                        energy = align_length(energy, mel_dict.get(wav_basename, None))
+                    if energy is None:
+                        logging.warning(
+                            "[AudioProcessor] Energy length mismatch with mel in %s",
+                            wav_basename,
+                        )
+                        self.badcase_list.append(wav_basename)
+                        continue
+                    self.energy_dict[wav_basename] = energy
+
+        #  Normalize energy
+        energy_mean = compute_mean(list(self.energy_dict.values()), dims=1)
+        energy_std = compute_std(list(self.energy_dict.values()), energy_mean, dims=1)
+        np.savetxt(
+            os.path.join(out_energy_dir, "energy_mean.txt"), energy_mean, fmt="%.6f"
+        )
+        np.savetxt(
+            os.path.join(out_energy_dir, "energy_std.txt"), energy_std, fmt="%.6f"
+        )
+        logging.info(
+            "[AudioProcessor] energy mean and std saved to:\n{},\n{}".format(
+                os.path.join(out_energy_dir, "energy_mean.txt"),
+                os.path.join(out_energy_dir, "energy_std.txt"),
+            )
+        )
+
+        logging.info("[AudioProcessor] Energy mean std norm is proceeding...")
+        for wav_basename in self.energy_dict:
+            energy = self.energy_dict[wav_basename]
+            norm_energy = f0_norm_mean_std(energy, energy_mean, energy_std)
+            self.energy_dict[wav_basename] = norm_energy
+
+        #  save frame energy to a specific dir
+        for wav_basename in self.energy_dict:
+            np.save(
+                os.path.join(out_frame_energy_dir, wav_basename + ".npy"),
+                self.energy_dict[wav_basename].reshape(-1),
+            )
+
+        #  phone level average
+        #  if there is no duration then save the frame-level energy
+        if self.phone_level_feature and len(self.dur_dict) > 0:
+            with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
+                total=len(self.energy_dict)
+            ) as progress:
+                futures = []
+                for wav_basename in self.energy_dict:
+                    future = executor.submit(
+                        average_by_duration,
+                        self.energy_dict.get(wav_basename, None),
+                        self.dur_dict.get(wav_basename, None),
+                    )
+                    future.add_done_callback(lambda p: progress.update())
+                    futures.append((future, wav_basename))
+
+                #  TODO: failed clear dict element
+                for future, wav_basename in futures:
+                    result = future.result()
+                    if result is None:
+                        logging.warning(
+                            "[AudioProcessor] Energy extraction failed in phone level avg for: %s",
+                            wav_basename,
+                        )
+                        self.badcase_list.append(wav_basename)
+                    else:
+                        avg_energy = result
+                        self.energy_dict[wav_basename] = avg_energy
+
+        for wav_basename in self.energy_dict:
+            np.save(
+                os.path.join(out_energy_dir, wav_basename + ".npy"),
+                self.energy_dict[wav_basename].reshape(-1),
+            )
+
+        logging.info("[AudioProcessor] Energy normalization finished")
+        logging.info("[AudioProcessor] Normed Energy saved to %s", out_energy_dir)
+        logging.info("[AudioProcessor] Energy extraction finished")
+
+        return True
+
+    def process(self, src_voice_dir, out_data_dir, aux_metafile=None):
+        succeed = True
+
+        raw_wav_dir = os.path.join(src_voice_dir, "wav")
+        src_interval_dir = os.path.join(src_voice_dir, "interval")
+
+        out_mel_dir = os.path.join(out_data_dir, "mel")
+        out_f0_dir = os.path.join(out_data_dir, "f0")
+        out_frame_f0_dir = os.path.join(out_data_dir, "frame_f0")
+        out_frame_uv_dir = os.path.join(out_data_dir, "frame_uv")
+        out_energy_dir = os.path.join(out_data_dir, "energy")
+        out_frame_energy_dir = os.path.join(out_data_dir, "frame_energy")
+        out_duration_dir = os.path.join(out_data_dir, "raw_duration")
+        out_cali_duration_dir = os.path.join(out_data_dir, "duration")
+
+        os.makedirs(out_data_dir, exist_ok=True)
+
+        with_duration = os.path.exists(src_interval_dir)
+
+        #  TODO: to resume from previous process, a log file is needed
+        train_wav_dir = os.path.join(out_data_dir, "wav")
+
+        succeed = self.amp_normalize(raw_wav_dir, train_wav_dir)
+        if not succeed:
+            logging.error("[AudioProcessor] amp_normalize failed, exit")
+            return False
+
+        if with_duration:
+            #  Raw duration, non-trimmed
+            succeed = self.duration_generate(src_interval_dir, out_duration_dir)
+            if not succeed:
+                logging.error("[AudioProcessor] duration_generate failed, exit")
+                return False
+
+        if self.trim_silence:
+            if with_duration:
+                succeed = self.trim_silence_wav_with_interval(
+                    train_wav_dir, out_duration_dir
+                )
+                if not succeed:
+                    logging.error(
+                        "[AudioProcessor] trim_silence_wav_with_interval failed, exit"
+                    )
+                    return False
+            else:
+                succeed = self.trim_silence_wav(train_wav_dir)
+                if not succeed:
+                    logging.error("[AudioProcessor] trim_silence_wav failed, exit")
+                    return False
+
+        succeed = self.mel_extract(train_wav_dir, out_mel_dir)
+        if not succeed:
+            logging.error("[AudioProcessor] mel_extract failed, exit")
+            return False
+
+        if aux_metafile is not None and with_duration:
+            self.calibrate_SyllableDuration(
+                out_duration_dir, aux_metafile, out_cali_duration_dir
+            )
+
+        succeed = self.pitch_extract(
+            train_wav_dir, out_f0_dir, out_frame_f0_dir, out_frame_uv_dir
+        )
+        if not succeed:
+            logging.error("[AudioProcessor] pitch_extract failed, exit")
+            return False
+
+        succeed = self.energy_extract(
+            train_wav_dir, out_energy_dir, out_frame_energy_dir
+        )
+        if not succeed:
+            logging.error("[AudioProcessor] energy_extract failed, exit")
+            return False
+
+        # recording badcase list
+        with open(os.path.join(out_data_dir, "badlist.txt"), "w") as f:
+            f.write("\n".join(self.badcase_list))
+
+        logging.info("[AudioProcessor] All features extracted successfully!")
+
+        return succeed
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Audio Processor")
+    parser.add_argument("--src_voice_dir", type=str, required=True)
+    parser.add_argument("--out_data_dir", type=str, required=True)
+    parser.add_argument("--config", type=str, default=None)
+    args = parser.parse_args()
+
+    if args.config is not None:
+        with open(args.config, "r") as f:
+            config = yaml.load(f, Loader=yaml.Loader)
+
+    ap = AudioProcessor(config["audio_config"])
+    ap.process(args.src_voice_dir, args.out_data_dir)
--- a/kantts/preprocess/audio_processor/core/__init__.py
+++ b/kantts/preprocess/audio_processor/core/__init__.py
--- a/kantts/preprocess/audio_processor/core/__pycache__/__init__.cpython-38.pyc
+++ b/kantts/preprocess/audio_processor/core/__pycache__/__init__.cpython-38.pyc
--- a/kantts/preprocess/audio_processor/core/__pycache__/dsp.cpython-38.pyc
+++ b/kantts/preprocess/audio_processor/core/__pycache__/dsp.cpython-38.pyc
--- a/kantts/preprocess/audio_processor/core/__pycache__/utils.cpython-38.pyc
+++ b/kantts/preprocess/audio_processor/core/__pycache__/utils.cpython-38.pyc
--- a/kantts/preprocess/audio_processor/core/dsp.py
+++ b/kantts/preprocess/audio_processor/core/dsp.py
+import numpy as np
+import librosa
+import librosa.filters
+from scipy.io import wavfile
+from scipy import signal
+
+
+def _stft(y, hop_length, win_length, n_fft):
+    return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
+
+
+def _istft(y, hop_length, win_length):
+    return librosa.istft(y, hop_length=hop_length, win_length=win_length)
+
+
+def _db_to_amp(x):
+    return np.power(10.0, x * 0.05)
+
+
+def _amp_to_db(x):
+    return 20 * np.log10(np.maximum(1e-5, x))
+
+
+def load_wav(path, sr):
+    return librosa.load(path, sr=sr)[0]
+
+
+def save_wav(wav, path, sr):
+    if wav.dtype == np.float32 or wav.dtype == np.float64:
+        quant_wav = 32767 * wav
+    else:
+        quant_wav = wav
+        # maxmize the volume to avoid clipping
+        # wav *= 32767 / max(0.01, np.max(np.abs(wav)))
+    wavfile.write(path, sr, quant_wav.astype(np.int16))
+
+
+def trim_silence(wav, top_db, hop_length, win_length):
+    trimed_wav, _ = librosa.effects.trim(
+        wav, top_db=top_db, frame_length=win_length, hop_length=hop_length
+    )
+    return trimed_wav
+
+
+def trim_silence_with_interval(wav, interval, hop_length):
+    if interval is None:
+        return None
+    leading_sil = interval[0]
+    tailing_sil = interval[-1]
+    trim_wav = wav[leading_sil * hop_length : -tailing_sil * hop_length]
+    return trim_wav
+
+
+def preemphasis(wav, k=0.98, preemphasize=False):
+    if preemphasize:
+        return signal.lfilter([1, -k], [1], wav)
+    return wav
+
+
+def inv_preemphasis(wav, k=0.98, inv_preemphasize=False):
+    if inv_preemphasize:
+        return signal.lfilter([1], [1, -k], wav)
+    return wav
+
+
+def _normalize(S, max_norm=1.0, min_level_db=-100, symmetric=False):
+    if symmetric:
+        return np.clip(
+            (2 * max_norm) * ((S - min_level_db) / (-min_level_db)) - max_norm,
+            -max_norm,
+            max_norm,
+        )
+    else:
+        return np.clip(max_norm * ((S - min_level_db) / (-min_level_db)), 0, max_norm)
+
+
+def _denormalize(D, max_norm=1.0, min_level_db=-100, symmetric=False):
+    if symmetric:
+        return (
+            (np.clip(D, -max_norm, max_norm) + max_norm)
+            * -min_level_db
+            / (2 * max_norm)
+        ) + min_level_db
+    else:
+        return (np.clip(D, 0, max_norm) * -min_level_db / max_norm) + min_level_db
+
+
+def _griffin_lim(S, n_fft, hop_length, win_length, griffin_lim_iters=60):
+    angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
+    S_complex = np.abs(S).astype(np.complex)
+    y = _istft(S_complex * angles, hop_length=hop_length, win_length=win_length)
+    for i in range(griffin_lim_iters):
+        angles = np.exp(
+            1j
+            * np.angle(
+                _stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
+            )
+        )
+        y = _istft(S_complex * angles, hop_length=hop_length, win_length=win_length)
+    return y
+
+
+def spectrogram(
+    y,
+    n_fft=1024,
+    hop_length=256,
+    win_length=1024,
+    max_norm=1.0,
+    min_level_db=-100,
+    ref_level_db=20,
+    symmetric=False,
+):
+    D = _stft(preemphasis(y), hop_length, win_length, n_fft)
+    S = _amp_to_db(np.abs(D)) - ref_level_db
+    return _normalize(S, max_norm, min_level_db, symmetric)
+
+
+def inv_spectrogram(
+    spectrogram,
+    n_fft=1024,
+    hop_length=256,
+    win_length=1024,
+    max_norm=1.0,
+    min_level_db=-100,
+    ref_level_db=20,
+    symmetric=False,
+    power=1.5,
+):
+    S = _db_to_amp(
+        _denormalize(spectrogram, max_norm, min_level_db, symmetric) + ref_level_db
+    )
+    return _griffin_lim(S ** power, n_fft, hop_length, win_length)
+
+
+def _build_mel_basis(sample_rate, n_fft=1024, fmin=50, fmax=8000, n_mels=80):
+    assert fmax <= sample_rate // 2
+    return librosa.filters.mel(
+        sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax
+    )
+
+
+# mel linear Conversions
+_mel_basis = None
+_inv_mel_basis = None
+
+
+def _linear_to_mel(spectogram, sample_rate, n_fft=1024, fmin=50, fmax=8000, n_mels=80):
+    global _mel_basis
+    if _mel_basis is None:
+        _mel_basis = _build_mel_basis(sample_rate, n_fft, fmin, fmax, n_mels)
+    return np.dot(_mel_basis, spectogram)
+
+
+def _mel_to_linear(
+    mel_spectrogram, sample_rate, n_fft=1024, fmin=50, fmax=8000, n_mels=80
+):
+    global _inv_mel_basis
+    if _inv_mel_basis is None:
+        _inv_mel_basis = np.linalg.pinv(
+            _build_mel_basis(sample_rate, n_fft, fmin, fmax, n_mels)
+        )
+    return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
+
+
+def melspectrogram(
+    y,
+    sample_rate,
+    n_fft=1024,
+    hop_length=256,
+    win_length=1024,
+    n_mels=80,
+    max_norm=1.0,
+    min_level_db=-100,
+    ref_level_db=20,
+    fmin=50,
+    fmax=8000,
+    symmetric=False,
+    preemphasize=False,
+):
+    D = _stft(
+        preemphasis(y, preemphasize=preemphasize),
+        hop_length=hop_length,
+        win_length=win_length,
+        n_fft=n_fft,
+    )
+    S = (
+        _amp_to_db(
+            _linear_to_mel(
+                np.abs(D),
+                sample_rate=sample_rate,
+                n_fft=n_fft,
+                fmin=fmin,
+                fmax=fmax,
+                n_mels=n_mels,
+            )
+        )
+        - ref_level_db
+    )
+    return _normalize(
+        S, max_norm=max_norm, min_level_db=min_level_db, symmetric=symmetric
+    ).T
+
+
+def inv_mel_spectrogram(
+    mel_spectrogram,
+    sample_rate,
+    n_fft=1024,
+    hop_length=256,
+    win_length=1024,
+    n_mels=80,
+    max_norm=1.0,
+    min_level_db=-100,
+    ref_level_db=20,
+    fmin=50,
+    fmax=8000,
+    power=1.5,
+    symmetric=False,
+    preemphasize=False,
+):
+    D = _denormalize(
+        mel_spectrogram,
+        max_norm=max_norm,
+        min_level_db=min_level_db,
+        symmetric=symmetric,
+    )
+    S = _mel_to_linear(
+        _db_to_amp(D + ref_level_db),
+        sample_rate=sample_rate,
+        n_fft=n_fft,
+        fmin=fmin,
+        fmax=fmax,
+        n_mels=n_mels,
+    )
+    return inv_preemphasis(
+        _griffin_lim(S ** power, n_fft, hop_length, win_length),
+        preemphasize=preemphasize,
+    )
--- a/kantts/preprocess/audio_processor/core/utils.py
+++ b/kantts/preprocess/audio_processor/core/utils.py
+import os
+from glob import glob
+import numpy as np
+import sox
+import librosa
+import pysptk
+from scipy.io import wavfile
+from concurrent.futures import ProcessPoolExecutor
+from tqdm import tqdm
+import logging
+
+from .dsp import _stft
+
+
+anchor_hist = np.array(
+    [
+        0.0,
+        0.00215827,
+        0.00354383,
+        0.00442313,
+        0.00490274,
+        0.00532907,
+        0.00602185,
+        0.00690115,
+        0.00810019,
+        0.00948574,
+        0.0120437,
+        0.01489475,
+        0.01873168,
+        0.02302158,
+        0.02872369,
+        0.03669065,
+        0.04636291,
+        0.05843325,
+        0.07700506,
+        0.11052491,
+        0.16802558,
+        0.25997868,
+        0.37942979,
+        0.50730083,
+        0.62006395,
+        0.71092459,
+        0.76877165,
+        0.80762057,
+        0.83458566,
+        0.85672795,
+        0.87660538,
+        0.89251266,
+        0.90578204,
+        0.91569411,
+        0.92541966,
+        0.93383959,
+        0.94162004,
+        0.94940048,
+        0.95539568,
+        0.96136424,
+        0.9670397,
+        0.97290168,
+        0.97705835,
+        0.98116174,
+        0.98465228,
+        0.98814282,
+        0.99152678,
+        0.99421796,
+        0.9965894,
+        0.99840128,
+        1.0,
+    ]
+)
+
+anchor_bins = np.array(
+    [
+        0.033976,
+        0.03529014,
+        0.03660428,
+        0.03791842,
+        0.03923256,
+        0.0405467,
+        0.04186084,
+        0.04317498,
+        0.04448912,
+        0.04580326,
+        0.0471174,
+        0.04843154,
+        0.04974568,
+        0.05105982,
+        0.05237396,
+        0.0536881,
+        0.05500224,
+        0.05631638,
+        0.05763052,
+        0.05894466,
+        0.0602588,
+        0.06157294,
+        0.06288708,
+        0.06420122,
+        0.06551536,
+        0.0668295,
+        0.06814364,
+        0.06945778,
+        0.07077192,
+        0.07208606,
+        0.0734002,
+        0.07471434,
+        0.07602848,
+        0.07734262,
+        0.07865676,
+        0.0799709,
+        0.08128504,
+        0.08259918,
+        0.08391332,
+        0.08522746,
+        0.0865416,
+        0.08785574,
+        0.08916988,
+        0.09048402,
+        0.09179816,
+        0.0931123,
+        0.09442644,
+        0.09574058,
+        0.09705472,
+        0.09836886,
+        0.099683,
+    ]
+)
+
+hist_bins = 50
+
+
+def amp_info(wav_file_path):
+    """
+    Returns the amplitude info of the wav file.
+    """
+    stats = sox.file_info.stat(wav_file_path)
+    amp_rms = stats["RMS     amplitude"]
+    amp_max = stats["Maximum amplitude"]
+    amp_mean = stats["Mean    amplitude"]
+    length = stats["Length (seconds)"]
+
+    return {
+        "amp_rms": amp_rms,
+        "amp_max": amp_max,
+        "amp_mean": amp_mean,
+        "length": length,
+        "basename": os.path.basename(wav_file_path),
+    }
+
+
+#  TODO: multi-processing
+def statistic_amplitude(src_wav_dir):
+    """
+    Returns the amplitude info of the wav file.
+    """
+    wav_lst = glob(os.path.join(src_wav_dir, "*.wav"))
+    with ProcessPoolExecutor(max_workers=8) as executor, tqdm(
+        total=len(wav_lst)
+    ) as progress:
+        futures = []
+        for wav_file_path in wav_lst:
+            future = executor.submit(amp_info, wav_file_path)
+            future.add_done_callback(lambda p: progress.update())
+            futures.append(future)
+
+        amp_info_lst = [future.result() for future in futures]
+
+    amp_info_lst = sorted(amp_info_lst, key=lambda x: x["amp_rms"])
+
+    logging.info(
+        "Average amplitude RMS : {}".format(
+            np.mean([x["amp_rms"] for x in amp_info_lst])
+        )
+    )
+
+    #  cnt = len(amp_info_lst)
+    #
+    #  pinhead_cnt = math.floor(cnt * 0.01)
+    #
+    #  return amp_info_lst[pinhead_cnt : cnt - pinhead_cnt]
+    return amp_info_lst
+
+
+#  TODO: multi process
+def volume_normalize(src_wav_dir, out_wav_dir):
+    logging.info("Volume statistic proceeding...")
+    amp_info_lst = statistic_amplitude(src_wav_dir)
+    logging.info("Volume statistic done.")
+
+    rms_amp_lst = [x["amp_rms"] for x in amp_info_lst]
+    src_hist, src_bins = np.histogram(rms_amp_lst, bins=hist_bins, density=True)
+    src_hist = src_hist / np.sum(src_hist)
+    src_hist = np.cumsum(src_hist)
+    src_hist = np.insert(src_hist, 0, 0.0)
+
+    logging.info("Volume normalization proceeding...")
+    for amp_info in tqdm(amp_info_lst):
+        rms_amp = amp_info["amp_rms"]
+        rms_amp = np.clip(rms_amp, src_bins[0], src_bins[-1])
+
+        src_idx = np.where(rms_amp >= src_bins)[0][-1]
+        src_pos = src_hist[src_idx]
+        anchor_idx = np.where(src_pos >= anchor_hist)[0][-1]
+
+        if src_idx == hist_bins or anchor_idx == hist_bins:
+            rms_amp = anchor_bins[-1]
+        else:
+            rms_amp = (rms_amp - src_bins[src_idx]) / (
+                src_bins[src_idx + 1] - src_bins[src_idx]
+            ) * (anchor_bins[anchor_idx + 1] - anchor_bins[anchor_idx]) + anchor_bins[
+                anchor_idx
+            ]
+
+        scale = rms_amp / amp_info["amp_rms"]
+
+        # FIXME: This is a hack to avoid the sound cliping.
+        sr, data = wavfile.read(os.path.join(src_wav_dir, amp_info["basename"]))
+        wavfile.write(
+            os.path.join(out_wav_dir, amp_info["basename"]),
+            sr,
+            (data * scale).astype(np.int16),
+        )
+    logging.info("Volume normalization done.")
+
+    return True
+
+
+def interp_f0(f0_data):
+    """
+    linear interpolation
+    """
+    f0_data[f0_data < 1] = 0
+    xp = np.nonzero(f0_data)
+    yp = f0_data[xp]
+    x = np.arange(f0_data.size)
+    contour_f0 = np.interp(x, xp[0], yp).astype(np.float32)
+    return contour_f0
+
+
+def frame_nccf(x, y):
+    norm_coef = (np.sum(x ** 2.0) * np.sum(y ** 2.0) + 1e-30) ** 0.5
+    return (np.sum(x * y) / norm_coef + 1.0) / 2.0
+
+
+def get_nccf(pcm_data, f0, min_f0=40, max_f0=800, fs=160, sr=16000):
+    if pcm_data.dtype == np.int16:
+        pcm_data = pcm_data.astype(np.float32) / 32768
+    frame_len = int(sr / 200)
+    frame_num = int(len(pcm_data) // fs)
+    frame_num = min(frame_num, len(f0))
+
+    pad_len = int(sr / min_f0) + frame_len
+
+    pad_zeros = np.zeros([pad_len], dtype=np.float32)
+    data = np.hstack((pad_zeros, pcm_data.astype(np.float32), pad_zeros))
+
+    nccf = np.zeros((frame_num), dtype=np.float32)
+
+    for i in range(frame_num):
+        curr_f0 = np.clip(f0[i], min_f0, max_f0)
+        lag = int(sr / curr_f0 + 0.5)
+        j = i * fs + pad_len - frame_len // 2
+
+        l_data = data[j : j + frame_len]
+        l_data -= l_data.mean()
+
+        r_data = data[j + lag : j + lag + frame_len]
+        r_data -= r_data.mean()
+
+        nccf[i] = frame_nccf(l_data, r_data)
+
+    return nccf
+
+
+def smooth(data, win_len):
+    if win_len % 2 == 0:
+        win_len += 1
+    hwin = win_len // 2
+    win = np.hanning(win_len)
+    win /= win.sum()
+    data = data.reshape([-1])
+    pad_data = np.pad(data, hwin, mode="edge")
+
+    for i in range(data.shape[0]):
+        data[i] = np.dot(win, pad_data[i : i + win_len])
+
+    return data.reshape([-1, 1])
+
+
+#  TODO: pysptk only supports two methods to estimate the F0 now.
+#  support: rapt, swipe
+#  unsupport: reaper, world(DIO)
+def RAPT_FUNC(v1, v2, v3, v4, v5):
+    return pysptk.sptk.rapt(v1.astype(np.float32), fs=v2, hopsize=v3, min=v4, max=v5)
+
+
+def SWIPE_FUNC(v1, v2, v3, v4, v5):
+    return pysptk.sptk.swipe(v1.astype(np.float64), fs=v2, hopsize=v3, min=v4, max=v5)
+
+
+def PYIN_FUNC(v1, v2, v3, v4, v5):
+    f0_mel = librosa.pyin(
+        v1.astype(np.float32), sr=v2, frame_length=v3 * 4, fmin=v4, fmax=v5
+    )[0]
+    f0_mel = np.where(np.isnan(f0_mel), 0.0, f0_mel)
+    return f0_mel
+
+
+def get_pitch(pcm_data, sampling_rate=16000, hop_length=160):
+    log_f0_list = []
+    uv_list = []
+    low, high = 40, 800
+
+    cali_f0 = pysptk.sptk.rapt(
+        pcm_data.astype(np.float32),
+        fs=sampling_rate,
+        hopsize=hop_length,
+        min=low,
+        max=high,
+    )
+    f0_range = np.sort(np.unique(cali_f0))
+
+    if len(f0_range) > 20:
+        low = max(f0_range[10] - 50, low)
+        high = min(f0_range[-10] + 50, high)
+
+    func_dict = {"rapt": RAPT_FUNC, "swipe": SWIPE_FUNC}
+
+    for func_name in func_dict:
+        f0 = func_dict[func_name](pcm_data, sampling_rate, hop_length, low, high)
+        uv = f0 > 0
+
+        if len(f0) < 10 or f0.max() < low:
+            logging.error("{} method: calc F0 is too low.".format(func_name))
+            continue
+        else:
+            f0 = np.clip(f0, 1e-30, high)
+            log_f0 = np.log(f0)
+            contour_log_f0 = interp_f0(log_f0)
+
+            log_f0_list.append(contour_log_f0)
+            uv_list.append(uv)
+
+    if len(log_f0_list) == 0:
+        logging.error("F0 estimation failed.")
+        return None
+
+    min_len = float("inf")
+    for log_f0 in log_f0_list:
+        min_len = min(min_len, log_f0.shape[0])
+
+    multi_log_f0 = np.zeros([len(log_f0_list), min_len], dtype=np.float32)
+    multi_uv = np.zeros([len(log_f0_list), min_len], dtype=np.float32)
+
+    for i in range(len(log_f0_list)):
+        multi_log_f0[i, :] = log_f0_list[i][:min_len]
+        multi_uv[i, :] = uv_list[i][:min_len]
+
+    log_f0 = smooth(np.median(multi_log_f0, axis=0), 5)
+    uv = (smooth(np.median(multi_uv, axis=0), 5) > 0.5).astype(np.float32)
+
+    f0 = np.exp(log_f0)
+
+    #  nccf = get_nccf(
+    #      pcm_data, f0, min_f0=low, max_f0=high, fs=hop_length, sr=sampling_rate
+    #  )
+
+    min_len = min(f0.shape[0], uv.shape[0])
+
+    return f0[:min_len], uv[:min_len], f0[:min_len] * uv[:min_len]
+
+
+#  TODO: some DSP functions are not implemented.
+def get_energy(pcm_data, hop_length, win_length, n_fft):
+    D = _stft(pcm_data, hop_length, win_length, n_fft)
+    S, _ = librosa.magphase(D)
+    energy = np.sqrt(np.sum(S ** 2, axis=0))
+
+    return energy.reshape((-1, 1))
+
+
+def align_length(in_data, tgt_data, basename=None):
+    if in_data is None or tgt_data is None:
+        logging.error("{}: Input data is None.".format(basename))
+        return None
+    in_len = in_data.shape[0]
+    tgt_len = tgt_data.shape[0]
+    if abs(in_len - tgt_len) > 20:
+        logging.error(
+            "{}: Input data length mismatches with target data length too much.".format(
+                basename
+            )
+        )
+        return None
+
+    if in_len < tgt_len:
+        out_data = np.pad(
+            in_data, ((0, tgt_len - in_len), (0, 0)), "constant", constant_values=0.0
+        )
+    else:
+        out_data = in_data[:tgt_len]
+
+    return out_data
+
+
+def compute_mean(data_list, dims=80):
+    mean_vector = np.zeros((1, dims))
+    all_frame_number = 0
+
+    for data in tqdm(data_list):
+        if data is None:
+            continue
+        features = data.reshape((-1, dims))
+        current_frame_number = np.shape(features)[0]
+        mean_vector += np.sum(features[:, :], axis=0)
+        all_frame_number += current_frame_number
+
+    mean_vector /= float(all_frame_number)
+    return mean_vector
+
+
+def compute_std(data_list, mean_vector, dims=80):
+    std_vector = np.zeros((1, dims))
+    all_frame_number = 0
+
+    for data in tqdm(data_list):
+        if data is None:
+            continue
+        features = data.reshape((-1, dims))
+        current_frame_number = np.shape(features)[0]
+        mean_matrix = np.tile(mean_vector, (current_frame_number, 1))
+        std_vector += np.sum((features[:, :] - mean_matrix) ** 2, axis=0)
+        all_frame_number += current_frame_number
+
+    std_vector /= float(all_frame_number)
+    std_vector = std_vector ** 0.5
+    return std_vector
+
+
+F0_MIN = 0.0
+F0_MAX = 800.0
+
+ENERGY_MIN = 0.0
+ENERGY_MAX = 200.0
+
+CLIP_FLOOR = 1e-3
+
+
+def f0_norm_min_max(f0):
+    zero_idxs = np.where(f0 <= CLIP_FLOOR)[0]
+    res = (2 * f0 - F0_MIN - F0_MAX) / (F0_MAX - F0_MIN)
+    res[zero_idxs] = 0.0
+    return res
+
+
+def f0_denorm_min_max(f0):
+    zero_idxs = np.where(f0 == 0.0)[0]
+    res = (f0 * (F0_MAX - F0_MIN) + F0_MIN + F0_MAX) / 2
+    res[zero_idxs] = 0.0
+    return res
+
+
+def energy_norm_min_max(energy):
+    zero_idxs = np.where(energy == 0.0)[0]
+    res = (2 * energy - ENERGY_MIN - ENERGY_MAX) / (ENERGY_MAX - ENERGY_MIN)
+    res[zero_idxs] = 0.0
+    return res
+
+
+def energy_denorm_min_max(energy):
+    zero_idxs = np.where(energy == 0.0)[0]
+    res = (energy * (ENERGY_MAX - ENERGY_MIN) + ENERGY_MIN + ENERGY_MAX) / 2
+    res[zero_idxs] = 0.0
+    return res
+
+
+def norm_log(x):
+    zero_idxs = np.where(x <= CLIP_FLOOR)[0]
+    x[zero_idxs] = 1.0
+    res = np.log(x)
+    return res
+
+
+def denorm_log(x):
+    zero_idxs = np.where(x == 0.0)[0]
+    res = np.exp(x)
+    res[zero_idxs] = 0.0
+    return res
+
+
+def f0_norm_mean_std(x, mean, std):
+    zero_idxs = np.where(x == 0.0)[0]
+    x = (x - mean) / std
+    x[zero_idxs] = 0.0
+    return x
+
+
+def norm_mean_std(x, mean, std):
+    x = (x - mean) / std
+    return x
+
+
+#  TODO: This is a hardcode implementation for mit-style interval label
+#  TODO: Try to implement a more general version
+def parse_interval_file(file_path, sampling_rate, hop_length):
+    with open(file_path, "r") as f:
+        lines = f.readlines()
+    #  second
+    frame_intervals = 1.0 * hop_length / sampling_rate
+    skip_lines = 12
+    dur_list = []
+    phone_list = []
+
+    line_index = skip_lines
+
+    while line_index < len(lines):
+        phone_begin = float(lines[line_index])
+        phone_end = float(lines[line_index + 1])
+        phone = lines[line_index + 2].strip()[1:-1]
+        dur_list.append(int(round((phone_end - phone_begin) / frame_intervals)))
+        phone_list.append(phone)
+        line_index += 3
+
+    if len(dur_list) == 0 or len(phone_list) == 0:
+        return None
+
+    return np.array(dur_list), phone_list
+
+
+def average_by_duration(x, durs):
+    if x is None or durs is None:
+        return None
+    durs_cum = np.cumsum(np.pad(durs, (1, 0), "constant"))
+
+    # average over each symbol's duraion
+    x_symbol = np.zeros((durs.shape[0],), dtype=np.float32)
+    for idx, start, end in zip(range(durs.shape[0]), durs_cum[:-1], durs_cum[1:]):
+        values = x[start:end][np.where(x[start:end] != 0.0)[0]]
+        x_symbol[idx] = np.mean(values) if len(values) > 0 else 0.0
+
+    return x_symbol.astype(np.float32)
+
+
+def encode_16bits(x):
+    if x.min() > -1.0 and x.max() < 1.0:
+        return np.clip(x * 2 ** 15, -(2 ** 15), 2 ** 15 - 1).astype(np.int16)
+    else:
+        return x
+
+
+if __name__ == "__main__":
+    import sys
+
+    infile = sys.argv[1]
+    sr, pcm_data = wavfile.read(infile)
+    res = get_pitch(pcm_data, 24000, 240)
+    print(res)
--- a/kantts/preprocess/data_process.py
+++ b/kantts/preprocess/data_process.py
+import logging
+import os
+import sys
+import argparse
+import yaml
+import time
+import codecs
+
+ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))  # NOQA: E402
+sys.path.insert(0, os.path.dirname(ROOT_PATH))  # NOQA: E402
+
+try:
+    from kantts.preprocess.audio_processor.audio_processor import AudioProcessor
+    from kantts.preprocess.se_processor.se_processor import SpeakerEmbeddingProcessor
+    from kantts.preprocess.script_convertor.TextScriptConvertor import (
+        TextScriptConvertor,
+    )
+    from kantts.preprocess.fp_processor import FpProcessor, is_fp_line
+    from kantts.preprocess.languages import languages
+    from kantts.datasets.dataset import AM_Dataset, Voc_Dataset
+    from kantts.utils.log import logging_to_file, get_git_revision_hash
+except ImportError:
+    raise ImportError("Please install kantts.")
+
+logging.basicConfig(
+    format="%(asctime)s,%(msecs)d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
+    datefmt="%Y-%m-%d:%H:%M:%S",
+    level=logging.INFO,
+)
+
+LANGUAGES_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "languages")
+
+
+def gen_metafile(
+    voice_output_dir,
+    fp_enable=False,
+    badlist=None,
+    split_ratio=0.98,
+):
+
+    voc_train_meta = os.path.join(voice_output_dir, "train.lst")
+    voc_valid_meta = os.path.join(voice_output_dir, "valid.lst")
+    if not os.path.exists(voc_train_meta) or not os.path.exists(voc_valid_meta):
+        Voc_Dataset.gen_metafile(
+            os.path.join(voice_output_dir, "wav"),
+            voice_output_dir,
+            split_ratio,
+        )
+        logging.info("Voc metafile generated.")
+
+    raw_metafile = os.path.join(voice_output_dir, "raw_metafile.txt")
+    am_train_meta = os.path.join(voice_output_dir, "am_train.lst")
+    am_valid_meta = os.path.join(voice_output_dir, "am_valid.lst")
+    if not os.path.exists(am_train_meta) or not os.path.exists(am_valid_meta):
+        AM_Dataset.gen_metafile(
+            raw_metafile,
+            voice_output_dir,
+            am_train_meta,
+            am_valid_meta,
+            badlist,
+            split_ratio,
+        )
+        logging.info("AM metafile generated.")
+
+    if fp_enable:
+        fpadd_metafile = os.path.join(voice_output_dir, "fpadd_metafile.txt")
+        am_train_meta = os.path.join(voice_output_dir, "am_fpadd_train.lst")
+        am_valid_meta = os.path.join(voice_output_dir, "am_fpadd_valid.lst")
+        if not os.path.exists(am_train_meta) or not os.path.exists(am_valid_meta):
+            AM_Dataset.gen_metafile(
+                fpadd_metafile,
+                voice_output_dir,
+                am_train_meta,
+                am_valid_meta,
+                badlist,
+                split_ratio,
+            )
+            logging.info("AM fpaddmetafile generated.")
+
+        fprm_metafile = os.path.join(voice_output_dir, "fprm_metafile.txt")
+        am_train_meta = os.path.join(voice_output_dir, "am_fprm_train.lst")
+        am_valid_meta = os.path.join(voice_output_dir, "am_fprm_valid.lst")
+        if not os.path.exists(am_train_meta) or not os.path.exists(am_valid_meta):
+            AM_Dataset.gen_metafile(
+                fprm_metafile,
+                voice_output_dir,
+                am_train_meta,
+                am_valid_meta,
+                badlist,
+                split_ratio,
+            )
+            logging.info("AM fprmmetafile generated.")
+
+
+#  TODO: Zh-CN as default
+def process_data(
+    voice_input_dir,
+    voice_output_dir,
+    audio_config,
+    speaker_name=None,
+    targetLang="PinYin",
+    skip_script=False,
+    se_model=None,
+):
+    foreignLang = "EnUS"
+
+    # check if the vocie is supported
+    if not os.path.exists(os.path.join(voice_input_dir, "emotion_tag.txt")):
+        emo_tag_path = None
+    else:
+        emo_tag_path = os.path.join(voice_input_dir, "emotion_tag.txt")
+
+    phoneset_path = os.path.join(
+        LANGUAGES_DIR, targetLang, languages[targetLang]["phoneset_path"]
+    )
+    posset_path = os.path.join(
+        LANGUAGES_DIR, targetLang, languages[targetLang]["posset_path"]
+    )
+    f2t_map_path = os.path.join(
+        LANGUAGES_DIR, targetLang, languages[targetLang]["f2t_map_path"]
+    )
+    s2p_map_path = os.path.join(
+        LANGUAGES_DIR, targetLang, languages[targetLang]["s2p_map_path"]
+    )
+
+    # dir of plain text/sentences for training byte based model
+    plain_text_dir = os.path.join(voice_input_dir, "text")
+
+    if speaker_name is None:
+        speaker_name = os.path.basename(voice_input_dir)
+
+    if audio_config is not None:
+        with open(audio_config, "r") as f:
+            config = yaml.load(f, Loader=yaml.Loader)
+
+    config["create_time"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    config["git_revision_hash"] = get_git_revision_hash()
+
+    se_enable = config["audio_config"].get("se_feature", False)
+
+    with open(os.path.join(voice_output_dir, "audio_config.yaml"), "w") as f:
+        yaml.dump(config, f, Dumper=yaml.Dumper, default_flow_style=None)
+
+    if skip_script:
+        logging.info("Skip script conversion")
+    raw_metafile = None
+    #  Script processor
+    if not skip_script:
+        if os.path.exists(plain_text_dir):
+            TextScriptConvertor.turn_text_into_bytes(
+                os.path.join(plain_text_dir, "text.txt"),
+                os.path.join(voice_output_dir, "raw_metafile.txt"),
+                speaker_name,
+            )
+            fp_enable = False
+        else:
+            tsc = TextScriptConvertor(
+                phoneset_path,
+                posset_path,
+                targetLang,
+                foreignLang,
+                f2t_map_path,
+                s2p_map_path,
+                emo_tag_path,
+                speaker_name,
+            )
+            tsc.process(
+                os.path.join(voice_input_dir, "prosody", "prosody.txt"),
+                os.path.join(voice_output_dir, "Script.xml"),
+                os.path.join(voice_output_dir, "raw_metafile.txt"),
+            )
+            prosody = os.path.join(voice_input_dir, "prosody", "prosody.txt")
+            # FP processor
+            with codecs.open(prosody, "r", "utf-8") as f:
+                lines = f.readlines()
+                fp_enable = is_fp_line(lines[1])
+        raw_metafile = os.path.join(voice_output_dir, "raw_metafile.txt")
+
+    if fp_enable:
+        FP = FpProcessor()
+
+        FP.process(
+            voice_output_dir,
+            prosody,
+            raw_metafile,
+        )
+        logging.info("Processing fp done.")
+
+    #  Audio processor
+    ap = AudioProcessor(config["audio_config"])
+    ap.process(
+        voice_input_dir,
+        voice_output_dir,
+        raw_metafile,
+    )
+    logging.info("Processing audio done.")
+
+    #  SpeakerEmbedding processor
+    if se_enable:
+        sep = SpeakerEmbeddingProcessor()
+        sep.process(
+            voice_output_dir,
+            se_model,
+        )
+        logging.info("Processing speaker embedding done.")
+
+    logging.info("Processing done.")
+
+    # Generate Voc&AM metafile
+    # TODO: train/valid ratio setting
+    gen_metafile(voice_output_dir, fp_enable, ap.badcase_list)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Dataset preprocessor")
+    parser.add_argument("--voice_input_dir", type=str, required=True)
+    parser.add_argument("--voice_output_dir", type=str, required=True)
+    parser.add_argument("--audio_config", type=str, required=True)
+    parser.add_argument("--speaker", type=str, default=None, help="speaker")
+    parser.add_argument("--lang", type=str, default="PinYin", help="target language")
+    parser.add_argument(
+        "--se_model",
+        type=str,
+        default="../pre_data/speaker_embeddding/se.*",
+        help="speaker embedding extractor model",
+    )
+    parser.add_argument(
+        "--skip_script", action="store_true", help="skip script converting"
+    )
+    args = parser.parse_args()
+
+    os.makedirs(args.voice_output_dir, exist_ok=True)
+    logging_to_file(os.path.join(args.voice_output_dir, "data_process_stdout.log"))
+
+    try:
+        process_data(
+            args.voice_input_dir,
+            args.voice_output_dir,
+            args.audio_config,
+            args.speaker,
+            args.lang,
+            args.skip_script,
+            args.se_model,
+        )
+    except (Exception, KeyboardInterrupt) as e:
+        logging.error(e, exc_info=True)
--- a/kantts/preprocess/fp_processor.py
+++ b/kantts/preprocess/fp_processor.py
+import os
+import logging
+import random
+
+
+def is_fp_line(line):
+    fp_category_list = ["FP", "I", "N", "Q"]
+    elements = line.strip().split(" ")
+    res = True
+    for ele in elements:
+        if ele not in fp_category_list:
+            res = False
+            break
+    return res
+
+
+class FpProcessor:
+    def __init__(self):
+        #  TODO: Add more audio processing methods.
+        self.res = []
+
+    def is_fp_line(line):
+        fp_category_list = ["FP", "I", "N", "Q"]
+        elements = line.strip().split(" ")
+        res = True
+        for ele in elements:
+            if ele not in fp_category_list:
+                res = False
+                break
+        return res
+
+    # TODO: adjust idx judgment rule
+    def addfp(self, voice_output_dir, prosody, raw_metafile_lines):
+
+        fp_category_list = ["FP", "I", "N"]
+
+        f = open(prosody)
+        prosody_lines = f.readlines()
+        f.close()
+
+        idx = ""
+        fp = ""
+        fp_label_dict = {}
+        i = 0
+        while i < len(prosody_lines):
+            if len(prosody_lines[i].strip().split("\t")) == 2:
+                idx = prosody_lines[i].strip().split("\t")[0]
+                i += 1
+            else:
+                fp_enable = is_fp_line(prosody_lines[i])
+                if fp_enable:
+                    fp = prosody_lines[i].strip().split("\t")[0].split(" ")
+                    for label in fp:
+                        if label not in fp_category_list:
+                            logging.warning("fp label not in fp_category_list")
+                            break
+                    i += 4
+                else:
+                    fp = [
+                        "N"
+                        for _ in range(
+                            len(
+                                prosody_lines[i]
+                                .strip()
+                                .split("\t")[0]
+                                .replace("/ ", "")
+                                .replace(". ", "")
+                                .split(" ")
+                            )
+                        )
+                    ]
+                    i += 1
+                fp_label_dict[idx] = fp
+
+        fpadd_metafile = os.path.join(voice_output_dir, "fpadd_metafile.txt")
+        f_out = open(fpadd_metafile, "w")
+        for line in raw_metafile_lines:
+            tokens = line.strip().split("\t")
+            if len(tokens) == 2:
+                uttname = tokens[0]
+                symbol_sequences = tokens[1].split(" ")
+
+                error_flag = False
+                idx = 0
+                out_str = uttname + "\t"
+
+                for this_symbol_sequence in symbol_sequences:
+                    emotion = this_symbol_sequence.split("$")[4]
+                    this_symbol_sequence = this_symbol_sequence.replace(
+                        emotion, "emotion_neutral"
+                    )
+
+                    if idx < len(fp_label_dict[uttname]):
+                        if fp_label_dict[uttname][idx] == "FP":
+                            if "none" not in this_symbol_sequence:
+                                this_symbol_sequence = this_symbol_sequence.replace(
+                                    "emotion_neutral", "emotion_disgust"
+                                )
+                        syllable_label = this_symbol_sequence.split("$")[2]
+                        if syllable_label == "s_both" or syllable_label == "s_end":
+                            idx += 1
+                    elif idx > len(fp_label_dict[uttname]):
+                        logging.warning(uttname + " not match")
+                        error_flag = True
+                    out_str = out_str + this_symbol_sequence + " "
+
+                # if idx != len(fp_label_dict[uttname]):
+                #     logging.warning(
+                #         "{} length mismatch, length: {} ".format(
+                #             idx, len(fp_label_dict[uttname])
+                #         )
+                #     )
+
+                if not error_flag:
+                    f_out.write(out_str.strip() + "\n")
+        f_out.close()
+        return fpadd_metafile
+
+    def removefp(self, voice_output_dir, fpadd_metafile, raw_metafile_lines):
+
+        f = open(fpadd_metafile)
+        fpadd_metafile_lines = f.readlines()
+        f.close()
+
+        fprm_metafile = os.path.join(voice_output_dir, "fprm_metafile.txt")
+        f_out = open(fprm_metafile, "w")
+        for i in range(len(raw_metafile_lines)):
+            tokens = raw_metafile_lines[i].strip().split("\t")
+            symbol_sequences = tokens[1].split(" ")
+            fpadd_tokens = fpadd_metafile_lines[i].strip().split("\t")
+            fpadd_symbol_sequences = fpadd_tokens[1].split(" ")
+
+            error_flag = False
+            out_str = tokens[0] + "\t"
+            idx = 0
+            length = len(symbol_sequences)
+            while idx < length:
+                if "$emotion_disgust" in fpadd_symbol_sequences[idx]:
+                    if idx + 1 < length and "none" in fpadd_symbol_sequences[idx + 1]:
+                        idx = idx + 2
+                    else:
+                        idx = idx + 1
+                    continue
+                out_str = out_str + symbol_sequences[idx] + " "
+                idx = idx + 1
+
+            if not error_flag:
+                f_out.write(out_str.strip() + "\n")
+        f_out.close()
+
+    def process(self, voice_output_dir, prosody, raw_metafile):
+
+        with open(raw_metafile, "r") as f:
+            lines = f.readlines()
+        random.shuffle(lines)
+
+        fpadd_metafile = self.addfp(voice_output_dir, prosody, lines)
+        self.removefp(voice_output_dir, fpadd_metafile, lines)
--- a/kantts/preprocess/languages/PinYin/En2ChPhoneMap.txt
+++ b/kantts/preprocess/languages/PinYin/En2ChPhoneMap.txt
+wu	w
+yi	y
--- a/kantts/preprocess/languages/PinYin/PhoneSet.xml
+++ b/kantts/preprocess/languages/PinYin/PhoneSet.xml
+<?xml version="1.0" encoding="utf-8"?>
+<phoneSet xmlns="http://schemas.alibaba-inc.com/tts">
+  <phone>
+    <id>0</id>
+    <name>a_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>low</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>1</id>
+    <name>ai_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>low</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>2</id>
+    <name>an_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>low</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>3</id>
+    <name>ang_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>low</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>4</id>
+    <name>ao_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>low</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>5</id>
+    <name>b_c</name>
+    <cv>vowel</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>low</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>6</id>
+    <name>c_c</name>
+    <cv>vowel</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>low</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>7</id>
+    <name>ch_c</name>
+    <cv>vowel</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>low</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>8</id>
+    <name>d_c</name>
+    <cv>vowel</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>low</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>9</id>
+    <name>e_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>low</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>10</id>
+    <name>ei_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>low</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>11</id>
+    <name>en_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>low</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>12</id>
+    <name>eng_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>doublelips</ap>
+    <am>stop</am>
+  </phone>
+  <phone>
+    <id>13</id>
+    <name>er_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>doublelips</ap>
+    <am>stop</am>
+  </phone>
+  <phone>
+    <id>14</id>
+    <name>f_c</name>
+    <cv>vowel</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>doublelips</ap>
+    <am>stop</am>
+  </phone>
+  <phone>
+    <id>15</id>
+    <name>g_c</name>
+    <cv>vowel</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>16</id>
+    <name>h_c</name>
+    <cv>vowel</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>backtongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>17</id>
+    <name>i_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>backtongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>18</id>
+    <name>ia_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>19</id>
+    <name>ian_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>centraltongue</ap>
+    <am>stop</am>
+  </phone>
+  <phone>
+    <id>20</id>
+    <name>iang_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>centraltongue</ap>
+    <am>stop</am>
+  </phone>
+  <phone>
+    <id>21</id>
+    <name>iao_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>centraltongue</ap>
+    <am>stop</am>
+  </phone>
+  <phone>
+    <id>22</id>
+    <name>ie_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>23</id>
+    <name>ih_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>24</id>
+    <name>ii_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>25</id>
+    <name>in_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>26</id>
+    <name>ing_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>27</id>
+    <name>io_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>28</id>
+    <name>iong_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>29</id>
+    <name>iou_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>30</id>
+    <name>j_c</name>
+    <cv>vowel</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>31</id>
+    <name>k_c</name>
+    <cv>vowel</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>32</id>
+    <name>l_c</name>
+    <cv>vowel</cv>
+    <if>initial</if>
+    <uv>voiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>33</id>
+    <name>m_c</name>
+    <cv>vowel</cv>
+    <if>initial</if>
+    <uv>voiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>34</id>
+    <name>n_c</name>
+    <cv>vowel</cv>
+    <if>initial</if>
+    <uv>voiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>35</id>
+    <name>o_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>36</id>
+    <name>ong_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>37</id>
+    <name>ou_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>38</id>
+    <name>p_c</name>
+    <cv>vowel</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>liptooth</ap>
+    <am>fricative</am>
+  </phone>
+  <phone>
+    <id>39</id>
+    <name>q_c</name>
+    <cv>vowel</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>liptooth</ap>
+    <am>fricative</am>
+  </phone>
+  <phone>
+    <id>40</id>
+    <name>r_c</name>
+    <cv>vowel</cv>
+    <if>initial</if>
+    <uv>voiced</uv>
+    <ap>velar</ap>
+    <am>stop</am>
+  </phone>
+  <phone>
+    <id>41</id>
+    <name>s_c</name>
+    <cv>vowel</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>low</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>42</id>
+    <name>sh_c</name>
+    <cv>vowel</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>43</id>
+    <name>t_c</name>
+    <cv>vowel</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>44</id>
+    <name>u_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>velar</ap>
+    <am>stop</am>
+  </phone>
+  <phone>
+    <id>45</id>
+    <name>ua_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>velar</ap>
+    <am>fricative</am>
+  </phone>
+  <phone>
+    <id>46</id>
+    <name>uai_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>velar</ap>
+    <am>fricative</am>
+  </phone>
+  <phone>
+    <id>47</id>
+    <name>uan_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>high</ap>
+    <am>close</am>
+  </phone>
+  <phone>
+    <id>48</id>
+    <name>uang_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>high</ap>
+    <am>close</am>
+  </phone>
+  <phone>
+    <id>49</id>
+    <name>uei_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>high</ap>
+    <am>close</am>
+  </phone>
+  <phone>
+    <id>50</id>
+    <name>uen_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>high</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>51</id>
+    <name>ueng_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>high</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>52</id>
+    <name>uo_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>high</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>53</id>
+    <name>v_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>high</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>54</id>
+    <name>van_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>high</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>55</id>
+    <name>ve_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>high</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>56</id>
+    <name>vn_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>high</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>57</id>
+    <name>xx_c</name>
+    <cv>vowel</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>high</ap>
+    <am>close</am>
+  </phone>
+  <phone>
+    <id>58</id>
+    <name>z_c</name>
+    <cv>vowel</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>high</ap>
+    <am>close</am>
+  </phone>
+  <phone>
+    <id>59</id>
+    <name>zh_c</name>
+    <cv>vowel</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>high</ap>
+    <am>close</am>
+  </phone>
+  <phone>
+    <id>60</id>
+    <name>w_c</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>high</ap>
+    <am>close</am>
+  </phone>
+  <phone>
+    <id>61</id>
+    <name>y_c</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>high</ap>
+    <am>close</am>
+  </phone>
+  <phone>
+    <id>62</id>
+    <name>ga</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>voiced</uv>
+    <ap>centraltongue</ap>
+    <am>lateral</am>
+  </phone>
+  <phone>
+    <id>63</id>
+    <name>ge</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>voiced</uv>
+    <ap>centraltongue</ap>
+    <am>lateral</am>
+  </phone>
+  <phone>
+    <id>64</id>
+    <name>go</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>voiced</uv>
+    <ap>centraltongue</ap>
+    <am>lateral</am>
+  </phone>
+  <phone>
+    <id>65</id>
+    <name>aa</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>66</id>
+    <name>ae</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>67</id>
+    <name>ah</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>68</id>
+    <name>ao</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>69</id>
+    <name>aw</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>70</id>
+    <name>ay</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>71</id>
+    <name>b</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>doublelips</ap>
+    <am>stop</am>
+  </phone>
+  <phone>
+    <id>72</id>
+    <name>ch</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>backtongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>73</id>
+    <name>d</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>centraltongue</ap>
+    <am>stop</am>
+  </phone>
+  <phone>
+    <id>74</id>
+    <name>dh</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>75</id>
+    <name>eh</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>76</id>
+    <name>er</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>77</id>
+    <name>ey</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>78</id>
+    <name>f</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>liptooth</ap>
+    <am>fricative</am>
+  </phone>
+  <phone>
+    <id>79</id>
+    <name>g</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>velar</ap>
+    <am>stop</am>
+  </phone>
+  <phone>
+    <id>80</id>
+    <name>hh</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>81</id>
+    <name>ih</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>82</id>
+    <name>iy</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>83</id>
+    <name>jh</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>84</id>
+    <name>k</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>velar</ap>
+    <am>stop</am>
+  </phone>
+  <phone>
+    <id>85</id>
+    <name>l</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>voiced</uv>
+    <ap>centraltongue</ap>
+    <am>lateral</am>
+  </phone>
+  <phone>
+    <id>86</id>
+    <name>m</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>voiced</uv>
+    <ap>doublelips</ap>
+    <am>nasal</am>
+  </phone>
+  <phone>
+    <id>87</id>
+    <name>n</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>voiced</uv>
+    <ap>centraltongue</ap>
+    <am>nasal</am>
+  </phone>
+  <phone>
+    <id>88</id>
+    <name>ng</name>
+    <cv>consonant</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>89</id>
+    <name>ow</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>90</id>
+    <name>oy</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>91</id>
+    <name>p</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>doublelips</ap>
+    <am>stop</am>
+  </phone>
+  <phone>
+    <id>92</id>
+    <name>r</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>voiced</uv>
+    <ap>backtongue</ap>
+    <am>fricative</am>
+  </phone>
+  <phone>
+    <id>93</id>
+    <name>s</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>fronttongue</ap>
+    <am>fricative</am>
+  </phone>
+  <phone>
+    <id>94</id>
+    <name>sh</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>backtongue</ap>
+    <am>fricative</am>
+  </phone>
+  <phone>
+    <id>95</id>
+    <name>t</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>centraltongue</ap>
+    <am>stop</am>
+  </phone>
+  <phone>
+    <id>96</id>
+    <name>th</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>97</id>
+    <name>uh</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>98</id>
+    <name>uw</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>99</id>
+    <name>v</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>100</id>
+    <name>w</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>101</id>
+    <name>y</name>
+    <cv>consonant</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>102</id>
+    <name>z</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>103</id>
+    <name>zh</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>backtongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>104</id>
+    <name>air_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>low</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>105</id>
+    <name>angr_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>low</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>106</id>
+    <name>anr_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>low</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>107</id>
+    <name>aor_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>low</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>108</id>
+    <name>ar_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>low</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>109</id>
+    <name>eir_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>low</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>110</id>
+    <name>engr_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>doublelips</ap>
+    <am>stop</am>
+  </phone>
+  <phone>
+    <id>111</id>
+    <name>enr_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>low</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>112</id>
+    <name>iangr_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>centraltongue</ap>
+    <am>stop</am>
+  </phone>
+  <phone>
+    <id>113</id>
+    <name>ianr_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>centraltongue</ap>
+    <am>stop</am>
+  </phone>
+  <phone>
+    <id>114</id>
+    <name>iaor_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>centraltongue</ap>
+    <am>stop</am>
+  </phone>
+  <phone>
+    <id>115</id>
+    <name>iar_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>fronttongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>116</id>
+    <name>ier_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>117</id>
+    <name>ihr_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>118</id>
+    <name>iir_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>119</id>
+    <name>ingr_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>120</id>
+    <name>inr_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>121</id>
+    <name>iongr_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>122</id>
+    <name>iour_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>123</id>
+    <name>ir_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>backtongue</ap>
+    <am>affricative</am>
+  </phone>
+  <phone>
+    <id>124</id>
+    <name>ongr_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>125</id>
+    <name>or_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>126</id>
+    <name>our_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>middle</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>127</id>
+    <name>uair_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>velar</ap>
+    <am>fricative</am>
+  </phone>
+  <phone>
+    <id>128</id>
+    <name>uangr_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>high</ap>
+    <am>close</am>
+  </phone>
+  <phone>
+    <id>129</id>
+    <name>uanr_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>high</ap>
+    <am>close</am>
+  </phone>
+  <phone>
+    <id>130</id>
+    <name>uar_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>velar</ap>
+    <am>fricative</am>
+  </phone>
+  <phone>
+    <id>131</id>
+    <name>ueir_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>high</ap>
+    <am>close</am>
+  </phone>
+  <phone>
+    <id>132</id>
+    <name>uenr_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>high</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>133</id>
+    <name>uor_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>high</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>134</id>
+    <name>ur_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>velar</ap>
+    <am>stop</am>
+  </phone>
+  <phone>
+    <id>135</id>
+    <name>vanr_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>high</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>136</id>
+    <name>ver_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>high</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>137</id>
+    <name>vnr_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>high</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>138</id>
+    <name>vr_c</name>
+    <cv>vowel</cv>
+    <if>final</if>
+    <uv>voiced</uv>
+    <ap>high</ap>
+    <am>open</am>
+  </phone>
+  <phone>
+    <id>146</id>
+    <name>pau</name>
+    <cv>consonant</cv>
+    <if>initial</if>
+    <uv>unvoiced</uv>
+    <ap>high</ap>
+    <am>close</am>
+  </phone>
+</phoneSet>