Commit ee10550a authored by liugh5's avatar liugh5
Browse files

Initial commit

parents
Pipeline #790 canceled with stages
import torch
import torch.nn as nn
import torch.nn.functional as F
from kantts.models.sambert import FFTBlock, PNCABlock, Prenet
from kantts.models.sambert.positions import (
SinusoidalPositionEncoder,
DurSinusoidalPositionEncoder,
)
from kantts.models.sambert.adaptors import (
LengthRegulator,
VarFsmnRnnNARPredictor,
VarRnnARPredictor,
)
from kantts.models.sambert.fsmn import FsmnEncoderV2
from kantts.models.sambert.alignment import b_mas
from kantts.models.sambert.attention import ConvAttention
from kantts.models.utils import get_mask_from_lengths
class SelfAttentionEncoder(nn.Module):
def __init__(
self,
n_layer,
d_in,
d_model,
n_head,
d_head,
d_inner,
dropout,
dropout_att,
dropout_relu,
position_encoder,
):
super(SelfAttentionEncoder, self).__init__()
self.d_in = d_in
self.d_model = d_model
self.dropout = dropout
d_in_lst = [d_in] + [d_model] * (n_layer - 1)
self.fft = nn.ModuleList(
[
FFTBlock(
d,
d_model,
n_head,
d_head,
d_inner,
(3, 1),
dropout,
dropout_att,
dropout_relu,
)
for d in d_in_lst
]
)
self.ln = nn.LayerNorm(d_model, eps=1e-6)
self.position_enc = position_encoder
def forward(self, input, mask=None, return_attns=False):
input *= self.d_model ** 0.5
if isinstance(self.position_enc, SinusoidalPositionEncoder):
input = self.position_enc(input)
else:
raise NotImplementedError
input = F.dropout(input, p=self.dropout, training=self.training)
enc_slf_attn_list = []
max_len = input.size(1)
if mask is not None:
slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1)
else:
slf_attn_mask = None
enc_output = input
for id, layer in enumerate(self.fft):
enc_output, enc_slf_attn = layer(
enc_output, mask=mask, slf_attn_mask=slf_attn_mask
)
if return_attns:
enc_slf_attn_list += [enc_slf_attn]
enc_output = self.ln(enc_output)
return enc_output, enc_slf_attn_list
class HybridAttentionDecoder(nn.Module):
def __init__(
self,
d_in,
prenet_units,
n_layer,
d_model,
d_mem,
n_head,
d_head,
d_inner,
dropout,
dropout_att,
dropout_relu,
d_out,
):
super(HybridAttentionDecoder, self).__init__()
self.d_model = d_model
self.dropout = dropout
self.prenet = Prenet(d_in, prenet_units, d_model)
self.dec_in_proj = nn.Linear(d_model + d_mem, d_model)
self.pnca = nn.ModuleList(
[
PNCABlock(
d_model,
d_mem,
n_head,
d_head,
d_inner,
(1, 1),
dropout,
dropout_att,
dropout_relu,
)
for _ in range(n_layer)
]
)
self.ln = nn.LayerNorm(d_model, eps=1e-6)
self.dec_out_proj = nn.Linear(d_model, d_out)
def reset_state(self):
for layer in self.pnca:
layer.reset_state()
def get_pnca_attn_mask(
self, device, max_len, x_band_width, h_band_width, masks=None
):
if masks is not None:
pnca_attn_mask = masks.unsqueeze(1).expand(-1, max_len, -1)
else:
pnca_attn_mask = None
range_ = torch.arange(max_len).to(device)
x_start = torch.clamp_min(range_ - x_band_width, 0)[None, None, :]
x_end = (range_ + 1)[None, None, :]
h_start = range_[None, None, :]
h_end = torch.clamp_max(range_ + h_band_width + 1, max_len + 1)[None, None, :]
pnca_x_attn_mask = ~(
(x_start <= range_[None, :, None]) & (x_end > range_[None, :, None])
).transpose(1, 2)
pnca_h_attn_mask = ~(
(h_start <= range_[None, :, None]) & (h_end > range_[None, :, None])
).transpose(1, 2)
if pnca_attn_mask is not None:
pnca_x_attn_mask = pnca_x_attn_mask | pnca_attn_mask
pnca_h_attn_mask = pnca_h_attn_mask | pnca_attn_mask
pnca_x_attn_mask = pnca_x_attn_mask.masked_fill(
pnca_attn_mask.transpose(1, 2), False
)
pnca_h_attn_mask = pnca_h_attn_mask.masked_fill(
pnca_attn_mask.transpose(1, 2), False
)
return pnca_attn_mask, pnca_x_attn_mask, pnca_h_attn_mask
# must call reset_state before
def forward(
self, input, memory, x_band_width, h_band_width, masks=None, return_attns=False
):
input = self.prenet(input)
input = torch.cat([memory, input], dim=-1)
input = self.dec_in_proj(input)
if masks is not None:
input = input.masked_fill(masks.unsqueeze(-1), 0)
input *= self.d_model ** 0.5
input = F.dropout(input, p=self.dropout, training=self.training)
max_len = input.size(1)
pnca_attn_mask, pnca_x_attn_mask, pnca_h_attn_mask = self.get_pnca_attn_mask(
input.device, max_len, x_band_width, h_band_width, masks
)
dec_pnca_attn_x_list = []
dec_pnca_attn_h_list = []
dec_output = input
for id, layer in enumerate(self.pnca):
dec_output, dec_pnca_attn_x, dec_pnca_attn_h = layer(
dec_output,
memory,
masks=masks,
pnca_x_attn_mask=pnca_x_attn_mask,
pnca_h_attn_mask=pnca_h_attn_mask,
)
if return_attns:
dec_pnca_attn_x_list += [dec_pnca_attn_x]
dec_pnca_attn_h_list += [dec_pnca_attn_h]
dec_output = self.ln(dec_output)
dec_output = self.dec_out_proj(dec_output)
return dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list
# must call reset_state before when step == 0
def infer(
self,
step,
input,
memory,
x_band_width,
h_band_width,
masks=None,
return_attns=False,
):
max_len = memory.size(1)
input = self.prenet(input)
input = torch.cat([memory[:, step : step + 1, :], input], dim=-1)
input = self.dec_in_proj(input)
input *= self.d_model ** 0.5
input = F.dropout(input, p=self.dropout, training=self.training)
pnca_attn_mask, pnca_x_attn_mask, pnca_h_attn_mask = self.get_pnca_attn_mask(
input.device, max_len, x_band_width, h_band_width, masks
)
dec_pnca_attn_x_list = []
dec_pnca_attn_h_list = []
dec_output = input
for id, layer in enumerate(self.pnca):
if masks is not None:
mask_step = masks[:, step : step + 1]
else:
mask_step = None
dec_output, dec_pnca_attn_x, dec_pnca_attn_h = layer(
dec_output,
memory,
mask=mask_step,
pnca_x_attn_mask=pnca_x_attn_mask[:, step : step + 1, : (step + 1)],
pnca_h_attn_mask=pnca_h_attn_mask[:, step : step + 1, :],
)
if return_attns:
dec_pnca_attn_x_list += [dec_pnca_attn_x]
dec_pnca_attn_h_list += [dec_pnca_attn_h]
dec_output = self.ln(dec_output)
dec_output = self.dec_out_proj(dec_output)
return dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list
class TextFftEncoder(nn.Module):
def __init__(self, config):
super(TextFftEncoder, self).__init__()
d_emb = config["embedding_dim"]
self.using_byte = False
if config.get("using_byte", False):
self.using_byte = True
nb_ling_byte_index = config["byte_index"]
self.byte_index_emb = nn.Embedding(nb_ling_byte_index, d_emb)
else:
# linguistic unit lookup table
nb_ling_sy = config["sy"]
nb_ling_tone = config["tone"]
nb_ling_syllable_flag = config["syllable_flag"]
nb_ling_ws = config["word_segment"]
self.sy_emb = nn.Embedding(nb_ling_sy, d_emb)
self.tone_emb = nn.Embedding(nb_ling_tone, d_emb)
self.syllable_flag_emb = nn.Embedding(nb_ling_syllable_flag, d_emb)
self.ws_emb = nn.Embedding(nb_ling_ws, d_emb)
max_len = config["max_len"]
nb_layers = config["encoder_num_layers"]
nb_heads = config["encoder_num_heads"]
d_model = config["encoder_num_units"]
d_head = d_model // nb_heads
d_inner = config["encoder_ffn_inner_dim"]
dropout = config["encoder_dropout"]
dropout_attn = config["encoder_attention_dropout"]
dropout_relu = config["encoder_relu_dropout"]
d_proj = config["encoder_projection_units"]
self.d_model = d_model
position_enc = SinusoidalPositionEncoder(max_len, d_emb)
self.ling_enc = SelfAttentionEncoder(
nb_layers,
d_emb,
d_model,
nb_heads,
d_head,
d_inner,
dropout,
dropout_attn,
dropout_relu,
position_enc,
)
self.ling_proj = nn.Linear(d_model, d_proj, bias=False)
def forward(self, inputs_ling, masks=None, return_attns=False):
# Parse inputs_ling_seq
if self.using_byte:
inputs_byte_index = inputs_ling[:, :, 0]
byte_index_embedding = self.byte_index_emb(inputs_byte_index)
ling_embedding = byte_index_embedding
else:
inputs_sy = inputs_ling[:, :, 0]
inputs_tone = inputs_ling[:, :, 1]
inputs_syllable_flag = inputs_ling[:, :, 2]
inputs_ws = inputs_ling[:, :, 3]
# Lookup table
sy_embedding = self.sy_emb(inputs_sy)
tone_embedding = self.tone_emb(inputs_tone)
syllable_flag_embedding = self.syllable_flag_emb(inputs_syllable_flag)
ws_embedding = self.ws_emb(inputs_ws)
ling_embedding = (
sy_embedding + tone_embedding + syllable_flag_embedding + ws_embedding
)
enc_output, enc_slf_attn_lst = self.ling_enc(
ling_embedding, masks, return_attns
)
if hasattr(self, "ling_proj"):
enc_output = self.ling_proj(enc_output)
return enc_output, enc_slf_attn_lst, ling_embedding
class TextEncoder(nn.Module):
def __init__(self, config):
super(TextEncoder, self).__init__()
self.text_encoder = TextFftEncoder(config)
self.se_enable = config.get("SE", False)
if not self.se_enable:
self.spk_tokenizer = nn.Embedding(config["speaker"], config["speaker_units"])
self.emo_tokenizer = nn.Embedding(config["emotion"], config["emotion_units"])
# self.variance_adaptor = VarianceAdaptor(config)
# self.mel_decoder = MelPNCADecoder(config)
# self.mel_postnet = PostNet(config)
self.MAS = False
if config.get("MAS", False):
self.MAS = True
self.align_attention = ConvAttention(
n_mel_channels=config["num_mels"],
n_text_channels=config["embedding_dim"],
n_att_channels=config["num_mels"],
)
self.fp_enable = config.get("FP", False)
if self.fp_enable:
self.FP_predictor = FP_Predictor(config)
def forward(self, inputs_ling, inputs_emotion, inputs_speaker, inputs_ling_masks=None, return_attns=False):
text_hid, enc_sla_attn_lst, ling_embedding = self.text_encoder(
inputs_ling, inputs_ling_masks, return_attns
)
emo_hid = self.emo_tokenizer(inputs_emotion)
spk_hid = inputs_speaker if self.se_enable else self.spk_tokenizer(inputs_speaker)
if return_attns:
return text_hid, enc_sla_attn_lst, ling_embedding, emo_hid, spk_hid
else:
return text_hid, ling_embedding, emo_hid, spk_hid
class VarianceAdaptor(nn.Module):
def __init__(self, config):
super(VarianceAdaptor, self).__init__()
input_dim = (
config["encoder_projection_units"]
+ config["emotion_units"]
+ config["speaker_units"]
)
filter_size = config["predictor_filter_size"]
fsmn_num_layers = config["predictor_fsmn_num_layers"]
num_memory_units = config["predictor_num_memory_units"]
ffn_inner_dim = config["predictor_ffn_inner_dim"]
dropout = config["predictor_dropout"]
shift = config["predictor_shift"]
lstm_units = config["predictor_lstm_units"]
dur_pred_prenet_units = config["dur_pred_prenet_units"]
dur_pred_lstm_units = config["dur_pred_lstm_units"]
self.pitch_predictor = VarFsmnRnnNARPredictor(
input_dim,
filter_size,
fsmn_num_layers,
num_memory_units,
ffn_inner_dim,
dropout,
shift,
lstm_units,
)
self.energy_predictor = VarFsmnRnnNARPredictor(
input_dim,
filter_size,
fsmn_num_layers,
num_memory_units,
ffn_inner_dim,
dropout,
shift,
lstm_units,
)
self.duration_predictor = VarRnnARPredictor(
input_dim, dur_pred_prenet_units, dur_pred_lstm_units
)
self.length_regulator = LengthRegulator(config["outputs_per_step"])
self.dur_position_encoder = DurSinusoidalPositionEncoder(
config["encoder_projection_units"], config["outputs_per_step"]
)
self.pitch_emb = nn.Conv1d(
1, config["encoder_projection_units"], kernel_size=9, padding=4
)
self.energy_emb = nn.Conv1d(
1, config["encoder_projection_units"], kernel_size=9, padding=4
)
def forward(
self,
inputs_text_embedding,
inputs_emo_embedding,
inputs_spk_embedding, # [1,20,192]
masks=None,
output_masks=None,
duration_targets=None,
pitch_targets=None,
energy_targets=None,
):
batch_size = inputs_text_embedding.size(0)
variance_predictor_inputs = torch.cat(
[inputs_text_embedding, inputs_spk_embedding, inputs_emo_embedding], dim=-1
)
pitch_predictions = self.pitch_predictor(variance_predictor_inputs, masks)
energy_predictions = self.energy_predictor(variance_predictor_inputs, masks)
if pitch_targets is not None:
pitch_embeddings = self.pitch_emb(pitch_targets.unsqueeze(1)).transpose(
1, 2
)
else:
pitch_embeddings = self.pitch_emb(pitch_predictions.unsqueeze(1)).transpose(
1, 2
)
if energy_targets is not None:
energy_embeddings = self.energy_emb(energy_targets.unsqueeze(1)).transpose(
1, 2
)
else:
energy_embeddings = self.energy_emb(energy_predictions.unsqueeze(1)).transpose(
1, 2)
inputs_text_embedding_aug = (
inputs_text_embedding + pitch_embeddings + energy_embeddings
)
duration_predictor_cond = torch.cat(
[inputs_text_embedding_aug, inputs_spk_embedding, inputs_emo_embedding],
dim=-1,
)
if duration_targets is not None:
duration_predictor_go_frame = torch.zeros(batch_size, 1).to(
inputs_text_embedding.device
)
duration_predictor_input = torch.cat(
[duration_predictor_go_frame, duration_targets[:, :-1].float()], dim=-1
)
duration_predictor_input = torch.log(duration_predictor_input + 1)
log_duration_predictions, _ = self.duration_predictor(
duration_predictor_input.unsqueeze(-1),
duration_predictor_cond,
masks=masks,
)
duration_predictions = torch.exp(log_duration_predictions) - 1
else:
log_duration_predictions = self.duration_predictor.infer(
duration_predictor_cond, masks=masks
)
duration_predictions = torch.exp(log_duration_predictions) - 1
if duration_targets is not None:
LR_text_outputs, LR_length_rounded = self.length_regulator(
inputs_text_embedding_aug, duration_targets, masks=output_masks
)
LR_position_embeddings = self.dur_position_encoder(
duration_targets, masks=output_masks
)
LR_emo_outputs, _ = self.length_regulator(
inputs_emo_embedding, duration_targets, masks=output_masks
)
LR_spk_outputs, _ = self.length_regulator(
inputs_spk_embedding, duration_targets, masks=output_masks
)
else:
LR_text_outputs, LR_length_rounded = self.length_regulator(
inputs_text_embedding_aug, duration_predictions, masks=output_masks
)
LR_position_embeddings = self.dur_position_encoder(
duration_predictions, masks=output_masks
)
LR_emo_outputs, _ = self.length_regulator(
inputs_emo_embedding, duration_predictions, masks=output_masks
)
LR_spk_outputs, _ = self.length_regulator(
inputs_spk_embedding, duration_predictions, masks=output_masks
)
LR_text_outputs = LR_text_outputs + LR_position_embeddings
return (
LR_text_outputs,
LR_emo_outputs,
LR_spk_outputs, # [1,153,192]
LR_length_rounded,
log_duration_predictions,
pitch_predictions,
energy_predictions,
)
class VarianceAdaptor2(nn.Module):
def __init__(self, config):
super(VarianceAdaptor2, self).__init__()
input_dim = (
config["encoder_projection_units"]
+ config["emotion_units"]
+ config["speaker_units"]
)
filter_size = config["predictor_filter_size"]
fsmn_num_layers = config["predictor_fsmn_num_layers"]
num_memory_units = config["predictor_num_memory_units"]
ffn_inner_dim = config["predictor_ffn_inner_dim"]
dropout = config["predictor_dropout"]
shift = config["predictor_shift"]
lstm_units = config["predictor_lstm_units"]
dur_pred_prenet_units = config["dur_pred_prenet_units"]
dur_pred_lstm_units = config["dur_pred_lstm_units"]
self.pitch_predictor = VarFsmnRnnNARPredictor(
input_dim,
filter_size,
fsmn_num_layers,
num_memory_units,
ffn_inner_dim,
dropout,
shift,
lstm_units,
)
self.energy_predictor = VarFsmnRnnNARPredictor(
input_dim,
filter_size,
fsmn_num_layers,
num_memory_units,
ffn_inner_dim,
dropout,
shift,
lstm_units,
)
self.duration_predictor = VarRnnARPredictor(
input_dim, dur_pred_prenet_units, dur_pred_lstm_units
)
self.length_regulator = LengthRegulator(config["outputs_per_step"])
self.dur_position_encoder = DurSinusoidalPositionEncoder(
config["encoder_projection_units"], config["outputs_per_step"]
)
self.pitch_emb = nn.Conv1d(
1, config["encoder_projection_units"], kernel_size=9, padding=4
)
self.energy_emb = nn.Conv1d(
1, config["encoder_projection_units"], kernel_size=9, padding=4
)
def forward(
self,
inputs_text_embedding,
inputs_emo_embedding,
inputs_spk_embedding, # [1,20,192]
scale=1.0,
masks=None,
output_masks=None,
duration_targets=None,
pitch_targets=None,
energy_targets=None,
):
batch_size = inputs_text_embedding.size(0)
variance_predictor_inputs = torch.cat(
[inputs_text_embedding, inputs_spk_embedding, inputs_emo_embedding], dim=-1
)
pitch_predictions = self.pitch_predictor(variance_predictor_inputs, masks)
energy_predictions = self.energy_predictor(variance_predictor_inputs, masks)
if pitch_targets is not None:
pitch_embeddings = self.pitch_emb(pitch_targets.unsqueeze(1)).transpose(
1, 2
)
else:
pitch_embeddings = self.pitch_emb(pitch_predictions.unsqueeze(1)).transpose(
1, 2
)
if energy_targets is not None:
energy_embeddings = self.energy_emb(energy_targets.unsqueeze(1)).transpose(
1, 2
)
else:
energy_embeddings = self.energy_emb(energy_predictions.unsqueeze(1)).transpose(
1, 2)
inputs_text_embedding_aug = (
inputs_text_embedding + pitch_embeddings + energy_embeddings
)
duration_predictor_cond = torch.cat(
[inputs_text_embedding_aug, inputs_spk_embedding, inputs_emo_embedding],
dim=-1,
)
if duration_targets is not None:
duration_predictor_go_frame = torch.zeros(batch_size, 1).to(
inputs_text_embedding.device
)
duration_predictor_input = torch.cat(
[duration_predictor_go_frame, duration_targets[:, :-1].float()], dim=-1
)
duration_predictor_input = torch.log(duration_predictor_input + 1)
log_duration_predictions, _ = self.duration_predictor(
duration_predictor_input.unsqueeze(-1),
duration_predictor_cond,
masks=masks,
)
duration_predictions = torch.exp(log_duration_predictions) - 1
else:
log_duration_predictions = self.duration_predictor.infer(
duration_predictor_cond, masks=masks
)
duration_predictions = torch.exp(log_duration_predictions) - 1
if duration_targets is not None:
LR_text_outputs, LR_length_rounded = self.length_regulator(
inputs_text_embedding_aug, duration_targets*scale, masks=output_masks # *scale
)
LR_position_embeddings = self.dur_position_encoder(
duration_targets, masks=output_masks
)
LR_emo_outputs, _ = self.length_regulator(
inputs_emo_embedding, duration_targets*scale, masks=output_masks # *scale
)
LR_spk_outputs, _ = self.length_regulator(
inputs_spk_embedding, duration_targets*scale, masks=output_masks # *scale
)
else:
LR_text_outputs, LR_length_rounded = self.length_regulator(
inputs_text_embedding_aug, duration_predictions*scale, masks=output_masks # *scale
)
LR_position_embeddings = self.dur_position_encoder(
duration_predictions*scale, masks=output_masks # *target_rate
)
LR_emo_outputs, _ = self.length_regulator(
inputs_emo_embedding, duration_predictions*scale, masks=output_masks # *scale
)
LR_spk_outputs, _ = self.length_regulator(
inputs_spk_embedding, duration_predictions*scale, masks=output_masks # *scale
)
LR_text_outputs = LR_text_outputs + LR_position_embeddings
return (
LR_text_outputs,
LR_emo_outputs,
LR_spk_outputs, # [1,153,192]
LR_length_rounded,
log_duration_predictions,
pitch_predictions,
energy_predictions,
)
class MelPNCADecoder(nn.Module):
def __init__(self, config):
super(MelPNCADecoder, self).__init__()
prenet_units = config["decoder_prenet_units"]
nb_layers = config["decoder_num_layers"]
nb_heads = config["decoder_num_heads"]
d_model = config["decoder_num_units"]
d_head = d_model // nb_heads
d_inner = config["decoder_ffn_inner_dim"]
dropout = config["decoder_dropout"]
dropout_attn = config["decoder_attention_dropout"]
dropout_relu = config["decoder_relu_dropout"]
outputs_per_step = config["outputs_per_step"]
d_mem = (
config["encoder_projection_units"] * outputs_per_step
+ config["emotion_units"]
+ config["speaker_units"]
)
d_mel = config["num_mels"]
self.d_mel = d_mel
self.r = outputs_per_step
self.nb_layers = nb_layers
self.mel_dec = HybridAttentionDecoder(
d_mel,
prenet_units,
nb_layers,
d_model,
d_mem,
nb_heads,
d_head,
d_inner,
dropout,
dropout_attn,
dropout_relu,
d_mel * outputs_per_step,
)
def forward(
self,
memory,
x_band_width,
h_band_width,
target=None,
masks=None,
return_attns=False,
):
batch_size = memory.size(0)
go_frame = torch.zeros((batch_size, 1, self.d_mel)).to(memory.device)
if target is not None:
self.mel_dec.reset_state()
input = target[:, self.r - 1 :: self.r, :]
input = torch.cat([go_frame, input], dim=1)[:, :-1, :]
dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list = self.mel_dec(
input,
memory,
x_band_width,
h_band_width,
masks=masks,
return_attns=return_attns,
)
else:
dec_output = []
dec_pnca_attn_x_list = [[] for _ in range(self.nb_layers)]
dec_pnca_attn_h_list = [[] for _ in range(self.nb_layers)]
self.mel_dec.reset_state()
input = go_frame
for step in range(memory.size(1)):
(
dec_output_step,
dec_pnca_attn_x_step,
dec_pnca_attn_h_step,
) = self.mel_dec.infer(
step,
input,
memory,
x_band_width,
h_band_width,
masks=masks,
return_attns=return_attns,
)
input = dec_output_step[:, :, -self.d_mel :]
dec_output.append(dec_output_step)
for layer_id, (pnca_x_attn, pnca_h_attn) in enumerate(
zip(dec_pnca_attn_x_step, dec_pnca_attn_h_step)
):
left = memory.size(1) - pnca_x_attn.size(-1)
if left > 0:
padding = torch.zeros((pnca_x_attn.size(0), 1, left)).to(
pnca_x_attn
)
pnca_x_attn = torch.cat([pnca_x_attn, padding], dim=-1)
dec_pnca_attn_x_list[layer_id].append(pnca_x_attn)
dec_pnca_attn_h_list[layer_id].append(pnca_h_attn)
dec_output = torch.cat(dec_output, dim=1)
if return_attns:
for layer_id in range(self.nb_layers):
dec_pnca_attn_x_list[layer_id] = torch.cat(
dec_pnca_attn_x_list[layer_id], dim=1
)
dec_pnca_attn_h_list[layer_id] = torch.cat(
dec_pnca_attn_h_list[layer_id], dim=1
)
if return_attns:
return dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list
else:
return dec_output
class PostNet(nn.Module):
def __init__(self, config):
super(PostNet, self).__init__()
self.filter_size = config["postnet_filter_size"]
self.fsmn_num_layers = config["postnet_fsmn_num_layers"]
self.num_memory_units = config["postnet_num_memory_units"]
self.ffn_inner_dim = config["postnet_ffn_inner_dim"]
self.dropout = config["postnet_dropout"]
self.shift = config["postnet_shift"]
self.lstm_units = config["postnet_lstm_units"]
self.num_mels = config["num_mels"]
self.fsmn = FsmnEncoderV2(
self.filter_size,
self.fsmn_num_layers,
self.num_mels,
self.num_memory_units,
self.ffn_inner_dim,
self.dropout,
self.shift,
)
self.lstm = nn.LSTM(
self.num_memory_units, self.lstm_units, num_layers=1, batch_first=True
)
self.fc = nn.Linear(self.lstm_units, self.num_mels)
def forward(self, x, mask=None):
postnet_fsmn_output = self.fsmn(x, mask)
# The input can also be a packed variable length sequence,
# here we just omit it for simpliciy due to the mask and uni-directional lstm.
postnet_lstm_output, _ = self.lstm(postnet_fsmn_output)
mel_residual_output = self.fc(postnet_lstm_output)
return mel_residual_output
class FP_Predictor(nn.Module):
def __init__(self, config):
super(FP_Predictor, self).__init__()
self.w_1 = nn.Conv1d(
config["encoder_projection_units"],
config["embedding_dim"] // 2,
kernel_size=3,
padding=1,
)
self.w_2 = nn.Conv1d(
config["embedding_dim"] // 2,
config["encoder_projection_units"],
kernel_size=1,
padding=0,
)
self.layer_norm1 = nn.LayerNorm(config["embedding_dim"] // 2, eps=1e-6)
self.layer_norm2 = nn.LayerNorm(config["encoder_projection_units"], eps=1e-6)
self.dropout_inner = nn.Dropout(0.1)
self.dropout = nn.Dropout(0.1)
self.fc = nn.Linear(config["encoder_projection_units"], 4)
def forward(self, x):
x = x.transpose(1, 2)
x = F.relu(self.w_1(x))
x = x.transpose(1, 2)
x = self.dropout_inner(self.layer_norm1(x))
x = x.transpose(1, 2)
x = F.relu(self.w_2(x))
x = x.transpose(1, 2)
x = self.dropout(self.layer_norm2(x))
output = F.softmax(self.fc(x), dim=2)
return output
\ No newline at end of file
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
class SinusoidalPositionEncoder(nn.Module):
def __init__(self, max_len, depth):
super(SinusoidalPositionEncoder, self).__init__()
self.max_len = max_len
self.depth = depth
self.position_enc = nn.Parameter(
self.get_sinusoid_encoding_table(max_len, depth).unsqueeze(0),
requires_grad=False,
)
def forward(self, input):
bz_in, len_in, _ = input.size()
if len_in > self.max_len:
self.max_len = len_in
self.position_enc.data = (
self.get_sinusoid_encoding_table(self.max_len, self.depth)
.unsqueeze(0)
.to(input.device)
)
output = input + self.position_enc[:, :len_in, :].expand(bz_in, -1, -1)
return output
@staticmethod
def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
""" Sinusoid position encoding table """
def cal_angle(position, hid_idx):
return position / np.power(10000, hid_idx / float(d_hid / 2 - 1))
def get_posi_angle_vec(position):
return [cal_angle(position, hid_j) for hid_j in range(d_hid // 2)]
scaled_time_table = np.array(
[get_posi_angle_vec(pos_i + 1) for pos_i in range(n_position)]
)
sinusoid_table = np.zeros((n_position, d_hid))
sinusoid_table[:, : d_hid // 2] = np.sin(scaled_time_table)
sinusoid_table[:, d_hid // 2 :] = np.cos(scaled_time_table)
if padding_idx is not None:
# zero vector for padding dimension
sinusoid_table[padding_idx] = 0.0
return torch.FloatTensor(sinusoid_table)
class DurSinusoidalPositionEncoder(nn.Module):
def __init__(self, depth, outputs_per_step):
super(DurSinusoidalPositionEncoder, self).__init__()
self.depth = depth
self.outputs_per_step = outputs_per_step
inv_timescales = [
np.power(10000, 2 * (hid_idx // 2) / depth) for hid_idx in range(depth)
]
self.inv_timescales = nn.Parameter(
torch.FloatTensor(inv_timescales), requires_grad=False
)
def forward(self, durations, masks=None):
reps = (durations + 0.5).long()
output_lens = reps.sum(dim=1)
max_len = output_lens.max()
reps_cumsum = torch.cumsum(F.pad(reps.float(), (1, 0, 0, 0), value=0.0), dim=1)[
:, None, :
]
range_ = torch.arange(max_len).to(durations.device)[None, :, None]
mult = (reps_cumsum[:, :, :-1] <= range_) & (reps_cumsum[:, :, 1:] > range_)
mult = mult.float()
offsets = torch.matmul(mult, reps_cumsum[:, 0, :-1].unsqueeze(-1)).squeeze(-1)
dur_pos = range_[:, :, 0] - offsets + 1
if masks is not None:
assert masks.size(1) == dur_pos.size(1)
dur_pos = dur_pos.masked_fill(masks, 0.0)
seq_len = dur_pos.size(1)
padding = self.outputs_per_step - int(seq_len) % self.outputs_per_step
if padding < self.outputs_per_step:
dur_pos = F.pad(dur_pos, (0, padding, 0, 0), value=0.0)
position_embedding = dur_pos[:, :, None] / self.inv_timescales[None, None, :]
position_embedding[:, :, 0::2] = torch.sin(position_embedding[:, :, 0::2])
position_embedding[:, :, 1::2] = torch.cos(position_embedding[:, :, 1::2])
return position_embedding
import torch
from distutils.version import LooseVersion
is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion("1.7")
def init_weights(m, mean=0.0, std=0.01):
classname = m.__class__.__name__
if classname.find("Conv") != -1:
m.weight.data.normal_(mean, std)
def get_mask_from_lengths(lengths, max_len=None):
batch_size = lengths.shape[0]
if max_len is None:
max_len = torch.max(lengths).item()
ids = (
torch.arange(0, max_len).unsqueeze(0).expand(batch_size, -1).to(lengths.device)
)
mask = ids >= lengths.unsqueeze(1).expand(-1, max_len)
return mask
import os
import numpy as np
from glob import glob
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
import argparse
import yaml
import logging
from .core.utils import (
volume_normalize,
get_pitch,
get_energy,
align_length,
compute_mean,
compute_std,
f0_norm_mean_std,
norm_mean_std,
parse_interval_file,
average_by_duration,
encode_16bits,
)
from .core.dsp import (
melspectrogram,
load_wav,
trim_silence,
trim_silence_with_interval,
save_wav,
)
logging.basicConfig(
format="%(asctime)s %(levelname)-4s [%(filename)s:%(lineno)d] %(message)s",
datefmt="%Y-%m-%d:%H:%M:%S",
level=logging.DEBUG,
)
default_audio_config = {
# Preprocess
"wav_normalize": True,
"trim_silence": True,
"trim_silence_threshold_db": 60,
"preemphasize": False,
# Feature extraction
"sampling_rate": 24000,
"hop_length": 240,
"win_length": 1024,
"n_mels": 80,
"n_fft": 1024,
"fmin": 50.0,
"fmax": 7600.0,
"min_level_db": -100,
"ref_level_db": 20,
"phone_level_feature": True,
"num_workers": 16,
# Normalization
"norm_type": "mean_std", # 'mean_std', 'global norm'
"max_norm": 1.0,
"symmetric": False,
}
class AudioProcessor:
def __init__(self, config=None):
# TODO: Add more audio processing methods.
if not isinstance(config, dict):
logging.warning(
"[AudioProcessor] config is not a dict, fall into default config."
)
self.config = default_audio_config
else:
self.config = config
for key in self.config:
setattr(self, key, self.config[key])
self.min_wav_length = int(self.config["sampling_rate"] * 0.5)
self.badcase_list = []
self.pcm_dict = {}
self.mel_dict = {}
self.f0_dict = {}
self.uv_dict = {}
self.nccf_dict = {}
self.f0uv_dict = {}
self.energy_dict = {}
self.dur_dict = {}
logging.info("[AudioProcessor] Initialize AudioProcessor.")
logging.info("[AudioProcessor] config params:")
for key in self.config:
logging.info("[AudioProcessor] %s: %s", key, self.config[key])
def calibrate_SyllableDuration(
self, raw_dur_dir, raw_metafile, out_cali_duration_dir
):
with open(raw_metafile, "r") as f:
lines = f.readlines()
output_dur_dir = out_cali_duration_dir
os.makedirs(output_dur_dir, exist_ok=True)
for line in lines:
line = line.strip()
index, symbols = line.split("\t")
symbols = [
symbol.strip("{").strip("}").split("$")[0]
for symbol in symbols.strip().split(" ")
]
dur_file = os.path.join(raw_dur_dir, index + ".npy")
phone_file = os.path.join(raw_dur_dir, index + ".phone")
if not os.path.exists(dur_file) or not os.path.exists(phone_file):
logging.warning(
"[AudioProcessor] dur file or phone file not exists: %s", index
)
continue
with open(phone_file, "r") as f:
phones = f.readlines()
dur = np.load(dur_file)
cali_duration = []
dur_idx = 0
syll_idx = 0
while dur_idx < len(dur) and syll_idx < len(symbols):
if phones[dur_idx].strip() == "sil":
dur_idx += 1
continue
if phones[dur_idx].strip() == "sp" and symbols[syll_idx][0] != "#":
dur_idx += 1
continue
if symbols[syll_idx] in ["ga", "go", "ge"]:
cali_duration.append(0)
syll_idx += 1
# print("NONE", symbols[syll_idx], 0)
continue
if symbols[syll_idx][0] == "#":
if phones[dur_idx].strip() != "sp":
cali_duration.append(0)
# print("NONE", symbols[syll_idx], 0)
syll_idx += 1
continue
else:
cali_duration.append(dur[dur_idx])
# print(phones[dur_idx].strip(), symbols[syll_idx], dur[dur_idx])
dur_idx += 1
syll_idx += 1
continue
# A corresponding phone is found
cali_duration.append(dur[dur_idx])
# print(phones[dur_idx].strip(), symbols[syll_idx], dur[dur_idx])
dur_idx += 1
syll_idx += 1
# Add #4 phone duration
cali_duration.append(0)
if len(cali_duration) != len(symbols):
logging.error(
"[Duration Calibrating] Syllable duration {}\
is not equal to the number of symbols {}, index: {}".format(
len(cali_duration), len(symbols), index
)
)
continue
# Align with mel frames
durs = np.array(cali_duration)
if len(self.mel_dict) > 0:
pair_mel = self.mel_dict.get(index, None)
if pair_mel is None:
logging.warning(
"[AudioProcessor] Interval file %s has no corresponding mel",
index,
)
continue
mel_frames = pair_mel.shape[0]
dur_frames = np.sum(durs)
if np.sum(durs) > mel_frames:
durs[-2] -= dur_frames - mel_frames
elif np.sum(durs) < mel_frames:
durs[-2] += mel_frames - np.sum(durs)
if durs[-2] < 0:
logging.error(
"[AudioProcessor] Duration calibrating failed for %s, mismatch frames %s",
index,
durs[-2],
)
self.badcase_list.append(index)
continue
self.dur_dict[index] = durs
np.save(os.path.join(output_dur_dir, index + ".npy"), self.dur_dict[index])
def amp_normalize(self, src_wav_dir, out_wav_dir):
if self.wav_normalize:
logging.info("[AudioProcessor] Amplitude normalization started")
os.makedirs(out_wav_dir, exist_ok=True)
res = volume_normalize(src_wav_dir, out_wav_dir)
logging.info("[AudioProcessor] Amplitude normalization finished")
return res
else:
logging.info("[AudioProcessor] No amplitude normalization")
os.symlink(src_wav_dir, out_wav_dir, target_is_directory=True)
return True
def get_pcm_dict(self, src_wav_dir):
wav_list = glob(os.path.join(src_wav_dir, "*.wav"))
if len(self.pcm_dict) > 0:
return self.pcm_dict
logging.info("[AudioProcessor] Start to load pcm from %s", src_wav_dir)
with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
total=len(wav_list)
) as progress:
futures = []
for wav_path in wav_list:
future = executor.submit(load_wav, wav_path, self.sampling_rate)
future.add_done_callback(lambda p: progress.update())
wav_name = os.path.splitext(os.path.basename(wav_path))[0]
futures.append((future, wav_name))
for future, wav_name in futures:
pcm = future.result()
if len(pcm) < self.min_wav_length:
logging.warning("[AudioProcessor] %s is too short, skip", wav_name)
self.badcase_list.append(wav_name)
continue
self.pcm_dict[wav_name] = pcm
return self.pcm_dict
def trim_silence_wav(self, src_wav_dir, out_wav_dir=None):
wav_list = glob(os.path.join(src_wav_dir, "*.wav"))
logging.info("[AudioProcessor] Trim silence started")
if out_wav_dir is None:
out_wav_dir = src_wav_dir
else:
os.makedirs(out_wav_dir, exist_ok=True)
pcm_dict = self.get_pcm_dict(src_wav_dir)
with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
total=len(wav_list)
) as progress:
futures = []
for wav_basename, pcm_data in pcm_dict.items():
future = executor.submit(
trim_silence,
pcm_data,
self.trim_silence_threshold_db,
self.hop_length,
self.win_length,
)
future.add_done_callback(lambda p: progress.update())
futures.append((future, wav_basename))
# TODO: multi-processing
for future, wav_basename in tqdm(futures):
pcm = future.result()
if len(pcm) < self.min_wav_length:
logging.warning("[AudioProcessor] %s is too short, skip", wav_basename)
self.badcase_list.append(wav_basename)
self.pcm_dict.pop(wav_basename)
continue
self.pcm_dict[wav_basename] = pcm
save_wav(
self.pcm_dict[wav_basename],
os.path.join(out_wav_dir, wav_basename + ".wav"),
self.sampling_rate,
)
logging.info("[AudioProcessor] Trim silence finished")
return True
def trim_silence_wav_with_interval(self, src_wav_dir, dur_dir, out_wav_dir=None):
wav_list = glob(os.path.join(src_wav_dir, "*.wav"))
logging.info("[AudioProcessor] Trim silence with interval started")
if out_wav_dir is None:
out_wav_dir = src_wav_dir
else:
os.makedirs(out_wav_dir, exist_ok=True)
pcm_dict = self.get_pcm_dict(src_wav_dir)
with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
total=len(wav_list)
) as progress:
futures = []
for wav_basename, pcm_data in pcm_dict.items():
future = executor.submit(
trim_silence_with_interval,
pcm_data,
self.dur_dict.get(wav_basename, None),
self.hop_length,
)
future.add_done_callback(lambda p: progress.update())
futures.append((future, wav_basename))
# TODO: multi-processing
for future, wav_basename in tqdm(futures):
trimed_pcm = future.result()
if trimed_pcm is None:
continue
if len(trimed_pcm) < self.min_wav_length:
logging.warning("[AudioProcessor] %s is too short, skip", wav_basename)
self.badcase_list.append(wav_basename)
self.pcm_dict.pop(wav_basename)
continue
self.pcm_dict[wav_basename] = trimed_pcm
save_wav(
self.pcm_dict[wav_basename],
os.path.join(out_wav_dir, wav_basename + ".wav"),
self.sampling_rate,
)
logging.info("[AudioProcessor] Trim silence finished")
return True
def mel_extract(self, src_wav_dir, out_feature_dir):
os.makedirs(out_feature_dir, exist_ok=True)
wav_list = glob(os.path.join(src_wav_dir, "*.wav"))
pcm_dict = self.get_pcm_dict(src_wav_dir)
logging.info("[AudioProcessor] Melspec extraction started")
# Get global normed mel spec
with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
total=len(wav_list)
) as progress:
futures = []
for wav_basename, pcm_data in pcm_dict.items():
future = executor.submit(
melspectrogram,
pcm_data,
self.sampling_rate,
self.n_fft,
self.hop_length,
self.win_length,
self.n_mels,
self.max_norm,
self.min_level_db,
self.ref_level_db,
self.fmin,
self.fmax,
self.symmetric,
self.preemphasize,
)
future.add_done_callback(lambda p: progress.update())
futures.append((future, wav_basename))
for future, wav_basename in futures:
result = future.result()
if result is None:
logging.warning(
"[AudioProcessor] Melspec extraction failed for %s",
wav_basename,
)
self.badcase_list.append(wav_basename)
else:
melspec = result
self.mel_dict[wav_basename] = melspec
logging.info("[AudioProcessor] Melspec extraction finished")
# FIXME: is this step necessary?
# Do mean std norm on global-normed melspec
logging.info("Melspec statistic proceeding...")
mel_mean = compute_mean(list(self.mel_dict.values()), dims=self.n_mels)
mel_std = compute_std(list(self.mel_dict.values()), mel_mean, dims=self.n_mels)
logging.info("Melspec statistic done")
np.savetxt(os.path.join(out_feature_dir, "mel_mean.txt"), mel_mean, fmt="%.6f")
np.savetxt(os.path.join(out_feature_dir, "mel_std.txt"), mel_std, fmt="%.6f")
logging.info(
"[AudioProcessor] melspec mean and std saved to:\n{},\n{}".format(
os.path.join(out_feature_dir, "mel_mean.txt"),
os.path.join(out_feature_dir, "mel_std.txt"),
)
)
logging.info("[AudioProcessor] Melspec mean std norm is proceeding...")
for wav_basename in self.mel_dict:
melspec = self.mel_dict[wav_basename]
norm_melspec = norm_mean_std(melspec, mel_mean, mel_std)
np.save(os.path.join(out_feature_dir, wav_basename + ".npy"), norm_melspec)
logging.info("[AudioProcessor] Melspec normalization finished")
logging.info("[AudioProcessor] Normed Melspec saved to %s", out_feature_dir)
return True
# TODO: some dataset may have no interval label
def duration_generate(self, src_interval_dir, out_feature_dir):
os.makedirs(out_feature_dir, exist_ok=True)
interval_list = glob(os.path.join(src_interval_dir, "*.interval"))
logging.info("[AudioProcessor] Duration generation started")
with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
total=len(interval_list)
) as progress:
futures = []
for interval_file_path in interval_list:
future = executor.submit(
parse_interval_file,
interval_file_path,
self.sampling_rate,
self.hop_length,
)
future.add_done_callback(lambda p: progress.update())
futures.append(
(future, os.path.splitext(os.path.basename(interval_file_path))[0])
)
logging.info("[AudioProcessor] Duration align with mel is proceeding...")
for future, wav_basename in futures:
result = future.result()
if result is None:
logging.warning(
"[AudioProcessor] Duration generate failed for %s", wav_basename
)
self.badcase_list.append(wav_basename)
else:
durs, phone_list = result
# Algin length with melspec
if len(self.mel_dict) > 0:
pair_mel = self.mel_dict.get(wav_basename, None)
if pair_mel is None:
logging.warning(
"[AudioProcessor] Interval file %s has no corresponding mel",
wav_basename,
)
continue
mel_frames = pair_mel.shape[0]
dur_frames = np.sum(durs)
if np.sum(durs) > mel_frames:
durs[-1] -= dur_frames - mel_frames
elif np.sum(durs) < mel_frames:
durs[-1] += mel_frames - np.sum(durs)
if durs[-1] < 0:
logging.error(
"[AudioProcessor] Duration align failed for %s, mismatch frames %s",
wav_basename,
durs[-1],
)
self.badcase_list.append(wav_basename)
continue
self.dur_dict[wav_basename] = durs
np.save(os.path.join(out_feature_dir, wav_basename + ".npy"), durs)
with open(
os.path.join(out_feature_dir, wav_basename + ".phone"), "w"
) as f:
f.write("\n".join(phone_list))
logging.info("[AudioProcessor] Duration generate finished")
return True
def pitch_extract(
self, src_wav_dir, out_f0_dir, out_frame_f0_dir, out_frame_uv_dir
):
os.makedirs(out_f0_dir, exist_ok=True)
os.makedirs(out_frame_f0_dir, exist_ok=True)
os.makedirs(out_frame_uv_dir, exist_ok=True)
wav_list = glob(os.path.join(src_wav_dir, "*.wav"))
pcm_dict = self.get_pcm_dict(src_wav_dir)
mel_dict = self.mel_dict
logging.info("[AudioProcessor] Pitch extraction started")
# Get raw pitch
with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
total=len(wav_list)
) as progress:
futures = []
for wav_basename, pcm_data in pcm_dict.items():
future = executor.submit(
get_pitch,
encode_16bits(pcm_data),
self.sampling_rate,
self.hop_length,
)
future.add_done_callback(lambda p: progress.update())
futures.append((future, wav_basename))
logging.info("[AudioProcessor] Pitch align with mel is proceeding...")
for future, wav_basename in futures:
result = future.result()
if result is None:
logging.warning(
"[AudioProcessor] Pitch extraction failed for %s", wav_basename
)
self.badcase_list.append(wav_basename)
else:
f0, uv, f0uv = result
if len(mel_dict) > 0:
f0 = align_length(f0, mel_dict.get(wav_basename, None))
uv = align_length(uv, mel_dict.get(wav_basename, None))
f0uv = align_length(f0uv, mel_dict.get(wav_basename, None))
if f0 is None or uv is None or f0uv is None:
logging.warning(
"[AudioProcessor] Pitch length mismatch with mel in %s",
wav_basename,
)
self.badcase_list.append(wav_basename)
continue
self.f0_dict[wav_basename] = f0
self.uv_dict[wav_basename] = uv
self.f0uv_dict[wav_basename] = f0uv
# Normalize f0
logging.info("[AudioProcessor] Pitch normalization is proceeding...")
f0_mean = compute_mean(list(self.f0uv_dict.values()), dims=1)
f0_std = compute_std(list(self.f0uv_dict.values()), f0_mean, dims=1)
np.savetxt(os.path.join(out_f0_dir, "f0_mean.txt"), f0_mean, fmt="%.6f")
np.savetxt(os.path.join(out_f0_dir, "f0_std.txt"), f0_std, fmt="%.6f")
logging.info(
"[AudioProcessor] f0 mean and std saved to:\n{},\n{}".format(
os.path.join(out_f0_dir, "f0_mean.txt"),
os.path.join(out_f0_dir, "f0_std.txt"),
)
)
logging.info("[AudioProcessor] Pitch mean std norm is proceeding...")
for wav_basename in self.f0uv_dict:
f0 = self.f0uv_dict[wav_basename]
norm_f0 = f0_norm_mean_std(f0, f0_mean, f0_std)
self.f0uv_dict[wav_basename] = norm_f0
for wav_basename in self.f0_dict:
f0 = self.f0_dict[wav_basename]
norm_f0 = f0_norm_mean_std(f0, f0_mean, f0_std)
self.f0_dict[wav_basename] = norm_f0
# save frame f0 to a specific dir
for wav_basename in self.f0_dict:
np.save(
os.path.join(out_frame_f0_dir, wav_basename + ".npy"),
self.f0_dict[wav_basename].reshape(-1),
)
for wav_basename in self.uv_dict:
np.save(
os.path.join(out_frame_uv_dir, wav_basename + ".npy"),
self.uv_dict[wav_basename].reshape(-1),
)
# phone level average
# if there is no duration then save the frame-level f0
if self.phone_level_feature and len(self.dur_dict) > 0:
logging.info("[AudioProcessor] Pitch turn to phone-level is proceeding...")
with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
total=len(self.f0uv_dict)
) as progress:
futures = []
for wav_basename in self.f0uv_dict:
future = executor.submit(
average_by_duration,
self.f0uv_dict.get(wav_basename, None),
self.dur_dict.get(wav_basename, None),
)
future.add_done_callback(lambda p: progress.update())
futures.append((future, wav_basename))
for future, wav_basename in futures:
result = future.result()
if result is None:
logging.warning(
"[AudioProcessor] Pitch extraction failed in phone level avg for: %s",
wav_basename,
)
self.badcase_list.append(wav_basename)
else:
avg_f0 = result
self.f0uv_dict[wav_basename] = avg_f0
for wav_basename in self.f0uv_dict:
np.save(
os.path.join(out_f0_dir, wav_basename + ".npy"),
self.f0uv_dict[wav_basename].reshape(-1),
)
logging.info("[AudioProcessor] Pitch normalization finished")
logging.info("[AudioProcessor] Normed f0 saved to %s", out_f0_dir)
logging.info("[AudioProcessor] Pitch extraction finished")
return True
def energy_extract(self, src_wav_dir, out_energy_dir, out_frame_energy_dir):
os.makedirs(out_energy_dir, exist_ok=True)
os.makedirs(out_frame_energy_dir, exist_ok=True)
wav_list = glob(os.path.join(src_wav_dir, "*.wav"))
pcm_dict = self.get_pcm_dict(src_wav_dir)
mel_dict = self.mel_dict
logging.info("[AudioProcessor] Energy extraction started")
# Get raw energy
with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
total=len(wav_list)
) as progress:
futures = []
for wav_basename, pcm_data in pcm_dict.items():
future = executor.submit(
get_energy, pcm_data, self.hop_length, self.win_length, self.n_fft
)
future.add_done_callback(lambda p: progress.update())
futures.append((future, wav_basename))
for future, wav_basename in futures:
result = future.result()
if result is None:
logging.warning(
"[AudioProcessor] Energy extraction failed for %s", wav_basename
)
self.badcase_list.append(wav_basename)
else:
energy = result
if len(mel_dict) > 0:
energy = align_length(energy, mel_dict.get(wav_basename, None))
if energy is None:
logging.warning(
"[AudioProcessor] Energy length mismatch with mel in %s",
wav_basename,
)
self.badcase_list.append(wav_basename)
continue
self.energy_dict[wav_basename] = energy
# Normalize energy
energy_mean = compute_mean(list(self.energy_dict.values()), dims=1)
energy_std = compute_std(list(self.energy_dict.values()), energy_mean, dims=1)
np.savetxt(
os.path.join(out_energy_dir, "energy_mean.txt"), energy_mean, fmt="%.6f"
)
np.savetxt(
os.path.join(out_energy_dir, "energy_std.txt"), energy_std, fmt="%.6f"
)
logging.info(
"[AudioProcessor] energy mean and std saved to:\n{},\n{}".format(
os.path.join(out_energy_dir, "energy_mean.txt"),
os.path.join(out_energy_dir, "energy_std.txt"),
)
)
logging.info("[AudioProcessor] Energy mean std norm is proceeding...")
for wav_basename in self.energy_dict:
energy = self.energy_dict[wav_basename]
norm_energy = f0_norm_mean_std(energy, energy_mean, energy_std)
self.energy_dict[wav_basename] = norm_energy
# save frame energy to a specific dir
for wav_basename in self.energy_dict:
np.save(
os.path.join(out_frame_energy_dir, wav_basename + ".npy"),
self.energy_dict[wav_basename].reshape(-1),
)
# phone level average
# if there is no duration then save the frame-level energy
if self.phone_level_feature and len(self.dur_dict) > 0:
with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
total=len(self.energy_dict)
) as progress:
futures = []
for wav_basename in self.energy_dict:
future = executor.submit(
average_by_duration,
self.energy_dict.get(wav_basename, None),
self.dur_dict.get(wav_basename, None),
)
future.add_done_callback(lambda p: progress.update())
futures.append((future, wav_basename))
# TODO: failed clear dict element
for future, wav_basename in futures:
result = future.result()
if result is None:
logging.warning(
"[AudioProcessor] Energy extraction failed in phone level avg for: %s",
wav_basename,
)
self.badcase_list.append(wav_basename)
else:
avg_energy = result
self.energy_dict[wav_basename] = avg_energy
for wav_basename in self.energy_dict:
np.save(
os.path.join(out_energy_dir, wav_basename + ".npy"),
self.energy_dict[wav_basename].reshape(-1),
)
logging.info("[AudioProcessor] Energy normalization finished")
logging.info("[AudioProcessor] Normed Energy saved to %s", out_energy_dir)
logging.info("[AudioProcessor] Energy extraction finished")
return True
def process(self, src_voice_dir, out_data_dir, aux_metafile=None):
succeed = True
raw_wav_dir = os.path.join(src_voice_dir, "wav")
src_interval_dir = os.path.join(src_voice_dir, "interval")
out_mel_dir = os.path.join(out_data_dir, "mel")
out_f0_dir = os.path.join(out_data_dir, "f0")
out_frame_f0_dir = os.path.join(out_data_dir, "frame_f0")
out_frame_uv_dir = os.path.join(out_data_dir, "frame_uv")
out_energy_dir = os.path.join(out_data_dir, "energy")
out_frame_energy_dir = os.path.join(out_data_dir, "frame_energy")
out_duration_dir = os.path.join(out_data_dir, "raw_duration")
out_cali_duration_dir = os.path.join(out_data_dir, "duration")
os.makedirs(out_data_dir, exist_ok=True)
with_duration = os.path.exists(src_interval_dir)
# TODO: to resume from previous process, a log file is needed
train_wav_dir = os.path.join(out_data_dir, "wav")
succeed = self.amp_normalize(raw_wav_dir, train_wav_dir)
if not succeed:
logging.error("[AudioProcessor] amp_normalize failed, exit")
return False
if with_duration:
# Raw duration, non-trimmed
succeed = self.duration_generate(src_interval_dir, out_duration_dir)
if not succeed:
logging.error("[AudioProcessor] duration_generate failed, exit")
return False
if self.trim_silence:
if with_duration:
succeed = self.trim_silence_wav_with_interval(
train_wav_dir, out_duration_dir
)
if not succeed:
logging.error(
"[AudioProcessor] trim_silence_wav_with_interval failed, exit"
)
return False
else:
succeed = self.trim_silence_wav(train_wav_dir)
if not succeed:
logging.error("[AudioProcessor] trim_silence_wav failed, exit")
return False
succeed = self.mel_extract(train_wav_dir, out_mel_dir)
if not succeed:
logging.error("[AudioProcessor] mel_extract failed, exit")
return False
if aux_metafile is not None and with_duration:
self.calibrate_SyllableDuration(
out_duration_dir, aux_metafile, out_cali_duration_dir
)
succeed = self.pitch_extract(
train_wav_dir, out_f0_dir, out_frame_f0_dir, out_frame_uv_dir
)
if not succeed:
logging.error("[AudioProcessor] pitch_extract failed, exit")
return False
succeed = self.energy_extract(
train_wav_dir, out_energy_dir, out_frame_energy_dir
)
if not succeed:
logging.error("[AudioProcessor] energy_extract failed, exit")
return False
# recording badcase list
with open(os.path.join(out_data_dir, "badlist.txt"), "w") as f:
f.write("\n".join(self.badcase_list))
logging.info("[AudioProcessor] All features extracted successfully!")
return succeed
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Audio Processor")
parser.add_argument("--src_voice_dir", type=str, required=True)
parser.add_argument("--out_data_dir", type=str, required=True)
parser.add_argument("--config", type=str, default=None)
args = parser.parse_args()
if args.config is not None:
with open(args.config, "r") as f:
config = yaml.load(f, Loader=yaml.Loader)
ap = AudioProcessor(config["audio_config"])
ap.process(args.src_voice_dir, args.out_data_dir)
import numpy as np
import librosa
import librosa.filters
from scipy.io import wavfile
from scipy import signal
def _stft(y, hop_length, win_length, n_fft):
return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
def _istft(y, hop_length, win_length):
return librosa.istft(y, hop_length=hop_length, win_length=win_length)
def _db_to_amp(x):
return np.power(10.0, x * 0.05)
def _amp_to_db(x):
return 20 * np.log10(np.maximum(1e-5, x))
def load_wav(path, sr):
return librosa.load(path, sr=sr)[0]
def save_wav(wav, path, sr):
if wav.dtype == np.float32 or wav.dtype == np.float64:
quant_wav = 32767 * wav
else:
quant_wav = wav
# maxmize the volume to avoid clipping
# wav *= 32767 / max(0.01, np.max(np.abs(wav)))
wavfile.write(path, sr, quant_wav.astype(np.int16))
def trim_silence(wav, top_db, hop_length, win_length):
trimed_wav, _ = librosa.effects.trim(
wav, top_db=top_db, frame_length=win_length, hop_length=hop_length
)
return trimed_wav
def trim_silence_with_interval(wav, interval, hop_length):
if interval is None:
return None
leading_sil = interval[0]
tailing_sil = interval[-1]
trim_wav = wav[leading_sil * hop_length : -tailing_sil * hop_length]
return trim_wav
def preemphasis(wav, k=0.98, preemphasize=False):
if preemphasize:
return signal.lfilter([1, -k], [1], wav)
return wav
def inv_preemphasis(wav, k=0.98, inv_preemphasize=False):
if inv_preemphasize:
return signal.lfilter([1], [1, -k], wav)
return wav
def _normalize(S, max_norm=1.0, min_level_db=-100, symmetric=False):
if symmetric:
return np.clip(
(2 * max_norm) * ((S - min_level_db) / (-min_level_db)) - max_norm,
-max_norm,
max_norm,
)
else:
return np.clip(max_norm * ((S - min_level_db) / (-min_level_db)), 0, max_norm)
def _denormalize(D, max_norm=1.0, min_level_db=-100, symmetric=False):
if symmetric:
return (
(np.clip(D, -max_norm, max_norm) + max_norm)
* -min_level_db
/ (2 * max_norm)
) + min_level_db
else:
return (np.clip(D, 0, max_norm) * -min_level_db / max_norm) + min_level_db
def _griffin_lim(S, n_fft, hop_length, win_length, griffin_lim_iters=60):
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
S_complex = np.abs(S).astype(np.complex)
y = _istft(S_complex * angles, hop_length=hop_length, win_length=win_length)
for i in range(griffin_lim_iters):
angles = np.exp(
1j
* np.angle(
_stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
)
)
y = _istft(S_complex * angles, hop_length=hop_length, win_length=win_length)
return y
def spectrogram(
y,
n_fft=1024,
hop_length=256,
win_length=1024,
max_norm=1.0,
min_level_db=-100,
ref_level_db=20,
symmetric=False,
):
D = _stft(preemphasis(y), hop_length, win_length, n_fft)
S = _amp_to_db(np.abs(D)) - ref_level_db
return _normalize(S, max_norm, min_level_db, symmetric)
def inv_spectrogram(
spectrogram,
n_fft=1024,
hop_length=256,
win_length=1024,
max_norm=1.0,
min_level_db=-100,
ref_level_db=20,
symmetric=False,
power=1.5,
):
S = _db_to_amp(
_denormalize(spectrogram, max_norm, min_level_db, symmetric) + ref_level_db
)
return _griffin_lim(S ** power, n_fft, hop_length, win_length)
def _build_mel_basis(sample_rate, n_fft=1024, fmin=50, fmax=8000, n_mels=80):
assert fmax <= sample_rate // 2
return librosa.filters.mel(
sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax
)
# mel linear Conversions
_mel_basis = None
_inv_mel_basis = None
def _linear_to_mel(spectogram, sample_rate, n_fft=1024, fmin=50, fmax=8000, n_mels=80):
global _mel_basis
if _mel_basis is None:
_mel_basis = _build_mel_basis(sample_rate, n_fft, fmin, fmax, n_mels)
return np.dot(_mel_basis, spectogram)
def _mel_to_linear(
mel_spectrogram, sample_rate, n_fft=1024, fmin=50, fmax=8000, n_mels=80
):
global _inv_mel_basis
if _inv_mel_basis is None:
_inv_mel_basis = np.linalg.pinv(
_build_mel_basis(sample_rate, n_fft, fmin, fmax, n_mels)
)
return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
def melspectrogram(
y,
sample_rate,
n_fft=1024,
hop_length=256,
win_length=1024,
n_mels=80,
max_norm=1.0,
min_level_db=-100,
ref_level_db=20,
fmin=50,
fmax=8000,
symmetric=False,
preemphasize=False,
):
D = _stft(
preemphasis(y, preemphasize=preemphasize),
hop_length=hop_length,
win_length=win_length,
n_fft=n_fft,
)
S = (
_amp_to_db(
_linear_to_mel(
np.abs(D),
sample_rate=sample_rate,
n_fft=n_fft,
fmin=fmin,
fmax=fmax,
n_mels=n_mels,
)
)
- ref_level_db
)
return _normalize(
S, max_norm=max_norm, min_level_db=min_level_db, symmetric=symmetric
).T
def inv_mel_spectrogram(
mel_spectrogram,
sample_rate,
n_fft=1024,
hop_length=256,
win_length=1024,
n_mels=80,
max_norm=1.0,
min_level_db=-100,
ref_level_db=20,
fmin=50,
fmax=8000,
power=1.5,
symmetric=False,
preemphasize=False,
):
D = _denormalize(
mel_spectrogram,
max_norm=max_norm,
min_level_db=min_level_db,
symmetric=symmetric,
)
S = _mel_to_linear(
_db_to_amp(D + ref_level_db),
sample_rate=sample_rate,
n_fft=n_fft,
fmin=fmin,
fmax=fmax,
n_mels=n_mels,
)
return inv_preemphasis(
_griffin_lim(S ** power, n_fft, hop_length, win_length),
preemphasize=preemphasize,
)
import os
from glob import glob
import numpy as np
import sox
import librosa
import pysptk
from scipy.io import wavfile
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm
import logging
from .dsp import _stft
anchor_hist = np.array(
[
0.0,
0.00215827,
0.00354383,
0.00442313,
0.00490274,
0.00532907,
0.00602185,
0.00690115,
0.00810019,
0.00948574,
0.0120437,
0.01489475,
0.01873168,
0.02302158,
0.02872369,
0.03669065,
0.04636291,
0.05843325,
0.07700506,
0.11052491,
0.16802558,
0.25997868,
0.37942979,
0.50730083,
0.62006395,
0.71092459,
0.76877165,
0.80762057,
0.83458566,
0.85672795,
0.87660538,
0.89251266,
0.90578204,
0.91569411,
0.92541966,
0.93383959,
0.94162004,
0.94940048,
0.95539568,
0.96136424,
0.9670397,
0.97290168,
0.97705835,
0.98116174,
0.98465228,
0.98814282,
0.99152678,
0.99421796,
0.9965894,
0.99840128,
1.0,
]
)
anchor_bins = np.array(
[
0.033976,
0.03529014,
0.03660428,
0.03791842,
0.03923256,
0.0405467,
0.04186084,
0.04317498,
0.04448912,
0.04580326,
0.0471174,
0.04843154,
0.04974568,
0.05105982,
0.05237396,
0.0536881,
0.05500224,
0.05631638,
0.05763052,
0.05894466,
0.0602588,
0.06157294,
0.06288708,
0.06420122,
0.06551536,
0.0668295,
0.06814364,
0.06945778,
0.07077192,
0.07208606,
0.0734002,
0.07471434,
0.07602848,
0.07734262,
0.07865676,
0.0799709,
0.08128504,
0.08259918,
0.08391332,
0.08522746,
0.0865416,
0.08785574,
0.08916988,
0.09048402,
0.09179816,
0.0931123,
0.09442644,
0.09574058,
0.09705472,
0.09836886,
0.099683,
]
)
hist_bins = 50
def amp_info(wav_file_path):
"""
Returns the amplitude info of the wav file.
"""
stats = sox.file_info.stat(wav_file_path)
amp_rms = stats["RMS amplitude"]
amp_max = stats["Maximum amplitude"]
amp_mean = stats["Mean amplitude"]
length = stats["Length (seconds)"]
return {
"amp_rms": amp_rms,
"amp_max": amp_max,
"amp_mean": amp_mean,
"length": length,
"basename": os.path.basename(wav_file_path),
}
# TODO: multi-processing
def statistic_amplitude(src_wav_dir):
"""
Returns the amplitude info of the wav file.
"""
wav_lst = glob(os.path.join(src_wav_dir, "*.wav"))
with ProcessPoolExecutor(max_workers=8) as executor, tqdm(
total=len(wav_lst)
) as progress:
futures = []
for wav_file_path in wav_lst:
future = executor.submit(amp_info, wav_file_path)
future.add_done_callback(lambda p: progress.update())
futures.append(future)
amp_info_lst = [future.result() for future in futures]
amp_info_lst = sorted(amp_info_lst, key=lambda x: x["amp_rms"])
logging.info(
"Average amplitude RMS : {}".format(
np.mean([x["amp_rms"] for x in amp_info_lst])
)
)
# cnt = len(amp_info_lst)
#
# pinhead_cnt = math.floor(cnt * 0.01)
#
# return amp_info_lst[pinhead_cnt : cnt - pinhead_cnt]
return amp_info_lst
# TODO: multi process
def volume_normalize(src_wav_dir, out_wav_dir):
logging.info("Volume statistic proceeding...")
amp_info_lst = statistic_amplitude(src_wav_dir)
logging.info("Volume statistic done.")
rms_amp_lst = [x["amp_rms"] for x in amp_info_lst]
src_hist, src_bins = np.histogram(rms_amp_lst, bins=hist_bins, density=True)
src_hist = src_hist / np.sum(src_hist)
src_hist = np.cumsum(src_hist)
src_hist = np.insert(src_hist, 0, 0.0)
logging.info("Volume normalization proceeding...")
for amp_info in tqdm(amp_info_lst):
rms_amp = amp_info["amp_rms"]
rms_amp = np.clip(rms_amp, src_bins[0], src_bins[-1])
src_idx = np.where(rms_amp >= src_bins)[0][-1]
src_pos = src_hist[src_idx]
anchor_idx = np.where(src_pos >= anchor_hist)[0][-1]
if src_idx == hist_bins or anchor_idx == hist_bins:
rms_amp = anchor_bins[-1]
else:
rms_amp = (rms_amp - src_bins[src_idx]) / (
src_bins[src_idx + 1] - src_bins[src_idx]
) * (anchor_bins[anchor_idx + 1] - anchor_bins[anchor_idx]) + anchor_bins[
anchor_idx
]
scale = rms_amp / amp_info["amp_rms"]
# FIXME: This is a hack to avoid the sound cliping.
sr, data = wavfile.read(os.path.join(src_wav_dir, amp_info["basename"]))
wavfile.write(
os.path.join(out_wav_dir, amp_info["basename"]),
sr,
(data * scale).astype(np.int16),
)
logging.info("Volume normalization done.")
return True
def interp_f0(f0_data):
"""
linear interpolation
"""
f0_data[f0_data < 1] = 0
xp = np.nonzero(f0_data)
yp = f0_data[xp]
x = np.arange(f0_data.size)
contour_f0 = np.interp(x, xp[0], yp).astype(np.float32)
return contour_f0
def frame_nccf(x, y):
norm_coef = (np.sum(x ** 2.0) * np.sum(y ** 2.0) + 1e-30) ** 0.5
return (np.sum(x * y) / norm_coef + 1.0) / 2.0
def get_nccf(pcm_data, f0, min_f0=40, max_f0=800, fs=160, sr=16000):
if pcm_data.dtype == np.int16:
pcm_data = pcm_data.astype(np.float32) / 32768
frame_len = int(sr / 200)
frame_num = int(len(pcm_data) // fs)
frame_num = min(frame_num, len(f0))
pad_len = int(sr / min_f0) + frame_len
pad_zeros = np.zeros([pad_len], dtype=np.float32)
data = np.hstack((pad_zeros, pcm_data.astype(np.float32), pad_zeros))
nccf = np.zeros((frame_num), dtype=np.float32)
for i in range(frame_num):
curr_f0 = np.clip(f0[i], min_f0, max_f0)
lag = int(sr / curr_f0 + 0.5)
j = i * fs + pad_len - frame_len // 2
l_data = data[j : j + frame_len]
l_data -= l_data.mean()
r_data = data[j + lag : j + lag + frame_len]
r_data -= r_data.mean()
nccf[i] = frame_nccf(l_data, r_data)
return nccf
def smooth(data, win_len):
if win_len % 2 == 0:
win_len += 1
hwin = win_len // 2
win = np.hanning(win_len)
win /= win.sum()
data = data.reshape([-1])
pad_data = np.pad(data, hwin, mode="edge")
for i in range(data.shape[0]):
data[i] = np.dot(win, pad_data[i : i + win_len])
return data.reshape([-1, 1])
# TODO: pysptk only supports two methods to estimate the F0 now.
# support: rapt, swipe
# unsupport: reaper, world(DIO)
def RAPT_FUNC(v1, v2, v3, v4, v5):
return pysptk.sptk.rapt(v1.astype(np.float32), fs=v2, hopsize=v3, min=v4, max=v5)
def SWIPE_FUNC(v1, v2, v3, v4, v5):
return pysptk.sptk.swipe(v1.astype(np.float64), fs=v2, hopsize=v3, min=v4, max=v5)
def PYIN_FUNC(v1, v2, v3, v4, v5):
f0_mel = librosa.pyin(
v1.astype(np.float32), sr=v2, frame_length=v3 * 4, fmin=v4, fmax=v5
)[0]
f0_mel = np.where(np.isnan(f0_mel), 0.0, f0_mel)
return f0_mel
def get_pitch(pcm_data, sampling_rate=16000, hop_length=160):
log_f0_list = []
uv_list = []
low, high = 40, 800
cali_f0 = pysptk.sptk.rapt(
pcm_data.astype(np.float32),
fs=sampling_rate,
hopsize=hop_length,
min=low,
max=high,
)
f0_range = np.sort(np.unique(cali_f0))
if len(f0_range) > 20:
low = max(f0_range[10] - 50, low)
high = min(f0_range[-10] + 50, high)
func_dict = {"rapt": RAPT_FUNC, "swipe": SWIPE_FUNC}
for func_name in func_dict:
f0 = func_dict[func_name](pcm_data, sampling_rate, hop_length, low, high)
uv = f0 > 0
if len(f0) < 10 or f0.max() < low:
logging.error("{} method: calc F0 is too low.".format(func_name))
continue
else:
f0 = np.clip(f0, 1e-30, high)
log_f0 = np.log(f0)
contour_log_f0 = interp_f0(log_f0)
log_f0_list.append(contour_log_f0)
uv_list.append(uv)
if len(log_f0_list) == 0:
logging.error("F0 estimation failed.")
return None
min_len = float("inf")
for log_f0 in log_f0_list:
min_len = min(min_len, log_f0.shape[0])
multi_log_f0 = np.zeros([len(log_f0_list), min_len], dtype=np.float32)
multi_uv = np.zeros([len(log_f0_list), min_len], dtype=np.float32)
for i in range(len(log_f0_list)):
multi_log_f0[i, :] = log_f0_list[i][:min_len]
multi_uv[i, :] = uv_list[i][:min_len]
log_f0 = smooth(np.median(multi_log_f0, axis=0), 5)
uv = (smooth(np.median(multi_uv, axis=0), 5) > 0.5).astype(np.float32)
f0 = np.exp(log_f0)
# nccf = get_nccf(
# pcm_data, f0, min_f0=low, max_f0=high, fs=hop_length, sr=sampling_rate
# )
min_len = min(f0.shape[0], uv.shape[0])
return f0[:min_len], uv[:min_len], f0[:min_len] * uv[:min_len]
# TODO: some DSP functions are not implemented.
def get_energy(pcm_data, hop_length, win_length, n_fft):
D = _stft(pcm_data, hop_length, win_length, n_fft)
S, _ = librosa.magphase(D)
energy = np.sqrt(np.sum(S ** 2, axis=0))
return energy.reshape((-1, 1))
def align_length(in_data, tgt_data, basename=None):
if in_data is None or tgt_data is None:
logging.error("{}: Input data is None.".format(basename))
return None
in_len = in_data.shape[0]
tgt_len = tgt_data.shape[0]
if abs(in_len - tgt_len) > 20:
logging.error(
"{}: Input data length mismatches with target data length too much.".format(
basename
)
)
return None
if in_len < tgt_len:
out_data = np.pad(
in_data, ((0, tgt_len - in_len), (0, 0)), "constant", constant_values=0.0
)
else:
out_data = in_data[:tgt_len]
return out_data
def compute_mean(data_list, dims=80):
mean_vector = np.zeros((1, dims))
all_frame_number = 0
for data in tqdm(data_list):
if data is None:
continue
features = data.reshape((-1, dims))
current_frame_number = np.shape(features)[0]
mean_vector += np.sum(features[:, :], axis=0)
all_frame_number += current_frame_number
mean_vector /= float(all_frame_number)
return mean_vector
def compute_std(data_list, mean_vector, dims=80):
std_vector = np.zeros((1, dims))
all_frame_number = 0
for data in tqdm(data_list):
if data is None:
continue
features = data.reshape((-1, dims))
current_frame_number = np.shape(features)[0]
mean_matrix = np.tile(mean_vector, (current_frame_number, 1))
std_vector += np.sum((features[:, :] - mean_matrix) ** 2, axis=0)
all_frame_number += current_frame_number
std_vector /= float(all_frame_number)
std_vector = std_vector ** 0.5
return std_vector
F0_MIN = 0.0
F0_MAX = 800.0
ENERGY_MIN = 0.0
ENERGY_MAX = 200.0
CLIP_FLOOR = 1e-3
def f0_norm_min_max(f0):
zero_idxs = np.where(f0 <= CLIP_FLOOR)[0]
res = (2 * f0 - F0_MIN - F0_MAX) / (F0_MAX - F0_MIN)
res[zero_idxs] = 0.0
return res
def f0_denorm_min_max(f0):
zero_idxs = np.where(f0 == 0.0)[0]
res = (f0 * (F0_MAX - F0_MIN) + F0_MIN + F0_MAX) / 2
res[zero_idxs] = 0.0
return res
def energy_norm_min_max(energy):
zero_idxs = np.where(energy == 0.0)[0]
res = (2 * energy - ENERGY_MIN - ENERGY_MAX) / (ENERGY_MAX - ENERGY_MIN)
res[zero_idxs] = 0.0
return res
def energy_denorm_min_max(energy):
zero_idxs = np.where(energy == 0.0)[0]
res = (energy * (ENERGY_MAX - ENERGY_MIN) + ENERGY_MIN + ENERGY_MAX) / 2
res[zero_idxs] = 0.0
return res
def norm_log(x):
zero_idxs = np.where(x <= CLIP_FLOOR)[0]
x[zero_idxs] = 1.0
res = np.log(x)
return res
def denorm_log(x):
zero_idxs = np.where(x == 0.0)[0]
res = np.exp(x)
res[zero_idxs] = 0.0
return res
def f0_norm_mean_std(x, mean, std):
zero_idxs = np.where(x == 0.0)[0]
x = (x - mean) / std
x[zero_idxs] = 0.0
return x
def norm_mean_std(x, mean, std):
x = (x - mean) / std
return x
# TODO: This is a hardcode implementation for mit-style interval label
# TODO: Try to implement a more general version
def parse_interval_file(file_path, sampling_rate, hop_length):
with open(file_path, "r") as f:
lines = f.readlines()
# second
frame_intervals = 1.0 * hop_length / sampling_rate
skip_lines = 12
dur_list = []
phone_list = []
line_index = skip_lines
while line_index < len(lines):
phone_begin = float(lines[line_index])
phone_end = float(lines[line_index + 1])
phone = lines[line_index + 2].strip()[1:-1]
dur_list.append(int(round((phone_end - phone_begin) / frame_intervals)))
phone_list.append(phone)
line_index += 3
if len(dur_list) == 0 or len(phone_list) == 0:
return None
return np.array(dur_list), phone_list
def average_by_duration(x, durs):
if x is None or durs is None:
return None
durs_cum = np.cumsum(np.pad(durs, (1, 0), "constant"))
# average over each symbol's duraion
x_symbol = np.zeros((durs.shape[0],), dtype=np.float32)
for idx, start, end in zip(range(durs.shape[0]), durs_cum[:-1], durs_cum[1:]):
values = x[start:end][np.where(x[start:end] != 0.0)[0]]
x_symbol[idx] = np.mean(values) if len(values) > 0 else 0.0
return x_symbol.astype(np.float32)
def encode_16bits(x):
if x.min() > -1.0 and x.max() < 1.0:
return np.clip(x * 2 ** 15, -(2 ** 15), 2 ** 15 - 1).astype(np.int16)
else:
return x
if __name__ == "__main__":
import sys
infile = sys.argv[1]
sr, pcm_data = wavfile.read(infile)
res = get_pitch(pcm_data, 24000, 240)
print(res)
import logging
import os
import sys
import argparse
import yaml
import time
import codecs
ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # NOQA: E402
sys.path.insert(0, os.path.dirname(ROOT_PATH)) # NOQA: E402
try:
from kantts.preprocess.audio_processor.audio_processor import AudioProcessor
from kantts.preprocess.se_processor.se_processor import SpeakerEmbeddingProcessor
from kantts.preprocess.script_convertor.TextScriptConvertor import (
TextScriptConvertor,
)
from kantts.preprocess.fp_processor import FpProcessor, is_fp_line
from kantts.preprocess.languages import languages
from kantts.datasets.dataset import AM_Dataset, Voc_Dataset
from kantts.utils.log import logging_to_file, get_git_revision_hash
except ImportError:
raise ImportError("Please install kantts.")
logging.basicConfig(
format="%(asctime)s,%(msecs)d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
datefmt="%Y-%m-%d:%H:%M:%S",
level=logging.INFO,
)
LANGUAGES_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "languages")
def gen_metafile(
voice_output_dir,
fp_enable=False,
badlist=None,
split_ratio=0.98,
):
voc_train_meta = os.path.join(voice_output_dir, "train.lst")
voc_valid_meta = os.path.join(voice_output_dir, "valid.lst")
if not os.path.exists(voc_train_meta) or not os.path.exists(voc_valid_meta):
Voc_Dataset.gen_metafile(
os.path.join(voice_output_dir, "wav"),
voice_output_dir,
split_ratio,
)
logging.info("Voc metafile generated.")
raw_metafile = os.path.join(voice_output_dir, "raw_metafile.txt")
am_train_meta = os.path.join(voice_output_dir, "am_train.lst")
am_valid_meta = os.path.join(voice_output_dir, "am_valid.lst")
if not os.path.exists(am_train_meta) or not os.path.exists(am_valid_meta):
AM_Dataset.gen_metafile(
raw_metafile,
voice_output_dir,
am_train_meta,
am_valid_meta,
badlist,
split_ratio,
)
logging.info("AM metafile generated.")
if fp_enable:
fpadd_metafile = os.path.join(voice_output_dir, "fpadd_metafile.txt")
am_train_meta = os.path.join(voice_output_dir, "am_fpadd_train.lst")
am_valid_meta = os.path.join(voice_output_dir, "am_fpadd_valid.lst")
if not os.path.exists(am_train_meta) or not os.path.exists(am_valid_meta):
AM_Dataset.gen_metafile(
fpadd_metafile,
voice_output_dir,
am_train_meta,
am_valid_meta,
badlist,
split_ratio,
)
logging.info("AM fpaddmetafile generated.")
fprm_metafile = os.path.join(voice_output_dir, "fprm_metafile.txt")
am_train_meta = os.path.join(voice_output_dir, "am_fprm_train.lst")
am_valid_meta = os.path.join(voice_output_dir, "am_fprm_valid.lst")
if not os.path.exists(am_train_meta) or not os.path.exists(am_valid_meta):
AM_Dataset.gen_metafile(
fprm_metafile,
voice_output_dir,
am_train_meta,
am_valid_meta,
badlist,
split_ratio,
)
logging.info("AM fprmmetafile generated.")
# TODO: Zh-CN as default
def process_data(
voice_input_dir,
voice_output_dir,
audio_config,
speaker_name=None,
targetLang="PinYin",
skip_script=False,
se_model=None,
):
foreignLang = "EnUS"
# check if the vocie is supported
if not os.path.exists(os.path.join(voice_input_dir, "emotion_tag.txt")):
emo_tag_path = None
else:
emo_tag_path = os.path.join(voice_input_dir, "emotion_tag.txt")
phoneset_path = os.path.join(
LANGUAGES_DIR, targetLang, languages[targetLang]["phoneset_path"]
)
posset_path = os.path.join(
LANGUAGES_DIR, targetLang, languages[targetLang]["posset_path"]
)
f2t_map_path = os.path.join(
LANGUAGES_DIR, targetLang, languages[targetLang]["f2t_map_path"]
)
s2p_map_path = os.path.join(
LANGUAGES_DIR, targetLang, languages[targetLang]["s2p_map_path"]
)
# dir of plain text/sentences for training byte based model
plain_text_dir = os.path.join(voice_input_dir, "text")
if speaker_name is None:
speaker_name = os.path.basename(voice_input_dir)
if audio_config is not None:
with open(audio_config, "r") as f:
config = yaml.load(f, Loader=yaml.Loader)
config["create_time"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
config["git_revision_hash"] = get_git_revision_hash()
se_enable = config["audio_config"].get("se_feature", False)
with open(os.path.join(voice_output_dir, "audio_config.yaml"), "w") as f:
yaml.dump(config, f, Dumper=yaml.Dumper, default_flow_style=None)
if skip_script:
logging.info("Skip script conversion")
raw_metafile = None
# Script processor
if not skip_script:
if os.path.exists(plain_text_dir):
TextScriptConvertor.turn_text_into_bytes(
os.path.join(plain_text_dir, "text.txt"),
os.path.join(voice_output_dir, "raw_metafile.txt"),
speaker_name,
)
fp_enable = False
else:
tsc = TextScriptConvertor(
phoneset_path,
posset_path,
targetLang,
foreignLang,
f2t_map_path,
s2p_map_path,
emo_tag_path,
speaker_name,
)
tsc.process(
os.path.join(voice_input_dir, "prosody", "prosody.txt"),
os.path.join(voice_output_dir, "Script.xml"),
os.path.join(voice_output_dir, "raw_metafile.txt"),
)
prosody = os.path.join(voice_input_dir, "prosody", "prosody.txt")
# FP processor
with codecs.open(prosody, "r", "utf-8") as f:
lines = f.readlines()
fp_enable = is_fp_line(lines[1])
raw_metafile = os.path.join(voice_output_dir, "raw_metafile.txt")
if fp_enable:
FP = FpProcessor()
FP.process(
voice_output_dir,
prosody,
raw_metafile,
)
logging.info("Processing fp done.")
# Audio processor
ap = AudioProcessor(config["audio_config"])
ap.process(
voice_input_dir,
voice_output_dir,
raw_metafile,
)
logging.info("Processing audio done.")
# SpeakerEmbedding processor
if se_enable:
sep = SpeakerEmbeddingProcessor()
sep.process(
voice_output_dir,
se_model,
)
logging.info("Processing speaker embedding done.")
logging.info("Processing done.")
# Generate Voc&AM metafile
# TODO: train/valid ratio setting
gen_metafile(voice_output_dir, fp_enable, ap.badcase_list)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Dataset preprocessor")
parser.add_argument("--voice_input_dir", type=str, required=True)
parser.add_argument("--voice_output_dir", type=str, required=True)
parser.add_argument("--audio_config", type=str, required=True)
parser.add_argument("--speaker", type=str, default=None, help="speaker")
parser.add_argument("--lang", type=str, default="PinYin", help="target language")
parser.add_argument(
"--se_model",
type=str,
default="../pre_data/speaker_embeddding/se.*",
help="speaker embedding extractor model",
)
parser.add_argument(
"--skip_script", action="store_true", help="skip script converting"
)
args = parser.parse_args()
os.makedirs(args.voice_output_dir, exist_ok=True)
logging_to_file(os.path.join(args.voice_output_dir, "data_process_stdout.log"))
try:
process_data(
args.voice_input_dir,
args.voice_output_dir,
args.audio_config,
args.speaker,
args.lang,
args.skip_script,
args.se_model,
)
except (Exception, KeyboardInterrupt) as e:
logging.error(e, exc_info=True)
import os
import logging
import random
def is_fp_line(line):
fp_category_list = ["FP", "I", "N", "Q"]
elements = line.strip().split(" ")
res = True
for ele in elements:
if ele not in fp_category_list:
res = False
break
return res
class FpProcessor:
def __init__(self):
# TODO: Add more audio processing methods.
self.res = []
def is_fp_line(line):
fp_category_list = ["FP", "I", "N", "Q"]
elements = line.strip().split(" ")
res = True
for ele in elements:
if ele not in fp_category_list:
res = False
break
return res
# TODO: adjust idx judgment rule
def addfp(self, voice_output_dir, prosody, raw_metafile_lines):
fp_category_list = ["FP", "I", "N"]
f = open(prosody)
prosody_lines = f.readlines()
f.close()
idx = ""
fp = ""
fp_label_dict = {}
i = 0
while i < len(prosody_lines):
if len(prosody_lines[i].strip().split("\t")) == 2:
idx = prosody_lines[i].strip().split("\t")[0]
i += 1
else:
fp_enable = is_fp_line(prosody_lines[i])
if fp_enable:
fp = prosody_lines[i].strip().split("\t")[0].split(" ")
for label in fp:
if label not in fp_category_list:
logging.warning("fp label not in fp_category_list")
break
i += 4
else:
fp = [
"N"
for _ in range(
len(
prosody_lines[i]
.strip()
.split("\t")[0]
.replace("/ ", "")
.replace(". ", "")
.split(" ")
)
)
]
i += 1
fp_label_dict[idx] = fp
fpadd_metafile = os.path.join(voice_output_dir, "fpadd_metafile.txt")
f_out = open(fpadd_metafile, "w")
for line in raw_metafile_lines:
tokens = line.strip().split("\t")
if len(tokens) == 2:
uttname = tokens[0]
symbol_sequences = tokens[1].split(" ")
error_flag = False
idx = 0
out_str = uttname + "\t"
for this_symbol_sequence in symbol_sequences:
emotion = this_symbol_sequence.split("$")[4]
this_symbol_sequence = this_symbol_sequence.replace(
emotion, "emotion_neutral"
)
if idx < len(fp_label_dict[uttname]):
if fp_label_dict[uttname][idx] == "FP":
if "none" not in this_symbol_sequence:
this_symbol_sequence = this_symbol_sequence.replace(
"emotion_neutral", "emotion_disgust"
)
syllable_label = this_symbol_sequence.split("$")[2]
if syllable_label == "s_both" or syllable_label == "s_end":
idx += 1
elif idx > len(fp_label_dict[uttname]):
logging.warning(uttname + " not match")
error_flag = True
out_str = out_str + this_symbol_sequence + " "
# if idx != len(fp_label_dict[uttname]):
# logging.warning(
# "{} length mismatch, length: {} ".format(
# idx, len(fp_label_dict[uttname])
# )
# )
if not error_flag:
f_out.write(out_str.strip() + "\n")
f_out.close()
return fpadd_metafile
def removefp(self, voice_output_dir, fpadd_metafile, raw_metafile_lines):
f = open(fpadd_metafile)
fpadd_metafile_lines = f.readlines()
f.close()
fprm_metafile = os.path.join(voice_output_dir, "fprm_metafile.txt")
f_out = open(fprm_metafile, "w")
for i in range(len(raw_metafile_lines)):
tokens = raw_metafile_lines[i].strip().split("\t")
symbol_sequences = tokens[1].split(" ")
fpadd_tokens = fpadd_metafile_lines[i].strip().split("\t")
fpadd_symbol_sequences = fpadd_tokens[1].split(" ")
error_flag = False
out_str = tokens[0] + "\t"
idx = 0
length = len(symbol_sequences)
while idx < length:
if "$emotion_disgust" in fpadd_symbol_sequences[idx]:
if idx + 1 < length and "none" in fpadd_symbol_sequences[idx + 1]:
idx = idx + 2
else:
idx = idx + 1
continue
out_str = out_str + symbol_sequences[idx] + " "
idx = idx + 1
if not error_flag:
f_out.write(out_str.strip() + "\n")
f_out.close()
def process(self, voice_output_dir, prosody, raw_metafile):
with open(raw_metafile, "r") as f:
lines = f.readlines()
random.shuffle(lines)
fpadd_metafile = self.addfp(voice_output_dir, prosody, lines)
self.removefp(voice_output_dir, fpadd_metafile, lines)
<?xml version="1.0" encoding="utf-8"?>
<phoneSet xmlns="http://schemas.alibaba-inc.com/tts">
<phone>
<id>0</id>
<name>a_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>1</id>
<name>ai_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>2</id>
<name>an_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>3</id>
<name>ang_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>4</id>
<name>ao_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>5</id>
<name>b_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>6</id>
<name>c_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>7</id>
<name>ch_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>8</id>
<name>d_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>9</id>
<name>e_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>10</id>
<name>ei_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>11</id>
<name>en_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>12</id>
<name>eng_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>doublelips</ap>
<am>stop</am>
</phone>
<phone>
<id>13</id>
<name>er_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>doublelips</ap>
<am>stop</am>
</phone>
<phone>
<id>14</id>
<name>f_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>doublelips</ap>
<am>stop</am>
</phone>
<phone>
<id>15</id>
<name>g_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>16</id>
<name>h_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>backtongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>17</id>
<name>i_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>backtongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>18</id>
<name>ia_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>19</id>
<name>ian_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>20</id>
<name>iang_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>21</id>
<name>iao_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>22</id>
<name>ie_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>23</id>
<name>ih_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>24</id>
<name>ii_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>25</id>
<name>in_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>26</id>
<name>ing_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>27</id>
<name>io_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>28</id>
<name>iong_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>29</id>
<name>iou_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>30</id>
<name>j_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>31</id>
<name>k_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>32</id>
<name>l_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>33</id>
<name>m_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>34</id>
<name>n_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>35</id>
<name>o_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>36</id>
<name>ong_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>37</id>
<name>ou_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>38</id>
<name>p_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>liptooth</ap>
<am>fricative</am>
</phone>
<phone>
<id>39</id>
<name>q_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>liptooth</ap>
<am>fricative</am>
</phone>
<phone>
<id>40</id>
<name>r_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>velar</ap>
<am>stop</am>
</phone>
<phone>
<id>41</id>
<name>s_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>42</id>
<name>sh_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>43</id>
<name>t_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>44</id>
<name>u_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>velar</ap>
<am>stop</am>
</phone>
<phone>
<id>45</id>
<name>ua_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>velar</ap>
<am>fricative</am>
</phone>
<phone>
<id>46</id>
<name>uai_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>velar</ap>
<am>fricative</am>
</phone>
<phone>
<id>47</id>
<name>uan_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>48</id>
<name>uang_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>49</id>
<name>uei_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>50</id>
<name>uen_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>51</id>
<name>ueng_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>52</id>
<name>uo_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>53</id>
<name>v_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>54</id>
<name>van_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>55</id>
<name>ve_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>56</id>
<name>vn_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>57</id>
<name>xx_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>58</id>
<name>z_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>59</id>
<name>zh_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>60</id>
<name>w_c</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>61</id>
<name>y_c</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>62</id>
<name>ga</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>lateral</am>
</phone>
<phone>
<id>63</id>
<name>ge</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>lateral</am>
</phone>
<phone>
<id>64</id>
<name>go</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>lateral</am>
</phone>
<phone>
<id>65</id>
<name>aa</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>66</id>
<name>ae</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>67</id>
<name>ah</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>68</id>
<name>ao</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>69</id>
<name>aw</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>70</id>
<name>ay</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>71</id>
<name>b</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>doublelips</ap>
<am>stop</am>
</phone>
<phone>
<id>72</id>
<name>ch</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>backtongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>73</id>
<name>d</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>74</id>
<name>dh</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>75</id>
<name>eh</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>76</id>
<name>er</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>77</id>
<name>ey</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>78</id>
<name>f</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>liptooth</ap>
<am>fricative</am>
</phone>
<phone>
<id>79</id>
<name>g</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>velar</ap>
<am>stop</am>
</phone>
<phone>
<id>80</id>
<name>hh</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>81</id>
<name>ih</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>82</id>
<name>iy</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>83</id>
<name>jh</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>84</id>
<name>k</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>velar</ap>
<am>stop</am>
</phone>
<phone>
<id>85</id>
<name>l</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>lateral</am>
</phone>
<phone>
<id>86</id>
<name>m</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>doublelips</ap>
<am>nasal</am>
</phone>
<phone>
<id>87</id>
<name>n</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>nasal</am>
</phone>
<phone>
<id>88</id>
<name>ng</name>
<cv>consonant</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>89</id>
<name>ow</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>90</id>
<name>oy</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>91</id>
<name>p</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>doublelips</ap>
<am>stop</am>
</phone>
<phone>
<id>92</id>
<name>r</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>backtongue</ap>
<am>fricative</am>
</phone>
<phone>
<id>93</id>
<name>s</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>fricative</am>
</phone>
<phone>
<id>94</id>
<name>sh</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>backtongue</ap>
<am>fricative</am>
</phone>
<phone>
<id>95</id>
<name>t</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>96</id>
<name>th</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>97</id>
<name>uh</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>98</id>
<name>uw</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>99</id>
<name>v</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>100</id>
<name>w</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>101</id>
<name>y</name>
<cv>consonant</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>102</id>
<name>z</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>103</id>
<name>zh</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>backtongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>104</id>
<name>air_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>105</id>
<name>angr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>106</id>
<name>anr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>107</id>
<name>aor_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>108</id>
<name>ar_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>109</id>
<name>eir_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>110</id>
<name>engr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>doublelips</ap>
<am>stop</am>
</phone>
<phone>
<id>111</id>
<name>enr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>112</id>
<name>iangr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>113</id>
<name>ianr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>114</id>
<name>iaor_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>115</id>
<name>iar_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>116</id>
<name>ier_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>117</id>
<name>ihr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>118</id>
<name>iir_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>119</id>
<name>ingr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>120</id>
<name>inr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>121</id>
<name>iongr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>122</id>
<name>iour_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>123</id>
<name>ir_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>backtongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>124</id>
<name>ongr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>125</id>
<name>or_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>126</id>
<name>our_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>127</id>
<name>uair_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>velar</ap>
<am>fricative</am>
</phone>
<phone>
<id>128</id>
<name>uangr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>129</id>
<name>uanr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>130</id>
<name>uar_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>velar</ap>
<am>fricative</am>
</phone>
<phone>
<id>131</id>
<name>ueir_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>132</id>
<name>uenr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>133</id>
<name>uor_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>134</id>
<name>ur_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>velar</ap>
<am>stop</am>
</phone>
<phone>
<id>135</id>
<name>vanr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>136</id>
<name>ver_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>137</id>
<name>vnr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>138</id>
<name>vr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>146</id>
<name>pau</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
</phoneSet>
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment