Commit 51782715 authored by liugh5's avatar liugh5
Browse files

update

parent 8b4e9acd
import os
import numpy as np
from glob import glob
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
import argparse
import yaml
import logging
from .core.utils import (
volume_normalize,
get_pitch,
get_energy,
align_length,
compute_mean,
compute_std,
f0_norm_mean_std,
norm_mean_std,
parse_interval_file,
average_by_duration,
encode_16bits,
)
from .core.dsp import (
melspectrogram,
load_wav,
trim_silence,
trim_silence_with_interval,
save_wav,
)
logging.basicConfig(
format="%(asctime)s %(levelname)-4s [%(filename)s:%(lineno)d] %(message)s",
datefmt="%Y-%m-%d:%H:%M:%S",
level=logging.DEBUG,
)
default_audio_config = {
# Preprocess
"wav_normalize": True,
"trim_silence": True,
"trim_silence_threshold_db": 60,
"preemphasize": False,
# Feature extraction
"sampling_rate": 24000,
"hop_length": 240,
"win_length": 1024,
"n_mels": 80,
"n_fft": 1024,
"fmin": 50.0,
"fmax": 7600.0,
"min_level_db": -100,
"ref_level_db": 20,
"phone_level_feature": True,
"num_workers": 16,
# Normalization
"norm_type": "mean_std", # 'mean_std', 'global norm'
"max_norm": 1.0,
"symmetric": False,
}
class AudioProcessor:
def __init__(self, config=None):
# TODO: Add more audio processing methods.
if not isinstance(config, dict):
logging.warning(
"[AudioProcessor] config is not a dict, fall into default config."
)
self.config = default_audio_config
else:
self.config = config
for key in self.config:
setattr(self, key, self.config[key])
self.min_wav_length = int(self.config["sampling_rate"] * 0.5)
self.badcase_list = []
self.pcm_dict = {}
self.mel_dict = {}
self.f0_dict = {}
self.uv_dict = {}
self.nccf_dict = {}
self.f0uv_dict = {}
self.energy_dict = {}
self.dur_dict = {}
logging.info("[AudioProcessor] Initialize AudioProcessor.")
logging.info("[AudioProcessor] config params:")
for key in self.config:
logging.info("[AudioProcessor] %s: %s", key, self.config[key])
def calibrate_SyllableDuration(
self, raw_dur_dir, raw_metafile, out_cali_duration_dir
):
with open(raw_metafile, "r") as f:
lines = f.readlines()
output_dur_dir = out_cali_duration_dir
os.makedirs(output_dur_dir, exist_ok=True)
for line in lines:
line = line.strip()
index, symbols = line.split("\t")
symbols = [
symbol.strip("{").strip("}").split("$")[0]
for symbol in symbols.strip().split(" ")
]
dur_file = os.path.join(raw_dur_dir, index + ".npy")
phone_file = os.path.join(raw_dur_dir, index + ".phone")
if not os.path.exists(dur_file) or not os.path.exists(phone_file):
logging.warning(
"[AudioProcessor] dur file or phone file not exists: %s", index
)
continue
with open(phone_file, "r") as f:
phones = f.readlines()
dur = np.load(dur_file)
cali_duration = []
dur_idx = 0
syll_idx = 0
while dur_idx < len(dur) and syll_idx < len(symbols):
if phones[dur_idx].strip() == "sil":
dur_idx += 1
continue
if phones[dur_idx].strip() == "sp" and symbols[syll_idx][0] != "#":
dur_idx += 1
continue
if symbols[syll_idx] in ["ga", "go", "ge"]:
cali_duration.append(0)
syll_idx += 1
# print("NONE", symbols[syll_idx], 0)
continue
if symbols[syll_idx][0] == "#":
if phones[dur_idx].strip() != "sp":
cali_duration.append(0)
# print("NONE", symbols[syll_idx], 0)
syll_idx += 1
continue
else:
cali_duration.append(dur[dur_idx])
# print(phones[dur_idx].strip(), symbols[syll_idx], dur[dur_idx])
dur_idx += 1
syll_idx += 1
continue
# A corresponding phone is found
cali_duration.append(dur[dur_idx])
# print(phones[dur_idx].strip(), symbols[syll_idx], dur[dur_idx])
dur_idx += 1
syll_idx += 1
# Add #4 phone duration
cali_duration.append(0)
if len(cali_duration) != len(symbols):
logging.error(
"[Duration Calibrating] Syllable duration {}\
is not equal to the number of symbols {}, index: {}".format(
len(cali_duration), len(symbols), index
)
)
continue
# Align with mel frames
durs = np.array(cali_duration)
if len(self.mel_dict) > 0:
pair_mel = self.mel_dict.get(index, None)
if pair_mel is None:
logging.warning(
"[AudioProcessor] Interval file %s has no corresponding mel",
index,
)
continue
mel_frames = pair_mel.shape[0]
dur_frames = np.sum(durs)
if np.sum(durs) > mel_frames:
durs[-2] -= dur_frames - mel_frames
elif np.sum(durs) < mel_frames:
durs[-2] += mel_frames - np.sum(durs)
if durs[-2] < 0:
logging.error(
"[AudioProcessor] Duration calibrating failed for %s, mismatch frames %s",
index,
durs[-2],
)
self.badcase_list.append(index)
continue
self.dur_dict[index] = durs
np.save(os.path.join(output_dur_dir, index + ".npy"), self.dur_dict[index])
def amp_normalize(self, src_wav_dir, out_wav_dir):
if self.wav_normalize:
logging.info("[AudioProcessor] Amplitude normalization started")
os.makedirs(out_wav_dir, exist_ok=True)
res = volume_normalize(src_wav_dir, out_wav_dir)
logging.info("[AudioProcessor] Amplitude normalization finished")
return res
else:
logging.info("[AudioProcessor] No amplitude normalization")
os.symlink(src_wav_dir, out_wav_dir, target_is_directory=True)
return True
def get_pcm_dict(self, src_wav_dir):
wav_list = glob(os.path.join(src_wav_dir, "*.wav"))
if len(self.pcm_dict) > 0:
return self.pcm_dict
logging.info("[AudioProcessor] Start to load pcm from %s", src_wav_dir)
with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
total=len(wav_list)
) as progress:
futures = []
for wav_path in wav_list:
future = executor.submit(load_wav, wav_path, self.sampling_rate)
future.add_done_callback(lambda p: progress.update())
wav_name = os.path.splitext(os.path.basename(wav_path))[0]
futures.append((future, wav_name))
for future, wav_name in futures:
pcm = future.result()
if len(pcm) < self.min_wav_length:
logging.warning("[AudioProcessor] %s is too short, skip", wav_name)
self.badcase_list.append(wav_name)
continue
self.pcm_dict[wav_name] = pcm
return self.pcm_dict
def trim_silence_wav(self, src_wav_dir, out_wav_dir=None):
wav_list = glob(os.path.join(src_wav_dir, "*.wav"))
logging.info("[AudioProcessor] Trim silence started")
if out_wav_dir is None:
out_wav_dir = src_wav_dir
else:
os.makedirs(out_wav_dir, exist_ok=True)
pcm_dict = self.get_pcm_dict(src_wav_dir)
with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
total=len(wav_list)
) as progress:
futures = []
for wav_basename, pcm_data in pcm_dict.items():
future = executor.submit(
trim_silence,
pcm_data,
self.trim_silence_threshold_db,
self.hop_length,
self.win_length,
)
future.add_done_callback(lambda p: progress.update())
futures.append((future, wav_basename))
# TODO: multi-processing
for future, wav_basename in tqdm(futures):
pcm = future.result()
if len(pcm) < self.min_wav_length:
logging.warning("[AudioProcessor] %s is too short, skip", wav_basename)
self.badcase_list.append(wav_basename)
self.pcm_dict.pop(wav_basename)
continue
self.pcm_dict[wav_basename] = pcm
save_wav(
self.pcm_dict[wav_basename],
os.path.join(out_wav_dir, wav_basename + ".wav"),
self.sampling_rate,
)
logging.info("[AudioProcessor] Trim silence finished")
return True
def trim_silence_wav_with_interval(self, src_wav_dir, dur_dir, out_wav_dir=None):
wav_list = glob(os.path.join(src_wav_dir, "*.wav"))
logging.info("[AudioProcessor] Trim silence with interval started")
if out_wav_dir is None:
out_wav_dir = src_wav_dir
else:
os.makedirs(out_wav_dir, exist_ok=True)
pcm_dict = self.get_pcm_dict(src_wav_dir)
with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
total=len(wav_list)
) as progress:
futures = []
for wav_basename, pcm_data in pcm_dict.items():
future = executor.submit(
trim_silence_with_interval,
pcm_data,
self.dur_dict.get(wav_basename, None),
self.hop_length,
)
future.add_done_callback(lambda p: progress.update())
futures.append((future, wav_basename))
# TODO: multi-processing
for future, wav_basename in tqdm(futures):
trimed_pcm = future.result()
if trimed_pcm is None:
continue
if len(trimed_pcm) < self.min_wav_length:
logging.warning("[AudioProcessor] %s is too short, skip", wav_basename)
self.badcase_list.append(wav_basename)
self.pcm_dict.pop(wav_basename)
continue
self.pcm_dict[wav_basename] = trimed_pcm
save_wav(
self.pcm_dict[wav_basename],
os.path.join(out_wav_dir, wav_basename + ".wav"),
self.sampling_rate,
)
logging.info("[AudioProcessor] Trim silence finished")
return True
def mel_extract(self, src_wav_dir, out_feature_dir):
os.makedirs(out_feature_dir, exist_ok=True)
wav_list = glob(os.path.join(src_wav_dir, "*.wav"))
pcm_dict = self.get_pcm_dict(src_wav_dir)
logging.info("[AudioProcessor] Melspec extraction started")
# Get global normed mel spec
with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
total=len(wav_list)
) as progress:
futures = []
for wav_basename, pcm_data in pcm_dict.items():
future = executor.submit(
melspectrogram,
pcm_data,
self.sampling_rate,
self.n_fft,
self.hop_length,
self.win_length,
self.n_mels,
self.max_norm,
self.min_level_db,
self.ref_level_db,
self.fmin,
self.fmax,
self.symmetric,
self.preemphasize,
)
future.add_done_callback(lambda p: progress.update())
futures.append((future, wav_basename))
for future, wav_basename in futures:
result = future.result()
if result is None:
logging.warning(
"[AudioProcessor] Melspec extraction failed for %s",
wav_basename,
)
self.badcase_list.append(wav_basename)
else:
melspec = result
self.mel_dict[wav_basename] = melspec
logging.info("[AudioProcessor] Melspec extraction finished")
# FIXME: is this step necessary?
# Do mean std norm on global-normed melspec
logging.info("Melspec statistic proceeding...")
mel_mean = compute_mean(list(self.mel_dict.values()), dims=self.n_mels)
mel_std = compute_std(list(self.mel_dict.values()), mel_mean, dims=self.n_mels)
logging.info("Melspec statistic done")
np.savetxt(os.path.join(out_feature_dir, "mel_mean.txt"), mel_mean, fmt="%.6f")
np.savetxt(os.path.join(out_feature_dir, "mel_std.txt"), mel_std, fmt="%.6f")
logging.info(
"[AudioProcessor] melspec mean and std saved to:\n{},\n{}".format(
os.path.join(out_feature_dir, "mel_mean.txt"),
os.path.join(out_feature_dir, "mel_std.txt"),
)
)
logging.info("[AudioProcessor] Melspec mean std norm is proceeding...")
for wav_basename in self.mel_dict:
melspec = self.mel_dict[wav_basename]
norm_melspec = norm_mean_std(melspec, mel_mean, mel_std)
np.save(os.path.join(out_feature_dir, wav_basename + ".npy"), norm_melspec)
logging.info("[AudioProcessor] Melspec normalization finished")
logging.info("[AudioProcessor] Normed Melspec saved to %s", out_feature_dir)
return True
# TODO: some dataset may have no interval label
def duration_generate(self, src_interval_dir, out_feature_dir):
os.makedirs(out_feature_dir, exist_ok=True)
interval_list = glob(os.path.join(src_interval_dir, "*.interval"))
logging.info("[AudioProcessor] Duration generation started")
with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
total=len(interval_list)
) as progress:
futures = []
for interval_file_path in interval_list:
future = executor.submit(
parse_interval_file,
interval_file_path,
self.sampling_rate,
self.hop_length,
)
future.add_done_callback(lambda p: progress.update())
futures.append(
(future, os.path.splitext(os.path.basename(interval_file_path))[0])
)
logging.info("[AudioProcessor] Duration align with mel is proceeding...")
for future, wav_basename in futures:
result = future.result()
if result is None:
logging.warning(
"[AudioProcessor] Duration generate failed for %s", wav_basename
)
self.badcase_list.append(wav_basename)
else:
durs, phone_list = result
# Algin length with melspec
if len(self.mel_dict) > 0:
pair_mel = self.mel_dict.get(wav_basename, None)
if pair_mel is None:
logging.warning(
"[AudioProcessor] Interval file %s has no corresponding mel",
wav_basename,
)
continue
mel_frames = pair_mel.shape[0]
dur_frames = np.sum(durs)
if np.sum(durs) > mel_frames:
durs[-1] -= dur_frames - mel_frames
elif np.sum(durs) < mel_frames:
durs[-1] += mel_frames - np.sum(durs)
if durs[-1] < 0:
logging.error(
"[AudioProcessor] Duration align failed for %s, mismatch frames %s",
wav_basename,
durs[-1],
)
self.badcase_list.append(wav_basename)
continue
self.dur_dict[wav_basename] = durs
np.save(os.path.join(out_feature_dir, wav_basename + ".npy"), durs)
with open(
os.path.join(out_feature_dir, wav_basename + ".phone"), "w"
) as f:
f.write("\n".join(phone_list))
logging.info("[AudioProcessor] Duration generate finished")
return True
def pitch_extract(
self, src_wav_dir, out_f0_dir, out_frame_f0_dir, out_frame_uv_dir
):
os.makedirs(out_f0_dir, exist_ok=True)
os.makedirs(out_frame_f0_dir, exist_ok=True)
os.makedirs(out_frame_uv_dir, exist_ok=True)
wav_list = glob(os.path.join(src_wav_dir, "*.wav"))
pcm_dict = self.get_pcm_dict(src_wav_dir)
mel_dict = self.mel_dict
logging.info("[AudioProcessor] Pitch extraction started")
# Get raw pitch
with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
total=len(wav_list)
) as progress:
futures = []
for wav_basename, pcm_data in pcm_dict.items():
future = executor.submit(
get_pitch,
encode_16bits(pcm_data),
self.sampling_rate,
self.hop_length,
)
future.add_done_callback(lambda p: progress.update())
futures.append((future, wav_basename))
logging.info("[AudioProcessor] Pitch align with mel is proceeding...")
for future, wav_basename in futures:
result = future.result()
if result is None:
logging.warning(
"[AudioProcessor] Pitch extraction failed for %s", wav_basename
)
self.badcase_list.append(wav_basename)
else:
f0, uv, f0uv = result
if len(mel_dict) > 0:
f0 = align_length(f0, mel_dict.get(wav_basename, None))
uv = align_length(uv, mel_dict.get(wav_basename, None))
f0uv = align_length(f0uv, mel_dict.get(wav_basename, None))
if f0 is None or uv is None or f0uv is None:
logging.warning(
"[AudioProcessor] Pitch length mismatch with mel in %s",
wav_basename,
)
self.badcase_list.append(wav_basename)
continue
self.f0_dict[wav_basename] = f0
self.uv_dict[wav_basename] = uv
self.f0uv_dict[wav_basename] = f0uv
# Normalize f0
logging.info("[AudioProcessor] Pitch normalization is proceeding...")
f0_mean = compute_mean(list(self.f0uv_dict.values()), dims=1)
f0_std = compute_std(list(self.f0uv_dict.values()), f0_mean, dims=1)
np.savetxt(os.path.join(out_f0_dir, "f0_mean.txt"), f0_mean, fmt="%.6f")
np.savetxt(os.path.join(out_f0_dir, "f0_std.txt"), f0_std, fmt="%.6f")
logging.info(
"[AudioProcessor] f0 mean and std saved to:\n{},\n{}".format(
os.path.join(out_f0_dir, "f0_mean.txt"),
os.path.join(out_f0_dir, "f0_std.txt"),
)
)
logging.info("[AudioProcessor] Pitch mean std norm is proceeding...")
for wav_basename in self.f0uv_dict:
f0 = self.f0uv_dict[wav_basename]
norm_f0 = f0_norm_mean_std(f0, f0_mean, f0_std)
self.f0uv_dict[wav_basename] = norm_f0
for wav_basename in self.f0_dict:
f0 = self.f0_dict[wav_basename]
norm_f0 = f0_norm_mean_std(f0, f0_mean, f0_std)
self.f0_dict[wav_basename] = norm_f0
# save frame f0 to a specific dir
for wav_basename in self.f0_dict:
np.save(
os.path.join(out_frame_f0_dir, wav_basename + ".npy"),
self.f0_dict[wav_basename].reshape(-1),
)
for wav_basename in self.uv_dict:
np.save(
os.path.join(out_frame_uv_dir, wav_basename + ".npy"),
self.uv_dict[wav_basename].reshape(-1),
)
# phone level average
# if there is no duration then save the frame-level f0
if self.phone_level_feature and len(self.dur_dict) > 0:
logging.info("[AudioProcessor] Pitch turn to phone-level is proceeding...")
with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
total=len(self.f0uv_dict)
) as progress:
futures = []
for wav_basename in self.f0uv_dict:
future = executor.submit(
average_by_duration,
self.f0uv_dict.get(wav_basename, None),
self.dur_dict.get(wav_basename, None),
)
future.add_done_callback(lambda p: progress.update())
futures.append((future, wav_basename))
for future, wav_basename in futures:
result = future.result()
if result is None:
logging.warning(
"[AudioProcessor] Pitch extraction failed in phone level avg for: %s",
wav_basename,
)
self.badcase_list.append(wav_basename)
else:
avg_f0 = result
self.f0uv_dict[wav_basename] = avg_f0
for wav_basename in self.f0uv_dict:
np.save(
os.path.join(out_f0_dir, wav_basename + ".npy"),
self.f0uv_dict[wav_basename].reshape(-1),
)
logging.info("[AudioProcessor] Pitch normalization finished")
logging.info("[AudioProcessor] Normed f0 saved to %s", out_f0_dir)
logging.info("[AudioProcessor] Pitch extraction finished")
return True
def energy_extract(self, src_wav_dir, out_energy_dir, out_frame_energy_dir):
os.makedirs(out_energy_dir, exist_ok=True)
os.makedirs(out_frame_energy_dir, exist_ok=True)
wav_list = glob(os.path.join(src_wav_dir, "*.wav"))
pcm_dict = self.get_pcm_dict(src_wav_dir)
mel_dict = self.mel_dict
logging.info("[AudioProcessor] Energy extraction started")
# Get raw energy
with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
total=len(wav_list)
) as progress:
futures = []
for wav_basename, pcm_data in pcm_dict.items():
future = executor.submit(
get_energy, pcm_data, self.hop_length, self.win_length, self.n_fft
)
future.add_done_callback(lambda p: progress.update())
futures.append((future, wav_basename))
for future, wav_basename in futures:
result = future.result()
if result is None:
logging.warning(
"[AudioProcessor] Energy extraction failed for %s", wav_basename
)
self.badcase_list.append(wav_basename)
else:
energy = result
if len(mel_dict) > 0:
energy = align_length(energy, mel_dict.get(wav_basename, None))
if energy is None:
logging.warning(
"[AudioProcessor] Energy length mismatch with mel in %s",
wav_basename,
)
self.badcase_list.append(wav_basename)
continue
self.energy_dict[wav_basename] = energy
# Normalize energy
energy_mean = compute_mean(list(self.energy_dict.values()), dims=1)
energy_std = compute_std(list(self.energy_dict.values()), energy_mean, dims=1)
np.savetxt(
os.path.join(out_energy_dir, "energy_mean.txt"), energy_mean, fmt="%.6f"
)
np.savetxt(
os.path.join(out_energy_dir, "energy_std.txt"), energy_std, fmt="%.6f"
)
logging.info(
"[AudioProcessor] energy mean and std saved to:\n{},\n{}".format(
os.path.join(out_energy_dir, "energy_mean.txt"),
os.path.join(out_energy_dir, "energy_std.txt"),
)
)
logging.info("[AudioProcessor] Energy mean std norm is proceeding...")
for wav_basename in self.energy_dict:
energy = self.energy_dict[wav_basename]
norm_energy = f0_norm_mean_std(energy, energy_mean, energy_std)
self.energy_dict[wav_basename] = norm_energy
# save frame energy to a specific dir
for wav_basename in self.energy_dict:
np.save(
os.path.join(out_frame_energy_dir, wav_basename + ".npy"),
self.energy_dict[wav_basename].reshape(-1),
)
# phone level average
# if there is no duration then save the frame-level energy
if self.phone_level_feature and len(self.dur_dict) > 0:
with ProcessPoolExecutor(max_workers=self.num_workers) as executor, tqdm(
total=len(self.energy_dict)
) as progress:
futures = []
for wav_basename in self.energy_dict:
future = executor.submit(
average_by_duration,
self.energy_dict.get(wav_basename, None),
self.dur_dict.get(wav_basename, None),
)
future.add_done_callback(lambda p: progress.update())
futures.append((future, wav_basename))
# TODO: failed clear dict element
for future, wav_basename in futures:
result = future.result()
if result is None:
logging.warning(
"[AudioProcessor] Energy extraction failed in phone level avg for: %s",
wav_basename,
)
self.badcase_list.append(wav_basename)
else:
avg_energy = result
self.energy_dict[wav_basename] = avg_energy
for wav_basename in self.energy_dict:
np.save(
os.path.join(out_energy_dir, wav_basename + ".npy"),
self.energy_dict[wav_basename].reshape(-1),
)
logging.info("[AudioProcessor] Energy normalization finished")
logging.info("[AudioProcessor] Normed Energy saved to %s", out_energy_dir)
logging.info("[AudioProcessor] Energy extraction finished")
return True
def process(self, src_voice_dir, out_data_dir, aux_metafile=None):
succeed = True
raw_wav_dir = os.path.join(src_voice_dir, "wav")
src_interval_dir = os.path.join(src_voice_dir, "interval")
out_mel_dir = os.path.join(out_data_dir, "mel")
out_f0_dir = os.path.join(out_data_dir, "f0")
out_frame_f0_dir = os.path.join(out_data_dir, "frame_f0")
out_frame_uv_dir = os.path.join(out_data_dir, "frame_uv")
out_energy_dir = os.path.join(out_data_dir, "energy")
out_frame_energy_dir = os.path.join(out_data_dir, "frame_energy")
out_duration_dir = os.path.join(out_data_dir, "raw_duration")
out_cali_duration_dir = os.path.join(out_data_dir, "duration")
os.makedirs(out_data_dir, exist_ok=True)
with_duration = os.path.exists(src_interval_dir)
# TODO: to resume from previous process, a log file is needed
train_wav_dir = os.path.join(out_data_dir, "wav")
succeed = self.amp_normalize(raw_wav_dir, train_wav_dir)
if not succeed:
logging.error("[AudioProcessor] amp_normalize failed, exit")
return False
if with_duration:
# Raw duration, non-trimmed
succeed = self.duration_generate(src_interval_dir, out_duration_dir)
if not succeed:
logging.error("[AudioProcessor] duration_generate failed, exit")
return False
if self.trim_silence:
if with_duration:
succeed = self.trim_silence_wav_with_interval(
train_wav_dir, out_duration_dir
)
if not succeed:
logging.error(
"[AudioProcessor] trim_silence_wav_with_interval failed, exit"
)
return False
else:
succeed = self.trim_silence_wav(train_wav_dir)
if not succeed:
logging.error("[AudioProcessor] trim_silence_wav failed, exit")
return False
succeed = self.mel_extract(train_wav_dir, out_mel_dir)
if not succeed:
logging.error("[AudioProcessor] mel_extract failed, exit")
return False
if aux_metafile is not None and with_duration:
self.calibrate_SyllableDuration(
out_duration_dir, aux_metafile, out_cali_duration_dir
)
succeed = self.pitch_extract(
train_wav_dir, out_f0_dir, out_frame_f0_dir, out_frame_uv_dir
)
if not succeed:
logging.error("[AudioProcessor] pitch_extract failed, exit")
return False
succeed = self.energy_extract(
train_wav_dir, out_energy_dir, out_frame_energy_dir
)
if not succeed:
logging.error("[AudioProcessor] energy_extract failed, exit")
return False
# recording badcase list
with open(os.path.join(out_data_dir, "badlist.txt"), "w") as f:
f.write("\n".join(self.badcase_list))
logging.info("[AudioProcessor] All features extracted successfully!")
return succeed
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Audio Processor")
parser.add_argument("--src_voice_dir", type=str, required=True)
parser.add_argument("--out_data_dir", type=str, required=True)
parser.add_argument("--config", type=str, default=None)
args = parser.parse_args()
if args.config is not None:
with open(args.config, "r") as f:
config = yaml.load(f, Loader=yaml.Loader)
ap = AudioProcessor(config["audio_config"])
ap.process(args.src_voice_dir, args.out_data_dir)
import numpy as np
import librosa
import librosa.filters
from scipy.io import wavfile
from scipy import signal
def _stft(y, hop_length, win_length, n_fft):
return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
def _istft(y, hop_length, win_length):
return librosa.istft(y, hop_length=hop_length, win_length=win_length)
def _db_to_amp(x):
return np.power(10.0, x * 0.05)
def _amp_to_db(x):
return 20 * np.log10(np.maximum(1e-5, x))
def load_wav(path, sr):
return librosa.load(path, sr=sr)[0]
def save_wav(wav, path, sr):
if wav.dtype == np.float32 or wav.dtype == np.float64:
quant_wav = 32767 * wav
else:
quant_wav = wav
# maxmize the volume to avoid clipping
# wav *= 32767 / max(0.01, np.max(np.abs(wav)))
wavfile.write(path, sr, quant_wav.astype(np.int16))
def trim_silence(wav, top_db, hop_length, win_length):
trimed_wav, _ = librosa.effects.trim(
wav, top_db=top_db, frame_length=win_length, hop_length=hop_length
)
return trimed_wav
def trim_silence_with_interval(wav, interval, hop_length):
if interval is None:
return None
leading_sil = interval[0]
tailing_sil = interval[-1]
trim_wav = wav[leading_sil * hop_length : -tailing_sil * hop_length]
return trim_wav
def preemphasis(wav, k=0.98, preemphasize=False):
if preemphasize:
return signal.lfilter([1, -k], [1], wav)
return wav
def inv_preemphasis(wav, k=0.98, inv_preemphasize=False):
if inv_preemphasize:
return signal.lfilter([1], [1, -k], wav)
return wav
def _normalize(S, max_norm=1.0, min_level_db=-100, symmetric=False):
if symmetric:
return np.clip(
(2 * max_norm) * ((S - min_level_db) / (-min_level_db)) - max_norm,
-max_norm,
max_norm,
)
else:
return np.clip(max_norm * ((S - min_level_db) / (-min_level_db)), 0, max_norm)
def _denormalize(D, max_norm=1.0, min_level_db=-100, symmetric=False):
if symmetric:
return (
(np.clip(D, -max_norm, max_norm) + max_norm)
* -min_level_db
/ (2 * max_norm)
) + min_level_db
else:
return (np.clip(D, 0, max_norm) * -min_level_db / max_norm) + min_level_db
def _griffin_lim(S, n_fft, hop_length, win_length, griffin_lim_iters=60):
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
S_complex = np.abs(S).astype(np.complex)
y = _istft(S_complex * angles, hop_length=hop_length, win_length=win_length)
for i in range(griffin_lim_iters):
angles = np.exp(
1j
* np.angle(
_stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
)
)
y = _istft(S_complex * angles, hop_length=hop_length, win_length=win_length)
return y
def spectrogram(
y,
n_fft=1024,
hop_length=256,
win_length=1024,
max_norm=1.0,
min_level_db=-100,
ref_level_db=20,
symmetric=False,
):
D = _stft(preemphasis(y), hop_length, win_length, n_fft)
S = _amp_to_db(np.abs(D)) - ref_level_db
return _normalize(S, max_norm, min_level_db, symmetric)
def inv_spectrogram(
spectrogram,
n_fft=1024,
hop_length=256,
win_length=1024,
max_norm=1.0,
min_level_db=-100,
ref_level_db=20,
symmetric=False,
power=1.5,
):
S = _db_to_amp(
_denormalize(spectrogram, max_norm, min_level_db, symmetric) + ref_level_db
)
return _griffin_lim(S ** power, n_fft, hop_length, win_length)
def _build_mel_basis(sample_rate, n_fft=1024, fmin=50, fmax=8000, n_mels=80):
assert fmax <= sample_rate // 2
return librosa.filters.mel(
sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax
)
# mel linear Conversions
_mel_basis = None
_inv_mel_basis = None
def _linear_to_mel(spectogram, sample_rate, n_fft=1024, fmin=50, fmax=8000, n_mels=80):
global _mel_basis
if _mel_basis is None:
_mel_basis = _build_mel_basis(sample_rate, n_fft, fmin, fmax, n_mels)
return np.dot(_mel_basis, spectogram)
def _mel_to_linear(
mel_spectrogram, sample_rate, n_fft=1024, fmin=50, fmax=8000, n_mels=80
):
global _inv_mel_basis
if _inv_mel_basis is None:
_inv_mel_basis = np.linalg.pinv(
_build_mel_basis(sample_rate, n_fft, fmin, fmax, n_mels)
)
return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
def melspectrogram(
y,
sample_rate,
n_fft=1024,
hop_length=256,
win_length=1024,
n_mels=80,
max_norm=1.0,
min_level_db=-100,
ref_level_db=20,
fmin=50,
fmax=8000,
symmetric=False,
preemphasize=False,
):
D = _stft(
preemphasis(y, preemphasize=preemphasize),
hop_length=hop_length,
win_length=win_length,
n_fft=n_fft,
)
S = (
_amp_to_db(
_linear_to_mel(
np.abs(D),
sample_rate=sample_rate,
n_fft=n_fft,
fmin=fmin,
fmax=fmax,
n_mels=n_mels,
)
)
- ref_level_db
)
return _normalize(
S, max_norm=max_norm, min_level_db=min_level_db, symmetric=symmetric
).T
def inv_mel_spectrogram(
mel_spectrogram,
sample_rate,
n_fft=1024,
hop_length=256,
win_length=1024,
n_mels=80,
max_norm=1.0,
min_level_db=-100,
ref_level_db=20,
fmin=50,
fmax=8000,
power=1.5,
symmetric=False,
preemphasize=False,
):
D = _denormalize(
mel_spectrogram,
max_norm=max_norm,
min_level_db=min_level_db,
symmetric=symmetric,
)
S = _mel_to_linear(
_db_to_amp(D + ref_level_db),
sample_rate=sample_rate,
n_fft=n_fft,
fmin=fmin,
fmax=fmax,
n_mels=n_mels,
)
return inv_preemphasis(
_griffin_lim(S ** power, n_fft, hop_length, win_length),
preemphasize=preemphasize,
)
import os
from glob import glob
import numpy as np
import sox
import librosa
import pysptk
from scipy.io import wavfile
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm
import logging
from .dsp import _stft
anchor_hist = np.array(
[
0.0,
0.00215827,
0.00354383,
0.00442313,
0.00490274,
0.00532907,
0.00602185,
0.00690115,
0.00810019,
0.00948574,
0.0120437,
0.01489475,
0.01873168,
0.02302158,
0.02872369,
0.03669065,
0.04636291,
0.05843325,
0.07700506,
0.11052491,
0.16802558,
0.25997868,
0.37942979,
0.50730083,
0.62006395,
0.71092459,
0.76877165,
0.80762057,
0.83458566,
0.85672795,
0.87660538,
0.89251266,
0.90578204,
0.91569411,
0.92541966,
0.93383959,
0.94162004,
0.94940048,
0.95539568,
0.96136424,
0.9670397,
0.97290168,
0.97705835,
0.98116174,
0.98465228,
0.98814282,
0.99152678,
0.99421796,
0.9965894,
0.99840128,
1.0,
]
)
anchor_bins = np.array(
[
0.033976,
0.03529014,
0.03660428,
0.03791842,
0.03923256,
0.0405467,
0.04186084,
0.04317498,
0.04448912,
0.04580326,
0.0471174,
0.04843154,
0.04974568,
0.05105982,
0.05237396,
0.0536881,
0.05500224,
0.05631638,
0.05763052,
0.05894466,
0.0602588,
0.06157294,
0.06288708,
0.06420122,
0.06551536,
0.0668295,
0.06814364,
0.06945778,
0.07077192,
0.07208606,
0.0734002,
0.07471434,
0.07602848,
0.07734262,
0.07865676,
0.0799709,
0.08128504,
0.08259918,
0.08391332,
0.08522746,
0.0865416,
0.08785574,
0.08916988,
0.09048402,
0.09179816,
0.0931123,
0.09442644,
0.09574058,
0.09705472,
0.09836886,
0.099683,
]
)
hist_bins = 50
def amp_info(wav_file_path):
"""
Returns the amplitude info of the wav file.
"""
stats = sox.file_info.stat(wav_file_path)
amp_rms = stats["RMS amplitude"]
amp_max = stats["Maximum amplitude"]
amp_mean = stats["Mean amplitude"]
length = stats["Length (seconds)"]
return {
"amp_rms": amp_rms,
"amp_max": amp_max,
"amp_mean": amp_mean,
"length": length,
"basename": os.path.basename(wav_file_path),
}
# TODO: multi-processing
def statistic_amplitude(src_wav_dir):
"""
Returns the amplitude info of the wav file.
"""
wav_lst = glob(os.path.join(src_wav_dir, "*.wav"))
with ProcessPoolExecutor(max_workers=8) as executor, tqdm(
total=len(wav_lst)
) as progress:
futures = []
for wav_file_path in wav_lst:
future = executor.submit(amp_info, wav_file_path)
future.add_done_callback(lambda p: progress.update())
futures.append(future)
amp_info_lst = [future.result() for future in futures]
amp_info_lst = sorted(amp_info_lst, key=lambda x: x["amp_rms"])
logging.info(
"Average amplitude RMS : {}".format(
np.mean([x["amp_rms"] for x in amp_info_lst])
)
)
# cnt = len(amp_info_lst)
#
# pinhead_cnt = math.floor(cnt * 0.01)
#
# return amp_info_lst[pinhead_cnt : cnt - pinhead_cnt]
return amp_info_lst
# TODO: multi process
def volume_normalize(src_wav_dir, out_wav_dir):
logging.info("Volume statistic proceeding...")
amp_info_lst = statistic_amplitude(src_wav_dir)
logging.info("Volume statistic done.")
rms_amp_lst = [x["amp_rms"] for x in amp_info_lst]
src_hist, src_bins = np.histogram(rms_amp_lst, bins=hist_bins, density=True)
src_hist = src_hist / np.sum(src_hist)
src_hist = np.cumsum(src_hist)
src_hist = np.insert(src_hist, 0, 0.0)
logging.info("Volume normalization proceeding...")
for amp_info in tqdm(amp_info_lst):
rms_amp = amp_info["amp_rms"]
rms_amp = np.clip(rms_amp, src_bins[0], src_bins[-1])
src_idx = np.where(rms_amp >= src_bins)[0][-1]
src_pos = src_hist[src_idx]
anchor_idx = np.where(src_pos >= anchor_hist)[0][-1]
if src_idx == hist_bins or anchor_idx == hist_bins:
rms_amp = anchor_bins[-1]
else:
rms_amp = (rms_amp - src_bins[src_idx]) / (
src_bins[src_idx + 1] - src_bins[src_idx]
) * (anchor_bins[anchor_idx + 1] - anchor_bins[anchor_idx]) + anchor_bins[
anchor_idx
]
scale = rms_amp / amp_info["amp_rms"]
# FIXME: This is a hack to avoid the sound cliping.
sr, data = wavfile.read(os.path.join(src_wav_dir, amp_info["basename"]))
wavfile.write(
os.path.join(out_wav_dir, amp_info["basename"]),
sr,
(data * scale).astype(np.int16),
)
logging.info("Volume normalization done.")
return True
def interp_f0(f0_data):
"""
linear interpolation
"""
f0_data[f0_data < 1] = 0
xp = np.nonzero(f0_data)
yp = f0_data[xp]
x = np.arange(f0_data.size)
contour_f0 = np.interp(x, xp[0], yp).astype(np.float32)
return contour_f0
def frame_nccf(x, y):
norm_coef = (np.sum(x ** 2.0) * np.sum(y ** 2.0) + 1e-30) ** 0.5
return (np.sum(x * y) / norm_coef + 1.0) / 2.0
def get_nccf(pcm_data, f0, min_f0=40, max_f0=800, fs=160, sr=16000):
if pcm_data.dtype == np.int16:
pcm_data = pcm_data.astype(np.float32) / 32768
frame_len = int(sr / 200)
frame_num = int(len(pcm_data) // fs)
frame_num = min(frame_num, len(f0))
pad_len = int(sr / min_f0) + frame_len
pad_zeros = np.zeros([pad_len], dtype=np.float32)
data = np.hstack((pad_zeros, pcm_data.astype(np.float32), pad_zeros))
nccf = np.zeros((frame_num), dtype=np.float32)
for i in range(frame_num):
curr_f0 = np.clip(f0[i], min_f0, max_f0)
lag = int(sr / curr_f0 + 0.5)
j = i * fs + pad_len - frame_len // 2
l_data = data[j : j + frame_len]
l_data -= l_data.mean()
r_data = data[j + lag : j + lag + frame_len]
r_data -= r_data.mean()
nccf[i] = frame_nccf(l_data, r_data)
return nccf
def smooth(data, win_len):
if win_len % 2 == 0:
win_len += 1
hwin = win_len // 2
win = np.hanning(win_len)
win /= win.sum()
data = data.reshape([-1])
pad_data = np.pad(data, hwin, mode="edge")
for i in range(data.shape[0]):
data[i] = np.dot(win, pad_data[i : i + win_len])
return data.reshape([-1, 1])
# TODO: pysptk only supports two methods to estimate the F0 now.
# support: rapt, swipe
# unsupport: reaper, world(DIO)
def RAPT_FUNC(v1, v2, v3, v4, v5):
return pysptk.sptk.rapt(v1.astype(np.float32), fs=v2, hopsize=v3, min=v4, max=v5)
def SWIPE_FUNC(v1, v2, v3, v4, v5):
return pysptk.sptk.swipe(v1.astype(np.float64), fs=v2, hopsize=v3, min=v4, max=v5)
def PYIN_FUNC(v1, v2, v3, v4, v5):
f0_mel = librosa.pyin(
v1.astype(np.float32), sr=v2, frame_length=v3 * 4, fmin=v4, fmax=v5
)[0]
f0_mel = np.where(np.isnan(f0_mel), 0.0, f0_mel)
return f0_mel
def get_pitch(pcm_data, sampling_rate=16000, hop_length=160):
log_f0_list = []
uv_list = []
low, high = 40, 800
cali_f0 = pysptk.sptk.rapt(
pcm_data.astype(np.float32),
fs=sampling_rate,
hopsize=hop_length,
min=low,
max=high,
)
f0_range = np.sort(np.unique(cali_f0))
if len(f0_range) > 20:
low = max(f0_range[10] - 50, low)
high = min(f0_range[-10] + 50, high)
func_dict = {"rapt": RAPT_FUNC, "swipe": SWIPE_FUNC}
for func_name in func_dict:
f0 = func_dict[func_name](pcm_data, sampling_rate, hop_length, low, high)
uv = f0 > 0
if len(f0) < 10 or f0.max() < low:
logging.error("{} method: calc F0 is too low.".format(func_name))
continue
else:
f0 = np.clip(f0, 1e-30, high)
log_f0 = np.log(f0)
contour_log_f0 = interp_f0(log_f0)
log_f0_list.append(contour_log_f0)
uv_list.append(uv)
if len(log_f0_list) == 0:
logging.error("F0 estimation failed.")
return None
min_len = float("inf")
for log_f0 in log_f0_list:
min_len = min(min_len, log_f0.shape[0])
multi_log_f0 = np.zeros([len(log_f0_list), min_len], dtype=np.float32)
multi_uv = np.zeros([len(log_f0_list), min_len], dtype=np.float32)
for i in range(len(log_f0_list)):
multi_log_f0[i, :] = log_f0_list[i][:min_len]
multi_uv[i, :] = uv_list[i][:min_len]
log_f0 = smooth(np.median(multi_log_f0, axis=0), 5)
uv = (smooth(np.median(multi_uv, axis=0), 5) > 0.5).astype(np.float32)
f0 = np.exp(log_f0)
# nccf = get_nccf(
# pcm_data, f0, min_f0=low, max_f0=high, fs=hop_length, sr=sampling_rate
# )
min_len = min(f0.shape[0], uv.shape[0])
return f0[:min_len], uv[:min_len], f0[:min_len] * uv[:min_len]
# TODO: some DSP functions are not implemented.
def get_energy(pcm_data, hop_length, win_length, n_fft):
D = _stft(pcm_data, hop_length, win_length, n_fft)
S, _ = librosa.magphase(D)
energy = np.sqrt(np.sum(S ** 2, axis=0))
return energy.reshape((-1, 1))
def align_length(in_data, tgt_data, basename=None):
if in_data is None or tgt_data is None:
logging.error("{}: Input data is None.".format(basename))
return None
in_len = in_data.shape[0]
tgt_len = tgt_data.shape[0]
if abs(in_len - tgt_len) > 20:
logging.error(
"{}: Input data length mismatches with target data length too much.".format(
basename
)
)
return None
if in_len < tgt_len:
out_data = np.pad(
in_data, ((0, tgt_len - in_len), (0, 0)), "constant", constant_values=0.0
)
else:
out_data = in_data[:tgt_len]
return out_data
def compute_mean(data_list, dims=80):
mean_vector = np.zeros((1, dims))
all_frame_number = 0
for data in tqdm(data_list):
if data is None:
continue
features = data.reshape((-1, dims))
current_frame_number = np.shape(features)[0]
mean_vector += np.sum(features[:, :], axis=0)
all_frame_number += current_frame_number
mean_vector /= float(all_frame_number)
return mean_vector
def compute_std(data_list, mean_vector, dims=80):
std_vector = np.zeros((1, dims))
all_frame_number = 0
for data in tqdm(data_list):
if data is None:
continue
features = data.reshape((-1, dims))
current_frame_number = np.shape(features)[0]
mean_matrix = np.tile(mean_vector, (current_frame_number, 1))
std_vector += np.sum((features[:, :] - mean_matrix) ** 2, axis=0)
all_frame_number += current_frame_number
std_vector /= float(all_frame_number)
std_vector = std_vector ** 0.5
return std_vector
F0_MIN = 0.0
F0_MAX = 800.0
ENERGY_MIN = 0.0
ENERGY_MAX = 200.0
CLIP_FLOOR = 1e-3
def f0_norm_min_max(f0):
zero_idxs = np.where(f0 <= CLIP_FLOOR)[0]
res = (2 * f0 - F0_MIN - F0_MAX) / (F0_MAX - F0_MIN)
res[zero_idxs] = 0.0
return res
def f0_denorm_min_max(f0):
zero_idxs = np.where(f0 == 0.0)[0]
res = (f0 * (F0_MAX - F0_MIN) + F0_MIN + F0_MAX) / 2
res[zero_idxs] = 0.0
return res
def energy_norm_min_max(energy):
zero_idxs = np.where(energy == 0.0)[0]
res = (2 * energy - ENERGY_MIN - ENERGY_MAX) / (ENERGY_MAX - ENERGY_MIN)
res[zero_idxs] = 0.0
return res
def energy_denorm_min_max(energy):
zero_idxs = np.where(energy == 0.0)[0]
res = (energy * (ENERGY_MAX - ENERGY_MIN) + ENERGY_MIN + ENERGY_MAX) / 2
res[zero_idxs] = 0.0
return res
def norm_log(x):
zero_idxs = np.where(x <= CLIP_FLOOR)[0]
x[zero_idxs] = 1.0
res = np.log(x)
return res
def denorm_log(x):
zero_idxs = np.where(x == 0.0)[0]
res = np.exp(x)
res[zero_idxs] = 0.0
return res
def f0_norm_mean_std(x, mean, std):
zero_idxs = np.where(x == 0.0)[0]
x = (x - mean) / std
x[zero_idxs] = 0.0
return x
def norm_mean_std(x, mean, std):
x = (x - mean) / std
return x
# TODO: This is a hardcode implementation for mit-style interval label
# TODO: Try to implement a more general version
def parse_interval_file(file_path, sampling_rate, hop_length):
with open(file_path, "r") as f:
lines = f.readlines()
# second
frame_intervals = 1.0 * hop_length / sampling_rate
skip_lines = 12
dur_list = []
phone_list = []
line_index = skip_lines
while line_index < len(lines):
phone_begin = float(lines[line_index])
phone_end = float(lines[line_index + 1])
phone = lines[line_index + 2].strip()[1:-1]
dur_list.append(int(round((phone_end - phone_begin) / frame_intervals)))
phone_list.append(phone)
line_index += 3
if len(dur_list) == 0 or len(phone_list) == 0:
return None
return np.array(dur_list), phone_list
def average_by_duration(x, durs):
if x is None or durs is None:
return None
durs_cum = np.cumsum(np.pad(durs, (1, 0), "constant"))
# average over each symbol's duraion
x_symbol = np.zeros((durs.shape[0],), dtype=np.float32)
for idx, start, end in zip(range(durs.shape[0]), durs_cum[:-1], durs_cum[1:]):
values = x[start:end][np.where(x[start:end] != 0.0)[0]]
x_symbol[idx] = np.mean(values) if len(values) > 0 else 0.0
return x_symbol.astype(np.float32)
def encode_16bits(x):
if x.min() > -1.0 and x.max() < 1.0:
return np.clip(x * 2 ** 15, -(2 ** 15), 2 ** 15 - 1).astype(np.int16)
else:
return x
if __name__ == "__main__":
import sys
infile = sys.argv[1]
sr, pcm_data = wavfile.read(infile)
res = get_pitch(pcm_data, 24000, 240)
print(res)
import logging
import os
import sys
import argparse
import yaml
import time
import codecs
ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # NOQA: E402
sys.path.insert(0, os.path.dirname(ROOT_PATH)) # NOQA: E402
try:
from kantts.preprocess.audio_processor.audio_processor import AudioProcessor
from kantts.preprocess.se_processor.se_processor import SpeakerEmbeddingProcessor
from kantts.preprocess.script_convertor.TextScriptConvertor import (
TextScriptConvertor,
)
from kantts.preprocess.fp_processor import FpProcessor, is_fp_line
from kantts.preprocess.languages import languages
from kantts.datasets.dataset import AM_Dataset, Voc_Dataset
from kantts.utils.log import logging_to_file, get_git_revision_hash
except ImportError:
raise ImportError("Please install kantts.")
logging.basicConfig(
format="%(asctime)s,%(msecs)d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
datefmt="%Y-%m-%d:%H:%M:%S",
level=logging.INFO,
)
LANGUAGES_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "languages")
def gen_metafile(
voice_output_dir,
fp_enable=False,
badlist=None,
split_ratio=0.98,
):
voc_train_meta = os.path.join(voice_output_dir, "train.lst")
voc_valid_meta = os.path.join(voice_output_dir, "valid.lst")
if not os.path.exists(voc_train_meta) or not os.path.exists(voc_valid_meta):
Voc_Dataset.gen_metafile(
os.path.join(voice_output_dir, "wav"),
voice_output_dir,
split_ratio,
)
logging.info("Voc metafile generated.")
raw_metafile = os.path.join(voice_output_dir, "raw_metafile.txt")
am_train_meta = os.path.join(voice_output_dir, "am_train.lst")
am_valid_meta = os.path.join(voice_output_dir, "am_valid.lst")
if not os.path.exists(am_train_meta) or not os.path.exists(am_valid_meta):
AM_Dataset.gen_metafile(
raw_metafile,
voice_output_dir,
am_train_meta,
am_valid_meta,
badlist,
split_ratio,
)
logging.info("AM metafile generated.")
if fp_enable:
fpadd_metafile = os.path.join(voice_output_dir, "fpadd_metafile.txt")
am_train_meta = os.path.join(voice_output_dir, "am_fpadd_train.lst")
am_valid_meta = os.path.join(voice_output_dir, "am_fpadd_valid.lst")
if not os.path.exists(am_train_meta) or not os.path.exists(am_valid_meta):
AM_Dataset.gen_metafile(
fpadd_metafile,
voice_output_dir,
am_train_meta,
am_valid_meta,
badlist,
split_ratio,
)
logging.info("AM fpaddmetafile generated.")
fprm_metafile = os.path.join(voice_output_dir, "fprm_metafile.txt")
am_train_meta = os.path.join(voice_output_dir, "am_fprm_train.lst")
am_valid_meta = os.path.join(voice_output_dir, "am_fprm_valid.lst")
if not os.path.exists(am_train_meta) or not os.path.exists(am_valid_meta):
AM_Dataset.gen_metafile(
fprm_metafile,
voice_output_dir,
am_train_meta,
am_valid_meta,
badlist,
split_ratio,
)
logging.info("AM fprmmetafile generated.")
# TODO: Zh-CN as default
def process_data(
voice_input_dir,
voice_output_dir,
audio_config,
speaker_name=None,
targetLang="PinYin",
skip_script=False,
se_model=None,
):
foreignLang = "EnUS"
# check if the vocie is supported
if not os.path.exists(os.path.join(voice_input_dir, "emotion_tag.txt")):
emo_tag_path = None
else:
emo_tag_path = os.path.join(voice_input_dir, "emotion_tag.txt")
phoneset_path = os.path.join(
LANGUAGES_DIR, targetLang, languages[targetLang]["phoneset_path"]
)
posset_path = os.path.join(
LANGUAGES_DIR, targetLang, languages[targetLang]["posset_path"]
)
f2t_map_path = os.path.join(
LANGUAGES_DIR, targetLang, languages[targetLang]["f2t_map_path"]
)
s2p_map_path = os.path.join(
LANGUAGES_DIR, targetLang, languages[targetLang]["s2p_map_path"]
)
# dir of plain text/sentences for training byte based model
plain_text_dir = os.path.join(voice_input_dir, "text")
if speaker_name is None:
speaker_name = os.path.basename(voice_input_dir)
if audio_config is not None:
with open(audio_config, "r") as f:
config = yaml.load(f, Loader=yaml.Loader)
config["create_time"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
config["git_revision_hash"] = get_git_revision_hash()
se_enable = config["audio_config"].get("se_feature", False)
with open(os.path.join(voice_output_dir, "audio_config.yaml"), "w") as f:
yaml.dump(config, f, Dumper=yaml.Dumper, default_flow_style=None)
if skip_script:
logging.info("Skip script conversion")
raw_metafile = None
# Script processor
if not skip_script:
if os.path.exists(plain_text_dir):
TextScriptConvertor.turn_text_into_bytes(
os.path.join(plain_text_dir, "text.txt"),
os.path.join(voice_output_dir, "raw_metafile.txt"),
speaker_name,
)
fp_enable = False
else:
tsc = TextScriptConvertor(
phoneset_path,
posset_path,
targetLang,
foreignLang,
f2t_map_path,
s2p_map_path,
emo_tag_path,
speaker_name,
)
tsc.process(
os.path.join(voice_input_dir, "prosody", "prosody.txt"),
os.path.join(voice_output_dir, "Script.xml"),
os.path.join(voice_output_dir, "raw_metafile.txt"),
)
prosody = os.path.join(voice_input_dir, "prosody", "prosody.txt")
# FP processor
with codecs.open(prosody, "r", "utf-8") as f:
lines = f.readlines()
fp_enable = is_fp_line(lines[1])
raw_metafile = os.path.join(voice_output_dir, "raw_metafile.txt")
if fp_enable:
FP = FpProcessor()
FP.process(
voice_output_dir,
prosody,
raw_metafile,
)
logging.info("Processing fp done.")
# Audio processor
ap = AudioProcessor(config["audio_config"])
ap.process(
voice_input_dir,
voice_output_dir,
raw_metafile,
)
logging.info("Processing audio done.")
# SpeakerEmbedding processor
if se_enable:
sep = SpeakerEmbeddingProcessor()
sep.process(
voice_output_dir,
se_model,
)
logging.info("Processing speaker embedding done.")
logging.info("Processing done.")
# Generate Voc&AM metafile
# TODO: train/valid ratio setting
gen_metafile(voice_output_dir, fp_enable, ap.badcase_list)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Dataset preprocessor")
parser.add_argument("--voice_input_dir", type=str, required=True)
parser.add_argument("--voice_output_dir", type=str, required=True)
parser.add_argument("--audio_config", type=str, required=True)
parser.add_argument("--speaker", type=str, default=None, help="speaker")
parser.add_argument("--lang", type=str, default="PinYin", help="target language")
parser.add_argument(
"--se_model",
type=str,
default="../pre_data/speaker_embeddding/se.*",
help="speaker embedding extractor model",
)
parser.add_argument(
"--skip_script", action="store_true", help="skip script converting"
)
args = parser.parse_args()
os.makedirs(args.voice_output_dir, exist_ok=True)
logging_to_file(os.path.join(args.voice_output_dir, "data_process_stdout.log"))
try:
process_data(
args.voice_input_dir,
args.voice_output_dir,
args.audio_config,
args.speaker,
args.lang,
args.skip_script,
args.se_model,
)
except (Exception, KeyboardInterrupt) as e:
logging.error(e, exc_info=True)
import os
import logging
import random
def is_fp_line(line):
fp_category_list = ["FP", "I", "N", "Q"]
elements = line.strip().split(" ")
res = True
for ele in elements:
if ele not in fp_category_list:
res = False
break
return res
class FpProcessor:
def __init__(self):
# TODO: Add more audio processing methods.
self.res = []
def is_fp_line(line):
fp_category_list = ["FP", "I", "N", "Q"]
elements = line.strip().split(" ")
res = True
for ele in elements:
if ele not in fp_category_list:
res = False
break
return res
# TODO: adjust idx judgment rule
def addfp(self, voice_output_dir, prosody, raw_metafile_lines):
fp_category_list = ["FP", "I", "N"]
f = open(prosody)
prosody_lines = f.readlines()
f.close()
idx = ""
fp = ""
fp_label_dict = {}
i = 0
while i < len(prosody_lines):
if len(prosody_lines[i].strip().split("\t")) == 2:
idx = prosody_lines[i].strip().split("\t")[0]
i += 1
else:
fp_enable = is_fp_line(prosody_lines[i])
if fp_enable:
fp = prosody_lines[i].strip().split("\t")[0].split(" ")
for label in fp:
if label not in fp_category_list:
logging.warning("fp label not in fp_category_list")
break
i += 4
else:
fp = [
"N"
for _ in range(
len(
prosody_lines[i]
.strip()
.split("\t")[0]
.replace("/ ", "")
.replace(". ", "")
.split(" ")
)
)
]
i += 1
fp_label_dict[idx] = fp
fpadd_metafile = os.path.join(voice_output_dir, "fpadd_metafile.txt")
f_out = open(fpadd_metafile, "w")
for line in raw_metafile_lines:
tokens = line.strip().split("\t")
if len(tokens) == 2:
uttname = tokens[0]
symbol_sequences = tokens[1].split(" ")
error_flag = False
idx = 0
out_str = uttname + "\t"
for this_symbol_sequence in symbol_sequences:
emotion = this_symbol_sequence.split("$")[4]
this_symbol_sequence = this_symbol_sequence.replace(
emotion, "emotion_neutral"
)
if idx < len(fp_label_dict[uttname]):
if fp_label_dict[uttname][idx] == "FP":
if "none" not in this_symbol_sequence:
this_symbol_sequence = this_symbol_sequence.replace(
"emotion_neutral", "emotion_disgust"
)
syllable_label = this_symbol_sequence.split("$")[2]
if syllable_label == "s_both" or syllable_label == "s_end":
idx += 1
elif idx > len(fp_label_dict[uttname]):
logging.warning(uttname + " not match")
error_flag = True
out_str = out_str + this_symbol_sequence + " "
# if idx != len(fp_label_dict[uttname]):
# logging.warning(
# "{} length mismatch, length: {} ".format(
# idx, len(fp_label_dict[uttname])
# )
# )
if not error_flag:
f_out.write(out_str.strip() + "\n")
f_out.close()
return fpadd_metafile
def removefp(self, voice_output_dir, fpadd_metafile, raw_metafile_lines):
f = open(fpadd_metafile)
fpadd_metafile_lines = f.readlines()
f.close()
fprm_metafile = os.path.join(voice_output_dir, "fprm_metafile.txt")
f_out = open(fprm_metafile, "w")
for i in range(len(raw_metafile_lines)):
tokens = raw_metafile_lines[i].strip().split("\t")
symbol_sequences = tokens[1].split(" ")
fpadd_tokens = fpadd_metafile_lines[i].strip().split("\t")
fpadd_symbol_sequences = fpadd_tokens[1].split(" ")
error_flag = False
out_str = tokens[0] + "\t"
idx = 0
length = len(symbol_sequences)
while idx < length:
if "$emotion_disgust" in fpadd_symbol_sequences[idx]:
if idx + 1 < length and "none" in fpadd_symbol_sequences[idx + 1]:
idx = idx + 2
else:
idx = idx + 1
continue
out_str = out_str + symbol_sequences[idx] + " "
idx = idx + 1
if not error_flag:
f_out.write(out_str.strip() + "\n")
f_out.close()
def process(self, voice_output_dir, prosody, raw_metafile):
with open(raw_metafile, "r") as f:
lines = f.readlines()
random.shuffle(lines)
fpadd_metafile = self.addfp(voice_output_dir, prosody, lines)
self.removefp(voice_output_dir, fpadd_metafile, lines)
<?xml version="1.0" encoding="utf-8"?>
<phoneSet xmlns="http://schemas.alibaba-inc.com/tts">
<phone>
<id>0</id>
<name>a_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>1</id>
<name>ai_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>2</id>
<name>an_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>3</id>
<name>ang_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>4</id>
<name>ao_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>5</id>
<name>b_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>6</id>
<name>c_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>7</id>
<name>ch_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>8</id>
<name>d_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>9</id>
<name>e_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>10</id>
<name>ei_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>11</id>
<name>en_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>12</id>
<name>eng_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>doublelips</ap>
<am>stop</am>
</phone>
<phone>
<id>13</id>
<name>er_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>doublelips</ap>
<am>stop</am>
</phone>
<phone>
<id>14</id>
<name>f_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>doublelips</ap>
<am>stop</am>
</phone>
<phone>
<id>15</id>
<name>g_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>16</id>
<name>h_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>backtongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>17</id>
<name>i_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>backtongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>18</id>
<name>ia_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>19</id>
<name>ian_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>20</id>
<name>iang_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>21</id>
<name>iao_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>22</id>
<name>ie_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>23</id>
<name>ih_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>24</id>
<name>ii_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>25</id>
<name>in_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>26</id>
<name>ing_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>27</id>
<name>io_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>28</id>
<name>iong_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>29</id>
<name>iou_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>30</id>
<name>j_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>31</id>
<name>k_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>32</id>
<name>l_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>33</id>
<name>m_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>34</id>
<name>n_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>35</id>
<name>o_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>36</id>
<name>ong_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>37</id>
<name>ou_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>38</id>
<name>p_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>liptooth</ap>
<am>fricative</am>
</phone>
<phone>
<id>39</id>
<name>q_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>liptooth</ap>
<am>fricative</am>
</phone>
<phone>
<id>40</id>
<name>r_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>velar</ap>
<am>stop</am>
</phone>
<phone>
<id>41</id>
<name>s_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>42</id>
<name>sh_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>43</id>
<name>t_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>44</id>
<name>u_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>velar</ap>
<am>stop</am>
</phone>
<phone>
<id>45</id>
<name>ua_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>velar</ap>
<am>fricative</am>
</phone>
<phone>
<id>46</id>
<name>uai_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>velar</ap>
<am>fricative</am>
</phone>
<phone>
<id>47</id>
<name>uan_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>48</id>
<name>uang_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>49</id>
<name>uei_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>50</id>
<name>uen_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>51</id>
<name>ueng_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>52</id>
<name>uo_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>53</id>
<name>v_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>54</id>
<name>van_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>55</id>
<name>ve_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>56</id>
<name>vn_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>57</id>
<name>xx_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>58</id>
<name>z_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>59</id>
<name>zh_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>60</id>
<name>w_c</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>61</id>
<name>y_c</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>62</id>
<name>ga</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>lateral</am>
</phone>
<phone>
<id>63</id>
<name>ge</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>lateral</am>
</phone>
<phone>
<id>64</id>
<name>go</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>lateral</am>
</phone>
<phone>
<id>65</id>
<name>aa</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>66</id>
<name>ae</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>67</id>
<name>ah</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>68</id>
<name>ao</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>69</id>
<name>aw</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>70</id>
<name>ay</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>71</id>
<name>b</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>doublelips</ap>
<am>stop</am>
</phone>
<phone>
<id>72</id>
<name>ch</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>backtongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>73</id>
<name>d</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>74</id>
<name>dh</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>75</id>
<name>eh</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>76</id>
<name>er</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>77</id>
<name>ey</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>78</id>
<name>f</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>liptooth</ap>
<am>fricative</am>
</phone>
<phone>
<id>79</id>
<name>g</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>velar</ap>
<am>stop</am>
</phone>
<phone>
<id>80</id>
<name>hh</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>81</id>
<name>ih</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>82</id>
<name>iy</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>83</id>
<name>jh</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>84</id>
<name>k</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>velar</ap>
<am>stop</am>
</phone>
<phone>
<id>85</id>
<name>l</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>lateral</am>
</phone>
<phone>
<id>86</id>
<name>m</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>doublelips</ap>
<am>nasal</am>
</phone>
<phone>
<id>87</id>
<name>n</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>nasal</am>
</phone>
<phone>
<id>88</id>
<name>ng</name>
<cv>consonant</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>89</id>
<name>ow</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>90</id>
<name>oy</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>91</id>
<name>p</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>doublelips</ap>
<am>stop</am>
</phone>
<phone>
<id>92</id>
<name>r</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>backtongue</ap>
<am>fricative</am>
</phone>
<phone>
<id>93</id>
<name>s</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>fricative</am>
</phone>
<phone>
<id>94</id>
<name>sh</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>backtongue</ap>
<am>fricative</am>
</phone>
<phone>
<id>95</id>
<name>t</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>96</id>
<name>th</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>97</id>
<name>uh</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>98</id>
<name>uw</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>99</id>
<name>v</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>100</id>
<name>w</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>101</id>
<name>y</name>
<cv>consonant</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>102</id>
<name>z</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>103</id>
<name>zh</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>backtongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>104</id>
<name>air_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>105</id>
<name>angr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>106</id>
<name>anr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>107</id>
<name>aor_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>108</id>
<name>ar_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>109</id>
<name>eir_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>110</id>
<name>engr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>doublelips</ap>
<am>stop</am>
</phone>
<phone>
<id>111</id>
<name>enr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>112</id>
<name>iangr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>113</id>
<name>ianr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>114</id>
<name>iaor_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>115</id>
<name>iar_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>116</id>
<name>ier_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>117</id>
<name>ihr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>118</id>
<name>iir_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>119</id>
<name>ingr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>120</id>
<name>inr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>121</id>
<name>iongr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>122</id>
<name>iour_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>123</id>
<name>ir_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>backtongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>124</id>
<name>ongr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>125</id>
<name>or_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>126</id>
<name>our_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>127</id>
<name>uair_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>velar</ap>
<am>fricative</am>
</phone>
<phone>
<id>128</id>
<name>uangr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>129</id>
<name>uanr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>130</id>
<name>uar_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>velar</ap>
<am>fricative</am>
</phone>
<phone>
<id>131</id>
<name>ueir_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>132</id>
<name>uenr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>133</id>
<name>uor_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>134</id>
<name>ur_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>velar</ap>
<am>stop</am>
</phone>
<phone>
<id>135</id>
<name>vanr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>136</id>
<name>ver_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>137</id>
<name>vnr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>138</id>
<name>vr_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>146</id>
<name>pau</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
</phoneSet>
<?xml version="1.0" encoding="utf-8"?>
<posSet xmlns="http://schemas.alibaba-inc.com/tts">
<pos>
<id>1</id>
<name>a</name>
<desc>todo</desc>
</pos>
<pos>
<id>2</id>
<name>b</name>
<desc>todo</desc>
</pos>
<pos>
<id>3</id>
<name>c</name>
<desc>todo</desc>
</pos>
<pos>
<id>4</id>
<name>d</name>
<desc>todo</desc>
</pos>
<pos>
<id>5</id>
<name>e</name>
<desc>todo</desc>
</pos>
<pos>
<id>6</id>
<name>f</name>
<desc>todo</desc>
</pos>
<pos>
<id>7</id>
<name>g</name>
<desc>todo</desc>
<sub>
<pos>
<id>8</id>
<name>gb</name>
<desc>todo</desc>
</pos>
</sub>
</pos>
<pos>
<id>9</id>
<name>h</name>
<desc>todo</desc>
</pos>
<pos>
<id>10</id>
<name>i</name>
<desc>todo</desc>
</pos>
<pos>
<id>11</id>
<name>j</name>
<desc>todo</desc>
</pos>
<pos>
<id>12</id>
<name>k</name>
<desc>todo</desc>
</pos>
<pos>
<id>13</id>
<name>l</name>
<desc>todo</desc>
</pos>
<pos>
<id>14</id>
<name>m</name>
<desc>todo</desc>
</pos>
<pos>
<id>15</id>
<name>n</name>
<desc>todo</desc>
<sub>
<pos>
<id>16</id>
<name>nz</name>
<desc>todo</desc>
</pos>
</sub>
</pos>
<pos>
<id>17</id>
<name>o</name>
<desc>todo</desc>
</pos>
<pos>
<id>18</id>
<name>p</name>
<desc>todo</desc>
</pos>
<pos>
<id>19</id>
<name>q</name>
<desc>todo</desc>
</pos>
<pos>
<id>20</id>
<name>r</name>
<desc>todo</desc>
</pos>
<pos>
<id>21</id>
<name>s</name>
<desc>todo</desc>
</pos>
<pos>
<id>22</id>
<name>t</name>
<desc>todo</desc>
</pos>
<pos>
<id>23</id>
<name>u</name>
<desc>todo</desc>
</pos>
<pos>
<id>24</id>
<name>v</name>
<desc>todo</desc>
</pos>
<pos>
<id>25</id>
<name>w</name>
<desc>todo</desc>
</pos>
<pos>
<id>26</id>
<name>x</name>
<desc>todo</desc>
</pos>
<pos>
<id>27</id>
<name>y</name>
<desc>todo</desc>
</pos>
<pos>
<id>28</id>
<name>z</name>
<desc>todo</desc>
</pos>
</posSet>
a ga a_c
ai ga ai_c
an ga an_c
ang ga ang_c
ao ga ao_c
ba b_c a_c
bai b_c ai_c
ban b_c an_c
bang b_c ang_c
bao b_c ao_c
bei b_c ei_c
ben b_c en_c
beng b_c eng_c
bi b_c i_c
bian b_c ian_c
biao b_c iao_c
bie b_c ie_c
bin b_c in_c
bing b_c ing_c
bo b_c o_c
bu b_c u_c
ca c_c a_c
cai c_c ai_c
can c_c an_c
cang c_c ang_c
cao c_c ao_c
ce c_c e_c
cen c_c en_c
ceng c_c eng_c
cha ch_c a_c
chai ch_c ai_c
chan ch_c an_c
chang ch_c ang_c
chao ch_c ao_c
che ch_c e_c
chen ch_c en_c
cheng ch_c eng_c
chi ch_c ih_c
chong ch_c ong_c
chou ch_c ou_c
chu ch_c u_c
chua ch_c ua_c
chuai ch_c uai_c
chuan ch_c uan_c
chuang ch_c uang_c
chui ch_c uei_c
chun ch_c uen_c
chuo ch_c uo_c
ci c_c ii_c
cong c_c ong_c
cou c_c ou_c
cu c_c u_c
cuan c_c uan_c
cui c_c uei_c
cun c_c uen_c
cuo c_c uo_c
da d_c a_c
dai d_c ai_c
dan d_c an_c
dang d_c ang_c
dao d_c ao_c
de d_c e_c
dei d_c ei_c
den d_c en_c
deng d_c eng_c
di d_c i_c
dia d_c ia_c
dian d_c ian_c
diao d_c iao_c
die d_c ie_c
ding d_c ing_c
diu d_c iou_c
dong d_c ong_c
dou d_c ou_c
du d_c u_c
duan d_c uan_c
dui d_c uei_c
dun d_c uen_c
duo d_c uo_c
e ge e_c
ei ge ei_c
en ge en_c
eng ge eng_c
er ge er_c
fa f_c a_c
fan f_c an_c
fang f_c ang_c
fei f_c ei_c
fen f_c en_c
feng f_c eng_c
fo f_c o_c
fou f_c ou_c
fu f_c u_c
ga g_c a_c
gai g_c ai_c
gan g_c an_c
gang g_c ang_c
gao g_c ao_c
ge g_c e_c
gei g_c ei_c
gen g_c en_c
geng g_c eng_c
gong g_c ong_c
gou g_c ou_c
gu g_c u_c
gua g_c ua_c
guai g_c uai_c
guan g_c uan_c
guang g_c uang_c
gui g_c uei_c
gun g_c uen_c
guo g_c uo_c
ha h_c a_c
hai h_c ai_c
han h_c an_c
hang h_c ang_c
hao h_c ao_c
he h_c e_c
hei h_c ei_c
hen h_c en_c
heng h_c eng_c
hong h_c ong_c
hou h_c ou_c
hu h_c u_c
hua h_c ua_c
huai h_c uai_c
huan h_c uan_c
huang h_c uang_c
hui h_c uei_c
hun h_c uen_c
huo h_c uo_c
ji j_c i_c
jia j_c ia_c
jian j_c ian_c
jiang j_c iang_c
jiao j_c iao_c
jie j_c ie_c
jin j_c in_c
jing j_c ing_c
jiong j_c iong_c
jiu j_c iou_c
jv j_c v_c
jvan j_c van_c
jve j_c ve_c
jvn j_c vn_c
ka k_c a_c
kai k_c ai_c
kan k_c an_c
kang k_c ang_c
kao k_c ao_c
ke k_c e_c
kei k_c ei_c
ken k_c en_c
keng k_c eng_c
kong k_c ong_c
kou k_c ou_c
ku k_c u_c
kua k_c ua_c
kuai k_c uai_c
kuan k_c uan_c
kuang k_c uang_c
kui k_c uei_c
kun k_c uen_c
kuo k_c uo_c
la l_c a_c
lai l_c ai_c
lan l_c an_c
lang l_c ang_c
lao l_c ao_c
le l_c e_c
lei l_c ei_c
leng l_c eng_c
li l_c i_c
lia l_c ia_c
lian l_c ian_c
liang l_c iang_c
liao l_c iao_c
lie l_c ie_c
lin l_c in_c
ling l_c ing_c
liu l_c iou_c
lo l_c o_c
long l_c ong_c
lou l_c ou_c
lu l_c u_c
luan l_c uan_c
lun l_c uen_c
luo l_c uo_c
lv l_c v_c
lve l_c ve_c
ma m_c a_c
mai m_c ai_c
man m_c an_c
mang m_c ang_c
mao m_c ao_c
me m_c e_c
mei m_c ei_c
men m_c en_c
meng m_c eng_c
mi m_c i_c
mian m_c ian_c
miao m_c iao_c
mie m_c ie_c
min m_c in_c
ming m_c ing_c
miu m_c iou_c
mo m_c o_c
mou m_c ou_c
mu m_c u_c
na n_c a_c
nai n_c ai_c
nan n_c an_c
nang n_c ang_c
nao n_c ao_c
ne n_c e_c
nei n_c ei_c
nen n_c en_c
neng n_c eng_c
ni n_c i_c
nian n_c ian_c
niang n_c iang_c
niao n_c iao_c
nie n_c ie_c
nin n_c in_c
ning n_c ing_c
niu n_c iou_c
nong n_c ong_c
nou n_c ou_c
nu n_c u_c
nuan n_c uan_c
nun n_c uen_c
nuo n_c uo_c
nv n_c v_c
nve n_c ve_c
o go o_c
ou go ou_c
pa p_c a_c
pai p_c ai_c
pan p_c an_c
pang p_c ang_c
pao p_c ao_c
pei p_c ei_c
pen p_c en_c
peng p_c eng_c
pi p_c i_c
pian p_c ian_c
piao p_c iao_c
pie p_c ie_c
pin p_c in_c
ping p_c ing_c
po p_c o_c
pou p_c ou_c
pu p_c u_c
qi q_c i_c
qia q_c ia_c
qian q_c ian_c
qiang q_c iang_c
qiao q_c iao_c
qie q_c ie_c
qin q_c in_c
qing q_c ing_c
qiong q_c iong_c
qiu q_c iou_c
qv q_c v_c
qvan q_c van_c
qve q_c ve_c
qvn q_c vn_c
ran r_c an_c
rang r_c ang_c
rao r_c ao_c
re r_c e_c
ren r_c en_c
reng r_c eng_c
ri r_c ih_c
rong r_c ong_c
rou r_c ou_c
ru r_c u_c
ruan r_c uan_c
rui r_c uei_c
run r_c uen_c
ruo r_c uo_c
sa s_c a_c
sai s_c ai_c
san s_c an_c
sang s_c ang_c
sao s_c ao_c
se s_c e_c
sen s_c en_c
seng s_c eng_c
sha sh_c a_c
shai sh_c ai_c
shan sh_c an_c
shang sh_c ang_c
shao sh_c ao_c
she sh_c e_c
shei sh_c ei_c
shen sh_c en_c
sheng sh_c eng_c
shi sh_c ih_c
shou sh_c ou_c
shu sh_c u_c
shua sh_c ua_c
shuai sh_c uai_c
shuan sh_c uan_c
shuang sh_c uang_c
shui sh_c uei_c
shun sh_c uen_c
shuo sh_c uo_c
si s_c ii_c
song s_c ong_c
sou s_c ou_c
su s_c u_c
suan s_c uan_c
sui s_c uei_c
sun s_c uen_c
suo s_c uo_c
ta t_c a_c
tai t_c ai_c
tan t_c an_c
tang t_c ang_c
tao t_c ao_c
te t_c e_c
tei t_c ei_c
teng t_c eng_c
ti t_c i_c
tian t_c ian_c
tiao t_c iao_c
tie t_c ie_c
ting t_c ing_c
tong t_c ong_c
tou t_c ou_c
tu t_c u_c
tuan t_c uan_c
tui t_c uei_c
tun t_c uen_c
tuo t_c uo_c
wa w_c a_c
wai w_c ai_c
wan w_c an_c
wang w_c ang_c
wei w_c ei_c
wen w_c en_c
weng w_c eng_c
wo w_c o_c
wu w_c u_c
xi xx_c i_c
xia xx_c ia_c
xian xx_c ian_c
xiang xx_c iang_c
xiao xx_c iao_c
xie xx_c ie_c
xin xx_c in_c
xing xx_c ing_c
xiong xx_c iong_c
xiu xx_c iou_c
xv xx_c v_c
xvan xx_c van_c
xve xx_c ve_c
xvn xx_c vn_c
ya y_c a_c
yan y_c an_c
yang y_c ang_c
yao y_c ao_c
ye y_c e_c
yi y_c i_c
yin y_c in_c
ying y_c ing_c
yo y_c o_c
yong y_c ong_c
you y_c ou_c
yv y_c v_c
yvan y_c van_c
yve y_c ve_c
yvn y_c vn_c
za z_c a_c
zai z_c ai_c
zan z_c an_c
zang z_c ang_c
zao z_c ao_c
ze z_c e_c
zei z_c ei_c
zen z_c en_c
zeng z_c eng_c
zha zh_c a_c
zhai zh_c ai_c
zhan zh_c an_c
zhang zh_c ang_c
zhao zh_c ao_c
zhe zh_c e_c
zhei zh_c ei_c
zhen zh_c en_c
zheng zh_c eng_c
zhi zh_c ih_c
zhong zh_c ong_c
zhou zh_c ou_c
zhu zh_c u_c
zhua zh_c ua_c
zhuai zh_c uai_c
zhuan zh_c uan_c
zhuang zh_c uang_c
zhui zh_c uei_c
zhun zh_c uen_c
zhuo zh_c uo_c
zi z_c ii_c
zong z_c ong_c
zou z_c ou_c
zu z_c u_c
zuan z_c uan_c
zui z_c uei_c
zun z_c uen_c
zuo z_c uo_c
bangr b_c angr_c
banr b_c anr_c
baor b_c aor_c
bar b_c ar_c
beir b_c eir_c
bengr b_c engr_c
benr b_c enr_c
bianr b_c ianr_c
biaor b_c iaor_c
bingr b_c ingr_c
bir b_c ir_c
bor b_c or_c
bur b_c ur_c
caor c_c aor_c
car c_c ar_c
changr ch_c angr_c
chaor ch_c aor_c
char ch_c ar_c
chengr ch_c engr_c
cher ch_c er_c
chir ch_c ihr_c
chongr ch_c ongr_c
chour ch_c our_c
chuangr ch_c uangr_c
chuanr ch_c uanr_c
chuir ch_c ueir_c
chunr ch_c uenr_c
chuor ch_c uor_c
chur ch_c ur_c
cir c_c iir_c
congr c_c ongr_c
cuir c_c ueir_c
cunr c_c uenr_c
cuor c_c uor_c
dair d_c air_c
danr d_c anr_c
dangr d_c angr_c
daor d_c aor_c
dengr d_c engr_c
dianr d_c ianr_c
diaor d_c iaor_c
dier d_c ier_c
dingr d_c ingr_c
dir d_c ir_c
dongr d_c ongr_c
dour d_c our_c
duanr d_c uanr_c
duir d_c ueir_c
dunr d_c uenr_c
duor d_c uor_c
dur d_c ur_c
fangr f_c angr_c
fanr f_c anr_c
far f_c ar_c
fengr f_c engr_c
fenr f_c enr_c
fur f_c ur_c
gair g_c air_c
ganr g_c anr_c
gaor g_c aor_c
gengr g_c engr_c
genr g_c enr_c
ger g_c er_c
gongr g_c ongr_c
gour g_c our_c
guair g_c uair_c
guanr g_c uanr_c
guar g_c uar_c
guir g_c ueir_c
gunr g_c uenr_c
guor g_c uor_c
gur g_c ur_c
hair h_c air_c
hanr h_c anr_c
haor h_c aor_c
heir h_c eir_c
her h_c er_c
hour h_c our_c
huanr h_c uanr_c
huangr h_c uangr_c
huar h_c uar_c
huir h_c ueir_c
hunr h_c uenr_c
huor h_c uor_c
hur h_c ur_c
jianr j_c ianr_c
jiaor j_c iaor_c
jiar j_c iar_c
jier j_c ier_c
jingr j_c ingr_c
jinr j_c inr_c
jir j_c ir_c
jiur j_c iour_c
jvanr j_c vanr_c
jver j_c ver_c
jvnr j_c vnr_c
kair k_c air_c
kanr k_c anr_c
kaor k_c aor_c
kengr k_c engr_c
ker k_c er_c
kongr k_c ongr_c
kour k_c our_c
kuair k_c uair_c
kuangr k_c uangr_c
kuanr k_c uanr_c
kunr k_c uenr_c
lanr l_c anr_c
laor l_c aor_c
lar l_c ar_c
leir l_c eir_c
lengr l_c engr_c
ler l_c er_c
liangr l_c iangr_c
lianr l_c ianr_c
liaor l_c iaor_c
liar l_c iar_c
lingr l_c ingr_c
lir l_c ir_c
liur l_c iour_c
lour l_c our_c
luor l_c uor_c
lunr l_c uenr_c
lur l_c ur_c
lvr l_c vr_c
mair m_c air_c
manr m_c anr_c
mangr m_c angr_c
maor m_c aor_c
mar m_c ar_c
meir m_c eir_c
menr m_c enr_c
mianr m_c ianr_c
miaor m_c iaor_c
mingr m_c ingr_c
mir m_c ir_c
mor m_c or_c
naor n_c aor_c
nar n_c ar_c
niangr n_c iangr_c
nianr n_c ianr_c
niaor n_c iaor_c
ningr n_c ingr_c
nir n_c ir_c
niur n_c iour_c
nvr n_c vr_c
pair p_c air_c
pangr p_c angr_c
panr p_c anr_c
paor p_c aor_c
penr p_c enr_c
pianr p_c ianr_c
piaor p_c iaor_c
pier p_c ier_c
pingr p_c ingr_c
pir p_c ir_c
por p_c or_c
pur p_c ur_c
qianr q_c ianr_c
qiaor q_c iaor_c
qingr q_c ingr_c
qir q_c ir_c
qiur q_c iour_c
qvanr q_c vanr_c
qvnr q_c vnr_c
qvr q_c vr_c
sar s_c ar_c
rangr r_c angr_c
renr r_c enr_c
sair s_c air_c
sanr s_c anr_c
shair sh_c air_c
shaor sh_c aor_c
shengr sh_c engr_c
shenr sh_c enr_c
shir sh_c ihr_c
shuair sh_c uair_c
shour sh_c our_c
shuar sh_c uar_c
shuir sh_c ueir_c
shunr sh_c uenr_c
shuor sh_c uor_c
shur sh_c ur_c
sir s_c iir_c
suir s_c ueir_c
sunr s_c uenr_c
tair t_c air_c
tangr t_c angr_c
tanr t_c anr_c
taor t_c aor_c
ter t_c er_c
tianr t_c ianr_c
tiaor t_c iaor_c
tir t_c ir_c
tingr t_c ingr_c
tongr t_c ongr_c
tour t_c our_c
tuanr t_c uanr_c
tuir t_c ueir_c
tuor t_c uor_c
tur t_c ur_c
wanr w_c anr_c
war w_c ar_c
weir w_c eir_c
wenr w_c enr_c
wengr w_c engr_c
wor w_c or_c
wur w_c ur_c
xiangr xx_c iangr_c
xianr xx_c ianr_c
xiar xx_c iar_c
xier xx_c ier_c
xingr xx_c ingr_c
xir xx_c ir_c
xinr xx_c inr_c
xiongr xx_c iongr_c
xiur xx_c iour_c
yangr y_c angr_c
yanr y_c anr_c
yaor y_c aor_c
yar y_c ar_c
yer y_c er_c
yingr y_c ingr_c
yinr y_c inr_c
yir y_c ir_c
your y_c our_c
yvanr y_c vanr_c
zair z_c air_c
yvr y_c vr_c
yver y_c ver_c
zaor z_c aor_c
zar z_c ar_c
zhangr zh_c angr_c
zhanr zh_c anr_c
zhaor zh_c aor_c
zhar zh_c ar_c
zhenr zh_c enr_c
zher zh_c er_c
zhir zh_c ihr_c
zhongr zh_c ongr_c
zhour zh_c our_c
zhuar zh_c uar_c
zhuanr zh_c uanr_c
zhunr zh_c uenr_c
zhuor zh_c uor_c
zhur zh_c ur_c
zir z_c iir_c
zuanr z_c uanr_c
zuir z_c ueir_c
zuor z_c uor_c
\ No newline at end of file
<?xml version="1.0" encoding="utf-8"?>
<phoneSet xmlns="http://schemas.alibaba-inc.com/tts">
<phone>
<id>0</id>
<name>a_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>1</id>
<name>ai_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>2</id>
<name>an_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>3</id>
<name>ang_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>4</id>
<name>ao_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>5</id>
<name>b_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>6</id>
<name>c_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>7</id>
<name>ch_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>8</id>
<name>d_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>9</id>
<name>e_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>10</id>
<name>ei_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>11</id>
<name>en_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>12</id>
<name>eng_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>doublelips</ap>
<am>stop</am>
</phone>
<phone>
<id>13</id>
<name>er_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>doublelips</ap>
<am>stop</am>
</phone>
<phone>
<id>14</id>
<name>f_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>doublelips</ap>
<am>stop</am>
</phone>
<phone>
<id>15</id>
<name>g_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>16</id>
<name>h_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>backtongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>17</id>
<name>i_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>backtongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>18</id>
<name>ia_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>19</id>
<name>ian_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>20</id>
<name>iang_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>21</id>
<name>iao_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>22</id>
<name>ie_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>23</id>
<name>ih_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>24</id>
<name>ii_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>25</id>
<name>in_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>26</id>
<name>ing_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>27</id>
<name>ioo_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>28</id>
<name>iong_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>29</id>
<name>iou_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>30</id>
<name>j_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>31</id>
<name>k_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>32</id>
<name>l_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>33</id>
<name>m_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>34</id>
<name>n_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>35</id>
<name>o_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>36</id>
<name>ong_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>37</id>
<name>ou_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>38</id>
<name>p_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>liptooth</ap>
<am>fricative</am>
</phone>
<phone>
<id>39</id>
<name>q_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>liptooth</ap>
<am>fricative</am>
</phone>
<phone>
<id>40</id>
<name>r_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>velar</ap>
<am>stop</am>
</phone>
<phone>
<id>41</id>
<name>s_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>42</id>
<name>sh_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>43</id>
<name>t_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>44</id>
<name>u_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>velar</ap>
<am>stop</am>
</phone>
<phone>
<id>45</id>
<name>ua_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>velar</ap>
<am>fricative</am>
</phone>
<phone>
<id>46</id>
<name>uai_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>velar</ap>
<am>fricative</am>
</phone>
<phone>
<id>47</id>
<name>uan_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>48</id>
<name>uang_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>49</id>
<name>uei_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>50</id>
<name>uen_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>51</id>
<name>ueng_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>52</id>
<name>uo_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>53</id>
<name>v_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>54</id>
<name>van_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>55</id>
<name>ve_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>56</id>
<name>vn_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>57</id>
<name>xx_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>58</id>
<name>z_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>59</id>
<name>zh_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>60</id>
<name>w_c</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>61</id>
<name>y_c</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>62</id>
<name>ng_c</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>lateral</am>
</phone>
<phone>
<id>63</id>
<name>iai_c</name>
<cv>consonant</cv>
<if>final</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>lateral</am>
</phone>
<phone>
<id>64</id>
<name>io_c</name>
<cv>consonant</cv>
<if>final</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>lateral</am>
</phone>
<phone>
<id>65</id>
<name>ue_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>66</id>
<name>ga</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>lateral</am>
</phone>
<phone>
<id>67</id>
<name>ge</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>lateral</am>
</phone>
<phone>
<id>68</id>
<name>go</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>lateral</am>
</phone>
<phone>
<id>69</id>
<name>aa</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>70</id>
<name>ae</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>71</id>
<name>ah</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>72</id>
<name>ao</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>73</id>
<name>aw</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>74</id>
<name>ay</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>75</id>
<name>b</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>doublelips</ap>
<am>stop</am>
</phone>
<phone>
<id>76</id>
<name>ch</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>backtongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>77</id>
<name>d</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>78</id>
<name>dh</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>79</id>
<name>eh</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>80</id>
<name>er</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>81</id>
<name>ey</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>82</id>
<name>f</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>liptooth</ap>
<am>fricative</am>
</phone>
<phone>
<id>83</id>
<name>g</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>velar</ap>
<am>stop</am>
</phone>
<phone>
<id>84</id>
<name>hh</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>85</id>
<name>ih</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>86</id>
<name>iy</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>87</id>
<name>jh</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>88</id>
<name>k</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>velar</ap>
<am>stop</am>
</phone>
<phone>
<id>89</id>
<name>l</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>lateral</am>
</phone>
<phone>
<id>90</id>
<name>m</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>doublelips</ap>
<am>nasal</am>
</phone>
<phone>
<id>91</id>
<name>n</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>nasal</am>
</phone>
<phone>
<id>92</id>
<name>ng</name>
<cv>consonant</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>93</id>
<name>ow</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>94</id>
<name>oy</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>95</id>
<name>p</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>doublelips</ap>
<am>stop</am>
</phone>
<phone>
<id>96</id>
<name>r</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>backtongue</ap>
<am>fricative</am>
</phone>
<phone>
<id>97</id>
<name>s</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>fricative</am>
</phone>
<phone>
<id>98</id>
<name>sh</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>backtongue</ap>
<am>fricative</am>
</phone>
<phone>
<id>99</id>
<name>t</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>100</id>
<name>th</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>101</id>
<name>uh</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>102</id>
<name>uw</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>103</id>
<name>v</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>104</id>
<name>w</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>105</id>
<name>y</name>
<cv>consonant</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>106</id>
<name>z</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>107</id>
<name>zh</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>backtongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>146</id>
<name>pau</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
</phoneSet>
<?xml version="1.0" encoding="utf-8"?>
<posSet xmlns="http://schemas.alibaba-inc.com/tts">
<pos>
<id>1</id>
<name>a</name>
<desc>todo</desc>
</pos>
<pos>
<id>2</id>
<name>b</name>
<desc>todo</desc>
</pos>
<pos>
<id>3</id>
<name>c</name>
<desc>todo</desc>
</pos>
<pos>
<id>4</id>
<name>d</name>
<desc>todo</desc>
</pos>
<pos>
<id>5</id>
<name>e</name>
<desc>todo</desc>
</pos>
<pos>
<id>6</id>
<name>f</name>
<desc>todo</desc>
</pos>
<pos>
<id>7</id>
<name>g</name>
<desc>todo</desc>
<sub>
<pos>
<id>8</id>
<name>gb</name>
<desc>todo</desc>
</pos>
</sub>
</pos>
<pos>
<id>9</id>
<name>h</name>
<desc>todo</desc>
</pos>
<pos>
<id>10</id>
<name>i</name>
<desc>todo</desc>
</pos>
<pos>
<id>11</id>
<name>j</name>
<desc>todo</desc>
</pos>
<pos>
<id>12</id>
<name>k</name>
<desc>todo</desc>
</pos>
<pos>
<id>13</id>
<name>l</name>
<desc>todo</desc>
</pos>
<pos>
<id>14</id>
<name>m</name>
<desc>todo</desc>
</pos>
<pos>
<id>15</id>
<name>n</name>
<desc>todo</desc>
<sub>
<pos>
<id>16</id>
<name>nz</name>
<desc>todo</desc>
</pos>
</sub>
</pos>
<pos>
<id>17</id>
<name>o</name>
<desc>todo</desc>
</pos>
<pos>
<id>18</id>
<name>p</name>
<desc>todo</desc>
</pos>
<pos>
<id>19</id>
<name>q</name>
<desc>todo</desc>
</pos>
<pos>
<id>20</id>
<name>r</name>
<desc>todo</desc>
</pos>
<pos>
<id>21</id>
<name>s</name>
<desc>todo</desc>
</pos>
<pos>
<id>22</id>
<name>t</name>
<desc>todo</desc>
</pos>
<pos>
<id>23</id>
<name>u</name>
<desc>todo</desc>
</pos>
<pos>
<id>24</id>
<name>v</name>
<desc>todo</desc>
</pos>
<pos>
<id>25</id>
<name>w</name>
<desc>todo</desc>
</pos>
<pos>
<id>26</id>
<name>x</name>
<desc>todo</desc>
</pos>
<pos>
<id>27</id>
<name>y</name>
<desc>todo</desc>
</pos>
<pos>
<id>28</id>
<name>z</name>
<desc>todo</desc>
</pos>
</posSet>
a ga a_c
ai ga ai_c
an ga an_c
ao ga ao_c
e ge e_c
er ge er_c
o go o_c
ong go ong_c
ba b_c a_c
bai b_c ai_c
ban b_c an_c
bang b_c ang_c
bao b_c ao_c
be b_c e_c
bei b_c ei_c
ben b_c en_c
bi b_c i_c
bia b_c ia_c
bian b_c ian_c
biao b_c iao_c
bie b_c ie_c
bin b_c in_c
bo b_c o_c
bong b_c ong_c
bu b_c u_c
ca c_c a_c
cai c_c ai_c
can c_c an_c
cang c_c ang_c
cao c_c ao_c
ce c_c e_c
cen c_c en_c
ceng c_c eng_c
ci c_c ii_c
co c_c o_c
cong c_c ong_c
cou c_c ou_c
cu c_c u_c
cuai c_c uai_c
cuan c_c uan_c
cuang c_c uang_c
cui c_c uei_c
cun c_c uen_c
da d_c a_c
dai d_c ai_c
dan d_c an_c
dang d_c ang_c
dao d_c ao_c
de d_c e_c
dei d_c ei_c
den d_c en_c
deng d_c eng_c
di d_c i_c
dian d_c ian_c
diao d_c iao_c
die d_c ie_c
din d_c in_c
ding d_c ing_c
diu d_c iou_c
do d_c o_c
dong d_c ong_c
dou d_c ou_c
du d_c u_c
duan d_c uan_c
dui d_c uei_c
dun d_c uen_c
fa f_c a_c
fai f_c ai_c
fan f_c an_c
fang f_c ang_c
fei f_c ei_c
fen f_c en_c
feng f_c eng_c
fong f_c ong_c
fu f_c u_c
ga g_c a_c
gai g_c ai_c
gan g_c an_c
gang g_c ang_c
gao g_c ao_c
gua g_c ua_c
ge g_c e_c
gen g_c en_c
go g_c o_c
gon g_c iai_c
gong g_c ong_c
gou g_c ou_c
gu g_c u_c
guai g_c uai_c
guan g_c uan_c
guang g_c uang_c
gui g_c uei_c
gun g_c uen_c
ha h_c a_c
hai h_c ai_c
han h_c an_c
hang h_c ang_c
hao h_c ao_c
he h_c e_c
hen h_c en_c
ho h_c o_c
hong h_c ong_c
hou h_c ou_c
hu h_c u_c
hua h_c ua_c
huai h_c uai_c
huan h_c uan_c
huang h_c uang_c
hui h_c uei_c
hun h_c uen_c
huo h_c uo_c
ji j_c i_c
jia j_c ia_c
jiai j_c ia_c
jian j_c ian_c
jiang j_c iang_c
jiao j_c iao_c
jie j_c ie_c
jin j_c in_c
jiu j_c iou_c
ju j_c u_c
juan j_c van_c
jue j_c ve_c
juo j_c uo_c
ka k_c a_c
kai k_c ai_c
kan k_c an_c
kang k_c ang_c
kao k_c ao_c
ke k_c e_c
ken k_c en_c
ko k_c o_c
kong k_c ong_c
kou k_c ou_c
ku k_c u_c
kua k_c ua_c
kuai k_c uai_c
kuan k_c uan_c
kuang k_c uang_c
kue k_c ve_c
kui k_c uei_c
kun k_c uen_c
la l_c a_c
na n_c a_c
lai l_c ai_c
nai n_c ai_c
lan l_c an_c
nan n_c an_c
lang l_c ang_c
nang n_c ang_c
lao l_c ao_c
nao n_c ao_c
len l_c en_c
nen n_c en_c
li l_c i_c
ni n_c i_c
lian l_c ian_c
nian n_c ian_c
liang l_c iang_c
niang n_c iang_c
liao l_c iao_c
niao n_c iao_c
lie l_c ie_c
nie n_c ie_c
lin l_c in_c
nin n_c in_c
liu l_c iou_c
niu n_c iou_c
lo l_c o_c
no n_c o_c
long l_c ong_c
nong n_c ong_c
lou l_c ou_c
nou n_c ou_c
lu l_c u_c
nu n_c u_c
luan l_c uan_c
nuan n_c uan_c
lue l_c ve_c
nue n_c ve_c
lui l_c uei_c
nui n_c uei_c
lun l_c uen_c
nun n_c uen_c
luo l_c uo_c
nuo n_c uo_c
lv l_c v_c
nv n_c v_c
ma m_c a_c
mai m_c ai_c
man m_c an_c
mang m_c ang_c
mao m_c ao_c
me m_c e_c
mei m_c ei_c
men m_c en_c
meng m_c eng_c
mi m_c i_c
mian m_c ian_c
miao m_c iao_c
mie m_c ie_c
min m_c in_c
mo m_c o_c
mong m_c ong_c
mu m_c u_c
ne n_c e_c
nei n_c ei_c
pa p_c a_c
pai p_c ai_c
pan p_c an_c
pang p_c ang_c
pao p_c ao_c
pe p_c e_c
pei p_c ei_c
pen p_c en_c
peng p_c eng_c
pi p_c i_c
pian p_c ian_c
piao p_c iao_c
pie p_c ie_c
pin p_c in_c
po p_c o_c
pong p_c ong_c
pu p_c u_c
qi q_c i_c
qia q_c ia_c
qian q_c ian_c
qiang q_c iang_c
qiao q_c iao_c
qie q_c ie_c
qin q_c in_c
qing q_c ing_c
qiong q_c iong_c
qiu q_c iou_c
qu q_c u_c
quan q_c van_c
que q_c ve_c
qun q_c vn_c
quo q_c uo_c
ran r_c an_c
rang r_c ang_c
rao r_c ao_c
re r_c e_c
ren r_c en_c
ri r_c ih_c
rong r_c ong_c
rou r_c ou_c
ru r_c u_c
rua r_c ua_c
ruan r_c uan_c
sa s_c a_c
sai s_c ai_c
san s_c an_c
sang s_c ang_c
sao s_c ao_c
se s_c e_c
sen s_c en_c
si s_c ii_c
so s_c o_c
song s_c ong_c
sou s_c ou_c
su s_c u_c
sua s_c ua_c
suai s_c uai_c
suan s_c uan_c
suang s_c uang_c
sui s_c uei_c
sun s_c uen_c
ta t_c a_c
tai t_c ai_c
tan t_c an_c
tang t_c ang_c
tao t_c ao_c
ten t_c en_c
ti t_c i_c
tian t_c ian_c
tiao t_c iao_c
tie t_c ie_c
tin t_c in_c
to t_c o_c
tong t_c ong_c
tou t_c ou_c
tu t_c u_c
tuan t_c uan_c
tui t_c uei_c
tuo t_c uo_c
wa w_c a_c
wai w_c ai_c
wan w_c an_c
wang w_c ang_c
wei w_c ei_c
wen w_c en_c
wo w_c o_c
wu w_c u_c
xi xx_c i_c
xia xx_c ia_c
xian xx_c ian_c
xiang xx_c iang_c
xiao xx_c iao_c
xie xx_c ie_c
xin xx_c in_c
xing xx_c ing_c
xiong xx_c iong_c
xiu xx_c iou_c
xu xx_c u_c
xuan xx_c van_c
xue xx_c ve_c
xun xx_c vn_c
ya y_c a_c
yan y_c an_c
yang y_c ang_c
yao y_c ao_c
ye y_c e_c
yi y_c i_c
yin y_c in_c
yo y_c o_c
yong y_c ong_c
you y_c ou_c
yu y_c u_c
yuan y_c van_c
yue y_c ve_c
yun y_c vn_c
yuo y_c uo_c
za z_c a_c
zai z_c ai_c
zan z_c an_c
zang z_c ang_c
zao z_c ao_c
ze z_c e_c
zei z_c ei_c
zen z_c en_c
zi z_c ii_c
zo z_c o_c
zong z_c ong_c
zou z_c ou_c
zu z_c u_c
zua z_c ua_c
zuai z_c uai_c
zuan z_c uan_c
zuang z_c uang_c
zui z_c uei_c
zuo z_c uo_c
bing b_c ing_c
cer c_c er_c
ei ge ei_c
en ge en_c
fou f_c ou_c
gei g_c ei_c
geng g_c eng_c
heng h_c eng_c
huar h_c ua_c
huei h_c uei_c
jing j_c ing_c
jo j_c o_c
keng k_c eng_c
kuei k_c uei_c
le l_c e_c
leng l_c eng_c
neng n_c eng_c
ling l_c ing_c
ning n_c ing_c
ming m_c ing_c
nar n_c a_c
ngai ng_c ai_c
ngan ng_c an_c
ngao ng_c ao_c
ngen ng_c en_c
ngo ng_c o_c
xou xx_c ou_c
ping p_c ing_c
reng r_c eng_c
ro r_c o_c
run r_c uen_c
sei s_c ei_c
seng s_c eng_c
te t_c e_c
teng t_c eng_c
ting t_c ing_c
tun t_c uen_c
wong w_c ong_c
ying y_c ing_c
zeng z_c eng_c
zun z_c uen_c
ang ga ang_c
ou go ou_c
banr b_c an_c
benr b_c en_c
bianr b_c ian_c
dianr d_c ian_c
dunr d_c uen_c
fenr f_c en_c
fo f_c o_c
fur f_c u_c
gunr g_c uen_c
guo g_c uo_c
hair h_c ai_c
har h_c a_c
hei h_c ei_c
huir h_c uei_c
jianr j_c ian_c
jingr j_c ing_c
jiong j_c iong_c
kanr k_c an_c
kei k_c ei_c
kuo k_c uo_c
lar l_c a_c
lei l_c ei_c
lianr l_c ian_c
nianr n_c ian_c
luei l_c uei_c
nuei n_c uei_c
maor m_c ao_c
menr m_c en_c
mou m_c ou_c
nga ng_c a_c
ngang ng_c ang_c
ngei ng_c ei_c
nger ng_c er_c
ngong ng_c ong_c
ngou ng_c ou_c
ningr n_c ing_c
niur n_c iou_c
nvr n_c v_c
qio q_c io_c
qo q_c o_c
rui r_c uei_c
sengr s_c eng_c
ter t_c er_c
tour t_c ou_c
wanr w_c an_c
war w_c a_c
weng w_c eng_c
wenr w_c en_c
xingr xx_c ing_c
xo xx_c o_c
yangr y_c ang_c
yanr y_c an_c
yar y_c a_c
yuanr y_c van_c
yuer y_c ve_c
zeir z_c ei_c
zer z_c er_c
jun j_c vn_c
beir b_c ei_c
cei c_c ei_c
dengr d_c eng_c
far f_c a_c
genr g_c en_c
hor h_c o_c
kor k_c o_c
miu m_c iou_c
nia n_c ia_c
penr p_c en_c
xianr xx_c ian_c
gue g_c ve_c
hue h_c ve_c
bangr b_c ang_c
baor b_c ao_c
bar b_c a_c
bingr b_c ing_c
cangr c_c ang_c
car c_c a_c
cengr c_c eng_c
cuanr c_c uan_c
cuir c_c uei_c
cunr c_c uen_c
danr d_c an_c
dar d_c a_c
dour d_c ou_c
duir d_c uei_c
feir f_c ei_c
fengr f_c eng_c
ganr g_c an_c
gaor g_c ao_c
gar g_c a_c
gengr g_c eng_c
gor g_c o_c
gour g_c ou_c
guanr g_c uan_c
guar g_c ua_c
hanr h_c an_c
hunr h_c uen_c
hur h_c u_c
jiaor j_c iao_c
jiar j_c ia_c
juanr j_c van_c
junr j_c vn_c
kar k_c a_c
kour k_c ou_c
kuair k_c uai_c
laor l_c ao_c
naor n_c ao_c
leir l_c ei_c
neir n_c ei_c
liur l_c iou_c
lur l_c u_c
nur n_c u_c
mianr m_c ian_c
miaor m_c iao_c
mingr m_c ing_c
minr m_c in_c
mur m_c u_c
nge ng_c e_c
niaor n_c iao_c
or go o_c
pair p_c ai_c
paor p_c ao_c
pianr p_c ian_c
piaor p_c iao_c
pon p_c iai_c
pur p_c u_c
qianr q_c ian_c
qir q_c i_c
qiur q_c iou_c
quanr q_c van_c
rei r_c ei_c
ruo r_c uo_c
sir s_c ii_c
sour s_c ou_c
sunr s_c uen_c
suo s_c uo_c
tair t_c ai_c
tanr t_c an_c
tei t_c ei_c
tianr t_c ian_c
tir t_c i_c
wangr w_c ang_c
weir w_c ei_c
xiar xx_c ia_c
yei y_c ei_c
yingr y_c ing_c
zengr z_c eng_c
zir z_c ii_c
zuanr z_c uan_c
zuir z_c uei_c
zur z_c u_c
beng b_c eng_c
cua c_c ua_c
dia d_c ia_c
duo d_c uo_c
eng ge eng_c
pou p_c ou_c
xuo xx_c uo_c
shao sh_c ao_c
zhen zh_c en_c
shi sh_c i_c
zhe zh_c e_c
lia l_c ia_c
hiang h_c iang_c
cuo c_c uo_c
ngeng ng_c eng_c
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment