Commit ee10550a authored by liugh5's avatar liugh5
Browse files

Initial commit

parents
Pipeline #790 canceled with stages
import logging
import os
from tqdm import tqdm
from .core.Script import Script
from .core.ScriptItem import ScriptItem
from .core.ScriptSentence import ScriptSentence
from .core.SyllableFormatter import (
ZhCNSyllableFormatter,
ZhHKSyllableFormatter,
PinYinSyllableFormatter,
WuuShanghaiSyllableFormatter,
SichuanSyllableFormatter,
EnXXSyllableFormatter,
)
from .core.ScriptWord import SpokenWord, SpokenMark, WrittenWord, WrittenMark
from .core.PhoneSet import PhoneSet
from .core.PosSet import PosSet
from .core.core_types import Language, BreakLevel
from .core.utils import (
RegexID,
RegexSentence,
RegexForeignLang,
RegexNeutralTone,
format_prosody,
)
import argparse
import re
from bitstring import BitArray
logging.basicConfig(
format="%(asctime)s,%(msecs)d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
datefmt="%Y-%m-%d:%H:%M:%S",
level=logging.DEBUG,
)
class TextScriptConvertor:
def __init__(
self,
phoneset_path,
posset_path,
target_lang,
foreign_lang,
f2t_map_path,
s2p_map_path,
m_emo_tag_path,
m_speaker,
):
self.m_f2p_map = {}
self.m_s2p_map = {}
self.m_phoneset = PhoneSet(phoneset_path)
self.m_posset = PosSet(posset_path)
self.m_target_lang = Language.parse(target_lang)
self.m_foreign_lang = Language.parse(foreign_lang)
self.m_emo_tag_path = m_emo_tag_path
self.m_speaker = m_speaker
self.LoadF2TMap(f2t_map_path)
self.LoadS2PMap(s2p_map_path)
if m_emo_tag_path is not None:
self.m_emo_dict = self.parse_emo_tag(m_emo_tag_path)
else:
self.m_emo_dict = {}
self.m_target_lang_syllable_formatter = self.InitSyllableFormatter(
self.m_target_lang
)
self.m_foreign_lang_syllable_formatter = self.InitSyllableFormatter(
self.m_foreign_lang
)
def parse_emo_tag(self, emo_tag_path):
with open(emo_tag_path, "r") as f:
lines = f.readlines()
emo_tag = {}
for line in lines:
line = line.strip()
elements = line.split()
if len(elements) != 2:
logging.error(
"TextScriptConvertor.parse_emo_tag: invalid line: %s", line
)
continue
key = elements[0]
value = elements[1]
if key in emo_tag:
logging.warning(
"TextScriptConvertor.parse_emo_tag: duplicate key: %s", key
)
emo_tag[key] = value
return emo_tag
def parse_sentence(self, sentence, line_num):
script_item = ScriptItem(self.m_phoneset, self.m_posset)
script_sentence = ScriptSentence(self.m_phoneset, self.m_posset)
script_item.m_scriptSentence_list.append(script_sentence)
written_sentence = script_sentence.m_writtenSentence
spoken_sentence = script_sentence.m_spokenSentence
position = 0
sentence = sentence.strip()
# Get ID
match = re.search(RegexID, sentence)
if match is None:
logging.error(
"TextScriptConvertor.parse_sentence:invalid line: %s,\
line ID is needed",
line_num,
)
return None
else:
sentence_id = match.group("ID")
script_item.m_id = sentence_id
position += match.end()
prevSpokenWord = SpokenWord()
prevWord = False
lastBreak = False
for m in re.finditer(RegexSentence, sentence[position:]):
if m is None:
logging.error(
"TextScriptConvertor.parse_sentence:\
invalid line: %s, there is no matched pattern",
line_num,
)
return None
if m.group("Word") is not None:
wordName = m.group("Word")
written_word = WrittenWord()
written_word.m_name = wordName
written_sentence.AddHost(written_word)
spoken_word = SpokenWord()
spoken_word.m_name = wordName
prevSpokenWord = spoken_word
prevWord = True
lastBreak = False
elif m.group("Break") is not None:
breakText = m.group("BreakLevel")
if len(breakText) == 0:
breakLevel = BreakLevel.L1
else:
breakLevel = BreakLevel.parse(breakText)
if prevWord:
prevSpokenWord.m_breakText = breakText
spoken_sentence.AddHost(prevSpokenWord)
if breakLevel != BreakLevel.L1:
spokenMark = SpokenMark()
spokenMark.m_breakLevel = breakLevel
spoken_sentence.AddAccompany(spokenMark)
lastBreak = True
elif m.group("PhraseTone") is not None:
# TODO: PhraseTonePattern
pass
elif m.group("POS") is not None:
POSClass = m.group("POSClass")
if prevWord:
prevSpokenWord.m_pos = POSClass
prevWord = False
elif m.group("Mark") is not None:
markText = m.group("Mark")
writtenMark = WrittenMark()
writtenMark.m_punctuation = markText
written_sentence.AddAccompany(writtenMark)
else:
logging.error(
"TextScriptConvertor.parse_sentence:\
invalid line: %s, matched pattern is unrecognized",
line_num,
)
return None
if not lastBreak:
prevSpokenWord.m_breakText = "4"
spoken_sentence.AddHost(prevSpokenWord)
spoken_word_cnt = len(spoken_sentence.m_spoken_word_list)
spoken_mark_cnt = len(spoken_sentence.m_spoken_mark_list)
if (
spoken_word_cnt > 0
and spoken_sentence.m_align_list[spoken_word_cnt - 1] == spoken_mark_cnt
):
spokenMark = SpokenMark()
spokenMark.m_breakLevel = BreakLevel.L4
spoken_sentence.AddAccompany(spokenMark)
written_sentence.BuildSequence()
spoken_sentence.BuildSequence()
written_sentence.BuildText()
spoken_sentence.BuildText()
script_sentence.m_text = written_sentence.m_text
script_item.m_text = written_sentence.m_text
return script_item
def FormatSyllable(self, pron, syllable_list):
isForeign = RegexForeignLang.search(pron) is not None
if self.m_foreign_lang_syllable_formatter is not None and isForeign:
return self.m_foreign_lang_syllable_formatter.Format(
self.m_phoneset, pron, syllable_list
)
else:
return self.m_target_lang_syllable_formatter.Format(
self.m_phoneset, pron, syllable_list
)
def GetWordProns(self, pronText):
prons = pronText.split("/")
res = []
for pron in prons:
if re.search(RegexForeignLang, pron):
res.append(pron.strip())
else:
res.extend(pron.strip().split(" "))
return res
def IsErHuaYin(self, pron):
pron = RegexNeutralTone.sub("5", pron)
pron = pron[:-1]
return pron[-1] == "r" and pron != "er"
def parse_pronunciation(self, script_item, pronunciation, line_num):
spoken_sentence = script_item.m_scriptSentence_list[0].m_spokenSentence
wordProns = self.GetWordProns(pronunciation)
wordIndex = 0
pronIndex = 0
succeed = True
while pronIndex < len(wordProns):
language = Language.Neutral
syllable_list = []
pron = wordProns[pronIndex].strip()
succeed = self.FormatSyllable(pron, syllable_list)
if not succeed:
logging.error(
"TextScriptConvertor.parse_pronunciation:\
invalid line: %s, error pronunciation: %s,\
syllable format error",
line_num,
pron,
)
return False
language = syllable_list[0].m_language
if wordIndex < len(spoken_sentence.m_spoken_word_list):
if language in [Language.EnGB, Language.EnUS]:
spoken_sentence.m_spoken_word_list[
wordIndex
].m_syllable_list.extend(syllable_list)
wordIndex += 1
pronIndex += 1
elif language in [
Language.ZhCN,
Language.PinYin,
Language.ZhHK,
Language.WuuShanghai,
Language.Sichuan,
]:
charCount = len(
spoken_sentence.m_spoken_word_list[wordIndex].m_name
)
if (
language in [Language.ZhCN, Language.PinYin, Language.Sichuan]
and self.IsErHuaYin(pron)
and "儿" in spoken_sentence.m_spoken_word_list[wordIndex].m_name
):
spoken_sentence.m_spoken_word_list[
wordIndex
].m_name = spoken_sentence.m_spoken_word_list[
wordIndex
].m_name.replace(
"儿", ""
)
charCount -= 1
if charCount == 1:
spoken_sentence.m_spoken_word_list[
wordIndex
].m_syllable_list.extend(syllable_list)
wordIndex += 1
pronIndex += 1
else:
# FIXME(Jin): Just skip the first char then match the rest char.
i = 1
while i >= 1 and i < charCount:
pronIndex += 1
if pronIndex < len(wordProns):
pron = wordProns[pronIndex].strip()
succeed = self.FormatSyllable(pron, syllable_list)
if not succeed:
logging.error(
"TextScriptConvertor.parse_pronunciation: invalid line: %s, \
error pronunciation: %s, syllable format error",
line_num,
pron,
)
return False
if (
language
in [
Language.ZhCN,
Language.PinYin,
Language.Sichuan,
]
and self.IsErHuaYin(pron)
and "儿"
in spoken_sentence.m_spoken_word_list[
wordIndex
].m_name
):
spoken_sentence.m_spoken_word_list[
wordIndex
].m_name = spoken_sentence.m_spoken_word_list[
wordIndex
].m_name.replace(
"儿", ""
)
charCount -= 1
else:
logging.error(
"TextScriptConvertor.parse_pronunciation: invalid line: %s, \
error pronunciation: %s, Word count mismatch with Pron count",
line_num,
pron,
)
return False
i += 1
spoken_sentence.m_spoken_word_list[
wordIndex
].m_syllable_list.extend(syllable_list)
wordIndex += 1
pronIndex += 1
else:
logging.error(
"TextScriptConvertor.parse_pronunciation: invalid line: %s, \
unsupported language: %s",
line_num,
language.name,
)
return False
else:
logging.error(
"TextScriptConvertor.parse_pronunciation: invalid line: %s, \
error pronunciation: %s, word index is out of range",
line_num,
pron,
)
return False
if pronIndex != len(wordProns):
logging.error(
"TextScriptConvertor.parse_pronunciation: invalid line: %s, \
error pronunciation: %s, pron count mismatch with word count",
line_num,
pron,
)
return False
if wordIndex != len(spoken_sentence.m_spoken_word_list):
logging.error(
"TextScriptConvertor.parse_pronunciation: invalid line: %s, \
error pronunciation: %s, word count mismatch with word index",
line_num,
pron,
)
return False
return True
def LoadF2TMap(self, file_path):
with open(file_path, "r") as f:
for line in f.readlines():
line = line.strip()
elements = line.split("\t")
if len(elements) != 2:
logging.error(
"TextScriptConvertor.LoadF2TMap: invalid line: %s", line
)
continue
key = elements[0]
value = elements[1]
value_list = value.split(" ")
if key in self.m_f2p_map:
logging.error(
"TextScriptConvertor.LoadF2TMap: duplicate key: %s", key
)
self.m_f2p_map[key] = value_list
def LoadS2PMap(self, file_path):
with open(file_path, "r") as f:
for line in f.readlines():
line = line.strip()
elements = line.split("\t")
if len(elements) != 2:
logging.error(
"TextScriptConvertor.LoadS2PMap: invalid line: %s", line
)
continue
key = elements[0]
value = elements[1]
if key in self.m_s2p_map:
logging.error(
"TextScriptConvertor.LoadS2PMap: duplicate key: %s", key
)
self.m_s2p_map[key] = value
def InitSyllableFormatter(self, targetLang):
if targetLang == Language.ZhCN:
if len(self.m_s2p_map) == 0:
logging.error(
"TextScriptConvertor.InitSyllableFormatter: ZhCN syllable to phone map is empty"
)
return None
return ZhCNSyllableFormatter(self.m_s2p_map)
elif targetLang == Language.PinYin:
if len(self.m_s2p_map) == 0:
logging.error(
"TextScriptConvertor.InitSyllableFormatter: PinYin syllable to phone map is empty"
)
return None
return PinYinSyllableFormatter(self.m_s2p_map)
elif targetLang == Language.ZhHK:
if len(self.m_s2p_map) == 0:
logging.error(
"TextScriptConvertor.InitSyllableFormatter: ZhHK syllable to phone map is empty"
)
return None
return ZhHKSyllableFormatter(self.m_s2p_map)
elif targetLang == Language.WuuShanghai:
if len(self.m_s2p_map) == 0:
logging.error(
"TextScriptConvertor.InitSyllableFormatter: WuuShanghai syllable to phone map is empty"
)
return None
return WuuShanghaiSyllableFormatter(self.m_s2p_map)
elif targetLang == Language.Sichuan:
if len(self.m_s2p_map) == 0:
logging.error(
"TextScriptConvertor.InitSyllableFormatter: Sichuan syllable to phone map is empty"
)
return None
return SichuanSyllableFormatter(self.m_s2p_map)
elif targetLang == Language.EnGB:
formatter = EnXXSyllableFormatter(Language.EnGB)
if len(self.m_f2p_map) != 0:
formatter.m_f2t_map = self.m_f2p_map
return formatter
elif targetLang == Language.EnUS:
formatter = EnXXSyllableFormatter(Language.EnUS)
if len(self.m_f2p_map) != 0:
formatter.m_f2t_map = self.m_f2p_map
return formatter
else:
logging.error(
"TextScriptConvertor.InitSyllableFormatter: unsupported language: %s",
targetLang,
)
return None
def process(self, textScriptPath, outputXMLPath, outputMetafile):
script = Script(self.m_phoneset, self.m_posset)
formatted_lines = format_prosody(textScriptPath)
line_num = 0
for line in tqdm(formatted_lines):
if line_num % 2 == 0:
sentence = line.strip()
item = self.parse_sentence(sentence, line_num)
else:
if item is not None:
pronunciation = line.strip()
res = self.parse_pronunciation(item, pronunciation, line_num)
if res:
script.m_items.append(item)
line_num += 1
script.Save(outputXMLPath)
logging.info("TextScriptConvertor.process:\nSave script to: %s", outputXMLPath)
meta_lines = script.SaveMetafile()
speaker = self.m_speaker
meta_lines_tagged = []
for line in meta_lines:
line_id, line_text = line.split("\t")
emo = self.m_emo_dict.get(line_id, "emotion_neutral")
syll_items = line_text.split(" ")
syll_items_tagged = []
for syll_item in syll_items:
syll_item_tagged = syll_item[:-1] + "$" + emo + "$" + speaker + "}"
syll_items_tagged.append(syll_item_tagged)
meta_lines_tagged.append(line_id + "\t" + " ".join(syll_items_tagged))
with open(outputMetafile, "w") as f:
for line in meta_lines_tagged:
f.write(line + "\n")
logging.info(
"TextScriptConvertor.process:\nSave metafile to: %s", outputMetafile
)
@staticmethod
def turn_text_into_bytes(plain_text_path, output_meta_file_path, speaker):
meta_lines = []
with open(plain_text_path, "r") as in_file:
for text_line in in_file:
[sentence_id, sentence] = text_line.strip().split("\t")
sequence = []
for character in sentence:
hex_string = character.encode("utf-8").hex()
i = 0
while i < len(hex_string):
byte_hex = hex_string[i : i + 2]
bit_array = BitArray(hex=byte_hex)
integer = bit_array.uint
if integer > 255:
logging.error(
"TextScriptConverter.turn_text_into_bytes: invalid byte conversion in sentence {} \
character {}: (uint) {} - (hex) {}".format(
sentence_id,
character,
integer,
character.encode("utf-8").hex(),
)
)
continue
sequence.append(
"{{{}$emotion_neutral${}}}".format(integer, speaker)
)
i += 2
if sequence[-1][1:].split("$")[0] not in ["33", "46", "63"]:
sequence.append("{{46$emotion_neutral${}}}".format(speaker))
meta_lines.append("{}\t{}\n".format(sentence_id, " ".join(sequence)))
with open(output_meta_file_path, "w") as out_file:
out_file.writelines(meta_lines)
def main(args):
logging.basicConfig(level=logging.INFO)
logging.info("TextScriptConvertor.main: start")
if args.speaker is None:
speaker = os.path.dirname(os.path.dirname(args.text_script_path))
else:
speaker = args.speaker
convertor = TextScriptConvertor(
args.phoneset_path,
args.posset_path,
args.language,
args.foreignLang,
args.f2t_map_path,
args.s2p_map_path,
args.emo_tag_path,
speaker,
)
convertor.process(args.text_script_path, args.output_xml_path, args.output_metafile)
logging.info("TextScriptConvertor.main: end")
# TODO(jin): add emotional style; add speaker info;
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="TextScriptConvertor")
parser.add_argument("--language", type=str, required=True, help="target language")
parser.add_argument(
"--foreignLang", type=str, required=True, help="foreign language"
)
parser.add_argument(
"--phoneset_path", type=str, required=True, help="phoneset path"
)
parser.add_argument("--posset_path", type=str, required=True, help="posset path")
parser.add_argument("--f2t_map_path", type=str, required=True, help="f2t map path")
parser.add_argument("--s2p_map_path", type=str, required=True, help="s2p map path")
parser.add_argument(
"--text_script_path", type=str, required=True, help="input text script path"
)
parser.add_argument(
"--output_xml_path", type=str, required=True, help="output xml path"
)
parser.add_argument(
"--output_metafile", type=str, required=True, help="output metafile path"
)
parser.add_argument(
"--emo_tag_path", type=str, default=None, help="emotion tag path"
)
parser.add_argument("--speaker", type=str, default=None, help="speaker")
args = parser.parse_args()
main(args)
from .XmlObj import XmlObj
from .core_types import PhoneCVType, PhoneIFType, PhoneUVType, PhoneAPType, PhoneAMType
class Phone(XmlObj):
def __init__(self):
self.m_id = None
self.m_name = None
self.m_cv_type = PhoneCVType.NULL
self.m_if_type = PhoneIFType.NULL
self.m_uv_type = PhoneUVType.NULL
self.m_ap_type = PhoneAPType.NULL
self.m_am_type = PhoneAMType.NULL
self.m_bnd = False
def __str__(self):
return self.m_name
def Save(self):
pass
def Load(self, phone_node):
ns = "{http://schemas.alibaba-inc.com/tts}"
id_node = phone_node.find(ns + "id")
self.m_id = int(id_node.text)
name_node = phone_node.find(ns + "name")
self.m_name = name_node.text
cv_node = phone_node.find(ns + "cv")
self.m_cv_type = PhoneCVType.parse(cv_node.text)
if_node = phone_node.find(ns + "if")
self.m_if_type = PhoneIFType.parse(if_node.text)
uv_node = phone_node.find(ns + "uv")
self.m_uv_type = PhoneUVType.parse(uv_node.text)
ap_node = phone_node.find(ns + "ap")
self.m_ap_type = PhoneAPType.parse(ap_node.text)
am_node = phone_node.find(ns + "am")
self.m_am_type = PhoneAMType.parse(am_node.text)
import xml.etree.ElementTree as ET
import logging
from .XmlObj import XmlObj
from .Phone import Phone
class PhoneSet(XmlObj):
def __init__(self, phoneset_path):
self.m_phone_list = []
self.m_id_map = {}
self.m_name_map = {}
self.Load(phoneset_path)
def Load(self, file_path):
# alibaba tts xml namespace
ns = "{http://schemas.alibaba-inc.com/tts}"
phoneset_root = ET.parse(file_path).getroot()
for phone_node in phoneset_root.findall(ns + "phone"):
phone = Phone()
phone.Load(phone_node)
self.m_phone_list.append(phone)
if phone.m_id in self.m_id_map:
logging.error("PhoneSet.Load: duplicate id: %d", phone.m_id)
self.m_id_map[phone.m_id] = phone
if phone.m_name in self.m_name_map:
logging.error("PhoneSet.Load duplicate name name: %s", phone.m_name)
self.m_name_map[phone.m_name] = phone
def Save(self):
pass
# if __name__ == "__main__":
# import os
# import sys
#
# phoneset = PhoneSet()
# phoneset.Load(sys.argv[1])
#
# for phone in phoneset.m_phone_list:
# print(phone)
# print(phone.m_id)
# print(phone.m_name)
# print(phone.m_cv_type)
# print(phone.m_if_type)
# print(phone.m_uv_type)
# print(phone.m_ap_type)
# print(phone.m_am_type)
# print(phone.m_bnd)
from .XmlObj import XmlObj
class Pos(XmlObj):
def __init__(self):
self.m_id = None
self.m_name = None
self.m_desc = None
self.m_level = 1
self.m_parent = None
self.m_sub_pos_list = []
def __str__(self):
return self.m_name
def Save(self):
pass
def Load(self, pos_node):
ns = "{http://schemas.alibaba-inc.com/tts}"
id_node = pos_node.find(ns + "id")
self.m_id = int(id_node.text)
name_node = pos_node.find(ns + "name")
self.m_name = name_node.text
desc_node = pos_node.find(ns + "desc")
self.m_desc = desc_node.text
sub_node = pos_node.find(ns + "sub")
if sub_node is not None:
for sub_pos_node in sub_node.findall(ns + "pos"):
sub_pos = Pos()
sub_pos.Load(sub_pos_node)
sub_pos.m_parent = self
sub_pos.m_level = self.m_level + 1
self.m_sub_pos_list.append(sub_pos)
return
import xml.etree.ElementTree as ET
import logging
from .XmlObj import XmlObj
from .Pos import Pos
class PosSet(XmlObj):
def __init__(self, posset_path):
self.m_pos_list = []
self.m_id_map = {}
self.m_name_map = {}
self.Load(posset_path)
def Load(self, file_path):
# alibaba tts xml namespace
ns = "{http://schemas.alibaba-inc.com/tts}"
posset_root = ET.parse(file_path).getroot()
for pos_node in posset_root.findall(ns + "pos"):
pos = Pos()
pos.Load(pos_node)
self.m_pos_list.append(pos)
if pos.m_id in self.m_id_map:
logging.error("PosSet.Load: duplicate id: %d", pos.m_id)
self.m_id_map[pos.m_id] = pos
if pos.m_name in self.m_name_map:
logging.error("PosSet.Load duplicate name name: %s", pos.m_name)
self.m_name_map[pos.m_name] = pos
if len(pos.m_sub_pos_list) > 0:
for sub_pos in pos.m_sub_pos_list:
self.m_pos_list.append(sub_pos)
if sub_pos.m_id in self.m_id_map:
logging.error("PosSet.Load: duplicate id: %d", sub_pos.m_id)
self.m_id_map[sub_pos.m_id] = sub_pos
if sub_pos.m_name in self.m_name_map:
logging.error(
"PosSet.Load duplicate name name: %s", sub_pos.m_name
)
self.m_name_map[sub_pos.m_name] = sub_pos
def Save(self):
pass
# if __name__ == "__main__":
# import os
# import sys
#
# posset = PosSet()
# posset.Load(sys.argv[1])
#
# for pos in posset.m_pos_list:
# print(pos)
# print(pos.m_id)
# print(pos.m_name)
# print(pos.m_desc)
# print(pos.m_level)
# print(pos.m_parent)
# if pos.m_sub_pos_list:
# print("sub pos list:")
# for sub_pos in pos.m_sub_pos_list:
# print(sub_pos)
# print(sub_pos.m_id)
# print(sub_pos.m_name)
# print(sub_pos.m_desc)
# print(sub_pos.m_level)
# print(sub_pos.m_parent)
# print("sub pos list end")
from .XmlObj import XmlObj
import xml.etree.ElementTree as ET
from xml.dom import minidom
class Script(XmlObj):
def __init__(self, phoneset, posset):
self.m_phoneset = phoneset
self.m_posset = posset
self.m_items = []
def Save(self, outputXMLPath):
root = ET.Element("script")
root.set("uttcount", str(len(self.m_items)))
root.set("xmlns", "http://schemas.alibaba-inc.com/tts")
for item in self.m_items:
item.Save(root)
xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(
indent=" ", encoding="utf-8"
)
with open(outputXMLPath, "wb") as f:
f.write(xmlstr)
def SaveMetafile(self):
meta_lines = []
for item in self.m_items:
meta_lines.append(item.SaveMetafile())
return meta_lines
import xml.etree.ElementTree as ET
from .XmlObj import XmlObj
class ScriptItem(XmlObj):
def __init__(self, phoneset, posset):
if phoneset is None or posset is None:
raise Exception("ScriptItem.__init__: phoneset or posset is None")
self.m_phoneset = phoneset
self.m_posset = posset
self.m_id = None
self.m_text = ""
self.m_scriptSentence_list = []
self.m_status = None
def Load(self):
pass
def Save(self, parent_node):
utterance_node = ET.SubElement(parent_node, "utterance")
utterance_node.set("id", self.m_id)
text_node = ET.SubElement(utterance_node, "text")
text_node.text = self.m_text
for sentence in self.m_scriptSentence_list:
sentence.Save(utterance_node)
def SaveMetafile(self):
meta_line = self.m_id + "\t"
for sentence in self.m_scriptSentence_list:
meta_line += sentence.SaveMetafile()
return meta_line
from .XmlObj import XmlObj
import xml.etree.ElementTree as ET
# TODO(jin): Not referenced, temporarily commented
class WrittenSentence(XmlObj):
def __init__(self, posset):
self.m_written_word_list = []
self.m_written_mark_list = []
self.m_posset = posset
self.m_align_list = []
self.m_alignCursor = 0
self.m_accompanyIndex = 0
self.m_sequence = ""
self.m_text = ""
def AddHost(self, writtenWord):
self.m_written_word_list.append(writtenWord)
self.m_align_list.append(self.m_alignCursor)
def LoadHost(self):
pass
def SaveHost(self):
pass
def AddAccompany(self, writtenMark):
self.m_written_mark_list.append(writtenMark)
self.m_alignCursor += 1
self.m_accompanyIndex += 1
def SaveAccompany(self):
pass
def LoadAccompany(self):
pass
# Get the mark span corresponding to specific spoken word
def GetAccompanySpan(self, host_index):
if host_index == -1:
return (0, self.m_align_list[0])
accompany_begin = self.m_align_list[host_index]
accompany_end = (
self.m_align_list[host_index + 1]
if host_index + 1 < len(self.m_written_word_list)
else len(self.m_written_mark_list)
)
return (accompany_begin, accompany_end)
# TODO: iterable
def GetElements(self):
accompany_begin, accompany_end = self.GetAccompanySpan(-1)
res_lst = [
self.m_written_mark_list[i] for i in range(accompany_begin, accompany_end)
]
for j in range(len(self.m_written_word_list)):
accompany_begin, accompany_end = self.GetAccompanySpan(j)
res_lst.extend([self.m_written_word_list[j]])
res_lst.extend(
[
self.m_written_mark_list[i]
for i in range(accompany_begin, accompany_end)
]
)
return res_lst
def BuildSequence(self):
self.m_sequence = " ".join([str(ele) for ele in self.GetElements()])
def BuildText(self):
self.m_text = "".join([str(ele) for ele in self.GetElements()])
class SpokenSentence(XmlObj):
def __init__(self, phoneset):
self.m_spoken_word_list = []
self.m_spoken_mark_list = []
self.m_phoneset = phoneset
self.m_align_list = []
self.m_alignCursor = 0
self.m_accompanyIndex = 0
self.m_sequence = ""
self.m_text = ""
def __len__(self):
return len(self.m_spoken_word_list)
def AddHost(self, spokenWord):
self.m_spoken_word_list.append(spokenWord)
self.m_align_list.append(self.m_alignCursor)
def SaveHost(self):
pass
def LoadHost(self):
pass
def AddAccompany(self, spokenMark):
self.m_spoken_mark_list.append(spokenMark)
self.m_alignCursor += 1
self.m_accompanyIndex += 1
def SaveAccompany(self):
pass
# Get the mark span corresponding to specific spoken word
def GetAccompanySpan(self, host_index):
if host_index == -1:
return (0, self.m_align_list[0])
accompany_begin = self.m_align_list[host_index]
accompany_end = (
self.m_align_list[host_index + 1]
if host_index + 1 < len(self.m_spoken_word_list)
else len(self.m_spoken_mark_list)
)
return (accompany_begin, accompany_end)
# TODO: iterable
def GetElements(self):
accompany_begin, accompany_end = self.GetAccompanySpan(-1)
res_lst = [
self.m_spoken_mark_list[i] for i in range(accompany_begin, accompany_end)
]
for j in range(len(self.m_spoken_word_list)):
accompany_begin, accompany_end = self.GetAccompanySpan(j)
res_lst.extend([self.m_spoken_word_list[j]])
res_lst.extend(
[
self.m_spoken_mark_list[i]
for i in range(accompany_begin, accompany_end)
]
)
return res_lst
def LoadAccompany(self):
pass
def BuildSequence(self):
self.m_sequence = " ".join([str(ele) for ele in self.GetElements()])
def BuildText(self):
self.m_text = "".join([str(ele) for ele in self.GetElements()])
def Save(self, parent_node):
spoken_node = ET.SubElement(parent_node, "spoken")
spoken_node.set("wordcount", str(len(self.m_spoken_word_list)))
text_node = ET.SubElement(spoken_node, "text")
text_node.text = self.m_sequence
# TODO: spoken mark might be used
for word in self.m_spoken_word_list:
word.Save(spoken_node)
def SaveMetafile(self):
meta_line_list = [word.SaveMetafile() for word in self.m_spoken_word_list]
return " ".join(meta_line_list)
class ScriptSentence(XmlObj):
def __init__(self, phoneset, posset):
self.m_phoneset = phoneset
self.m_posset = posset
self.m_writtenSentence = WrittenSentence(posset)
self.m_spokenSentence = SpokenSentence(phoneset)
self.m_text = ""
def Save(self, parent_node):
if len(self.m_spokenSentence) > 0:
self.m_spokenSentence.Save(parent_node)
def SaveMetafile(self):
if len(self.m_spokenSentence) > 0:
return self.m_spokenSentence.SaveMetafile()
else:
return ""
import xml.etree.ElementTree as ET
from .XmlObj import XmlObj
from .core_types import Language
from .Syllable import SyllableList
# TODO(Jin): Not referenced, temporarily commented
class WrittenWord(XmlObj):
def __init__(self):
self.m_name = None
self.m_POS = None
def __str__(self):
return self.m_name
def Load(self):
pass
def Save(self):
pass
class WrittenMark(XmlObj):
def __init__(self):
self.m_punctuation = None
def __str__(self):
return self.m_punctuation
def Load(self):
pass
def Save(self):
pass
class SpokenWord(XmlObj):
def __init__(self):
self.m_name = None
self.m_language = None
self.m_syllable_list = []
self.m_breakText = "1"
self.m_POS = "0"
def __str__(self):
return self.m_name
def Load(self):
pass
def Save(self, parent_node):
word_node = ET.SubElement(parent_node, "word")
name_node = ET.SubElement(word_node, "name")
name_node.text = self.m_name
if (
len(self.m_syllable_list) > 0
and self.m_syllable_list[0].m_language != Language.Neutral
):
language_node = ET.SubElement(word_node, "lang")
language_node.text = self.m_syllable_list[0].m_language.name
SyllableList(self.m_syllable_list).Save(word_node)
break_node = ET.SubElement(word_node, "break")
break_node.text = self.m_breakText
POS_node = ET.SubElement(word_node, "POS")
POS_node.text = self.m_POS
return
def SaveMetafile(self):
word_phone_cnt = sum(
[syllable.PhoneCount() for syllable in self.m_syllable_list]
)
word_syllable_cnt = len(self.m_syllable_list)
single_syllable_word = word_syllable_cnt == 1
meta_line_list = []
for idx, syll in enumerate(self.m_syllable_list):
if word_phone_cnt == 1:
word_pos = "word_both"
elif idx == 0:
word_pos = "word_begin"
elif idx == len(self.m_syllable_list) - 1:
word_pos = "word_end"
else:
word_pos = "word_middle"
meta_line_list.append(
syll.SaveMetafile(word_pos, single_syllable_word=single_syllable_word)
)
if self.m_breakText != "0" and self.m_breakText is not None:
meta_line_list.append(
"{{#{}$tone_none$s_none$word_none}}".format(self.m_breakText)
)
return " ".join(meta_line_list)
class SpokenMark(XmlObj):
def __init__(self):
self.m_breakLevel = None
def BreakLevel2Text(self):
return "#" + str(self.m_breakLevel.value)
def __str__(self):
return self.BreakLevel2Text()
def Load(self):
pass
def Save(self):
pass
import xml.etree.ElementTree as ET
from .XmlObj import XmlObj
class Syllable(XmlObj):
def __init__(self):
self.m_phone_list = []
self.m_tone = None
self.m_language = None
self.m_breaklevel = None
def PronunciationText(self):
return " ".join([str(phone) for phone in self.m_phone_list])
def PhoneCount(self):
return len(self.m_phone_list)
def ToneText(self):
return str(self.m_tone.value)
def Save(self):
pass
def Load(self):
pass
def GetPhoneMeta(
self, phone_name, word_pos, syll_pos, tone_text, single_syllable_word=False
):
# Special case: word with single syllable, the last phone's word_pos should be "word_end"
if word_pos == "word_begin" and syll_pos == "s_end" and single_syllable_word:
word_pos = "word_end"
elif word_pos == "word_begin" and syll_pos not in [
"s_begin",
"s_both",
]: # FIXME: keep accord with Engine logic
word_pos = "word_middle"
elif word_pos == "word_end" and syll_pos not in ["s_end", "s_both"]:
word_pos = "word_middle"
else:
pass
return "{{{}$tone{}${}${}}}".format(phone_name, tone_text, syll_pos, word_pos)
def SaveMetafile(self, word_pos, single_syllable_word=False):
syllable_phone_cnt = len(self.m_phone_list)
meta_line_list = []
for idx, phone in enumerate(self.m_phone_list):
if syllable_phone_cnt == 1:
syll_pos = "s_both"
elif idx == 0:
syll_pos = "s_begin"
elif idx == len(self.m_phone_list) - 1:
syll_pos = "s_end"
else:
syll_pos = "s_middle"
meta_line_list.append(
self.GetPhoneMeta(
phone,
word_pos,
syll_pos,
self.ToneText(),
single_syllable_word=single_syllable_word,
)
)
return " ".join(meta_line_list)
class SyllableList(XmlObj):
def __init__(self, syllables):
self.m_syllable_list = syllables
def __len__(self):
return len(self.m_syllable_list)
def __index__(self, index):
return self.m_syllable_list[index]
def PronunciationText(self):
return " - ".join(
[syllable.PronunciationText() for syllable in self.m_syllable_list]
)
def ToneText(self):
return "".join([syllable.ToneText() for syllable in self.m_syllable_list])
def Save(self, parent_node):
syllable_node = ET.SubElement(parent_node, "syllable")
syllable_node.set("syllcount", str(len(self.m_syllable_list)))
phone_node = ET.SubElement(syllable_node, "phone")
phone_node.text = self.PronunciationText()
tone_node = ET.SubElement(syllable_node, "tone")
tone_node.text = self.ToneText()
return
def Load(self):
pass
import re
import logging
from .utils import NgBreakPattern
from .Syllable import Syllable
from .core_types import Language, Tone, PhoneCVType
class DefaultSyllableFormatter:
def __init__(self):
return
def Format(self, phoneset, pronText, syllable_list):
logging.warning("Using DefaultSyllableFormatter dry run: %s", pronText)
return True
RegexNg2en = re.compile(NgBreakPattern)
RegexQingSheng = re.compile(r"([1-5]5)")
RegexPron = re.compile(r"(?P<Pron>[a-z]+)(?P<Tone>[1-6])")
class ZhCNSyllableFormatter:
def __init__(self, sy2ph_map):
self.m_sy2ph_map = sy2ph_map
def NormalizePron(self, pronText):
# Replace Qing Sheng
newPron = pronText.replace("6", "2")
newPron = re.sub(RegexQingSheng, "5", newPron)
# FIXME(Jin): ng case overrides newPron
match = RegexNg2en.search(newPron)
if match:
newPron = "en" + match.group("break")
return newPron
def Format(self, phoneset, pronText, syllable_list):
if phoneset is None or syllable_list is None or pronText is None:
logging.error("ZhCNSyllableFormatter.Format: invalid input")
return False
pronText = self.NormalizePron(pronText)
if pronText in self.m_sy2ph_map:
phone_list = self.m_sy2ph_map[pronText].split(" ")
if len(phone_list) == 3:
syll = Syllable()
for phone in phone_list:
syll.m_phone_list.append(phone)
syll.m_tone = Tone.parse(
pronText[-1]
) # FIXME(Jin): assume tone is the last char
syll.m_language = Language.ZhCN
syllable_list.append(syll)
return True
else:
logging.error(
"ZhCNSyllableFormatter.Format: invalid pronText: %s", pronText
)
return False
else:
logging.error(
"ZhCNSyllableFormatter.Format: syllable to phone map missing key: %s",
pronText,
)
return False
class PinYinSyllableFormatter:
def __init__(self, sy2ph_map):
self.m_sy2ph_map = sy2ph_map
def NormalizePron(self, pronText):
newPron = pronText.replace("6", "2")
newPron = re.sub(RegexQingSheng, "5", newPron)
# FIXME(Jin): ng case overrides newPron
match = RegexNg2en.search(newPron)
if match:
newPron = "en" + match.group("break")
return newPron
def Format(self, phoneset, pronText, syllable_list):
if phoneset is None or syllable_list is None or pronText is None:
logging.error("PinYinSyllableFormatter.Format: invalid input")
return False
pronText = self.NormalizePron(pronText)
match = RegexPron.search(pronText)
if match:
pron = match.group("Pron")
tone = match.group("Tone")
else:
logging.error(
"PinYinSyllableFormatter.Format: pronunciation is not valid: %s",
pronText,
)
return False
if pron in self.m_sy2ph_map:
phone_list = self.m_sy2ph_map[pron].split(" ")
if len(phone_list) in [1, 2]:
syll = Syllable()
for phone in phone_list:
syll.m_phone_list.append(phone)
syll.m_tone = Tone.parse(tone)
syll.m_language = Language.PinYin
syllable_list.append(syll)
return True
else:
logging.error("PinYinSyllableFormatter.Format: invalid phone: %s", pron)
return False
else:
logging.error(
"PinYinSyllableFormatter.Format: syllable to phone map missing key: %s",
pron,
)
return False
class ZhHKSyllableFormatter:
def __init__(self, sy2ph_map):
self.m_sy2ph_map = sy2ph_map
def Format(self, phoneset, pronText, syllable_list):
if phoneset is None or syllable_list is None or pronText is None:
logging.error("ZhHKSyllableFormatter.Format: invalid input")
return False
match = RegexPron.search(pronText)
if match:
pron = match.group("Pron")
tone = match.group("Tone")
else:
logging.error(
"ZhHKSyllableFormatter.Format: pronunciation is not valid: %s", pronText
)
return False
if pron in self.m_sy2ph_map:
phone_list = self.m_sy2ph_map[pron].split(" ")
if len(phone_list) in [1, 2]:
syll = Syllable()
for phone in phone_list:
syll.m_phone_list.append(phone)
syll.m_tone = Tone.parse(tone)
syll.m_language = Language.ZhHK
syllable_list.append(syll)
return True
else:
logging.error("ZhHKSyllableFormatter.Format: invalid phone: %s", pron)
return False
else:
logging.error(
"ZhHKSyllableFormatter.Format: syllable to phone map missing key: %s",
pron,
)
return False
class WuuShanghaiSyllableFormatter:
def __init__(self, sy2ph_map):
self.m_sy2ph_map = sy2ph_map
def Format(self, phoneset, pronText, syllable_list):
if phoneset is None or syllable_list is None or pronText is None:
logging.error("WuuShanghaiSyllableFormatter.Format: invalid input")
return False
match = RegexPron.search(pronText)
if match:
pron = match.group("Pron")
tone = match.group("Tone")
else:
logging.error(
"WuuShanghaiSyllableFormatter.Format: pronunciation is not valid: %s",
pronText,
)
return False
if pron in self.m_sy2ph_map:
phone_list = self.m_sy2ph_map[pron].split(" ")
if len(phone_list) in [1, 2]:
syll = Syllable()
for phone in phone_list:
syll.m_phone_list.append(phone)
syll.m_tone = Tone.parse(tone)
syll.m_language = Language.WuuShanghai
syllable_list.append(syll)
return True
else:
logging.error(
"WuuShanghaiSyllableFormatter.Format: invalid phone: %s", pron
)
return False
else:
logging.error(
"WuuShanghaiSyllableFormatter.Format: syllable to phone map missing key: %s",
pron,
)
return False
class SichuanSyllableFormatter:
def __init__(self, sy2ph_map):
self.m_sy2ph_map = sy2ph_map
def Format(self, phoneset, pronText, syllable_list):
if phoneset is None or syllable_list is None or pronText is None:
logging.error("SichuanSyllableFormatter.Format: invalid input")
return False
match = RegexPron.search(pronText)
if match:
pron = match.group("Pron")
tone = match.group("Tone")
else:
logging.error(
"SichuanSyllableFormatter.Format: pronunciation is not valid: %s",
pronText,
)
return False
if pron in self.m_sy2ph_map:
phone_list = self.m_sy2ph_map[pron].split(" ")
if len(phone_list) in [1, 2]:
syll = Syllable()
for phone in phone_list:
syll.m_phone_list.append(phone)
syll.m_tone = Tone.parse(tone)
syll.m_language = Language.Sichuan
syllable_list.append(syll)
return True
else:
logging.error(
"SichuanSyllableFormatter.Format: invalid phone: %s", pron
)
return False
else:
logging.error(
"SichuanSyllableFormatter.Format: syllable to phone map missing key: %s",
pron,
)
return False
class EnXXSyllableFormatter:
def __init__(self, language):
self.m_f2t_map = None
self.m_language = language
def NormalizePron(self, pronText):
newPron = pronText.replace("#", ".")
newPron = (
newPron.replace("03", "0")
.replace("13", "1")
.replace("23", "2")
.replace("3", "")
)
newPron = newPron.replace("2", "0")
return newPron
def Format(self, phoneset, pronText, syllable_list):
if phoneset is None or syllable_list is None or pronText is None:
logging.error("EnXXSyllableFormatter.Format: invalid input")
return False
pronText = self.NormalizePron(pronText)
syllables = [ele.strip() for ele in pronText.split(".")]
for i in range(len(syllables)):
syll = Syllable()
syll.m_language = self.m_language
syll.m_tone = Tone.parse("0")
phones = re.split(r"[\s]+", syllables[i])
for j in range(len(phones)):
phoneName = phones[j].lower()
toneName = "0"
if "0" in phoneName or "1" in phoneName or "2" in phoneName:
toneName = phoneName[-1]
phoneName = phoneName[:-1]
phoneName_lst = None
if self.m_f2t_map is not None:
phoneName_lst = self.m_f2t_map.get(phoneName, None)
if phoneName_lst is None:
phoneName_lst = [phoneName]
for new_phoneName in phoneName_lst:
phone_obj = phoneset.m_name_map.get(new_phoneName, None)
if phone_obj is None:
logging.error(
"EnXXSyllableFormatter.Format: phone %s not found",
new_phoneName,
)
return False
phone_obj.m_name = new_phoneName
syll.m_phone_list.append(phone_obj)
if phone_obj.m_cv_type == PhoneCVType.Vowel:
syll.m_tone = Tone.parse(toneName)
if j == len(phones) - 1:
phone_obj.m_bnd = True
syllable_list.append(syll)
return True
class XmlObj:
def __init__(self):
pass
def Load(self):
pass
def Save(self):
pass
def LoadData(self):
pass
def SaveData(self):
pass
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment