Commit b309ea1b authored by chenzk's avatar chenzk
Browse files

v1.0

parents
import os
import re
from dataclasses import dataclass
import concurrent.futures
from tqdm.auto import tqdm
import openpyxl # use to open excel. run ! pip install openpyxl
# download_link: https://www.bilibili.com/read/cv23965717
@dataclass
class DataConfig:
dataset_path = './raw_datasets/Genshin_chinese4.5/原神语音包4.5(英)'
excel_path = './raw_datasets/Genshin_chinese4.5/原神4.5语音包对应文本(英).xlsx'
output_filelist_path = './filelists/genshin_en.txt'
# 若文本中出现以下字符,基本和语音对不上
FORBIDDEN_TEXTS = ["……", "{NICKNAME}", "#", "(", ")", "♪", "test", "{0}", "█", "*", "█", "+", "Gohus"]
REPLACEMENTS = {"$UNRELEASED": ""}
escaped_forbidden_texts = [re.escape(text) for text in FORBIDDEN_TEXTS]
pattern = re.compile("|".join(escaped_forbidden_texts))
data_config = DataConfig()
def clean_text(text):
cleaned_text = text
if pattern.search(cleaned_text):
return None
for old, new in REPLACEMENTS.items():
cleaned_text = cleaned_text.replace(old, new)
return text
def read_excel(excel):
wb = openpyxl.load_workbook(excel)
sheet_names = wb.sheetnames
main_sheet = wb[sheet_names[0]]
npc_names = [cell.value for cell in main_sheet['B'] if cell.value][1:]
npc_audio_number = [cell.value for cell in main_sheet['C'] if cell.value][1:]
return wb, npc_names, npc_audio_number
def process_filelist(data):
audio_path, text, npc_path = data
input_audio_path = os.path.abspath(os.path.join(npc_path, audio_path))
if os.path.exists(input_audio_path):
text = clean_text(text)
if text is not None:
return f'{input_audio_path}|{text}\n'
if __name__ == '__main__':
wb, npc_names, npc_audio_number = read_excel(data_config.excel_path)
datas_list = []
results = []
for index, npc_name in enumerate(tqdm(npc_names)):
sheet = wb[npc_name]
audio_names = [cell.value for cell in sheet['C'] if cell.value][1:]
texts = [cell.value for cell in sheet['D'] if cell.value][1:]
npc_path = os.path.join(data_config.dataset_path, npc_name)
datas_list.extend([(audio_name, text, npc_path) for audio_name, text in zip(audio_names, texts)])
with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor:
futures = [executor.submit(process_filelist, data) for data in datas_list]
for future in tqdm(concurrent.futures.as_completed(futures), total=len(datas_list)):
result = future.result()
if result is not None:
results.append(result)
# make sure that the parent dir exists, raising error at the last step is quite terrible OVO
os.makedirs(os.path.dirname(data_config.output_filelist_path), exist_ok=True)
with open(data_config.output_filelist_path, 'w', encoding='utf-8') as f:
f.writelines(results)
\ No newline at end of file
import os
import re
from dataclasses import dataclass
import concurrent.futures
from tqdm.auto import tqdm
import openpyxl # use to open excel. run ! pip install openpyxl
# download_link: https://www.bilibili.com/read/cv23965717
@dataclass
class DataConfig:
dataset_path = './raw_datasets/Genshin_chinese4.5/原神语音包4.5(中)'
excel_path = './raw_datasets/Genshin_chinese4.5/原神4.5语音包对应文本(中).xlsx'
output_filelist_path = './filelists/genshin_zh.txt'
# 若文本中出现以下字符,基本和语音对不上
FORBIDDEN_TEXTS = ["……", "{NICKNAME}", "#", "(", ")", "♪", "test", "{0}", "█", "*", "█", "+", "Gohus"]
REPLACEMENTS = {"$UNRELEASED": ""}
escaped_forbidden_texts = [re.escape(text) for text in FORBIDDEN_TEXTS]
pattern = re.compile("|".join(escaped_forbidden_texts))
data_config = DataConfig()
def clean_text(text):
cleaned_text = text
# 删去所有包含英文的台词
if re.search(r'[A-Za-z0-9]', cleaned_text):
return None
if pattern.search(cleaned_text):
return None
for old, new in REPLACEMENTS.items():
cleaned_text = cleaned_text.replace(old, new)
return text
def read_excel(excel):
wb = openpyxl.load_workbook(excel)
sheet_names = wb.sheetnames
main_sheet = wb[sheet_names[0]]
npc_names = [cell.value for cell in main_sheet['B'] if cell.value][1:]
npc_audio_number = [cell.value for cell in main_sheet['C'] if cell.value][1:]
return wb, npc_names, npc_audio_number
def process_filelist(data):
audio_path, text, npc_path = data
input_audio_path = os.path.abspath(os.path.join(npc_path, audio_path))
if os.path.exists(input_audio_path):
text = clean_text(text)
if text is not None:
return f'{input_audio_path}|{text}\n'
if __name__ == '__main__':
wb, npc_names, npc_audio_number = read_excel(data_config.excel_path)
datas_list = []
results = []
for index, npc_name in enumerate(tqdm(npc_names)):
sheet = wb[npc_name]
audio_names = [cell.value for cell in sheet['C'] if cell.value][1:]
texts = [cell.value for cell in sheet['D'] if cell.value][1:]
npc_path = os.path.join(data_config.dataset_path, npc_name)
datas_list.extend([(audio_name, text, npc_path) for audio_name, text in zip(audio_names, texts)])
with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor:
futures = [executor.submit(process_filelist, data) for data in datas_list]
for future in tqdm(concurrent.futures.as_completed(futures), total=len(datas_list)):
result = future.result()
if result is not None:
results.append(result)
# make sure that the parent dir exists, raising error at the last step is quite terrible OVO
os.makedirs(os.path.dirname(data_config.output_filelist_path), exist_ok=True)
with open(data_config.output_filelist_path, 'w', encoding='utf-8') as f:
f.writelines(results)
\ No newline at end of file
import os
import json
from pathlib import Path
from dataclasses import dataclass
import concurrent.futures
from tqdm.auto import tqdm
# download_link: https://www.openslr.org/109/
@dataclass
class DataConfig:
dataset_path = './raw_datasets/hi_fi_tts_v0'
output_filelist_path = './filelists/hifi_tts.txt'
data_config = DataConfig()
def process_filelist(speaker):
filelist = []
with open(speaker, 'r', encoding='utf-8') as f:
for line in f:
line = json.loads(line.strip())
audio_path = os.path.abspath(os.path.join(data_config.dataset_path, line['audio_filepath']))
text = line['text_normalized']
if os.path.exists(audio_path):
filelist.append(f'{audio_path}|{text}\n')
return filelist
if __name__ == '__main__':
filelist = []
results = []
dataset_path = Path(data_config.dataset_path)
speakers = list(dataset_path.rglob('*.json'))
with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor:
futures = [executor.submit(process_filelist, speaker) for speaker in speakers]
for future in tqdm(concurrent.futures.as_completed(futures), total=len(speakers)):
result = future.result()
if result is not None:
results.extend(result)
# make sure that the parent dir exists, raising error at the last step is quite terrible OVO
os.makedirs(os.path.dirname(data_config.output_filelist_path), exist_ok=True)
with open(data_config.output_filelist_path, 'w', encoding='utf-8') as f:
f.writelines(results)
\ No newline at end of file
import os
from pathlib import Path
from dataclasses import dataclass
import concurrent.futures
from tqdm.auto import tqdm
# download_link: https://openslr.org/60/
@dataclass
class DataConfig:
dataset_path = './raw_datasets/LibriTTS/train-other-500'
output_filelist_path = './filelists/libri_tts.txt'
data_config = DataConfig()
def process_filelist(wav_path: Path):
text_path = wav_path.with_suffix('.normalized.txt')
if text_path.exists():
with open(text_path, 'r', encoding='utf-8') as f:
text = f.read().strip()
return f'{wav_path.as_posix()}|{text}\n'
if __name__ == '__main__':
filelist = []
results = []
dataset_path = Path(data_config.dataset_path)
waves = list(dataset_path.rglob('*.wav'))
with concurrent.futures.ProcessPoolExecutor(max_workers=8) as executor:
futures = [executor.submit(process_filelist, wav_path) for wav_path in waves]
for future in tqdm(concurrent.futures.as_completed(futures), total=len(waves)):
result = future.result()
if result is not None:
results.append(result)
# make sure that the parent dir exists, raising error at the last step is quite terrible OVO
os.makedirs(os.path.dirname(data_config.output_filelist_path), exist_ok=True)
with open(data_config.output_filelist_path, 'w', encoding='utf-8') as f:
f.writelines(results)
\ No newline at end of file
Copyright (c) 2017 Keith Ito
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
""" from https://github.com/keithito/tacotron """
from text import cleaners
from text.symbols import symbols
# Mappings from symbol to numeric ID and vice versa:
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
def text_to_sequence(text, symbols, cleaner_names):
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
Args:
text: string to convert to a sequence
cleaner_names: names of the cleaner functions to run the text through
Returns:
List of integers corresponding to the symbols in the text
'''
sequence = []
symbol_to_id = {s: i for i, s in enumerate(symbols)}
clean_text = _clean_text(text, cleaner_names)
print(clean_text)
print(f" length:{len(clean_text)}")
for symbol in clean_text:
if symbol not in symbol_to_id.keys():
continue
symbol_id = symbol_to_id[symbol]
sequence += [symbol_id]
print(f" length:{len(sequence)}")
return sequence
def cleaned_text_to_sequence(cleaned_text):
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
Args:
text: string to convert to a sequence
Returns:
List of integers corresponding to the symbols in the text
'''
# symbol_to_id = {s: i for i, s in enumerate(symbols)}
sequence = [_symbol_to_id[symbol] for symbol in cleaned_text if symbol in _symbol_to_id.keys()]
return sequence
def cleaned_text_to_sequence_chinese(cleaned_text):
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
Args:
text: string to convert to a sequence
Returns:
List of integers corresponding to the symbols in the text
'''
# symbol_to_id = {s: i for i, s in enumerate(symbols)}
sequence = [_symbol_to_id[symbol] for symbol in cleaned_text.split(' ') if symbol in _symbol_to_id.keys()]
return sequence
def sequence_to_text(sequence):
'''Converts a sequence of IDs back to a string'''
result = ''
for symbol_id in sequence:
s = _id_to_symbol[symbol_id]
result += s
return result
def _clean_text(text, cleaner_names):
for name in cleaner_names:
cleaner = getattr(cleaners, name)
if not cleaner:
raise Exception('Unknown cleaner: %s' % name)
text = cleaner(text)
return text
import re
import string
import numpy as np
from .langdetect import detect, LangDetectException
from text.english import english_to_ipa2
from text.mandarin import chinese_to_cnm3
from text.japanese import japanese_to_ipa2
language_module_map = {"PAD":0, "ZH": 1, "EN": 2, "JA": 3}
# 预编译正则表达式
ZH_PATTERN = re.compile(r'[\u3400-\u4DBF\u4e00-\u9FFF\uF900-\uFAFF\u3000-\u303F]')
EN_PATTERN = re.compile(r'[a-zA-Z.,!?\'"(){}[\]<>:;@#$%^&*-_+=/\\|~`]+')
JP_PATTERN = re.compile(r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FAF\u31F0-\u31FF\uFF00-\uFFEF\u3000-\u303F]')
CLEANER_PATTERN = re.compile(r'\[(ZH|EN|JA)\]')
def detect_language(text: str, prev_lang=None):
"""
根据给定的文本检测语言
:param text: 输入文本
:param prev_lang: 上一个检测到的语言
:return: 'ZH' for Chinese, 'EN' for English, 'JA' for Japanese, or prev_lang for spaces
"""
if ZH_PATTERN.search(text): return 'ZH'
if EN_PATTERN.search(text): return 'EN'
if JP_PATTERN.search(text): return 'JA'
if text.isspace(): return prev_lang # 若是空格,则返回前一个语言
return None
def replace_substring(s, start_index, end_index, replacement):
return s[:start_index] + replacement + s[end_index:]
def replace_sublist(lst, start_index, end_index, replacement_list):
lst[start_index:end_index] = replacement_list
# convert text to ipa and prepare for language embedding
def append_tags_and_convert(match, conversion_func, tag_value, tags):
converted_text = conversion_func(match.group(1))
tags.extend([tag_value] * len(converted_text))
return converted_text + ' '
# auto detect language using re
def cjke_cleaners4(text: str):
"""
根据文本内容自动检测语言并转换为IPA音标
:param text: 输入文本
:return: 转换为IPA音标的文本
"""
text = CLEANER_PATTERN.sub('', text)
pointer = 0
output = ''
current_language = detect_language(text[pointer])
while pointer < len(text):
temp_text = ''
while pointer < len(text) and detect_language(text[pointer], current_language) == current_language:
temp_text += text[pointer]
pointer += 1
if current_language == 'ZH':
output += chinese_to_cnm3(temp_text)
elif current_language == 'JA':
output += japanese_to_ipa2(temp_text)
elif current_language == 'EN':
output += english_to_ipa2(temp_text)
if pointer < len(text):
current_language = detect_language(text[pointer])
output = re.sub(r'\s+$', '', output)
output = re.sub(r'([^\.,!\?\-…~])$', r'\1.', output)
return output
__version__ = "0.5.20"
from .cn2an import Cn2An
from .an2cn import An2Cn
from .transform import Transform
cn2an = Cn2An().cn2an
an2cn = An2Cn().an2cn
transform = Transform().transform
__all__ = [
"__version__",
"cn2an",
"an2cn",
"transform"
]
from typing import Union
#from proces import preprocess
from .conf import NUMBER_LOW_AN2CN, NUMBER_UP_AN2CN, UNIT_LOW_ORDER_AN2CN, UNIT_UP_ORDER_AN2CN
class An2Cn(object):
def __init__(self) -> None:
self.all_num = "0123456789"
self.number_low = NUMBER_LOW_AN2CN
self.number_up = NUMBER_UP_AN2CN
self.mode_list = ["low", "up", "rmb", "direct"]
def an2cn(self, inputs: Union[str, int, float] = None, mode: str = "low") -> str:
"""阿拉伯数字转中文数字
:param inputs: 阿拉伯数字
:param mode: low 小写数字,up 大写数字,rmb 人民币大写,direct 直接转化
:return: 中文数字
"""
if inputs is not None and inputs != "":
if mode not in self.mode_list:
raise ValueError(f"mode 仅支持 {str(self.mode_list)} !")
# 将数字转化为字符串,这里会有Python会自动做转化
# 1. -> 1.0 1.00 -> 1.0 -0 -> 0
if not isinstance(inputs, str):
inputs = self.__number_to_string(inputs)
# 数据预处理:
# 1. 繁体转简体
# 2. 全角转半角
# inputs = preprocess(inputs, pipelines=[
# "traditional_to_simplified",
# "full_angle_to_half_angle"
# ])
# 检查数据是否有效
self.__check_inputs_is_valid(inputs)
# 判断正负
if inputs[0] == "-":
sign = "负"
inputs = inputs[1:]
else:
sign = ""
if mode == "direct":
output = self.__direct_convert(inputs)
else:
# 切割整数部分和小数部分
split_result = inputs.split(".")
len_split_result = len(split_result)
if len_split_result == 1:
# 不包含小数的输入
integer_data = split_result[0]
if mode == "rmb":
output = self.__integer_convert(integer_data, "up") + "元整"
else:
output = self.__integer_convert(integer_data, mode)
elif len_split_result == 2:
# 包含小数的输入
integer_data, decimal_data = split_result
if mode == "rmb":
int_data = self.__integer_convert(integer_data, "up")
dec_data = self.__decimal_convert(decimal_data, "up")
len_dec_data = len(dec_data)
if len_dec_data == 0:
output = int_data + "元整"
elif len_dec_data == 1:
raise ValueError(f"异常输出:{dec_data}")
elif len_dec_data == 2:
if dec_data[1] != "零":
if int_data == "零":
output = dec_data[1] + "角"
else:
output = int_data + "元" + dec_data[1] + "角"
else:
output = int_data + "元整"
else:
if dec_data[1] != "零":
if dec_data[2] != "零":
if int_data == "零":
output = dec_data[1] + "角" + dec_data[2] + "分"
else:
output = int_data + "元" + dec_data[1] + "角" + dec_data[2] + "分"
else:
if int_data == "零":
output = dec_data[1] + "角"
else:
output = int_data + "元" + dec_data[1] + "角"
else:
if dec_data[2] != "零":
if int_data == "零":
output = dec_data[2] + "分"
else:
output = int_data + "元" + "零" + dec_data[2] + "分"
else:
output = int_data + "元整"
else:
output = self.__integer_convert(integer_data, mode) + self.__decimal_convert(decimal_data, mode)
else:
raise ValueError(f"输入格式错误:{inputs}!")
else:
raise ValueError("输入数据为空!")
return sign + output
def __direct_convert(self, inputs: str) -> str:
_output = ""
for d in inputs:
if d == ".":
_output += "点"
else:
_output += self.number_low[int(d)]
return _output
@staticmethod
def __number_to_string(number_data: Union[int, float]) -> str:
# 小数处理:python 会自动把 0.00005 转化成 5e-05,因此 str(0.00005) != "0.00005"
string_data = str(number_data)
if "e" in string_data:
string_data_list = string_data.split("e")
string_key = string_data_list[0]
string_value = string_data_list[1]
if string_value[0] == "-":
string_data = "0." + "0" * (int(string_value[1:]) - 1) + string_key
else:
string_data = string_key + "0" * int(string_value)
return string_data
def __check_inputs_is_valid(self, check_data: str) -> None:
# 检查输入数据是否在规定的字典中
all_check_keys = self.all_num + ".-"
for data in check_data:
if data not in all_check_keys:
raise ValueError(f"输入的数据不在转化范围内:{data}!")
def __integer_convert(self, integer_data: str, mode: str) -> str:
if mode == "low":
numeral_list = NUMBER_LOW_AN2CN
unit_list = UNIT_LOW_ORDER_AN2CN
elif mode == "up":
numeral_list = NUMBER_UP_AN2CN
unit_list = UNIT_UP_ORDER_AN2CN
else:
raise ValueError(f"error mode: {mode}")
# 去除前面的 0,比如 007 => 7
integer_data = str(int(integer_data))
len_integer_data = len(integer_data)
if len_integer_data > len(unit_list):
raise ValueError(f"超出数据范围,最长支持 {len(unit_list)} 位")
output_an = ""
for i, d in enumerate(integer_data):
if int(d):
output_an += numeral_list[int(d)] + unit_list[len_integer_data - i - 1]
else:
if not (len_integer_data - i - 1) % 4:
output_an += numeral_list[int(d)] + unit_list[len_integer_data - i - 1]
if i > 0 and not output_an[-1] == "零":
output_an += numeral_list[int(d)]
output_an = output_an.replace("零零", "零").replace("零万", "万").replace("零亿", "亿").replace("亿万", "亿") \
.strip("零")
# 解决「一十几」问题
if output_an[:2] in ["一十"]:
output_an = output_an[1:]
# 0 - 1 之间的小数
if not output_an:
output_an = "零"
return output_an
def __decimal_convert(self, decimal_data: str, o_mode: str) -> str:
len_decimal_data = len(decimal_data)
if len_decimal_data > 16:
print(f"注意:小数部分长度为 {len_decimal_data} ,将自动截取前 16 位有效精度!")
decimal_data = decimal_data[:16]
if len_decimal_data:
output_an = "点"
else:
output_an = ""
if o_mode == "low":
numeral_list = NUMBER_LOW_AN2CN
elif o_mode == "up":
numeral_list = NUMBER_UP_AN2CN
else:
raise ValueError(f"error mode: {o_mode}")
for data in decimal_data:
output_an += numeral_list[int(data)]
return output_an
import re
from typing import Union
#from proces import preprocess
from .an2cn import An2Cn
from .conf import NUMBER_CN2AN, UNIT_CN2AN, STRICT_CN_NUMBER, NORMAL_CN_NUMBER, NUMBER_LOW_AN2CN, UNIT_LOW_AN2CN
class Cn2An(object):
def __init__(self) -> None:
self.all_num = "".join(list(NUMBER_CN2AN.keys()))
self.all_unit = "".join(list(UNIT_CN2AN.keys()))
self.strict_cn_number = STRICT_CN_NUMBER
self.normal_cn_number = NORMAL_CN_NUMBER
self.check_key_dict = {
"strict": "".join(self.strict_cn_number.values()) + "点负",
"normal": "".join(self.normal_cn_number.values()) + "点负",
"smart": "".join(self.normal_cn_number.values()) + "点负" + "01234567890.-"
}
self.pattern_dict = self.__get_pattern()
self.ac = An2Cn()
self.mode_list = ["strict", "normal", "smart"]
self.yjf_pattern = re.compile(fr"^.*?[元圆][{self.all_num}]角([{self.all_num}]分)?$")
self.pattern1 = re.compile(fr"^-?\d+(\.\d+)?[{self.all_unit}]?$")
self.ptn_all_num = re.compile(f"^[{self.all_num}]+$")
# "十?" is for special case "十一万三"
self.ptn_speaking_mode = re.compile(f"^([{self.all_num}]{{0,2}}[{self.all_unit}])+[{self.all_num}]$")
def cn2an(self, inputs: Union[str, int, float] = None, mode: str = "strict") -> Union[float, int]:
"""中文数字转阿拉伯数字
:param inputs: 中文数字、阿拉伯数字、中文数字和阿拉伯数字
:param mode: strict 严格,normal 正常,smart 智能
:return: 阿拉伯数字
"""
if inputs is not None or inputs == "":
if mode not in self.mode_list:
raise ValueError(f"mode 仅支持 {str(self.mode_list)} !")
# 将数字转化为字符串
if not isinstance(inputs, str):
inputs = str(inputs)
# 数据预处理:
# 1. 繁体转简体
# 2. 全角转半角
# inputs = preprocess(inputs, pipelines=[
# "traditional_to_simplified",
# "full_angle_to_half_angle"
# ])
# 特殊转化 廿
inputs = inputs.replace("廿", "二十")
# 检查输入数据是否有效
sign, integer_data, decimal_data, is_all_num = self.__check_input_data_is_valid(inputs, mode)
# smart 下的特殊情况
if sign == 0:
return integer_data
else:
if not is_all_num:
if decimal_data is None:
output = self.__integer_convert(integer_data)
else:
output = self.__integer_convert(integer_data) + self.__decimal_convert(decimal_data)
# fix 1 + 0.57 = 1.5699999999999998
output = round(output, len(decimal_data))
else:
if decimal_data is None:
output = self.__direct_convert(integer_data)
else:
output = self.__direct_convert(integer_data) + self.__decimal_convert(decimal_data)
# fix 1 + 0.57 = 1.5699999999999998
output = round(output, len(decimal_data))
else:
raise ValueError("输入数据为空!")
return sign * output
def __get_pattern(self) -> dict:
# 整数严格检查
_0 = "[零]"
_1_9 = "[一二三四五六七八九]"
_10_99 = f"{_1_9}?[十]{_1_9}?"
_1_99 = f"({_10_99}|{_1_9})"
_100_999 = f"({_1_9}[百]([零]{_1_9})?|{_1_9}[百]{_10_99})"
_1_999 = f"({_100_999}|{_1_99})"
_1000_9999 = f"({_1_9}[千]([零]{_1_99})?|{_1_9}[千]{_100_999})"
_1_9999 = f"({_1000_9999}|{_1_999})"
_10000_99999999 = f"({_1_9999}[万]([零]{_1_999})?|{_1_9999}[万]{_1000_9999})"
_1_99999999 = f"({_10000_99999999}|{_1_9999})"
_100000000_9999999999999999 = f"({_1_99999999}[亿]([零]{_1_99999999})?|{_1_99999999}[亿]{_10000_99999999})"
_1_9999999999999999 = f"({_100000000_9999999999999999}|{_1_99999999})"
str_int_pattern = f"^({_0}|{_1_9999999999999999})$"
nor_int_pattern = f"^({_0}|{_1_9999999999999999})$"
str_dec_pattern = "^[零一二三四五六七八九]{0,15}[一二三四五六七八九]$"
nor_dec_pattern = "^[零一二三四五六七八九]{0,16}$"
for str_num in self.strict_cn_number.keys():
str_int_pattern = str_int_pattern.replace(str_num, self.strict_cn_number[str_num])
str_dec_pattern = str_dec_pattern.replace(str_num, self.strict_cn_number[str_num])
for nor_num in self.normal_cn_number.keys():
nor_int_pattern = nor_int_pattern.replace(nor_num, self.normal_cn_number[nor_num])
nor_dec_pattern = nor_dec_pattern.replace(nor_num, self.normal_cn_number[nor_num])
pattern_dict = {
"strict": {
"int": re.compile(str_int_pattern),
"dec": re.compile(str_dec_pattern)
},
"normal": {
"int": re.compile(nor_int_pattern),
"dec": re.compile(nor_dec_pattern)
}
}
return pattern_dict
def __copy_num(self, num):
cn_num = ""
for n in num:
cn_num += NUMBER_LOW_AN2CN[int(n)]
return cn_num
def __check_input_data_is_valid(self, check_data: str, mode: str) -> (int, str, str, bool):
# 去除 元整、圆整、元正、圆正
stop_words = ["元整", "圆整", "元正", "圆正"]
for word in stop_words:
if check_data[-2:] == word:
check_data = check_data[:-2]
# 去除 元、圆
if mode != "strict":
normal_stop_words = ["圆", "元"]
for word in normal_stop_words:
if check_data[-1] == word:
check_data = check_data[:-1]
# 处理元角分
result = self.yjf_pattern.search(check_data)
if result:
check_data = check_data.replace("元", "点").replace("角", "").replace("分", "")
# 处理特殊问法:一千零十一 一万零百一十一
if "零十" in check_data:
check_data = check_data.replace("零十", "零一十")
if "零百" in check_data:
check_data = check_data.replace("零百", "零一百")
for data in check_data:
if data not in self.check_key_dict[mode]:
raise ValueError(f"当前为{mode}模式,输入的数据不在转化范围内:{data}!")
# 确定正负号
if check_data[0] == "负":
check_data = check_data[1:]
sign = -1
else:
sign = 1
if "点" in check_data:
split_data = check_data.split("点")
if len(split_data) == 2:
integer_data, decimal_data = split_data
# 将 smart 模式中的阿拉伯数字转化成中文数字
if mode == "smart":
integer_data = re.sub(r"\d+", lambda x: self.ac.an2cn(x.group()), integer_data)
decimal_data = re.sub(r"\d+", lambda x: self.__copy_num(x.group()), decimal_data)
mode = "normal"
else:
raise ValueError("数据中包含不止一个点!")
else:
integer_data = check_data
decimal_data = None
# 将 smart 模式中的阿拉伯数字转化成中文数字
if mode == "smart":
# 10.1万 10.1
result1 = self.pattern1.search(integer_data)
if result1:
if result1.group() == integer_data:
if integer_data[-1] in UNIT_CN2AN.keys():
output = int(float(integer_data[:-1]) * UNIT_CN2AN[integer_data[-1]])
else:
output = float(integer_data)
return 0, output, None, None
integer_data = re.sub(r"\d+", lambda x: self.ac.an2cn(x.group()), integer_data)
mode = "normal"
result_int = self.pattern_dict[mode]["int"].search(integer_data)
if result_int:
if result_int.group() == integer_data:
if decimal_data is not None:
result_dec = self.pattern_dict[mode]["dec"].search(decimal_data)
if result_dec:
if result_dec.group() == decimal_data:
return sign, integer_data, decimal_data, False
else:
return sign, integer_data, decimal_data, False
else:
if mode == "strict":
raise ValueError(f"不符合格式的数据:{integer_data}")
elif mode == "normal":
# 纯数模式:一二三
result_all_num = self.ptn_all_num.search(integer_data)
if result_all_num:
if result_all_num.group() == integer_data:
if decimal_data is not None:
result_dec = self.pattern_dict[mode]["dec"].search(decimal_data)
if result_dec:
if result_dec.group() == decimal_data:
return sign, integer_data, decimal_data, True
else:
return sign, integer_data, decimal_data, True
# 口语模式:一万二,两千三,三百四,十三万六,一百二十五万三
result_speaking_mode = self.ptn_speaking_mode.search(integer_data)
if len(integer_data) >= 3 and result_speaking_mode and result_speaking_mode.group() == integer_data:
# len(integer_data)>=3: because the minimum length of integer_data that can be matched is 3
# to find the last unit
last_unit = result_speaking_mode.groups()[-1][-1]
_unit = UNIT_LOW_AN2CN[UNIT_CN2AN[last_unit] // 10]
integer_data = integer_data + _unit
if decimal_data is not None:
result_dec = self.pattern_dict[mode]["dec"].search(decimal_data)
if result_dec:
if result_dec.group() == decimal_data:
return sign, integer_data, decimal_data, False
else:
return sign, integer_data, decimal_data, False
raise ValueError(f"不符合格式的数据:{check_data}")
def __integer_convert(self, integer_data: str) -> int:
# 核心
output_integer = 0
unit = 1
ten_thousand_unit = 1
for index, cn_num in enumerate(reversed(integer_data)):
# 数值
if cn_num in NUMBER_CN2AN:
num = NUMBER_CN2AN[cn_num]
output_integer += num * unit
# 单位
elif cn_num in UNIT_CN2AN:
unit = UNIT_CN2AN[cn_num]
# 判断出万、亿、万亿
if unit % 10000 == 0:
# 万 亿
if unit > ten_thousand_unit:
ten_thousand_unit = unit
# 万亿
else:
ten_thousand_unit = unit * ten_thousand_unit
unit = ten_thousand_unit
if unit < ten_thousand_unit:
unit = unit * ten_thousand_unit
if index == len(integer_data) - 1:
output_integer += unit
else:
raise ValueError(f"{cn_num} 不在转化范围内")
return int(output_integer)
def __decimal_convert(self, decimal_data: str) -> float:
len_decimal_data = len(decimal_data)
if len_decimal_data > 16:
print(f"注意:小数部分长度为 {len_decimal_data} ,将自动截取前 16 位有效精度!")
decimal_data = decimal_data[:16]
len_decimal_data = 16
output_decimal = 0
for index in range(len(decimal_data) - 1, -1, -1):
unit_key = NUMBER_CN2AN[decimal_data[index]]
output_decimal += unit_key * 10 ** -(index + 1)
# 处理精度溢出问题
output_decimal = round(output_decimal, len_decimal_data)
return output_decimal
def __direct_convert(self, data: str) -> int:
output_data = 0
for index in range(len(data) - 1, -1, -1):
unit_key = NUMBER_CN2AN[data[index]]
output_data += unit_key * 10 ** (len(data) - index - 1)
return output_data
NUMBER_CN2AN = {
"零": 0,
"〇": 0,
"一": 1,
"壹": 1,
"幺": 1,
"二": 2,
"贰": 2,
"两": 2,
"三": 3,
"叁": 3,
"四": 4,
"肆": 4,
"五": 5,
"伍": 5,
"六": 6,
"陆": 6,
"七": 7,
"柒": 7,
"八": 8,
"捌": 8,
"九": 9,
"玖": 9,
}
UNIT_CN2AN = {
"十": 10,
"拾": 10,
"百": 100,
"佰": 100,
"千": 1000,
"仟": 1000,
"万": 10000,
"亿": 100000000,
}
UNIT_LOW_AN2CN = {
10: "十",
100: "百",
1000: "千",
10000: "万",
100000000: "亿",
}
NUMBER_LOW_AN2CN = {
0: "零",
1: "一",
2: "二",
3: "三",
4: "四",
5: "五",
6: "六",
7: "七",
8: "八",
9: "九",
}
NUMBER_UP_AN2CN = {
0: "零",
1: "壹",
2: "贰",
3: "叁",
4: "肆",
5: "伍",
6: "陆",
7: "柒",
8: "捌",
9: "玖",
}
UNIT_LOW_ORDER_AN2CN = [
"",
"十",
"百",
"千",
"万",
"十",
"百",
"千",
"亿",
"十",
"百",
"千",
"万",
"十",
"百",
"千",
]
UNIT_UP_ORDER_AN2CN = [
"",
"拾",
"佰",
"仟",
"万",
"拾",
"佰",
"仟",
"亿",
"拾",
"佰",
"仟",
"万",
"拾",
"佰",
"仟",
]
STRICT_CN_NUMBER = {
"零": "零",
"一": "一壹",
"二": "二贰",
"三": "三叁",
"四": "四肆",
"五": "五伍",
"六": "六陆",
"七": "七柒",
"八": "八捌",
"九": "九玖",
"十": "十拾",
"百": "百佰",
"千": "千仟",
"万": "万",
"亿": "亿",
}
NORMAL_CN_NUMBER = {
"零": "零〇",
"一": "一壹幺",
"二": "二贰两",
"三": "三叁仨",
"四": "四肆",
"五": "五伍",
"六": "六陆",
"七": "七柒",
"八": "八捌",
"九": "九玖",
"十": "十拾",
"百": "百佰",
"千": "千仟",
"万": "万",
"亿": "亿",
}
import re
from .cn2an import Cn2An
from .an2cn import An2Cn
from .conf import UNIT_CN2AN
class Transform(object):
def __init__(self) -> None:
self.all_num = "零一二三四五六七八九"
self.all_unit = "".join(list(UNIT_CN2AN.keys()))
self.cn2an = Cn2An().cn2an
self.an2cn = An2Cn().an2cn
self.cn_pattern = f"负?([{self.all_num}{self.all_unit}]+点)?[{self.all_num}{self.all_unit}]+"
self.smart_cn_pattern = f"-?([0-9]+.)?[0-9]+[{self.all_unit}]+"
def transform(self, inputs: str, method: str = "cn2an") -> str:
if method == "cn2an":
inputs = inputs.replace("廿", "二十").replace("半", "0.5").replace("两", "2")
# date
inputs = re.sub(
fr"((({self.smart_cn_pattern})|({self.cn_pattern}))年)?([{self.all_num}十]+月)?([{self.all_num}十]+日)?",
lambda x: self.__sub_util(x.group(), "cn2an", "date"), inputs)
# fraction
inputs = re.sub(fr"{self.cn_pattern}分之{self.cn_pattern}",
lambda x: self.__sub_util(x.group(), "cn2an", "fraction"), inputs)
# percent
inputs = re.sub(fr"百分之{self.cn_pattern}",
lambda x: self.__sub_util(x.group(), "cn2an", "percent"), inputs)
# celsius
inputs = re.sub(fr"{self.cn_pattern}摄氏度",
lambda x: self.__sub_util(x.group(), "cn2an", "celsius"), inputs)
# number
output = re.sub(self.cn_pattern,
lambda x: self.__sub_util(x.group(), "cn2an", "number"), inputs)
elif method == "an2cn":
# date
inputs = re.sub(r"(\d{2,4}年)?(\d{1,2}月)?(\d{1,2}日)?",
lambda x: self.__sub_util(x.group(), "an2cn", "date"), inputs)
# fraction
inputs = re.sub(r"\d+/\d+",
lambda x: self.__sub_util(x.group(), "an2cn", "fraction"), inputs)
# percent
inputs = re.sub(r"-?(\d+\.)?\d+%",
lambda x: self.__sub_util(x.group(), "an2cn", "percent"), inputs)
# celsius
inputs = re.sub(r"\d+℃",
lambda x: self.__sub_util(x.group(), "an2cn", "celsius"), inputs)
# number
output = re.sub(r"-?(\d+\.)?\d+",
lambda x: self.__sub_util(x.group(), "an2cn", "number"), inputs)
else:
raise ValueError(f"error method: {method}, only support 'cn2an' and 'an2cn'!")
return output
def __sub_util(self, inputs, method: str = "cn2an", sub_mode: str = "number") -> str:
try:
if inputs:
if method == "cn2an":
if sub_mode == "date":
return re.sub(fr"(({self.smart_cn_pattern})|({self.cn_pattern}))",
lambda x: str(self.cn2an(x.group(), "smart")), inputs)
elif sub_mode == "fraction":
if inputs[0] != "百":
frac_result = re.sub(self.cn_pattern,
lambda x: str(self.cn2an(x.group(), "smart")), inputs)
numerator, denominator = frac_result.split("分之")
return f"{denominator}/{numerator}"
else:
return inputs
elif sub_mode == "percent":
return re.sub(f"(?<=百分之){self.cn_pattern}",
lambda x: str(self.cn2an(x.group(), "smart")), inputs).replace("百分之", "") + "%"
elif sub_mode == "celsius":
return re.sub(f"{self.cn_pattern}(?=摄氏度)",
lambda x: str(self.cn2an(x.group(), "smart")), inputs).replace("摄氏度", "℃")
elif sub_mode == "number":
return str(self.cn2an(inputs, "smart"))
else:
raise Exception(f"error sub_mode: {sub_mode} !")
else:
if sub_mode == "date":
inputs = re.sub(r"\d+(?=年)",
lambda x: self.an2cn(x.group(), "direct"), inputs)
return re.sub(r"\d+",
lambda x: self.an2cn(x.group(), "low"), inputs)
elif sub_mode == "fraction":
frac_result = re.sub(r"\d+", lambda x: self.an2cn(x.group(), "low"), inputs)
numerator, denominator = frac_result.split("/")
return f"{denominator}分之{numerator}"
elif sub_mode == "celsius":
return self.an2cn(inputs[:-1], "low") + "摄氏度"
elif sub_mode == "percent":
return "百分之" + self.an2cn(inputs[:-1], "low")
elif sub_mode == "number":
return self.an2cn(inputs, "low")
else:
raise Exception(f"error sub_mode: {sub_mode} !")
except Exception as e:
print(f"WARN: {e}")
return inputs
a,a
ai,ai
ai0,a0 I0
an,an
an0,a0 N0
ang,ang
ang0,A0 ng0
ao,ao
ao0,A0 O0
ba,b a
bai,b a0 I0
ban,b a0 N0
bang,b A0 ng0
bao,b A0 O0
be,b e
bei,b E0 I0
ben,b e0 N0
beng,b e0 ng0
ber,b er
bi,b i
bia,b ia
bian,b iE0 N0
biang,b iA0 ng0
biao,b iA0 O0
bie,b ie
bin,b i N0
bing,b i ng0
biong,b iO0 ng0
biu,b io0 U0
bo,b o
bong,b oo0 ng0
bou,b o0 U0
bu,b u
bua,b ua
buai,b ua0 I0
buan,b ua0 N0
buang,b uA0 ng0
bui,b uE0 I0
bun,b ue0 N0
bv,b v
bve,b ve
ca,c a
cai,c a0 I0
can,c a0 N0
cang,c A0 ng0
cao,c A0 O0
ce,c e
cei,c E0 I0
cen,c e0 N0
ceng,c e0 ng0
cer,c er
cha,ch a
chai,ch a0 I0
chan,ch a0 N0
chang,ch A0 ng0
chao,ch A0 O0
che,ch e
chei,ch E0 I0
chen,ch e0 N0
cheng,ch e0 ng0
cher,ch er
chi,ch ir
chong,ch oo0 ng0
chou,ch o0 U0
chu,ch u
chua,ch ua
chuai,ch ua0 I0
chuan,ch ua0 N0
chuang,ch uA0 ng0
chui,ch uE0 I0
chun,ch ue0 N0
chuo,ch uo
chv,ch v
chyi,ch i
ci,c i0
cong,c oo0 ng0
cou,c o0 U0
cu,c u
cua,c ua
cuai,c ua0 I0
cuan,c ua0 N0
cuang,c uA0 ng0
cui,c uE0 I0
cun,c ue0 N0
cuo,c uo
cv,c v
cyi,c i
da,d a
dai,d a0 I0
dan,d a0 N0
dang,d A0 ng0
dao,d A0 O0
de,d e
dei,d E0 I0
den,d e0 N0
deng,d e0 ng0
der,d er
di,d i
dia,d ia
dian,d iE0 N0
diang,d iA0 ng0
diao,d iA0 O0
die,d ie
din,d i N0
ding,d i ng0
diong,d iO0 ng0
diu,d io0 U0
dong,d oo0 ng0
dou,d o0 U0
du,d u
dua,d ua
duai,d ua0 I0
duan,d ua0 N0
duang,d uA0 ng0
dui,d uE0 I0
dun,d ue0 N0
duo,d uo
dv,d v
dve,d ve
e,e
ei,E0 I0
en,e0 N0
eng,e0 ng0
er,er
fa,f a
fai,f a0 I0
fan,f a0 N0
fang,f A0 ng0
fao,f A0 O0
fe,f e
fei,f E0 I0
fen,f e0 N0
feng,f e0 ng0
fer,f er
fi,f i
fia,f ia
fian,f iE0 N0
fiang,f iA0 ng0
fiao,f iA0 O0
fie,f ie
fin,f i N0
fing,f i ng0
fiong,f iO0 ng0
fiu,f io0 U0
fo,f o
fong,f oo0 ng0
fou,f o0 U0
fu,f u
fua,f ua
fuai,f ua0 I0
fuan,f ua0 N0
fuang,f uA0 ng0
fui,f uE0 I0
fun,f ue0 N0
fv,f v
fve,f ve
ga,g a
gai,g a0 I0
gan,g a0 N0
gang,g A0 ng0
gao,g A0 O0
ge,g e
gei,g E0 I0
gen,g e0 N0
geng,g e0 ng0
ger,g er
gi,g i
gia,g ia
gian,g iE0 N0
giang,g iA0 ng0
giao,g iA0 O0
gie,g ie
gin,g i N0
ging,g i ng0
giong,g iO0 ng0
giu,g io0 U0
gong,g oo0 ng0
gou,g o0 U0
gu,g u
gua,g ua
guai,g ua0 I0
guan,g ua0 N0
guang,g uA0 ng0
gui,g uE0 I0
gun,g ue0 N0
guo,g uo
gv,g v
gve,g ve
ha,h a
hai,h a0 I0
han,h a0 N0
hang,h A0 ng0
hao,h A0 O0
he,h e
hei,h E0 I0
hen,h e0 N0
heng,h e0 ng0
her,h er
hi,h i
hia,h ia
hian,h iE0 N0
hiang,h iA0 ng0
hiao,h iA0 O0
hie,h ie
hin,h i N0
hing,h i ng0
hiong,h iO0 ng0
hiu,h io0 U0
hong,h oo0 ng0
hou,h o0 U0
hu,h u
hua,h ua
huai,h ua0 I0
huan,h ua0 N0
huang,h uA0 ng0
hui,h uE0 I0
hun,h ue0 N0
huo,h uo
hv,h v
hve,h ve
ji,j i
jia,j ia
jian,j iE0 N0
jiang,j iA0 ng0
jiao,j iA0 O0
jie,j ie
jin,j i N0
jing,j i ng0
jiong,j iO0 ng0
jiu,j io0 U0
ju,j v
juan,j vE0 N0
jue,j ve
jun,j v0 N0
ka,k a
kai,k a0 I0
kan,k a0 N0
kang,k A0 ng0
kao,k A0 O0
ke,k e
kei,k E0 I0
ken,k e0 N0
keng,k e0 ng0
ker,k er
ki,k i
kia,k ia
kian,k iE0 N0
kiang,k iA0 ng0
kiao,k iA0 O0
kie,k ie
kin,k i N0
king,k i ng0
kiong,k iO0 ng0
kiu,k io0 U0
kong,k oo0 ng0
kou,k o0 U0
ku,k u
kua,k ua
kuai,k ua0 I0
kuan,k ua0 N0
kuang,k uA0 ng0
kui,k uE0 I0
kun,k ue0 N0
kuo,k uo
kv,k v
kve,k ve
la,l a
lai,l a0 I0
lan,l a0 N0
lang,l A0 ng0
lao,l A0 O0
le,l e
lei,l E0 I0
len,l e0 N0
leng,l e0 ng0
ler,l er
li,l i
lia,l ia
lian,l iE0 N0
liang,l iA0 ng0
liao,l iA0 O0
lie,l ie
lin,l i N0
ling,l i ng0
liong,l iO0 ng0
liu,l io0 U0
lo,l o
long,l oo0 ng0
lou,l o0 U0
lu,l u
lua,l ua
luai,l ua0 I0
luan,l ua0 N0
luang,l uA0 ng0
lui,l uE0 I0
lun,l ue0 N0
luo,l uo
lv,l v
lve,l ve
ma,m a
mai,m a0 I0
man,m a0 N0
mang,m A0 ng0
mao,m A0 O0
me,m e
mei,m E0 I0
men,m e0 N0
meng,m e0 ng0
mer,m er
mi,m i
mia,m ia
mian,m iE0 N0
miang,m iA0 ng0
miao,m iA0 O0
mie,m ie
min,m i N0
ming,m i ng0
miong,m iO0 ng0
miu,m io0 U0
mo,m o
mong,m oo0 ng0
mou,m o0 U0
mu,m u
mua,m ua
muai,m ua0 I0
muan,m ua0 N0
muang,m uA0 ng0
mui,m uE0 I0
mun,m ue0 N0
mv,m v
mve,m ve
n,ng
na,n a
nai,n a0 I0
nan,n a0 N0
nang,n A0 ng0
nao,n A0 O0
ne,n e
nei,n E0 I0
nen,n e0 N0
neng,n e0 ng0
ner,n er
ni,n i
nia,n ia
nian,n iE0 N0
niang,n iA0 ng0
niao,n iA0 O0
nie,n ie
nin,n i N0
ning,n i ng0
niong,n iO0 ng0
niu,n io0 U0
nong,n oo0 ng0
nou,n o0 U0
nu,n u
nua,n ua
nuai,n ua0 I0
nuan,n ua0 N0
nuang,n uA0 ng0
nui,n uE0 I0
nun,n ue0 N0
nuo,n uo
nv,n v
nve,n ve
o,o
ong,ong
ou,ou
pa,p a
pai,p a0 I0
pan,p a0 N0
pang,p A0 ng0
pao,p A0 O0
pe,p e
pei,p E0 I0
pen,p e0 N0
peng,p e0 ng0
per,p er
pi,p i
pia,p ia
pian,p iE0 N0
piang,p iA0 ng0
piao,p iA0 O0
pie,p ie
pin,p i N0
ping,p i ng0
piong,p iO0 ng0
piu,p io0 U0
po,p o
pong,p oo0 ng0
pou,p o0 U0
pu,p u
pua,p ua
puai,p ua0 I0
puan,p ua0 N0
puang,p uA0 ng0
pui,p uE0 I0
pun,p ue0 N0
pv,p v
pve,p ve
qi,q i
qia,q ia
qian,q iE0 N0
qiang,q iA0 ng0
qiao,q iA0 O0
qie,q ie
qin,q i N0
qing,q i ng0
qiong,q iO0 ng0
qiu,q io0 U0
qu,q v
quan,q vE0 N0
que,q ve
qun,q v0 N0
ra,r a
rai,r a0 I0
ran,r a0 N0
rang,r A0 ng0
rao,r A0 O0
re,r e
rei,r E0 I0
ren,r e0 N0
reng,r e0 ng0
rer,r er
ri,r ir
rong,r oo0 ng0
rou,r o0 U0
ru,r u
rua,r ua
ruai,r ua0 I0
ruan,r ua0 N0
ruang,r uA0 ng0
rui,r uE0 I0
run,r ue0 N0
ruo,r uo
rv,r v
ryi,r i
sa,s a
sai,s a0 I0
san,s a0 N0
sang,s A0 ng0
sao,s A0 O0
se,s e
sei,s E0 I0
sen,s e0 N0
seng,s e0 ng0
ser,s er
sha,sh a
shai,sh a0 I0
shan,sh a0 N0
shang,sh A0 ng0
shao,sh A0 O0
she,sh e
shei,sh E0 I0
shen,sh e0 N0
sheng,sh e0 ng0
sher,sh er
shi,sh ir
shong,sh oo0 ng0
shou,sh o0 U0
shu,sh u
shua,sh ua
shuai,sh ua0 I0
shuan,sh ua0 N0
shuang,sh uA0 ng0
shui,sh uE0 I0
shun,sh ue0 N0
shuo,sh uo
shv,sh v
shyi,sh i
si,s i0
song,s oo0 ng0
sou,s o0 U0
su,s u
sua,s ua
suai,s ua0 I0
suan,s ua0 N0
suang,s uA0 ng0
sui,s uE0 I0
sun,s ue0 N0
suo,s uo
sv,s v
syi,s i
ta,t a
tai,t a0 I0
tan,t a0 N0
tang,t A0 ng0
tao,t A0 O0
te,t e
tei,t E0 I0
ten,t e0 N0
teng,t e0 ng0
ter,t er
ti,t i
tia,t ia
tian,t iE0 N0
tiang,t iA0 ng0
tiao,t iA0 O0
tie,t ie
tin,t i N0
ting,t i ng0
tiong,t iO0 ng0
tong,t oo0 ng0
tou,t o0 U0
tu,t u
tua,t ua
tuai,t ua0 I0
tuan,t ua0 N0
tuang,t uA0 ng0
tui,t uE0 I0
tun,t ue0 N0
tuo,t uo
tv,t v
tve,t ve
wa,w a
wai,w a0 I0
wan,w a0 N0
wang,w A0 ng0
wao,w A0 O0
we,w e
wei,w E0 I0
wen,w e0 N0
weng,w e0 ng0
wer,w er
wi,w i
wo,w o
wong,w oo0 ng0
wou,w o0 U0
wu,w u
xi,x i
xia,x ia
xian,x iE0 N0
xiang,x iA0 ng0
xiao,x iA0 O0
xie,x ie
xin,x i N0
xing,x i ng0
xiong,x iO0 ng0
xiu,x io0 U0
xu,x v
xuan,x vE0 N0
xue,x ve
xun,x v0 N0
ya,y a
yai,y a0 I0
yan,y iE0 N0
yang,y A0 ng0
yao,y A0 O0
ye,y E
yei,y E0 I0
yi,y i
yin,y i N0
ying,y i ng0
yo,y o
yong,y oo0 ng0
you,y o0 U0
yu,y v
yuan,y vE0 N0
yue,y ve
yun,y v0 N0
ywu,y u
za,z a
zai,z a0 I0
zan,z a0 N0
zang,z A0 ng0
zao,z A0 O0
ze,z e
zei,z E0 I0
zen,z e0 N0
zeng,z e0 ng0
zer,z er
zha,zh a
zhai,zh a0 I0
zhan,zh a0 N0
zhang,zh A0 ng0
zhao,zh A0 O0
zhe,zh e
zhei,zh E0 I0
zhen,zh e0 N0
zheng,zh e0 ng0
zher,zh er
zhi,zh ir
zhong,zh oo0 ng0
zhou,zh o0 U0
zhu,zh u
zhua,zh ua
zhuai,zh ua0 I0
zhuan,zh ua0 N0
zhuang,zh uA0 ng0
zhui,zh uE0 I0
zhun,zh ue0 N0
zhuo,zh uo
zhv,zh v
zhyi,zh i
zi,z i0
zong,z oo0 ng0
zou,z o0 U0
zu,z u
zua,z ua
zuai,z ua0 I0
zuan,z ua0 N0
zuang,z uA0 ng0
zui,z uE0 I0
zun,z ue0 N0
zuo,z uo
zv,z v
zyi,z i
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
# Warning: Auto-generated file, don't edit.
phrases_dict = {
'𰻝𰻝面': [['biáng'], ['biáng'], ['miàn']],
}
from pypinyin import load_phrases_dict
def load():
load_phrases_dict(phrases_dict)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment