v1.0

b309ea1b · chenzk · b309ea1b · b309ea1b · b309ea1b · b309ea1b
Commit b309ea1b authored May 07, 2024 by chenzk
20 changed files
--- a/recipes/genshin_en_小虫哥ver.py
+++ b/recipes/genshin_en_小虫哥ver.py
+import os
+import re
+from dataclasses import dataclass
+import concurrent.futures
+from tqdm.auto import tqdm
+import openpyxl # use to open excel. run ! pip install openpyxl 
+# download_link: https://www.bilibili.com/read/cv23965717
+@dataclass
+class DataConfig:
+    dataset_path = './raw_datasets/Genshin_chinese4.5/原神语音包4.5（英）'
+    excel_path = './raw_datasets/Genshin_chinese4.5/原神4.5语音包对应文本（英）.xlsx'
+    output_filelist_path = './filelists/genshin_en.txt'
+# 若文本中出现以下字符，基本和语音对不上
+FORBIDDEN_TEXTS = ["……", "{NICKNAME}", "#", "(", ")", "♪", "test", "{0}", "█", "*", "█", "+", "Gohus"]
+REPLACEMENTS = {"$UNRELEASED": ""}
+escaped_forbidden_texts = [re.escape(text) for text in FORBIDDEN_TEXTS]
+pattern = re.compile("|".join(escaped_forbidden_texts))
+data_config = DataConfig()
+def clean_text(text):
+    cleaned_text = text
+    if pattern.search(cleaned_text):
+        return None
+    for old, new in REPLACEMENTS.items():
+        cleaned_text = cleaned_text.replace(old, new)
+    return text
+def read_excel(excel):
+    wb = openpyxl.load_workbook(excel)
+    sheet_names = wb.sheetnames
+    main_sheet = wb[sheet_names[0]]
+    npc_names = [cell.value for cell in main_sheet['B'] if cell.value][1:]
+    npc_audio_number = [cell.value for cell in main_sheet['C'] if cell.value][1:]
+    return wb, npc_names, npc_audio_number
+def process_filelist(data):
+    audio_path, text, npc_path = data
+    input_audio_path = os.path.abspath(os.path.join(npc_path, audio_path))
+    if os.path.exists(input_audio_path):
+        text = clean_text(text)
+        if text is not None:
+            return f'{input_audio_path}|{text}\n'
+if __name__ == '__main__':   
+    wb, npc_names, npc_audio_number = read_excel(data_config.excel_path)
+    datas_list = []
+    results = []
+    for index, npc_name in enumerate(tqdm(npc_names)):
+        sheet = wb[npc_name]
+        audio_names = [cell.value for cell in sheet['C'] if cell.value][1:]
+        texts = [cell.value for cell in sheet['D'] if cell.value][1:]
+        npc_path = os.path.join(data_config.dataset_path,  npc_name)
+        datas_list.extend([(audio_name, text, npc_path) for audio_name, text in zip(audio_names, texts)]) 
+    with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor:
+        futures = [executor.submit(process_filelist, data) for data in datas_list]
+        for future in tqdm(concurrent.futures.as_completed(futures), total=len(datas_list)):
+            result = future.result()
+            if result is not None:
+                results.append(result)
+    # make sure that the parent dir exists, raising error at the last step is quite terrible OVO
+    os.makedirs(os.path.dirname(data_config.output_filelist_path), exist_ok=True)
+    with open(data_config.output_filelist_path, 'w', encoding='utf-8') as f:
+        f.writelines(results)
\ No newline at end of file
--- a/recipes/genshin_zh_小虫哥ver.py
+++ b/recipes/genshin_zh_小虫哥ver.py
+import os
+import re
+from dataclasses import dataclass
+import concurrent.futures
+from tqdm.auto import tqdm
+import openpyxl # use to open excel. run ! pip install openpyxl 
+# download_link: https://www.bilibili.com/read/cv23965717
+@dataclass
+class DataConfig:
+    dataset_path = './raw_datasets/Genshin_chinese4.5/原神语音包4.5（中）'
+    excel_path = './raw_datasets/Genshin_chinese4.5/原神4.5语音包对应文本（中）.xlsx'
+    output_filelist_path = './filelists/genshin_zh.txt'
+# 若文本中出现以下字符，基本和语音对不上
+FORBIDDEN_TEXTS = ["……", "{NICKNAME}", "#", "(", ")", "♪", "test", "{0}", "█", "*", "█", "+", "Gohus"]
+REPLACEMENTS = {"$UNRELEASED": ""}
+escaped_forbidden_texts = [re.escape(text) for text in FORBIDDEN_TEXTS]
+pattern = re.compile("|".join(escaped_forbidden_texts))
+data_config = DataConfig()
+def clean_text(text):
+    cleaned_text = text
+    # 删去所有包含英文的台词
+    if re.search(r'[A-Za-z0-9]', cleaned_text):
+        return None
+    if pattern.search(cleaned_text):
+        return None
+    for old, new in REPLACEMENTS.items():
+        cleaned_text = cleaned_text.replace(old, new)
+    return text
+def read_excel(excel):
+    wb = openpyxl.load_workbook(excel)
+    sheet_names = wb.sheetnames
+    main_sheet = wb[sheet_names[0]]
+    npc_names = [cell.value for cell in main_sheet['B'] if cell.value][1:]
+    npc_audio_number = [cell.value for cell in main_sheet['C'] if cell.value][1:]
+    return wb, npc_names, npc_audio_number
+def process_filelist(data):
+    audio_path, text, npc_path = data
+    input_audio_path = os.path.abspath(os.path.join(npc_path, audio_path))
+    if os.path.exists(input_audio_path):
+        text = clean_text(text)
+        if text is not None:
+            return f'{input_audio_path}|{text}\n'
+if __name__ == '__main__':   
+    wb, npc_names, npc_audio_number = read_excel(data_config.excel_path)
+    datas_list = []
+    results = []
+    for index, npc_name in enumerate(tqdm(npc_names)):
+        sheet = wb[npc_name]
+        audio_names = [cell.value for cell in sheet['C'] if cell.value][1:]
+        texts = [cell.value for cell in sheet['D'] if cell.value][1:]
+        npc_path = os.path.join(data_config.dataset_path,  npc_name)
+        datas_list.extend([(audio_name, text, npc_path) for audio_name, text in zip(audio_names, texts)]) 
+    with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor:
+        futures = [executor.submit(process_filelist, data) for data in datas_list]
+        for future in tqdm(concurrent.futures.as_completed(futures), total=len(datas_list)):
+            result = future.result()
+            if result is not None:
+                results.append(result)
+    # make sure that the parent dir exists, raising error at the last step is quite terrible OVO
+    os.makedirs(os.path.dirname(data_config.output_filelist_path), exist_ok=True)
+    with open(data_config.output_filelist_path, 'w', encoding='utf-8') as f:
+        f.writelines(results)
\ No newline at end of file
--- a/recipes/hifi_tts.py
+++ b/recipes/hifi_tts.py
+import os
+import json
+from pathlib import Path
+from dataclasses import dataclass
+import concurrent.futures
+from tqdm.auto import tqdm
+# download_link: https://www.openslr.org/109/
+@dataclass
+class DataConfig:
+    dataset_path = './raw_datasets/hi_fi_tts_v0'
+    output_filelist_path = './filelists/hifi_tts.txt'
+data_config = DataConfig()
+def process_filelist(speaker):
+    filelist = []
+    with open(speaker, 'r', encoding='utf-8') as f:
+        for line in f:
+            line = json.loads(line.strip())
+            audio_path = os.path.abspath(os.path.join(data_config.dataset_path, line['audio_filepath']))
+            text = line['text_normalized']
+            if os.path.exists(audio_path):
+                filelist.append(f'{audio_path}|{text}\n')
+    return filelist
+if __name__ == '__main__':
+    filelist = []   
+    results = []
+    dataset_path = Path(data_config.dataset_path)
+    speakers = list(dataset_path.rglob('*.json'))
+    with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor:
+        futures = [executor.submit(process_filelist, speaker) for speaker in speakers]
+        for future in tqdm(concurrent.futures.as_completed(futures), total=len(speakers)):
+            result = future.result()
+            if result is not None:
+                results.extend(result)
+    # make sure that the parent dir exists, raising error at the last step is quite terrible OVO
+    os.makedirs(os.path.dirname(data_config.output_filelist_path), exist_ok=True)
+    with open(data_config.output_filelist_path, 'w', encoding='utf-8') as f:
+        f.writelines(results)
\ No newline at end of file
--- a/recipes/libriTTS.py
+++ b/recipes/libriTTS.py
+import os
+from pathlib import Path
+from dataclasses import dataclass
+import concurrent.futures
+from tqdm.auto import tqdm
+# download_link: https://openslr.org/60/
+@dataclass
+class DataConfig:
+    dataset_path = './raw_datasets/LibriTTS/train-other-500'
+    output_filelist_path = './filelists/libri_tts.txt'
+data_config = DataConfig()
+def process_filelist(wav_path: Path):
+    text_path = wav_path.with_suffix('.normalized.txt')
+    if text_path.exists():
+        with open(text_path, 'r', encoding='utf-8') as f:
+            text = f.read().strip()
+        return f'{wav_path.as_posix()}|{text}\n'
+if __name__ == '__main__':
+    filelist = []   
+    results = []
+    dataset_path = Path(data_config.dataset_path)
+    waves = list(dataset_path.rglob('*.wav'))
+    with concurrent.futures.ProcessPoolExecutor(max_workers=8) as executor:
+        futures = [executor.submit(process_filelist, wav_path) for wav_path in waves]
+        for future in tqdm(concurrent.futures.as_completed(futures), total=len(waves)):
+            result = future.result()
+            if result is not None:
+                results.append(result)
+    # make sure that the parent dir exists, raising error at the last step is quite terrible OVO
+    os.makedirs(os.path.dirname(data_config.output_filelist_path), exist_ok=True)
+    with open(data_config.output_filelist_path, 'w', encoding='utf-8') as f:
+        f.writelines(results)
\ No newline at end of file
--- a/recipes/raw_datasets/BZNSYP.zip
+++ b/recipes/raw_datasets/BZNSYP.zip
--- a/requirements.txt
+++ b/requirements.txt
+# torch
+# torchaudio
+matplotlib
+numpy
+tensorboard
+pypinyin
+jieba
+eng_to_ipa
+unidecode
+inflect
+pyopenjtalk-prebuilt
+numba
+tqdm
+IPython
+gradio
+soundfile
+scipy
--- a/text/LICENSE
+++ b/text/LICENSE
+Copyright (c) 2017 Keith Ito
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/text/__init__.py
+++ b/text/__init__.py
+""" from https://github.com/keithito/tacotron """
+from text import cleaners
+from text.symbols import symbols
+# Mappings from symbol to numeric ID and vice versa:
+_symbol_to_id = {s: i for i, s in enumerate(symbols)}
+_id_to_symbol = {i: s for i, s in enumerate(symbols)}
+def text_to_sequence(text, symbols, cleaner_names):
+  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    Args:
+      text: string to convert to a sequence
+      cleaner_names: names of the cleaner functions to run the text through
+    Returns:
+      List of integers corresponding to the symbols in the text
+  '''
+  sequence = []
+  symbol_to_id = {s: i for i, s in enumerate(symbols)}
+  clean_text = _clean_text(text, cleaner_names)
+  print(clean_text)
+  print(f" length:{len(clean_text)}")
+  for symbol in clean_text:
+    if symbol not in symbol_to_id.keys():
+      continue
+    symbol_id = symbol_to_id[symbol]
+    sequence += [symbol_id]
+  print(f" length:{len(sequence)}")
+  return sequence
+def cleaned_text_to_sequence(cleaned_text):
+  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    Args:
+      text: string to convert to a sequence
+    Returns:
+      List of integers corresponding to the symbols in the text
+  '''
+  # symbol_to_id = {s: i for i, s in enumerate(symbols)}
+  sequence = [_symbol_to_id[symbol] for symbol in cleaned_text if symbol in _symbol_to_id.keys()]
+  return sequence
+def cleaned_text_to_sequence_chinese(cleaned_text):
+  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    Args:
+      text: string to convert to a sequence
+    Returns:
+      List of integers corresponding to the symbols in the text
+  '''
+  # symbol_to_id = {s: i for i, s in enumerate(symbols)}
+  sequence = [_symbol_to_id[symbol] for symbol in cleaned_text.split(' ') if symbol in _symbol_to_id.keys()]
+  return sequence
+def sequence_to_text(sequence):
+  '''Converts a sequence of IDs back to a string'''
+  result = ''
+  for symbol_id in sequence:
+    s = _id_to_symbol[symbol_id]
+    result += s
+  return result
+def _clean_text(text, cleaner_names):
+  for name in cleaner_names:
+    cleaner = getattr(cleaners, name)
+    if not cleaner:
+      raise Exception('Unknown cleaner: %s' % name)
+    text = cleaner(text)
+  return text
--- a/text/cleaners.py
+++ b/text/cleaners.py
+import re
+import string
+import numpy as np
+from .langdetect import detect, LangDetectException
+from text.english import english_to_ipa2
+from text.mandarin import chinese_to_cnm3
+from text.japanese import japanese_to_ipa2
+language_module_map = {"PAD":0, "ZH": 1, "EN": 2, "JA": 3}
+# 预编译正则表达式
+ZH_PATTERN = re.compile(r'[\u3400-\u4DBF\u4e00-\u9FFF\uF900-\uFAFF\u3000-\u303F]')
+EN_PATTERN = re.compile(r'[a-zA-Z.,!?\'"(){}[\]<>:;@#$%^&*-_+=/\\|~`]+')
+JP_PATTERN = re.compile(r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FAF\u31F0-\u31FF\uFF00-\uFFEF\u3000-\u303F]')
+CLEANER_PATTERN = re.compile(r'\[(ZH|EN|JA)\]')
+def detect_language(text: str, prev_lang=None):
+    """
+    根据给定的文本检测语言
+    :param text: 输入文本
+    :param prev_lang: 上一个检测到的语言
+    :return: 'ZH' for Chinese, 'EN' for English, 'JA' for Japanese, or prev_lang for spaces
+    """
+    if ZH_PATTERN.search(text): return 'ZH'
+    if EN_PATTERN.search(text): return 'EN'
+    if JP_PATTERN.search(text): return 'JA'
+    if text.isspace(): return prev_lang  # 若是空格，则返回前一个语言
+    return None
+def replace_substring(s, start_index, end_index, replacement):
+    return s[:start_index] + replacement + s[end_index:]
+def replace_sublist(lst, start_index, end_index, replacement_list):
+    lst[start_index:end_index] = replacement_list
+# convert text to ipa and prepare for language embedding
+def append_tags_and_convert(match, conversion_func, tag_value, tags):
+    converted_text = conversion_func(match.group(1))
+    tags.extend([tag_value] * len(converted_text))
+    return converted_text + ' '
+# auto detect language using re
+def cjke_cleaners4(text: str):
+    """
+    根据文本内容自动检测语言并转换为IPA音标
+    :param text: 输入文本
+    :return: 转换为IPA音标的文本
+    """
+    text = CLEANER_PATTERN.sub('', text)
+    pointer = 0
+    output = ''
+    current_language = detect_language(text[pointer])
+    while pointer < len(text):
+        temp_text = ''
+        while pointer < len(text) and detect_language(text[pointer], current_language) == current_language:
+            temp_text += text[pointer]
+            pointer += 1
+        if current_language == 'ZH':
+            output += chinese_to_cnm3(temp_text)
+        elif current_language == 'JA':
+            output += japanese_to_ipa2(temp_text)
+        elif current_language == 'EN':
+            output += english_to_ipa2(temp_text)
+        if pointer < len(text):
+            current_language = detect_language(text[pointer])
+    output = re.sub(r'\s+$', '', output)
+    output = re.sub(r'([^\.,!\?\-…~])$', r'\1.', output)
+    return output
--- a/text/cn2an/__init__.py
+++ b/text/cn2an/__init__.py
+__version__ = "0.5.20"
+from .cn2an import Cn2An
+from .an2cn import An2Cn
+from .transform import Transform
+cn2an = Cn2An().cn2an
+an2cn = An2Cn().an2cn
+transform = Transform().transform
+__all__ = [
+    "__version__",
+    "cn2an",
+    "an2cn",
+    "transform"
+]
--- a/text/cn2an/an2cn.py
+++ b/text/cn2an/an2cn.py
+from typing import Union
+#from proces import preprocess
+from .conf import NUMBER_LOW_AN2CN, NUMBER_UP_AN2CN, UNIT_LOW_ORDER_AN2CN, UNIT_UP_ORDER_AN2CN
+class An2Cn(object):
+    def __init__(self) -> None:
+        self.all_num = "0123456789"
+        self.number_low = NUMBER_LOW_AN2CN
+        self.number_up = NUMBER_UP_AN2CN
+        self.mode_list = ["low", "up", "rmb", "direct"]
+    def an2cn(self, inputs: Union[str, int, float] = None, mode: str = "low") -> str:
+        """阿拉伯数字转中文数字
+        :param inputs: 阿拉伯数字
+        :param mode: low 小写数字，up 大写数字，rmb 人民币大写，direct 直接转化
+        :return: 中文数字
+        """
+        if inputs is not None and inputs != "":
+            if mode not in self.mode_list:
+                raise ValueError(f"mode 仅支持 {str(self.mode_list)} ！")
+            # 将数字转化为字符串，这里会有Python会自动做转化
+            # 1. -> 1.0 1.00 -> 1.0 -0 -> 0
+            if not isinstance(inputs, str):
+                inputs = self.__number_to_string(inputs)
+            # 数据预处理：
+            # 1. 繁体转简体
+            # 2. 全角转半角
+            # inputs = preprocess(inputs, pipelines=[
+            #     "traditional_to_simplified",
+            #     "full_angle_to_half_angle"
+            # ])
+            # 检查数据是否有效
+            self.__check_inputs_is_valid(inputs)
+            # 判断正负
+            if inputs[0] == "-":
+                sign = "负"
+                inputs = inputs[1:]
+            else:
+                sign = ""
+            if mode == "direct":
+                output = self.__direct_convert(inputs)
+            else:
+                # 切割整数部分和小数部分
+                split_result = inputs.split(".")
+                len_split_result = len(split_result)
+                if len_split_result == 1:
+                    # 不包含小数的输入
+                    integer_data = split_result[0]
+                    if mode == "rmb":
+                        output = self.__integer_convert(integer_data, "up") + "元整"
+                    else:
+                        output = self.__integer_convert(integer_data, mode)
+                elif len_split_result == 2:
+                    # 包含小数的输入
+                    integer_data, decimal_data = split_result
+                    if mode == "rmb":
+                        int_data = self.__integer_convert(integer_data, "up")
+                        dec_data = self.__decimal_convert(decimal_data, "up")
+                        len_dec_data = len(dec_data)
+                        if len_dec_data == 0:
+                            output = int_data + "元整"
+                        elif len_dec_data == 1:
+                            raise ValueError(f"异常输出：{dec_data}")
+                        elif len_dec_data == 2:
+                            if dec_data[1] != "零":
+                                if int_data == "零":
+                                    output = dec_data[1] + "角"
+                                else:
+                                    output = int_data + "元" + dec_data[1] + "角"
+                            else:
+                                output = int_data + "元整"
+                        else:
+                            if dec_data[1] != "零":
+                                if dec_data[2] != "零":
+                                    if int_data == "零":
+                                        output = dec_data[1] + "角" + dec_data[2] + "分"
+                                    else:
+                                        output = int_data + "元" + dec_data[1] + "角" + dec_data[2] + "分"
+                                else:
+                                    if int_data == "零":
+                                        output = dec_data[1] + "角"
+                                    else:
+                                        output = int_data + "元" + dec_data[1] + "角"
+                            else:
+                                if dec_data[2] != "零":
+                                    if int_data == "零":
+                                        output = dec_data[2] + "分"
+                                    else:
+                                        output = int_data + "元" + "零" + dec_data[2] + "分"
+                                else:
+                                    output = int_data + "元整"
+                    else:
+                        output = self.__integer_convert(integer_data, mode) + self.__decimal_convert(decimal_data, mode)
+                else:
+                    raise ValueError(f"输入格式错误：{inputs}！")
+        else:
+            raise ValueError("输入数据为空！")
+        return sign + output
+    def __direct_convert(self, inputs: str) -> str:
+        _output = ""
+        for d in inputs:
+            if d == ".":
+                _output += "点"
+            else:
+                _output += self.number_low[int(d)]
+        return _output
+    @staticmethod
+    def __number_to_string(number_data: Union[int, float]) -> str:
+        # 小数处理：python 会自动把 0.00005 转化成 5e-05，因此 str(0.00005) != "0.00005"
+        string_data = str(number_data)
+        if "e" in string_data:
+            string_data_list = string_data.split("e")
+            string_key = string_data_list[0]
+            string_value = string_data_list[1]
+            if string_value[0] == "-":
+                string_data = "0." + "0" * (int(string_value[1:]) - 1) + string_key
+            else:
+                string_data = string_key + "0" * int(string_value)
+        return string_data
+    def __check_inputs_is_valid(self, check_data: str) -> None:
+        # 检查输入数据是否在规定的字典中
+        all_check_keys = self.all_num + ".-"
+        for data in check_data:
+            if data not in all_check_keys:
+                raise ValueError(f"输入的数据不在转化范围内：{data}！")
+    def __integer_convert(self, integer_data: str, mode: str) -> str:
+        if mode == "low":
+            numeral_list = NUMBER_LOW_AN2CN
+            unit_list = UNIT_LOW_ORDER_AN2CN
+        elif mode == "up":
+            numeral_list = NUMBER_UP_AN2CN
+            unit_list = UNIT_UP_ORDER_AN2CN
+        else:
+            raise ValueError(f"error mode: {mode}")
+        # 去除前面的 0，比如 007 => 7
+        integer_data = str(int(integer_data))
+        len_integer_data = len(integer_data)
+        if len_integer_data > len(unit_list):
+            raise ValueError(f"超出数据范围，最长支持 {len(unit_list)} 位")
+        output_an = ""
+        for i, d in enumerate(integer_data):
+            if int(d):
+                output_an += numeral_list[int(d)] + unit_list[len_integer_data - i - 1]
+            else:
+                if not (len_integer_data - i - 1) % 4:
+                    output_an += numeral_list[int(d)] + unit_list[len_integer_data - i - 1]
+                if i > 0 and not output_an[-1] == "零":
+                    output_an += numeral_list[int(d)]
+        output_an = output_an.replace("零零", "零").replace("零万", "万").replace("零亿", "亿").replace("亿万", "亿") \
+            .strip("零")
+        # 解决「一十几」问题
+        if output_an[:2] in ["一十"]:
+            output_an = output_an[1:]
+        # 0 - 1 之间的小数
+        if not output_an:
+            output_an = "零"
+        return output_an
+    def __decimal_convert(self, decimal_data: str, o_mode: str) -> str:
+        len_decimal_data = len(decimal_data)
+        if len_decimal_data > 16:
+            print(f"注意：小数部分长度为 {len_decimal_data} ，将自动截取前 16 位有效精度！")
+            decimal_data = decimal_data[:16]
+        if len_decimal_data:
+            output_an = "点"
+        else:
+            output_an = ""
+        if o_mode == "low":
+            numeral_list = NUMBER_LOW_AN2CN
+        elif o_mode == "up":
+            numeral_list = NUMBER_UP_AN2CN
+        else:
+            raise ValueError(f"error mode: {o_mode}")
+        for data in decimal_data:
+            output_an += numeral_list[int(data)]
+        return output_an
--- a/text/cn2an/cn2an.py
+++ b/text/cn2an/cn2an.py
+import re
+from typing import Union
+#from proces import preprocess
+from .an2cn import An2Cn
+from .conf import NUMBER_CN2AN, UNIT_CN2AN, STRICT_CN_NUMBER, NORMAL_CN_NUMBER, NUMBER_LOW_AN2CN, UNIT_LOW_AN2CN
+class Cn2An(object):
+    def __init__(self) -> None:
+        self.all_num = "".join(list(NUMBER_CN2AN.keys()))
+        self.all_unit = "".join(list(UNIT_CN2AN.keys()))
+        self.strict_cn_number = STRICT_CN_NUMBER
+        self.normal_cn_number = NORMAL_CN_NUMBER
+        self.check_key_dict = {
+            "strict": "".join(self.strict_cn_number.values()) + "点负",
+            "normal": "".join(self.normal_cn_number.values()) + "点负",
+            "smart": "".join(self.normal_cn_number.values()) + "点负" + "01234567890.-"
+        }
+        self.pattern_dict = self.__get_pattern()
+        self.ac = An2Cn()
+        self.mode_list = ["strict", "normal", "smart"]
+        self.yjf_pattern = re.compile(fr"^.*?[元圆][{self.all_num}]角([{self.all_num}]分)?$")
+        self.pattern1 = re.compile(fr"^-?\d+(\.\d+)?[{self.all_unit}]?$")
+        self.ptn_all_num = re.compile(f"^[{self.all_num}]+$")
+        # "十?" is for special case "十一万三"
+        self.ptn_speaking_mode = re.compile(f"^([{self.all_num}]{{0,2}}[{self.all_unit}])+[{self.all_num}]$")
+    def cn2an(self, inputs: Union[str, int, float] = None, mode: str = "strict") -> Union[float, int]:
+        """中文数字转阿拉伯数字
+        :param inputs: 中文数字、阿拉伯数字、中文数字和阿拉伯数字
+        :param mode: strict 严格，normal 正常，smart 智能
+        :return: 阿拉伯数字
+        """
+        if inputs is not None or inputs == "":
+            if mode not in self.mode_list:
+                raise ValueError(f"mode 仅支持 {str(self.mode_list)} ！")
+            # 将数字转化为字符串
+            if not isinstance(inputs, str):
+                inputs = str(inputs)
+            # 数据预处理：
+            # 1. 繁体转简体
+            # 2. 全角转半角
+            # inputs = preprocess(inputs, pipelines=[
+            #     "traditional_to_simplified",
+            #     "full_angle_to_half_angle"
+            # ])
+            # 特殊转化 廿
+            inputs = inputs.replace("廿", "二十")
+            # 检查输入数据是否有效
+            sign, integer_data, decimal_data, is_all_num = self.__check_input_data_is_valid(inputs, mode)
+            # smart 下的特殊情况
+            if sign == 0:
+                return integer_data
+            else:
+                if not is_all_num:
+                    if decimal_data is None:
+                        output = self.__integer_convert(integer_data)
+                    else:
+                        output = self.__integer_convert(integer_data) + self.__decimal_convert(decimal_data)
+                        # fix 1 + 0.57 = 1.5699999999999998
+                        output = round(output, len(decimal_data))
+                else:
+                    if decimal_data is None:
+                        output = self.__direct_convert(integer_data)
+                    else:
+                        output = self.__direct_convert(integer_data) + self.__decimal_convert(decimal_data)
+                        # fix 1 + 0.57 = 1.5699999999999998
+                        output = round(output, len(decimal_data))
+        else:
+            raise ValueError("输入数据为空！")
+        return sign * output
+    def __get_pattern(self) -> dict:
+        # 整数严格检查
+        _0 = "[零]"
+        _1_9 = "[一二三四五六七八九]"
+        _10_99 = f"{_1_9}?[十]{_1_9}?"
+        _1_99 = f"({_10_99}|{_1_9})"
+        _100_999 = f"({_1_9}[百]([零]{_1_9})?|{_1_9}[百]{_10_99})"
+        _1_999 = f"({_100_999}|{_1_99})"
+        _1000_9999 = f"({_1_9}[千]([零]{_1_99})?|{_1_9}[千]{_100_999})"
+        _1_9999 = f"({_1000_9999}|{_1_999})"
+        _10000_99999999 = f"({_1_9999}[万]([零]{_1_999})?|{_1_9999}[万]{_1000_9999})"
+        _1_99999999 = f"({_10000_99999999}|{_1_9999})"
+        _100000000_9999999999999999 = f"({_1_99999999}[亿]([零]{_1_99999999})?|{_1_99999999}[亿]{_10000_99999999})"
+        _1_9999999999999999 = f"({_100000000_9999999999999999}|{_1_99999999})"
+        str_int_pattern = f"^({_0}|{_1_9999999999999999})$"
+        nor_int_pattern = f"^({_0}|{_1_9999999999999999})$"
+        str_dec_pattern = "^[零一二三四五六七八九]{0,15}[一二三四五六七八九]$"
+        nor_dec_pattern = "^[零一二三四五六七八九]{0,16}$"
+        for str_num in self.strict_cn_number.keys():
+            str_int_pattern = str_int_pattern.replace(str_num, self.strict_cn_number[str_num])
+            str_dec_pattern = str_dec_pattern.replace(str_num, self.strict_cn_number[str_num])
+        for nor_num in self.normal_cn_number.keys():
+            nor_int_pattern = nor_int_pattern.replace(nor_num, self.normal_cn_number[nor_num])
+            nor_dec_pattern = nor_dec_pattern.replace(nor_num, self.normal_cn_number[nor_num])
+        pattern_dict = {
+            "strict": {
+                "int": re.compile(str_int_pattern),
+                "dec": re.compile(str_dec_pattern)
+            },
+            "normal": {
+                "int": re.compile(nor_int_pattern),
+                "dec": re.compile(nor_dec_pattern)
+            }
+        }
+        return pattern_dict
+    def __copy_num(self, num):
+        cn_num = ""
+        for n in num:
+            cn_num += NUMBER_LOW_AN2CN[int(n)]
+        return cn_num
+    def __check_input_data_is_valid(self, check_data: str, mode: str) -> (int, str, str, bool):
+        # 去除 元整、圆整、元正、圆正
+        stop_words = ["元整", "圆整", "元正", "圆正"]
+        for word in stop_words:
+            if check_data[-2:] == word:
+                check_data = check_data[:-2]
+        # 去除 元、圆
+        if mode != "strict":
+            normal_stop_words = ["圆", "元"]
+            for word in normal_stop_words:
+                if check_data[-1] == word:
+                    check_data = check_data[:-1]
+        # 处理元角分
+        result = self.yjf_pattern.search(check_data)
+        if result:
+            check_data = check_data.replace("元", "点").replace("角", "").replace("分", "")
+        # 处理特殊问法：一千零十一 一万零百一十一
+        if "零十" in check_data:
+            check_data = check_data.replace("零十", "零一十")
+        if "零百" in check_data:
+            check_data = check_data.replace("零百", "零一百")
+        for data in check_data:
+            if data not in self.check_key_dict[mode]:
+                raise ValueError(f"当前为{mode}模式，输入的数据不在转化范围内：{data}！")
+        # 确定正负号
+        if check_data[0] == "负":
+            check_data = check_data[1:]
+            sign = -1
+        else:
+            sign = 1
+        if "点" in check_data:
+            split_data = check_data.split("点")
+            if len(split_data) == 2:
+                integer_data, decimal_data = split_data
+                # 将 smart 模式中的阿拉伯数字转化成中文数字
+                if mode == "smart":
+                    integer_data = re.sub(r"\d+", lambda x: self.ac.an2cn(x.group()), integer_data)
+                    decimal_data = re.sub(r"\d+", lambda x: self.__copy_num(x.group()), decimal_data)
+                    mode = "normal"
+            else:
+                raise ValueError("数据中包含不止一个点！")
+        else:
+            integer_data = check_data
+            decimal_data = None
+            # 将 smart 模式中的阿拉伯数字转化成中文数字
+            if mode == "smart":
+                # 10.1万 10.1
+                result1 = self.pattern1.search(integer_data)
+                if result1:
+                    if result1.group() == integer_data:
+                        if integer_data[-1] in UNIT_CN2AN.keys():
+                            output = int(float(integer_data[:-1]) * UNIT_CN2AN[integer_data[-1]])
+                        else:
+                            output = float(integer_data)
+                        return 0, output, None, None
+                integer_data = re.sub(r"\d+", lambda x: self.ac.an2cn(x.group()), integer_data)
+                mode = "normal"
+        result_int = self.pattern_dict[mode]["int"].search(integer_data)
+        if result_int:
+            if result_int.group() == integer_data:
+                if decimal_data is not None:
+                    result_dec = self.pattern_dict[mode]["dec"].search(decimal_data)
+                    if result_dec:
+                        if result_dec.group() == decimal_data:
+                            return sign, integer_data, decimal_data, False
+                else:
+                    return sign, integer_data, decimal_data, False
+        else:
+            if mode == "strict":
+                raise ValueError(f"不符合格式的数据：{integer_data}")
+            elif mode == "normal":
+                # 纯数模式：一二三
+                result_all_num = self.ptn_all_num.search(integer_data)
+                if result_all_num:
+                    if result_all_num.group() == integer_data:
+                        if decimal_data is not None:
+                            result_dec = self.pattern_dict[mode]["dec"].search(decimal_data)
+                            if result_dec:
+                                if result_dec.group() == decimal_data:
+                                    return sign, integer_data, decimal_data, True
+                        else:
+                            return sign, integer_data, decimal_data, True
+                # 口语模式：一万二，两千三，三百四，十三万六，一百二十五万三
+                result_speaking_mode = self.ptn_speaking_mode.search(integer_data)
+                if len(integer_data) >= 3 and result_speaking_mode and result_speaking_mode.group() == integer_data:
+                    # len(integer_data)>=3: because the minimum length of integer_data that can be matched is 3
+                    # to find the last unit
+                    last_unit = result_speaking_mode.groups()[-1][-1]
+                    _unit = UNIT_LOW_AN2CN[UNIT_CN2AN[last_unit] // 10]
+                    integer_data = integer_data + _unit
+                    if decimal_data is not None:
+                        result_dec = self.pattern_dict[mode]["dec"].search(decimal_data)
+                        if result_dec:
+                            if result_dec.group() == decimal_data:
+                                return sign, integer_data, decimal_data, False
+                    else:
+                        return sign, integer_data, decimal_data, False
+        raise ValueError(f"不符合格式的数据：{check_data}")
+    def __integer_convert(self, integer_data: str) -> int:
+        # 核心
+        output_integer = 0
+        unit = 1
+        ten_thousand_unit = 1
+        for index, cn_num in enumerate(reversed(integer_data)):
+            # 数值
+            if cn_num in NUMBER_CN2AN:
+                num = NUMBER_CN2AN[cn_num]
+                output_integer += num * unit
+            # 单位
+            elif cn_num in UNIT_CN2AN:
+                unit = UNIT_CN2AN[cn_num]
+                # 判断出万、亿、万亿
+                if unit % 10000 == 0:
+                    # 万 亿
+                    if unit > ten_thousand_unit:
+                        ten_thousand_unit = unit
+                    # 万亿
+                    else:
+                        ten_thousand_unit = unit * ten_thousand_unit
+                        unit = ten_thousand_unit
+                if unit < ten_thousand_unit:
+                    unit = unit * ten_thousand_unit
+                if index == len(integer_data) - 1:
+                    output_integer += unit
+            else:
+                raise ValueError(f"{cn_num} 不在转化范围内")
+        return int(output_integer)
+    def __decimal_convert(self, decimal_data: str) -> float:
+        len_decimal_data = len(decimal_data)
+        if len_decimal_data > 16:
+            print(f"注意：小数部分长度为 {len_decimal_data} ，将自动截取前 16 位有效精度！")
+            decimal_data = decimal_data[:16]
+            len_decimal_data = 16
+        output_decimal = 0
+        for index in range(len(decimal_data) - 1, -1, -1):
+            unit_key = NUMBER_CN2AN[decimal_data[index]]
+            output_decimal += unit_key * 10 ** -(index + 1)
+        # 处理精度溢出问题
+        output_decimal = round(output_decimal, len_decimal_data)
+        return output_decimal
+    def __direct_convert(self, data: str) -> int:
+        output_data = 0
+        for index in range(len(data) - 1, -1, -1):
+            unit_key = NUMBER_CN2AN[data[index]]
+            output_data += unit_key * 10 ** (len(data) - index - 1)
+        return output_data
--- a/text/cn2an/conf.py
+++ b/text/cn2an/conf.py
+NUMBER_CN2AN = {
+    "零": 0,
+    "〇": 0,
+    "一": 1,
+    "壹": 1,
+    "幺": 1,
+    "二": 2,
+    "贰": 2,
+    "两": 2,
+    "三": 3,
+    "叁": 3,
+    "四": 4,
+    "肆": 4,
+    "五": 5,
+    "伍": 5,
+    "六": 6,
+    "陆": 6,
+    "七": 7,
+    "柒": 7,
+    "八": 8,
+    "捌": 8,
+    "九": 9,
+    "玖": 9,
+}
+UNIT_CN2AN = {
+    "十": 10,
+    "拾": 10,
+    "百": 100,
+    "佰": 100,
+    "千": 1000,
+    "仟": 1000,
+    "万": 10000,
+    "亿": 100000000,
+}
+UNIT_LOW_AN2CN = {
+    10: "十",
+    100: "百",
+    1000: "千",
+    10000: "万",
+    100000000: "亿",
+}
+NUMBER_LOW_AN2CN = {
+    0: "零",
+    1: "一",
+    2: "二",
+    3: "三",
+    4: "四",
+    5: "五",
+    6: "六",
+    7: "七",
+    8: "八",
+    9: "九",
+}
+NUMBER_UP_AN2CN = {
+    0: "零",
+    1: "壹",
+    2: "贰",
+    3: "叁",
+    4: "肆",
+    5: "伍",
+    6: "陆",
+    7: "柒",
+    8: "捌",
+    9: "玖",
+}
+UNIT_LOW_ORDER_AN2CN = [
+    "",
+    "十",
+    "百",
+    "千",
+    "万",
+    "十",
+    "百",
+    "千",
+    "亿",
+    "十",
+    "百",
+    "千",
+    "万",
+    "十",
+    "百",
+    "千",
+]
+UNIT_UP_ORDER_AN2CN = [
+    "",
+    "拾",
+    "佰",
+    "仟",
+    "万",
+    "拾",
+    "佰",
+    "仟",
+    "亿",
+    "拾",
+    "佰",
+    "仟",
+    "万",
+    "拾",
+    "佰",
+    "仟",
+]
+STRICT_CN_NUMBER = {
+    "零": "零",
+    "一": "一壹",
+    "二": "二贰",
+    "三": "三叁",
+    "四": "四肆",
+    "五": "五伍",
+    "六": "六陆",
+    "七": "七柒",
+    "八": "八捌",
+    "九": "九玖",
+    "十": "十拾",
+    "百": "百佰",
+    "千": "千仟",
+    "万": "万",
+    "亿": "亿",
+}
+NORMAL_CN_NUMBER = {
+    "零": "零〇",
+    "一": "一壹幺",
+    "二": "二贰两",
+    "三": "三叁仨",
+    "四": "四肆",
+    "五": "五伍",
+    "六": "六陆",
+    "七": "七柒",
+    "八": "八捌",
+    "九": "九玖",
+    "十": "十拾",
+    "百": "百佰",
+    "千": "千仟",
+    "万": "万",
+    "亿": "亿",
+}
--- a/text/cn2an/transform.py
+++ b/text/cn2an/transform.py
+import re
+from .cn2an import Cn2An
+from .an2cn import An2Cn
+from .conf import UNIT_CN2AN
+class Transform(object):
+    def __init__(self) -> None:
+        self.all_num = "零一二三四五六七八九"
+        self.all_unit = "".join(list(UNIT_CN2AN.keys()))
+        self.cn2an = Cn2An().cn2an
+        self.an2cn = An2Cn().an2cn
+        self.cn_pattern = f"负?([{self.all_num}{self.all_unit}]+点)?[{self.all_num}{self.all_unit}]+"
+        self.smart_cn_pattern = f"-?([0-9]+.)?[0-9]+[{self.all_unit}]+"
+    def transform(self, inputs: str, method: str = "cn2an") -> str:
+        if method == "cn2an":
+            inputs = inputs.replace("廿", "二十").replace("半", "0.5").replace("两", "2")
+            # date
+            inputs = re.sub(
+                fr"((({self.smart_cn_pattern})|({self.cn_pattern}))年)?([{self.all_num}十]+月)?([{self.all_num}十]+日)?",
+                lambda x: self.__sub_util(x.group(), "cn2an", "date"), inputs)
+            # fraction
+            inputs = re.sub(fr"{self.cn_pattern}分之{self.cn_pattern}",
+                            lambda x: self.__sub_util(x.group(), "cn2an", "fraction"), inputs)
+            # percent
+            inputs = re.sub(fr"百分之{self.cn_pattern}",
+                            lambda x: self.__sub_util(x.group(), "cn2an", "percent"), inputs)
+            # celsius
+            inputs = re.sub(fr"{self.cn_pattern}摄氏度",
+                            lambda x: self.__sub_util(x.group(), "cn2an", "celsius"), inputs)
+            # number
+            output = re.sub(self.cn_pattern,
+                            lambda x: self.__sub_util(x.group(), "cn2an", "number"), inputs)
+        elif method == "an2cn":
+            # date
+            inputs = re.sub(r"(\d{2,4}年)?(\d{1,2}月)?(\d{1,2}日)?",
+                            lambda x: self.__sub_util(x.group(), "an2cn", "date"), inputs)
+            # fraction
+            inputs = re.sub(r"\d+/\d+",
+                            lambda x: self.__sub_util(x.group(), "an2cn", "fraction"), inputs)
+            # percent
+            inputs = re.sub(r"-?(\d+\.)?\d+%",
+                            lambda x: self.__sub_util(x.group(), "an2cn", "percent"), inputs)
+            # celsius
+            inputs = re.sub(r"\d+℃",
+                            lambda x: self.__sub_util(x.group(), "an2cn", "celsius"), inputs)
+            # number
+            output = re.sub(r"-?(\d+\.)?\d+",
+                            lambda x: self.__sub_util(x.group(), "an2cn", "number"), inputs)
+        else:
+            raise ValueError(f"error method: {method}, only support 'cn2an' and 'an2cn'!")
+        return output
+    def __sub_util(self, inputs, method: str = "cn2an", sub_mode: str = "number") -> str:
+        try:
+            if inputs:
+                if method == "cn2an":
+                    if sub_mode == "date":
+                        return re.sub(fr"(({self.smart_cn_pattern})|({self.cn_pattern}))",
+                                      lambda x: str(self.cn2an(x.group(), "smart")), inputs)
+                    elif sub_mode == "fraction":
+                        if inputs[0] != "百":
+                            frac_result = re.sub(self.cn_pattern,
+                                                 lambda x: str(self.cn2an(x.group(), "smart")), inputs)
+                            numerator, denominator = frac_result.split("分之")
+                            return f"{denominator}/{numerator}"
+                        else:
+                            return inputs
+                    elif sub_mode == "percent":
+                        return re.sub(f"(?<=百分之){self.cn_pattern}",
+                                      lambda x: str(self.cn2an(x.group(), "smart")), inputs).replace("百分之", "") + "%"
+                    elif sub_mode == "celsius":
+                        return re.sub(f"{self.cn_pattern}(?=摄氏度)",
+                                      lambda x: str(self.cn2an(x.group(), "smart")), inputs).replace("摄氏度", "℃")
+                    elif sub_mode == "number":
+                        return str(self.cn2an(inputs, "smart"))
+                    else:
+                        raise Exception(f"error sub_mode: {sub_mode} !")
+                else:
+                    if sub_mode == "date":
+                        inputs = re.sub(r"\d+(?=年)",
+                                        lambda x: self.an2cn(x.group(), "direct"), inputs)
+                        return re.sub(r"\d+",
+                                      lambda x: self.an2cn(x.group(), "low"), inputs)
+                    elif sub_mode == "fraction":
+                        frac_result = re.sub(r"\d+", lambda x: self.an2cn(x.group(), "low"), inputs)
+                        numerator, denominator = frac_result.split("/")
+                        return f"{denominator}分之{numerator}"
+                    elif sub_mode == "celsius":
+                        return self.an2cn(inputs[:-1], "low") + "摄氏度"
+                    elif sub_mode == "percent":
+                        return "百分之" + self.an2cn(inputs[:-1], "low")
+                    elif sub_mode == "number":
+                        return self.an2cn(inputs, "low")
+                    else:
+                        raise Exception(f"error sub_mode: {sub_mode} !")
+        except Exception as e:
+            print(f"WARN: {e}")
+            return inputs
--- a/text/cnm3/ds_CNM3.txt
+++ b/text/cnm3/ds_CNM3.txt
+a,a
+ai,ai
+ai0,a0 I0
+an,an
+an0,a0 N0
+ang,ang
+ang0,A0 ng0
+ao,ao
+ao0,A0 O0
+ba,b a
+bai,b a0 I0
+ban,b a0 N0
+bang,b A0 ng0
+bao,b A0 O0
+be,b e
+bei,b E0 I0
+ben,b e0 N0
+beng,b e0 ng0
+ber,b er
+bi,b i
+bia,b ia
+bian,b iE0 N0
+biang,b iA0 ng0
+biao,b iA0 O0
+bie,b ie
+bin,b i N0
+bing,b i ng0
+biong,b iO0 ng0
+biu,b io0 U0
+bo,b o
+bong,b oo0 ng0
+bou,b o0 U0
+bu,b u
+bua,b ua
+buai,b ua0 I0
+buan,b ua0 N0
+buang,b uA0 ng0
+bui,b uE0 I0
+bun,b ue0 N0
+bv,b v
+bve,b ve
+ca,c a
+cai,c a0 I0
+can,c a0 N0
+cang,c A0 ng0
+cao,c A0 O0
+ce,c e
+cei,c E0 I0
+cen,c e0 N0
+ceng,c e0 ng0
+cer,c er
+cha,ch a
+chai,ch a0 I0
+chan,ch a0 N0
+chang,ch A0 ng0
+chao,ch A0 O0
+che,ch e
+chei,ch E0 I0
+chen,ch e0 N0
+cheng,ch e0 ng0
+cher,ch er
+chi,ch ir
+chong,ch oo0 ng0
+chou,ch o0 U0
+chu,ch u
+chua,ch ua
+chuai,ch ua0 I0
+chuan,ch ua0 N0
+chuang,ch uA0 ng0
+chui,ch uE0 I0
+chun,ch ue0 N0
+chuo,ch uo
+chv,ch v
+chyi,ch i
+ci,c i0
+cong,c oo0 ng0
+cou,c o0 U0
+cu,c u
+cua,c ua
+cuai,c ua0 I0
+cuan,c ua0 N0
+cuang,c uA0 ng0
+cui,c uE0 I0
+cun,c ue0 N0
+cuo,c uo
+cv,c v
+cyi,c i
+da,d a
+dai,d a0 I0
+dan,d a0 N0
+dang,d A0 ng0
+dao,d A0 O0
+de,d e
+dei,d E0 I0
+den,d e0 N0
+deng,d e0 ng0
+der,d er
+di,d i
+dia,d ia
+dian,d iE0 N0
+diang,d iA0 ng0
+diao,d iA0 O0
+die,d ie
+din,d i N0
+ding,d i ng0
+diong,d iO0 ng0
+diu,d io0 U0
+dong,d oo0 ng0
+dou,d o0 U0
+du,d u
+dua,d ua
+duai,d ua0 I0
+duan,d ua0 N0
+duang,d uA0 ng0
+dui,d uE0 I0
+dun,d ue0 N0
+duo,d uo
+dv,d v
+dve,d ve
+e,e
+ei,E0 I0
+en,e0 N0
+eng,e0 ng0
+er,er
+fa,f a
+fai,f a0 I0
+fan,f a0 N0
+fang,f A0 ng0
+fao,f A0 O0
+fe,f e
+fei,f E0 I0
+fen,f e0 N0
+feng,f e0 ng0
+fer,f er
+fi,f i
+fia,f ia
+fian,f iE0 N0
+fiang,f iA0 ng0
+fiao,f iA0 O0
+fie,f ie
+fin,f i N0
+fing,f i ng0
+fiong,f iO0 ng0
+fiu,f io0 U0
+fo,f o
+fong,f oo0 ng0
+fou,f o0 U0
+fu,f u
+fua,f ua
+fuai,f ua0 I0
+fuan,f ua0 N0
+fuang,f uA0 ng0
+fui,f uE0 I0
+fun,f ue0 N0
+fv,f v
+fve,f ve
+ga,g a
+gai,g a0 I0
+gan,g a0 N0
+gang,g A0 ng0
+gao,g A0 O0
+ge,g e
+gei,g E0 I0
+gen,g e0 N0
+geng,g e0 ng0
+ger,g er
+gi,g i
+gia,g ia
+gian,g iE0 N0
+giang,g iA0 ng0
+giao,g iA0 O0
+gie,g ie
+gin,g i N0
+ging,g i ng0
+giong,g iO0 ng0
+giu,g io0 U0
+gong,g oo0 ng0
+gou,g o0 U0
+gu,g u
+gua,g ua
+guai,g ua0 I0
+guan,g ua0 N0
+guang,g uA0 ng0
+gui,g uE0 I0
+gun,g ue0 N0
+guo,g uo
+gv,g v
+gve,g ve
+ha,h a
+hai,h a0 I0
+han,h a0 N0
+hang,h A0 ng0
+hao,h A0 O0
+he,h e
+hei,h E0 I0
+hen,h e0 N0
+heng,h e0 ng0
+her,h er
+hi,h i
+hia,h ia
+hian,h iE0 N0
+hiang,h iA0 ng0
+hiao,h iA0 O0
+hie,h ie
+hin,h i N0
+hing,h i ng0
+hiong,h iO0 ng0
+hiu,h io0 U0
+hong,h oo0 ng0
+hou,h o0 U0
+hu,h u
+hua,h ua
+huai,h ua0 I0
+huan,h ua0 N0
+huang,h uA0 ng0
+hui,h uE0 I0
+hun,h ue0 N0
+huo,h uo
+hv,h v
+hve,h ve
+ji,j i
+jia,j ia
+jian,j iE0 N0
+jiang,j iA0 ng0
+jiao,j iA0 O0
+jie,j ie
+jin,j i N0
+jing,j i ng0
+jiong,j iO0 ng0
+jiu,j io0 U0
+ju,j v
+juan,j vE0 N0
+jue,j ve
+jun,j v0 N0
+ka,k a
+kai,k a0 I0
+kan,k a0 N0
+kang,k A0 ng0
+kao,k A0 O0
+ke,k e
+kei,k E0 I0
+ken,k e0 N0
+keng,k e0 ng0
+ker,k er
+ki,k i
+kia,k ia
+kian,k iE0 N0
+kiang,k iA0 ng0
+kiao,k iA0 O0
+kie,k ie
+kin,k i N0
+king,k i ng0
+kiong,k iO0 ng0
+kiu,k io0 U0
+kong,k oo0 ng0
+kou,k o0 U0
+ku,k u
+kua,k ua
+kuai,k ua0 I0
+kuan,k ua0 N0
+kuang,k uA0 ng0
+kui,k uE0 I0
+kun,k ue0 N0
+kuo,k uo
+kv,k v
+kve,k ve
+la,l a
+lai,l a0 I0
+lan,l a0 N0
+lang,l A0 ng0
+lao,l A0 O0
+le,l e
+lei,l E0 I0
+len,l e0 N0
+leng,l e0 ng0
+ler,l er
+li,l i
+lia,l ia
+lian,l iE0 N0
+liang,l iA0 ng0
+liao,l iA0 O0
+lie,l ie
+lin,l i N0
+ling,l i ng0
+liong,l iO0 ng0
+liu,l io0 U0
+lo,l o
+long,l oo0 ng0
+lou,l o0 U0
+lu,l u
+lua,l ua
+luai,l ua0 I0
+luan,l ua0 N0
+luang,l uA0 ng0
+lui,l uE0 I0
+lun,l ue0 N0
+luo,l uo
+lv,l v
+lve,l ve
+ma,m a
+mai,m a0 I0
+man,m a0 N0
+mang,m A0 ng0
+mao,m A0 O0
+me,m e
+mei,m E0 I0
+men,m e0 N0
+meng,m e0 ng0
+mer,m er
+mi,m i
+mia,m ia
+mian,m iE0 N0
+miang,m iA0 ng0
+miao,m iA0 O0
+mie,m ie
+min,m i N0
+ming,m i ng0
+miong,m iO0 ng0
+miu,m io0 U0
+mo,m o
+mong,m oo0 ng0
+mou,m o0 U0
+mu,m u
+mua,m ua
+muai,m ua0 I0
+muan,m ua0 N0
+muang,m uA0 ng0
+mui,m uE0 I0
+mun,m ue0 N0
+mv,m v
+mve,m ve
+n,ng
+na,n a
+nai,n a0 I0
+nan,n a0 N0
+nang,n A0 ng0
+nao,n A0 O0
+ne,n e
+nei,n E0 I0
+nen,n e0 N0
+neng,n e0 ng0
+ner,n er
+ni,n i
+nia,n ia
+nian,n iE0 N0
+niang,n iA0 ng0
+niao,n iA0 O0
+nie,n ie
+nin,n i N0
+ning,n i ng0
+niong,n iO0 ng0
+niu,n io0 U0
+nong,n oo0 ng0
+nou,n o0 U0
+nu,n u
+nua,n ua
+nuai,n ua0 I0
+nuan,n ua0 N0
+nuang,n uA0 ng0
+nui,n uE0 I0
+nun,n ue0 N0
+nuo,n uo
+nv,n v
+nve,n ve
+o,o
+ong,ong
+ou,ou
+pa,p a
+pai,p a0 I0
+pan,p a0 N0
+pang,p A0 ng0
+pao,p A0 O0
+pe,p e
+pei,p E0 I0
+pen,p e0 N0
+peng,p e0 ng0
+per,p er
+pi,p i
+pia,p ia
+pian,p iE0 N0
+piang,p iA0 ng0
+piao,p iA0 O0
+pie,p ie
+pin,p i N0
+ping,p i ng0
+piong,p iO0 ng0
+piu,p io0 U0
+po,p o
+pong,p oo0 ng0
+pou,p o0 U0
+pu,p u
+pua,p ua
+puai,p ua0 I0
+puan,p ua0 N0
+puang,p uA0 ng0
+pui,p uE0 I0
+pun,p ue0 N0
+pv,p v
+pve,p ve
+qi,q i
+qia,q ia
+qian,q iE0 N0
+qiang,q iA0 ng0
+qiao,q iA0 O0
+qie,q ie
+qin,q i N0
+qing,q i ng0
+qiong,q iO0 ng0
+qiu,q io0 U0
+qu,q v
+quan,q vE0 N0
+que,q ve
+qun,q v0 N0
+ra,r a
+rai,r a0 I0
+ran,r a0 N0
+rang,r A0 ng0
+rao,r A0 O0
+re,r e
+rei,r E0 I0
+ren,r e0 N0
+reng,r e0 ng0
+rer,r er
+ri,r ir
+rong,r oo0 ng0
+rou,r o0 U0
+ru,r u
+rua,r ua
+ruai,r ua0 I0
+ruan,r ua0 N0
+ruang,r uA0 ng0
+rui,r uE0 I0
+run,r ue0 N0
+ruo,r uo
+rv,r v
+ryi,r i
+sa,s a
+sai,s a0 I0
+san,s a0 N0
+sang,s A0 ng0
+sao,s A0 O0
+se,s e
+sei,s E0 I0
+sen,s e0 N0
+seng,s e0 ng0
+ser,s er
+sha,sh a
+shai,sh a0 I0
+shan,sh a0 N0
+shang,sh A0 ng0
+shao,sh A0 O0
+she,sh e
+shei,sh E0 I0
+shen,sh e0 N0
+sheng,sh e0 ng0
+sher,sh er
+shi,sh ir
+shong,sh oo0 ng0
+shou,sh o0 U0
+shu,sh u
+shua,sh ua
+shuai,sh ua0 I0
+shuan,sh ua0 N0
+shuang,sh uA0 ng0
+shui,sh uE0 I0
+shun,sh ue0 N0
+shuo,sh uo
+shv,sh v
+shyi,sh i
+si,s i0
+song,s oo0 ng0
+sou,s o0 U0
+su,s u
+sua,s ua
+suai,s ua0 I0
+suan,s ua0 N0
+suang,s uA0 ng0
+sui,s uE0 I0
+sun,s ue0 N0
+suo,s uo
+sv,s v
+syi,s i
+ta,t a
+tai,t a0 I0
+tan,t a0 N0
+tang,t A0 ng0
+tao,t A0 O0
+te,t e
+tei,t E0 I0
+ten,t e0 N0
+teng,t e0 ng0
+ter,t er
+ti,t i
+tia,t ia
+tian,t iE0 N0
+tiang,t iA0 ng0
+tiao,t iA0 O0
+tie,t ie
+tin,t i N0
+ting,t i ng0
+tiong,t iO0 ng0
+tong,t oo0 ng0
+tou,t o0 U0
+tu,t u
+tua,t ua
+tuai,t ua0 I0
+tuan,t ua0 N0
+tuang,t uA0 ng0
+tui,t uE0 I0
+tun,t ue0 N0
+tuo,t uo
+tv,t v
+tve,t ve
+wa,w a
+wai,w a0 I0
+wan,w a0 N0
+wang,w A0 ng0
+wao,w A0 O0
+we,w e
+wei,w E0 I0
+wen,w e0 N0
+weng,w e0 ng0
+wer,w er
+wi,w i
+wo,w o
+wong,w oo0 ng0
+wou,w o0 U0
+wu,w u
+xi,x i
+xia,x ia
+xian,x iE0 N0
+xiang,x iA0 ng0
+xiao,x iA0 O0
+xie,x ie
+xin,x i N0
+xing,x i ng0
+xiong,x iO0 ng0
+xiu,x io0 U0
+xu,x v
+xuan,x vE0 N0
+xue,x ve
+xun,x v0 N0
+ya,y a
+yai,y a0 I0
+yan,y iE0 N0
+yang,y A0 ng0
+yao,y A0 O0
+ye,y E
+yei,y E0 I0
+yi,y i
+yin,y i N0
+ying,y i ng0
+yo,y o
+yong,y oo0 ng0
+you,y o0 U0
+yu,y v
+yuan,y vE0 N0
+yue,y ve
+yun,y v0 N0
+ywu,y u
+za,z a
+zai,z a0 I0
+zan,z a0 N0
+zang,z A0 ng0
+zao,z A0 O0
+ze,z e
+zei,z E0 I0
+zen,z e0 N0
+zeng,z e0 ng0
+zer,z er
+zha,zh a
+zhai,zh a0 I0
+zhan,zh a0 N0
+zhang,zh A0 ng0
+zhao,zh A0 O0
+zhe,zh e
+zhei,zh E0 I0
+zhen,zh e0 N0
+zheng,zh e0 ng0
+zher,zh er
+zhi,zh ir
+zhong,zh oo0 ng0
+zhou,zh o0 U0
+zhu,zh u
+zhua,zh ua
+zhuai,zh ua0 I0
+zhuan,zh ua0 N0
+zhuang,zh uA0 ng0
+zhui,zh uE0 I0
+zhun,zh ue0 N0
+zhuo,zh uo
+zhv,zh v
+zhyi,zh i
+zi,z i0
+zong,z oo0 ng0
+zou,z o0 U0
+zu,z u
+zua,z ua
+zuai,z ua0 I0
+zuan,z ua0 N0
+zuang,z uA0 ng0
+zui,z uE0 I0
+zun,z ue0 N0
+zuo,z uo
+zv,z v
+zyi,z i
\ No newline at end of file
--- a/text/custom_pypinyin_dict/__init__.py
+++ b/text/custom_pypinyin_dict/__init__.py
+# -*- coding: utf-8 -*-
--- a/text/custom_pypinyin_dict/cc_cedict_0.py
+++ b/text/custom_pypinyin_dict/cc_cedict_0.py
--- a/text/custom_pypinyin_dict/cc_cedict_1.py
+++ b/text/custom_pypinyin_dict/cc_cedict_1.py
--- a/text/custom_pypinyin_dict/cc_cedict_2.py
+++ b/text/custom_pypinyin_dict/cc_cedict_2.py
--- a/text/custom_pypinyin_dict/cc_cedict_3.py
+++ b/text/custom_pypinyin_dict/cc_cedict_3.py
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+# Warning: Auto-generated file, don't edit.
+phrases_dict = {
+    '𰻝𰻝面': [['biáng'], ['biáng'], ['miàn']],
+}
+from pypinyin import load_phrases_dict
+def load():
+    load_phrases_dict(phrases_dict)