genshin_zh_小虫哥ver.py

import os
import re
from dataclasses import dataclass
import concurrent.futures

from tqdm.auto import tqdm
import openpyxl # use to open excel. run ! pip install openpyxl 

# download_link: https://www.bilibili.com/read/cv23965717
@dataclass
class DataConfig:
    dataset_path = './raw_datasets/Genshin_chinese4.5/原神语音包4.5（中）'
    excel_path = './raw_datasets/Genshin_chinese4.5/原神4.5语音包对应文本（中）.xlsx'
    output_filelist_path = './filelists/genshin_zh.txt'

# 若文本中出现以下字符，基本和语音对不上
FORBIDDEN_TEXTS = ["……", "{NICKNAME}", "#", "(", ")", "♪", "test", "{0}", "█", "*", "█", "+", "Gohus"]
REPLACEMENTS = {"$UNRELEASED": ""}
escaped_forbidden_texts = [re.escape(text) for text in FORBIDDEN_TEXTS]
pattern = re.compile("|".join(escaped_forbidden_texts))

data_config = DataConfig()

def clean_text(text):
    cleaned_text = text
    # 删去所有包含英文的台词
    if re.search(r'[A-Za-z0-9]', cleaned_text):
        return None
    if pattern.search(cleaned_text):
        return None
    for old, new in REPLACEMENTS.items():
        cleaned_text = cleaned_text.replace(old, new)
    return text

def read_excel(excel):
    wb = openpyxl.load_workbook(excel)
    sheet_names = wb.sheetnames
    main_sheet = wb[sheet_names[0]]
    npc_names = [cell.value for cell in main_sheet['B'] if cell.value][1:]
    npc_audio_number = [cell.value for cell in main_sheet['C'] if cell.value][1:]
    return wb, npc_names, npc_audio_number
    
def process_filelist(data):
    audio_path, text, npc_path = data
    input_audio_path = os.path.abspath(os.path.join(npc_path, audio_path))
    if os.path.exists(input_audio_path):
        text = clean_text(text)
        if text is not None:
            return f'{input_audio_path}|{text}\n'

if __name__ == '__main__':   
    wb, npc_names, npc_audio_number = read_excel(data_config.excel_path)
    datas_list = []
    results = []
    
    for index, npc_name in enumerate(tqdm(npc_names)):
        sheet = wb[npc_name]
        audio_names = [cell.value for cell in sheet['C'] if cell.value][1:]
        texts = [cell.value for cell in sheet['D'] if cell.value][1:]
        npc_path = os.path.join(data_config.dataset_path,  npc_name)
        datas_list.extend([(audio_name, text, npc_path) for audio_name, text in zip(audio_names, texts)]) 
           
    with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor:
        futures = [executor.submit(process_filelist, data) for data in datas_list]
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(datas_list)):
            result = future.result()
            if result is not None:
                results.append(result)
                                 
    # make sure that the parent dir exists, raising error at the last step is quite terrible OVO
    os.makedirs(os.path.dirname(data_config.output_filelist_path), exist_ok=True)
    with open(data_config.output_filelist_path, 'w', encoding='utf-8') as f:
        f.writelines(results)