hifi_tts.py

import os
import json
from pathlib import Path
from dataclasses import dataclass
import concurrent.futures

from tqdm.auto import tqdm

# download_link: https://www.openslr.org/109/
@dataclass
class DataConfig:
    dataset_path = './raw_datasets/hi_fi_tts_v0'
    output_filelist_path = './filelists/hifi_tts.txt'

data_config = DataConfig()
    
def process_filelist(speaker):
    filelist = []
    with open(speaker, 'r', encoding='utf-8') as f:
        for line in f:
            line = json.loads(line.strip())
            audio_path = os.path.abspath(os.path.join(data_config.dataset_path, line['audio_filepath']))
            text = line['text_normalized']
            if os.path.exists(audio_path):
                filelist.append(f'{audio_path}|{text}\n')
    return filelist

if __name__ == '__main__':
    filelist = []   
    results = []
    
    dataset_path = Path(data_config.dataset_path)
    speakers = list(dataset_path.rglob('*.json'))
           
    with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor:
        futures = [executor.submit(process_filelist, speaker) for speaker in speakers]
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(speakers)):
            result = future.result()
            if result is not None:
                results.extend(result)
                                 
    # make sure that the parent dir exists, raising error at the last step is quite terrible OVO
    os.makedirs(os.path.dirname(data_config.output_filelist_path), exist_ok=True)
    with open(data_config.output_filelist_path, 'w', encoding='utf-8') as f:
        f.writelines(results)