libriTTS.py 1.43 KB
Newer Older
chenzk's avatar
v1.0  
chenzk committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import os
from pathlib import Path
from dataclasses import dataclass
import concurrent.futures

from tqdm.auto import tqdm

# download_link: https://openslr.org/60/
@dataclass
class DataConfig:
    dataset_path = './raw_datasets/LibriTTS/train-other-500'
    output_filelist_path = './filelists/libri_tts.txt'

data_config = DataConfig()
    
def process_filelist(wav_path: Path):
    text_path = wav_path.with_suffix('.normalized.txt')
    if text_path.exists():
        with open(text_path, 'r', encoding='utf-8') as f:
            text = f.read().strip()
        return f'{wav_path.as_posix()}|{text}\n'

if __name__ == '__main__':
    filelist = []   
    results = []
    
    dataset_path = Path(data_config.dataset_path)
    waves = list(dataset_path.rglob('*.wav'))
           
    with concurrent.futures.ProcessPoolExecutor(max_workers=8) as executor:
        futures = [executor.submit(process_filelist, wav_path) for wav_path in waves]
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(waves)):
            result = future.result()
            if result is not None:
                results.append(result)
                                 
    # make sure that the parent dir exists, raising error at the last step is quite terrible OVO
    os.makedirs(os.path.dirname(data_config.output_filelist_path), exist_ok=True)
    with open(data_config.output_filelist_path, 'w', encoding='utf-8') as f:
        f.writelines(results)