AiSHELL3.py 1.7 KB
Newer Older
chenzk's avatar
v1.0  
chenzk committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import os

import re
from dataclasses import dataclass
import concurrent.futures

from tqdm.auto import tqdm

# download_link: https://www.openslr.org/93/
@dataclass
class DataConfig:
    dataset_path = './raw_datasets/Aishell3/train/wav'
    txt_path = './raw_datasets/Aishell3/train/content.txt'
    output_filelist_path = './filelists/aishell3.txt'

data_config = DataConfig()
    
def process_filelist(line):
    dir_name, audio_path, text = line
    input_audio_path = os.path.abspath(os.path.join(data_config.dataset_path, dir_name, audio_path))
    if os.path.exists(input_audio_path):
        return f'{input_audio_path}|{text}\n'

if __name__ == '__main__':
    filelist = []   
    results = []
    
    with open(data_config.txt_path, 'r', encoding='utf-8') as f:
        for idx, line in enumerate(f):
            audio_path, text = line.strip().split(maxsplit=1)
            dir_name = audio_path[:7]
            text = re.sub(r'[a-zA-Z0-9\s]', '', text) # remove pinyin and tone
            filelist.append((dir_name, audio_path, text))
           
    with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor:
        futures = [executor.submit(process_filelist, line) for line in filelist]
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(filelist)):
            result = future.result()
            if result is not None:
                results.append(result)
                                 
    # make sure that the parent dir exists, raising error at the last step is quite terrible OVO
    os.makedirs(os.path.dirname(data_config.output_filelist_path), exist_ok=True)
    with open(data_config.output_filelist_path, 'w', encoding='utf-8') as f:
        f.writelines(results)