to_flac.py 1.58 KB
Newer Older
Lengyue's avatar
Lengyue committed
1
import random
Lengyue's avatar
Lengyue committed
2
3
import subprocess
from multiprocessing import Pool, cpu_count
Lengyue's avatar
Lengyue committed
4
5
from pathlib import Path

Lengyue's avatar
Lengyue committed
6
from tqdm import tqdm
Lengyue's avatar
Lengyue committed
7

Lengyue's avatar
Lengyue committed
8
9
10
11
12
13
14

def convert_to_flac(src_file_path):
    dst_file_path = src_file_path.with_suffix(".flac")
    dst_file_path.parent.mkdir(parents=True, exist_ok=True)

    try:
        subprocess.check_call(
Lengyue's avatar
Lengyue committed
15
16
17
18
19
20
21
22
23
24
25
            [
                "ffmpeg",
                "-y",
                "-i",
                str(src_file_path),
                "-acodec",
                "flac",
                "-threads",
                "0",
                str(dst_file_path),
            ],
Lengyue's avatar
Lengyue committed
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
        )

        # remove the input file
        src_file_path.unlink()
        return True
    except subprocess.CalledProcessError:
        return False


if __name__ == "__main__":
    src_dir = Path("dataset/tts/WenetSpeech/cleaned")

    wav_files = list(src_dir.rglob("*.wav"))
    random.shuffle(wav_files)
    print(f"Found {len(wav_files)} wav files")

    success_counter = 0
    fail_counter = 0

    with Pool(processes=cpu_count(), maxtasksperchild=100) as pool:
Lengyue's avatar
Lengyue committed
48
49
50
        with tqdm(
            pool.imap_unordered(convert_to_flac, wav_files), total=len(wav_files)
        ) as pbar:
Lengyue's avatar
Lengyue committed
51
52
53
54
55
            for success in pbar:
                if success:
                    success_counter += 1
                else:
                    fail_counter += 1
Lengyue's avatar
Lengyue committed
56

Lengyue's avatar
Lengyue committed
57
58
59
60
            pbar.set_description(f"Success: {success_counter}, Fail: {fail_counter}")

    print(f"Successfully converted: {success_counter}")
    print(f"Failed conversions: {fail_counter}")