statistics_audio_duration.py 3.17 KB
Newer Older
chenzk's avatar
v1.0  
chenzk committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import json
import math
import os
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed

from PIL import Image
from tqdm import tqdm

import torchaudio
from vita.config import *
from vita.config import AudioFolder, FolderDict
from vita.config.dataset_config import *

# 定义文件路径
output_file_path = "lost_file_name.txt"

# 将所有字典放入一个列表中
# datasets = NLP+HumanCentric+VideoQA+NaturalQA
datasets = VideoCap + OCRCap + NaturalCap

# 初始化一个列表来存储丢失的文件名
lock = threading.Lock()


def get_wav_duration(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    duration = waveform.size(1) / sample_rate
    return duration


def check_audio(audio_file_name, audio_directory):
    audio_file_path = os.path.join(audio_directory, "audio", audio_file_name)
    duration = get_wav_duration(audio_file_path)
    if duration > 200:
        print(audio_file_path, duration)
    return duration


# 遍历每个字典
for dataset in datasets:
    dur_list = []
    keys = list(dataset.keys())
    json_file_path = dataset["chat_path"]
    print(json_file_path)
    # 读取JSON文件
    with open(json_file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # 遍历每条数据,使用tqdm显示进度条
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        for item in data:
            audio_files = item.get("audio")
            audio_directory = AudioFolder
            # 如果 audio_files 是字符串,将其转换为列表
            if isinstance(audio_files, str):
                audio_files = [audio_files]

            # 如果 audio_files 是列表,处理每个文件
            if isinstance(audio_files, list):
                for audio_file_name in audio_files:
                    futures.append(executor.submit(check_audio, audio_file_name, audio_directory))

        for future in tqdm(
            as_completed(futures), total=len(futures), desc="Processing", unit="file"
        ):
            duration = future.result()
            dur_list.append(duration)

    # 初始化区间计数字典
    distribution = {
        "0-1": 0,
        "1-5": 0,
        "5-10": 0,
        "10-15": 0,
        "15-20": 0,
        "20-25": 0,
        "25-30": 0,
        "30-60": 0,
        "60-200": 0,
        ">200": 0,
    }

    # 统计每个区间的计数
    for length in dur_list:
        if length <= 1:
            distribution["0-1"] += 1
        elif length <= 5:
            distribution["1-5"] += 1
        elif length <= 10:
            distribution["5-10"] += 1
        elif length <= 15:
            distribution["10-15"] += 1
        elif length <= 20:
            distribution["15-20"] += 1
        elif length <= 25:
            distribution["20-25"] += 1
        elif length <= 30:
            distribution["25-30"] += 1
        elif length <= 60:
            distribution["30-60"] += 1
        elif length <= 200:
            distribution["60-200"] += 1
        else:
            distribution[">200"] += 1

    # 打印分布结果
    print(f"duration distribution of {json_file_path}:")
    for key, value in distribution.items():
        print(f"{key}: {value}")