statistics_image_num.py

import json
import math
import os
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed

from PIL import Image
from tqdm import tqdm

import torchaudio
from vita.config import *
from vita.config import AudioFolder, FolderDict
from vita.config.dataset_config import *

# 将所有字典放入一个列表中
datasets = NLP + HumanCentric + VideoQA + NaturalQA + VideoCap + OCRCap + NaturalCap

# 遍历每个字典
for dataset in datasets:
    dur_list = []
    keys = list(dataset.keys())
    input_file_name = dataset["chat_path"]

    # 读取JSON文件
    len_list = []
    with open(input_file_name, "r", encoding="utf-8") as file:
        data = json.load(file)

    print(f"check {input_file_name}")
    # 遍历每条数据
    for item in tqdm(data):
        if "image" in item:
            image_path = item["image"]
            assert isinstance(image_path[0], str)
            if type(image_path) is not list:
                assert isinstance(image_path, str)
                image_path = [image_path]
            count_image_path = len(image_path)
            if count_image_path > 40:
                print(count_image_path)
                print(item)
            len_list.append(count_image_path)

    distribution = {
        "0-5": 0,
        "5-10": 0,
        "10-16": 0,
        "16-20": 0,
        "20-25": 0,
        "25-30": 0,
        "30-35": 0,
        "35-40": 0,
        ">40": 0,
    }

    for length in len_list:
        if length <= 5:
            distribution["0-5"] += 1
        elif length <= 10:
            distribution["5-10"] += 1
        elif length <= 16:
            distribution["10-16"] += 1
        elif length <= 20:
            distribution["16-20"] += 1
        elif length <= 25:
            distribution["20-25"] += 1
        elif length <= 30:
            distribution["25-30"] += 1
        elif length <= 35:
            distribution["30-35"] += 1
        elif length <= 40:
            distribution["35-40"] += 1
        else:
            distribution[">40"] += 1

    print(f"Length distribution of {input_file_name}:")
    for key, value in distribution.items():
        print(f"{key}: {value}")