check_image_space_ratio.py

import json
import os
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed

from PIL import Image
from tqdm import tqdm

from vita.config import FolderDict
from vita.config.dataset_config import *

# 定义文件路径
output_file_path = "long_image_file_name.txt"
ratio_thre = 12

# 将所有字典放入一个列表中
# datasets = [AnyWord_20to50, RCTW2019, RCTW2019QA, RCTW2017, OpenChart, SCID, K12, TabRECSet, DigChat, iFlyTab]
datasets = [AnyWord_20to50, DyChart_iresearch]
datasets = [RCTW2019, RCTW2019QA, RCTW2017]
datasets = [OpenChart, SCID]
datasets = [K12]
# datasets = [TabRECSet, DigChat, iFlyTab]

# 初始化一个列表来存储丢失的文件名
lost_files = []
lock = threading.Lock()


def check_image(image_file_name, image_directory):
    image_file_path = os.path.join(image_directory, image_file_name)
    if not os.path.exists(image_file_path):
        print(f"{image_file_path} not exist!!!!!!!!!!")
        return image_file_name
    else:
        try:
            with Image.open(image_file_path) as img:
                img.convert("RGB")
                size_ratio = img.size[0] / img.size[1]
                if size_ratio < 1 / ratio_thre or size_ratio > ratio_thre:
                    print(f"{image_file_path} ratio is too big!!!!!!!!!!!!!!")
                    return image_file_name
        except Exception as e:
            print(f"{image_file_path} is broken!!!!!!!!!!!!")
            return image_file_name
    return None


# 遍历每个字典
for dataset in datasets:
    keys = list(dataset.keys())
    json_file_path = dataset["chat_path"]

    # 读取JSON文件
    with open(json_file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # 遍历每条数据，使用tqdm显示进度条
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        for item in data:
            image_files = item.get("image")
            set_id = item["set"]
            image_directory = FolderDict[set_id]

            # 如果 image_files 是字符串，将其转换为列表
            if isinstance(image_files, str):
                image_files = [image_files]

            # 如果 image_files 是列表，处理每个文件
            if isinstance(image_files, list):
                for image_file_name in image_files:
                    futures.append(executor.submit(check_image, image_file_name, image_directory))

        for future in tqdm(
            as_completed(futures), total=len(futures), desc="Processing", unit="file"
        ):
            result = future.result()
            if result:
                with lock:
                    lost_files.append(result)

# 将丢失的文件名写入到lost_file_name.txt中
with open(output_file_path, "w", encoding="utf-8") as f:
    for file_name in lost_files:
        f.write(file_name + "\n")

print(f"检查完成，共有 {len(lost_files)} 个文件丢失或无法读取，结果已保存到 {output_file_path}")