check_video_lost.py

import json
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

from PIL import Image
from tqdm import tqdm

from decord import VideoReader, cpu
from vita.config import FolderDict
from vita.config.dataset_config import *

# 定义文件路径
output_file_path = "lost_file_name.txt"

# 将所有字典放入一个列表中
# datasets = [Webvid, K400]
# datasets = [VIDEOChatGPT, K700Split, VC2Internvid]
# datasets = [EgoGesture, Literature, CopyWrite, MovingFashion]
# datasets = [NoHarm]
datasets = [SGInternvid0]

# 初始化一个列表来存储丢失的文件名
lost_files = []

# 遍历每个字典
for dataset in datasets:
    keys = list(dataset.keys())
    json_file_path = dataset["chat_path"]

    # 读取JSON文件
    with open(json_file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    def check_video_file(item):
        video_file_name = item.get("video")
        if video_file_name:
            video_directory = FolderDict[item["set"]]
            video_file_path = os.path.join(video_directory, video_file_name)
            if not os.path.exists(video_file_path):
                print(f"file lost: {video_file_path}")
                return video_file_name
            else:
                sample_pos = [0, 10]
                try:
                    vreader = VideoReader(video_file_path, ctx=cpu(0))
                    patch_images = [
                        Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()
                    ]
                except Exception as e:
                    print(f"file broken: {video_file_path}")
                    return video_file_name
        return None

    # 使用ThreadPoolExecutor进行多线程并行处理
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(check_video_file, item) for item in data]
        for future in tqdm(
            as_completed(futures), total=len(futures), desc="Processing", unit="file"
        ):
            result = future.result()
            if result:
                lost_files.append(result)

# 将丢失的文件名写入到lost_file_name.txt中
with open(output_file_path, "w", encoding="utf-8") as f:
    for file_name in lost_files:
        f.write(file_name + "\n")

print(f"检查完成，共有 {len(lost_files)} 个文件丢失或无法读取，结果已保存到 {output_file_path}")