fix_data_Path.py

import json
from pathlib import Path

# 1. 配置路径（仅需修改这1个参数：JSON文件的路径）
json_file = Path("/public/opendas/DL_DATA/LLaVA-Pretrain/blip_laion_cc_sbu_558k.json")
# 自动获取JSON所在目录（即图片根目录，无需手动改！）
image_root_dir = json_file.parent  # 结果：/blip-3/dataset/LLaVA-Pretrain/
# 修复后的JSON保存路径（在原路径后加"_fixed"，避免覆盖原始文件）
fixed_json_file = "/blip-3/dataset/blip_laion_cc_sbu_558k_fixed.json"

# 2. 读取原始JSON数据
print(f"正在读取原始JSON：{json_file}")
with open(json_file, "r", encoding="utf-8") as f:
    data = json.load(f)
    if not isinstance(data, list):
        raise ValueError("JSON文件内容必须是列表格式（每个元素为一个样本）")
    print(f"成功读取 {len(data)} 个样本")

# 3. 修复每个样本的图片路径（核心逻辑）
fixed_count = 0  # 成功修复的样本数
missing_count = 0  # 原始路径缺失的样本数
for idx, sample in enumerate(data):
    # 从样本中获取图片相对路径（常见字段名：image、file_path、img_path，根据你的JSON调整！）
    # 先尝试"image"字段（如果你的JSON用其他字段，比如"file_path"，就改成sample.get("file_path")）
    relative_img_path = sample.get("image")
    
    if not relative_img_path:
        missing_count += 1
        print(f"警告：第{idx}个样本缺失图片路径，将跳过")
        continue
    
    # 拼接绝对路径：JSON所在目录 + 相对路径
    absolute_img_path = image_root_dir / relative_img_path
    # 转换为字符串格式（避免Path对象在JSON中被序列化为字典）
    sample["image"] = str(absolute_img_path)
    
    # （可选）验证路径是否存在，提前排查无效图片
    if not absolute_img_path.exists():
        print(f"警告：第{idx}个样本的图片不存在 → {absolute_img_path}")
    else:
        fixed_count += 1

# 4. 保存修复后的JSON
with open(fixed_json_file, "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

# 打印修复结果
print("\n" + "="*50)
print(f"路径修复完成！")
print(f"原始样本数：{len(data)}")
print(f"成功修复（路径有效或已拼接）：{fixed_count}")
print(f"缺失图片路径的样本：{missing_count}")
print(f"修复后的JSON：{fixed_json_file}")
# 打印第一个样本的路径示例，确认是否正确
if len(data) > 0 and "image" in data[0]:
    print(f"示例路径（第一个样本）：{data[0]['image']}")