# 将数据转换为json格式,此脚本适用于cc3m # [ # {"text": "a dog", "image_file": "dog.jpg"} # ] import json from pathlib import Path def convert_to_json(data_root: str, save_path: str): data_root = Path(data_root) txt_path_list = [*data_root.glob("*.txt")] image_path_list = [*data_root.glob("*.png"), *data_root.glob("*.jpg"), *data_root.glob("*.jpeg")] text_path_mapping = { txt_path.stem: txt_path for txt_path in txt_path_list } image_path_mapping = { image_path.stem: image_path for image_path in image_path_list } keys = list(set(text_path_mapping.keys()) & set(image_path_mapping.keys())) results = [] for key in keys: with open(text_path_mapping[key]) as f: text = f.read().strip() results.append({"text": text, "image_file": str(image_path_mapping[key])}) with open(save_path, "w") as f: json.dump(results, f, ensure_ascii=False) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--data_root", type=str, help="图像-文本存储位置") parser.add_argument("--save_path", type=str, help="json文件存储位置") args = parser.parse_args() convert_to_json(args.data_root, args.save_path)