# 将数据转换为json格式，此脚本适用于cc3m
# [
#    {"text": "a dog", "image_file": "dog.jpg"}
# ] 
import json

from pathlib import Path


def convert_to_json(data_root: str,
                    save_path: str):
    
    data_root = Path(data_root)
    
    txt_path_list = [*data_root.glob("*.txt")]
    
    image_path_list = [*data_root.glob("*.png"),
                       *data_root.glob("*.jpg"),
                       *data_root.glob("*.jpeg")]
    
    text_path_mapping = {
        txt_path.stem: txt_path for txt_path in txt_path_list
    }

    image_path_mapping = {
        image_path.stem: image_path for image_path in image_path_list
    }
    
    keys = list(set(text_path_mapping.keys()) & set(image_path_mapping.keys()))
    
    results = []
    
    for key in keys:
        with open(text_path_mapping[key]) as f:
            text = f.read().strip()
        
        results.append({"text": text, "image_file": str(image_path_mapping[key])})
    
    with open(save_path, "w") as f:
        json.dump(results, f, ensure_ascii=False)


if __name__ == "__main__":
    import argparse 
    
    parser = argparse.ArgumentParser()
    
    parser.add_argument("--data_root", type=str, help="图像-文本存储位置")
    
    parser.add_argument("--save_path", type=str, help="json文件存储位置")
    
    args = parser.parse_args()
    
    convert_to_json(args.data_root, args.save_path)
    