import json import os import argparse from tqdm import tqdm def parse_args(): parser = argparse.ArgumentParser( description="Convert COCO2017 captions to LLaMA-Factory jsonl format" ) parser.add_argument( "--coco-root", type=str, required=True, help="COCO2017 root directory (default: /workspace/DL_DATA/COCO2017/)" ) parser.add_argument( "--output", type=str, default="./data/coco_train2017_captions.jsonl", help="Output jsonl file path" ) parser.add_argument( "--prompt", type=str, default="\nDescribe the image.", help="Human prompt text" ) return parser.parse_args() def main(): args = parse_args() coco_root = args.coco_root output_jsonl = args.output human_prompt = args.prompt caption_json = os.path.join(coco_root, "annotations/captions_train2017.json") image_dir = os.path.join(coco_root, "train2017") assert os.path.exists(caption_json), f"Caption file not found: {caption_json}" assert os.path.exists(image_dir), f"Image dir not found: {image_dir}" with open(caption_json, "r", encoding="utf-8") as f: coco = json.load(f) # image_id -> file_name id2file = {img["id"]: img["file_name"] for img in coco["images"]} annotations = coco["annotations"] os.makedirs(os.path.dirname(output_jsonl), exist_ok=True) with open(output_jsonl, "w", encoding="utf-8") as out_f: for ann in tqdm(annotations, desc="Converting COCO captions"): image_id = ann["image_id"] caption = ann["caption"].strip() file_name = id2file[image_id] image_path = os.path.join(image_dir, file_name) sample = { "image": image_path, "conversations": [ { "from": "human", "value": human_prompt }, { "from": "gpt", "value": caption } ] } out_f.write(json.dumps(sample, ensure_ascii=False) + "\n") print(f"Done. Saved to {output_jsonl}") if __name__ == "__main__": main()