convert_coco2017_captions.py 2.24 KB
Newer Older
litzh's avatar
litzh committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import json
import os
import argparse
from tqdm import tqdm

def parse_args():
    parser = argparse.ArgumentParser(
        description="Convert COCO2017 captions to LLaMA-Factory jsonl format"
    )
    parser.add_argument(
        "--coco-root",
        type=str,
        required=True,
        help="COCO2017 root directory (default: /workspace/DL_DATA/COCO2017/)"
    )
    parser.add_argument(
        "--output",
        type=str,
        default="./data/coco_train2017_captions.jsonl",
        help="Output jsonl file path"
    )
    parser.add_argument(
        "--prompt",
        type=str,
        default="<image>\nDescribe the image.",
        help="Human prompt text"
    )
    return parser.parse_args()
    
def main():
    args = parse_args()
    coco_root = args.coco_root
    output_jsonl = args.output
    human_prompt = args.prompt

litzh's avatar
litzh committed
36
37
    caption_json = os.path.join(coco_root, "annotations/captions_train2017.json")
    image_dir = os.path.join(coco_root, "train2017")
litzh's avatar
litzh committed
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76

    assert os.path.exists(caption_json), f"Caption file not found: {caption_json}"
    assert os.path.exists(image_dir), f"Image dir not found: {image_dir}"

    with open(caption_json, "r", encoding="utf-8") as f:
        coco = json.load(f)

    # image_id -> file_name
    id2file = {img["id"]: img["file_name"] for img in coco["images"]}
    annotations = coco["annotations"]
    os.makedirs(os.path.dirname(output_jsonl), exist_ok=True)

    with open(output_jsonl, "w", encoding="utf-8") as out_f:
        for ann in tqdm(annotations, desc="Converting COCO captions"):
            image_id = ann["image_id"]
            caption = ann["caption"].strip()
            file_name = id2file[image_id]

            image_path = os.path.join(image_dir, file_name)

            sample = {
                "image": image_path,
                "conversations": [
                    {
                        "from": "human",
                        "value": human_prompt
                    },
                    {
                        "from": "gpt",
                        "value": caption
                    }
                ]
            }

            out_f.write(json.dumps(sample, ensure_ascii=False) + "\n")
    print(f"Done. Saved to {output_jsonl}")

if __name__ == "__main__":
    main()