convert_data.py 1 KB
Newer Older
chenzk's avatar
v1.0  
chenzk committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# 转换为 ChatML 格式
import os
import shutil
import json

input_dir = "data/AdvertiseGen"
output_dir = "data/AdvertiseGenChatML"
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)
os.makedirs(output_dir, exist_ok=True)

for fn in ["train.json", "dev.json"]:
    data_out_list = []
    with open(os.path.join(input_dir, fn), "r") as f, open(os.path.join(output_dir, fn), "w") as fo:
        for line in f:
            if len(line.strip()) > 0:
                data = json.loads(line)
                data_out = {
                    "messages": [
                        {
                            "role": "user",
                            "content": data["content"],
                        },
                        {
                            "role": "assistant",
                            "content": data["summary"],
                        },
                    ]
                }
                data_out_list.append(data_out)
        json.dump(data_out_list, fo, ensure_ascii=False, indent=4)