gen_messages_data.py 1.38 KB
Newer Older
Rayyyyy's avatar
Rayyyyy committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import os
import json
import argparse

# 配置数据
parse = argparse.ArgumentParser()
parse.add_argument('--data_path', default='./data/AdvertiseGen')
args = parse.parse_args()

# 默认保存路径
save_root_path = os.path.join(args.data_path, 'saves')
if not os.path.exists(save_root_path):
    os.mkdir(save_root_path)


def save_to_jsonl(train_infos, save_path):
    '''将json数据保存到.jsonl文件中'''

    with open(save_path, 'w', encoding='utf-8') as file:
        for info in train_infos:
            file.write(json.dumps(info, ensure_ascii=False)+'\n')
    file.close()

def load_json_infos(file_path):
    '''读取json数据'''
    all_data = []
    with open(file_path, 'r', encoding='utf-8') as ofile:
        for info in ofile.readlines():
            json_info = json.loads(info)
            output = {"messages": []}
            content = {"role": "user", "content": json_info.get("content")}
            summary = {"role": "assistant", "content": json_info.get("summary")}
            output["messages"].extend([content, summary])
            all_data.append(output)

    save_file_path = os.path.join(save_root_path, os.path.basename(file_path)+'l')
    save_to_jsonl(all_data, save_file_path)


if __name__ == "__main__":

    files = ['train.json', 'dev.json']
    for file in files:
        file_path = os.path.join(args.data_path, file)
        output = load_json_infos(file_path)