data_process.py 3.79 KB
Newer Older
mashun1's avatar
toolace  
mashun1 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import json
import re

def is_function_call(text):
    """
    判断文本是否符合函数调用列表格式:
    [Func1(arg="val"), Func2(...)]
    """
    text = text.strip()
    if not (text.startswith('[') and text.endswith(']')):
        return False

    inner = text[1:-1].strip()
    pattern = re.compile(r'([^\(\)]+)\(([^()]*)\)')
    matches = pattern.findall(inner)
    if not matches:
        return False

    replaced = pattern.sub('', inner)
    replaced = replaced.replace(',', '').replace(' ', '')
    return replaced == ''

def parse_function_call_list(text):
    """
    解析函数调用字符串为JSON数组格式
    示例:
    [SEC Filings(identifier="AAPL"), United States Away from Home Mobility API(string="2025-05-17")]
    转换成:
    [
      {"name": "SEC Filings", "arguments": {"identifier": "AAPL"}},
      {"name": "United States Away from Home Mobility API", "arguments": {"string": "2025-05-17"}}
    ]
    """
    inner = text.strip()[1:-1].strip()
    pattern = re.compile(r'([^\(\)]+)\((.*?)\)')
    matches = pattern.findall(inner)

    functions = []
    for func_name, args_str in matches:
        func_name = func_name.strip()

        args = {}
        if args_str.strip():
            # 支持多个参数,形如 key="value", key2="value2"
            parts = re.split(r',\s*(?=\w+=)', args_str)
            for part in parts:
                key_val = part.split('=', 1)
                if len(key_val) == 2:
                    key = key_val[0].strip()
                    val = key_val[1].strip()
                    if (val.startswith('"') and val.endswith('"')) or (val.startswith("'") and val.endswith("'")):
                        val = val[1:-1]
                    args[key] = val

        functions.append({
            "name": func_name,
            "arguments": args
        })

    return functions

def convert_conversation(conversations):
    converted = []

    for message in conversations:
        role = message.get("from", "")
        value = message.get("value", "")

        if role == "user":
            converted.append({
                "from": "human",
                "value": value
            })

        elif role == "assistant":
            # 判断是否函数调用
            if is_function_call(value):
                parsed_funcs = parse_function_call_list(value)
                converted.append({
                    "from": "function_call",
                    "value": json.dumps(parsed_funcs, ensure_ascii=False)
                })
            else:
                converted.append({
                    "from": "gpt",
                    "value": value
                })

        elif role == "tool":
            converted.append({
                "from": "observation",
                "value": value
            })

    return converted

def transform_data(data):
    """
    对整体data进行转换,保留system,转换conversations
    """
    result = []
    for item in data:
        system_text = item.get("system", "")
        conversations = item.get("conversations", [])

        converted_conversations = convert_conversation(conversations)

        result.append({
            "system": system_text,
            "conversations": converted_conversations
        })

    return result

def main(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    transformed = transform_data(data)

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(transformed, f, ensure_ascii=False, indent=2)


if __name__ == "__main__":
    from argparse import ArgumentParser
    
    parser = ArgumentParser()
    
    parser.add_argument("--input_file", type=str)

    parser.add_argument("--output_file", type=str)
    
    args = parser.parse_args()

    main(args.input_file, args.output_file)