import json import re def is_function_call(text): """ 判断文本是否符合函数调用列表格式: [Func1(arg="val"), Func2(...)] """ text = text.strip() if not (text.startswith('[') and text.endswith(']')): return False inner = text[1:-1].strip() pattern = re.compile(r'([^\(\)]+)\(([^()]*)\)') matches = pattern.findall(inner) if not matches: return False replaced = pattern.sub('', inner) replaced = replaced.replace(',', '').replace(' ', '') return replaced == '' def parse_function_call_list(text): """ 解析函数调用字符串为JSON数组格式 示例: [SEC Filings(identifier="AAPL"), United States Away from Home Mobility API(string="2025-05-17")] 转换成: [ {"name": "SEC Filings", "arguments": {"identifier": "AAPL"}}, {"name": "United States Away from Home Mobility API", "arguments": {"string": "2025-05-17"}} ] """ inner = text.strip()[1:-1].strip() pattern = re.compile(r'([^\(\)]+)\((.*?)\)') matches = pattern.findall(inner) functions = [] for func_name, args_str in matches: func_name = func_name.strip() args = {} if args_str.strip(): # 支持多个参数,形如 key="value", key2="value2" parts = re.split(r',\s*(?=\w+=)', args_str) for part in parts: key_val = part.split('=', 1) if len(key_val) == 2: key = key_val[0].strip() val = key_val[1].strip() if (val.startswith('"') and val.endswith('"')) or (val.startswith("'") and val.endswith("'")): val = val[1:-1] args[key] = val functions.append({ "name": func_name, "arguments": args }) return functions def convert_conversation(conversations): converted = [] for message in conversations: role = message.get("from", "") value = message.get("value", "") if role == "user": converted.append({ "from": "human", "value": value }) elif role == "assistant": # 判断是否函数调用 if is_function_call(value): parsed_funcs = parse_function_call_list(value) converted.append({ "from": "function_call", "value": json.dumps(parsed_funcs, ensure_ascii=False) }) else: converted.append({ "from": "gpt", "value": value }) elif role == "tool": converted.append({ "from": "observation", "value": value }) return converted def transform_data(data): """ 对整体data进行转换,保留system,转换conversations """ result = [] for item in data: system_text = item.get("system", "") conversations = item.get("conversations", []) converted_conversations = convert_conversation(conversations) result.append({ "system": system_text, "conversations": converted_conversations }) return result def main(input_file, output_file): with open(input_file, "r", encoding="utf-8") as f: data = json.load(f) transformed = transform_data(data) with open(output_file, "w", encoding="utf-8") as f: json.dump(transformed, f, ensure_ascii=False, indent=2) if __name__ == "__main__": from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("--input_file", type=str) parser.add_argument("--output_file", type=str) args = parser.parse_args() main(args.input_file, args.output_file)