import os import json from argparse import ArgumentParser def normalize_value(value) -> object: if isinstance(value, str): try: return float(value) if '.' in value else int(value) except ValueError: if value.lower() in ["true", "false"]: return value.lower() == "true" return value.lower().strip() return value def validate_prediction(pred_json): return ( pred_json is not None ) def evaluate_function_calls(test_data: list, eval_data: list) : stats = { "total": 0, "function_correct": 0, "argument_name_correct": 0, "argument_value_correct": 0, "invalid_gt": 0, "invalid_pred": 0 } function_accuracy_errors = [] argument_name_accuracy_errors = [] argument_value_accuracy_errors = [] for i in range(len(test_data)): stats["total"] += 1 try: pred_data = test_data[i] gt_data = eval_data[i] except: breakpoint() if not validate_prediction(pred_data): stats["invalid_pred"] += 1 continue if isinstance(pred_data,list): find = False for data in pred_data: if data["name"] == gt_data["name"]: pred_data = data find = True break if not find: pred_data = pred_data[0] if pred_data["name"] == gt_data["name"]: stats["function_correct"] += 1 pred_args = pred_data.get("arguments", {}) gt_args = gt_data.get("arguments", {}) try: if set(pred_args.keys()) == set(gt_args.keys()): stats["argument_name_correct"] += 1 all_values_match = True for key in gt_args.keys(): normalized_pred = normalize_value(pred_args[key]) normalized_gt = normalize_value(gt_args[key]) if normalized_pred != normalized_gt: all_values_match = False argument_value_accuracy_errors.append({"prediction": pred_data, "ground_truth": gt_data}) break if all_values_match: stats["argument_value_correct"] += 1 else: argument_name_accuracy_errors.append({"prediction": pred_data, "ground_truth": gt_data}) except Exception as e: print(e) continue else: function_accuracy_errors.append({"prediction": repr(pred_data["name"]), "ground_truth": repr(gt_data["name"])}) return { "function_accuracy": round(stats["function_correct"] / stats["total"], 4) if stats["total"] > 0 else 0.0, "argument_name_accuracy": round(stats["argument_name_correct"] / stats["total"], 4) if stats["function_correct"] > 0 else 0.0, "argument_value_accuracy": round(stats["argument_value_correct"] / stats["total"], 4) if stats["argument_name_correct"] > 0 else 0.0, "total_samples": stats["total"], "function_correct": stats["function_correct"], "argument_name_correct": stats["argument_name_correct"], "argument_value_correct": stats["argument_value_correct"], "invalid_ground_truth": stats["invalid_gt"], "invalid_predictions": stats["invalid_pred"] } gt_example = [ { "name": "getWeather", "arguments": { "city": "杭州", "extensions": "all" } } ] if __name__ == "__main__": argument_parsr = ArgumentParser() argument_parsr.add_argument("--input_path",type=str,required=True) args = argument_parsr.parse_args() with open(args.input_path,"r") as f: data = json.load(f) print(evaluate_function_calls(test_data=[data[-1].get("function_call")],eval_data=gt_example))