auto_evaluation.py

import os
import json
import argparse
import evaluate_bird
import evaluate_spider2
import evaluate_spider
import matplotlib.pyplot as plt
from tqdm import tqdm

def visualize(eval_name, acc_dict, ylabel, file_path):
    plt.figure(figsize=(10, 6))

    ckpt_ids = list(range(len(acc_dict)))
    values = list(acc_dict.values())

    if isinstance(values[0], list): # Spider has two metrics: EX acc and TS acc
        num_lines = len(values[0])
        labels = ["EX", "TS"]
        assert num_lines == len(labels)
        for i in range(num_lines):
            line_values = [v[i] for v in values]
            plt.plot(ckpt_ids, line_values, marker='o', linestyle='-', label=labels[i])
    else:
        plt.plot(ckpt_ids, values, marker='o', linestyle='-', label="EX")

    plt.title(eval_name)
    plt.xlabel('ckpt-id')
    plt.ylabel(ylabel)
    plt.grid(True)
    plt.legend()

    plt.savefig(file_path)
    plt.close()

def save_evaluation_results(file_path, acc_dict):
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(json.dumps(acc_dict, indent=2, ensure_ascii=False))

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--output_ckpt_dir", type = str, default = "./ckpts")
    parser.add_argument('--multiple_models', action='store_true', help='Evaluate multiple models from a folder.')
    parser.add_argument("--source", type = str, default = "bird")

    parser.add_argument("--visible_devices", type = str, default = "0,1")
    parser.add_argument("--input_file", type = str, help = "input file path (prompts)")
    parser.add_argument("--eval_name", type = str, help = "name of the evaluation set")
    parser.add_argument("--tensor_parallel_size", type = int, help = "the number of used GPUs", default = 1)
    parser.add_argument("--n", type = int, help = "sampling number", default = 16)

    parser.add_argument("--gold_file", type = str, help = "gold sql path")
    parser.add_argument("--db_path", type = str, help = "database path")
    parser.add_argument("--ts_db_path", type = str, default = "", help = "test suite database path (required by Spider)")
    parser.add_argument("--gold_result_dir", type = str, help = "gold sql execution results (required by Spider2.0)")
    parser.add_argument("--eval_standard", type = str, help = "evaluation standard (required by Spider2.0)")
    
    opt = parser.parse_args()
    print(opt)

    assert opt.source in ["spider", "bird", "spider2.0"]

    if opt.multiple_models:
        ckpt_ids = os.listdir(opt.output_ckpt_dir)
        ckpt_ids = sorted(ckpt_ids, key=lambda x: int(x.split("-")[1]))
        print(ckpt_ids)
    else:
        ckpt_ids = [""]

    greedy_search_acc_dict = dict()
    pass_at_k_acc_dict = dict()
    major_voting_acc_dict = dict()

    os.makedirs(os.path.join("results", opt.eval_name), exist_ok=True)
    os.makedirs(os.path.join("evaluation_results", opt.eval_name), exist_ok=True)

    for ckpt_id in tqdm(ckpt_ids):
        print("Evaluating ckpt:", ckpt_id)

        if ckpt_id not in greedy_search_acc_dict.keys():
            # greedy decoding
            gs_pred_file = f"results/{opt.eval_name}/greedy_search_{ckpt_id}.json"
            greedy_search_cmd = f"CUDA_VISIBLE_DEVICES={opt.visible_devices} python3 infer.py \
                --pretrained_model_name_or_path {os.path.join(opt.output_ckpt_dir, ckpt_id)} \
                --input_file {opt.input_file} \
                --output_file {gs_pred_file} \
                --tensor_parallel_size {opt.tensor_parallel_size} \
                --n 1 \
                --temperature 0.0"
            os.system(greedy_search_cmd)

            # evaluate greedy search
            if opt.source == "spider2.0":
                # warm up
                evaluate_spider2.evaluate("greedy_search", opt.gold_result_dir, opt.eval_standard, 
                                        opt.gold_file, gs_pred_file, opt.db_path, True)
                # record evaluation results
                gs_acc, _ = evaluate_spider2.evaluate("greedy_search", opt.gold_result_dir, opt.eval_standard, 
                                        opt.gold_file, gs_pred_file, opt.db_path, True)
            elif opt.source == "bird":
                # warm up
                evaluate_bird.run_eval(opt.gold_file, gs_pred_file, opt.db_path, "greedy_search", True)
                # record evaluation results
                gs_acc, _ = evaluate_bird.run_eval(opt.gold_file, gs_pred_file, opt.db_path, "greedy_search", True)
            elif opt.source == "spider": # for "spider"
                # warm up
                evaluate_spider.run_spider_eval(opt.gold_file, gs_pred_file, opt.db_path, 
                                            opt.ts_db_path, "greedy_search", True)
                # record evaluation results
                ex_score, ts_score = evaluate_spider.run_spider_eval(opt.gold_file, gs_pred_file, opt.db_path, 
                                            opt.ts_db_path, "greedy_search", True)
                if ts_score is None:
                    gs_acc = ex_score
                else:
                    gs_acc = [ex_score, ts_score]
            
            greedy_search_acc_dict[ckpt_id] = gs_acc
            print(greedy_search_acc_dict)
            visualize(opt.eval_name, greedy_search_acc_dict, "greedy_search",
                os.path.join("evaluation_results", opt.eval_name, "greedy_search.png"))
            save_evaluation_results(os.path.join("evaluation_results", opt.eval_name, "greedy_search.json"), greedy_search_acc_dict)
        else:
            print(f"skip {ckpt_id} greedy search")

        if ckpt_id not in major_voting_acc_dict.keys():
            # sampling
            sampling_pred_file = f"results/{opt.eval_name}/sampling_{ckpt_id}.json"
            sampling_cmd = f"CUDA_VISIBLE_DEVICES={opt.visible_devices} python3 infer.py \
                --pretrained_model_name_or_path {os.path.join(opt.output_ckpt_dir, ckpt_id)} \
                --input_file {opt.input_file} \
                --output_file {sampling_pred_file} \
                --tensor_parallel_size {opt.tensor_parallel_size} \
                --n {opt.n} \
                --temperature 0.8"
            os.system(sampling_cmd)

            # evaluate pass@k (we do not evaluate pass@k for spider and its variants)
            if opt.source in ["bird", "spider2.0"]:
                if opt.source == "spider2.0":
                    # warm up
                    evaluate_spider2.evaluate("pass@k", opt.gold_result_dir, opt.eval_standard, 
                        opt.gold_file, sampling_pred_file, opt.db_path, True)
                    # record evaluation results
                    pass_at_k_acc, _ = evaluate_spider2.evaluate("pass@k", opt.gold_result_dir, opt.eval_standard, 
                                            opt.gold_file, sampling_pred_file, opt.db_path, True)
                elif opt.source == "bird":
                    # warm up
                    evaluate_bird.run_eval(opt.gold_file, sampling_pred_file, opt.db_path, "pass@k", True)
                    # record evaluation results
                    pass_at_k_acc, _ = evaluate_bird.run_eval(opt.gold_file, sampling_pred_file, opt.db_path, "pass@k", True)
                pass_at_k_acc_dict[ckpt_id] = pass_at_k_acc
                print(pass_at_k_acc_dict)
                visualize(opt.eval_name, pass_at_k_acc_dict, "pass_at_k",
                    os.path.join("evaluation_results", opt.eval_name, "pass_at_k.png"))
                save_evaluation_results(os.path.join("evaluation_results", opt.eval_name, "pass_at_k.json"), pass_at_k_acc_dict)
            
            # evaluate major voting
            if opt.source == "spider2.0":
                # warm up
                evaluate_spider2.evaluate("major_voting", opt.gold_result_dir, opt.eval_standard, 
                                            opt.gold_file, sampling_pred_file, opt.db_path, True)
                # record evaluation results
                major_voting_acc, _ = evaluate_spider2.evaluate("major_voting", opt.gold_result_dir, opt.eval_standard, 
                                            opt.gold_file, sampling_pred_file, opt.db_path, True)
            elif opt.source == "bird":
                # warm up
                evaluate_bird.run_eval(opt.gold_file, sampling_pred_file, opt.db_path, "major_voting", True)
                # record evaluation results
                major_voting_acc, _ = evaluate_bird.run_eval(opt.gold_file, sampling_pred_file, opt.db_path, "major_voting", True)
            else: # spider
                # warm up
                evaluate_spider.run_spider_eval(opt.gold_file, sampling_pred_file, opt.db_path, 
                                            opt.ts_db_path, "major_voting", True)
                # record evaluation results
                ex_score, ts_score = evaluate_spider.run_spider_eval(opt.gold_file, sampling_pred_file, opt.db_path, 
                                            opt.ts_db_path, "major_voting", True)
                if ts_score is None:
                    major_voting_acc = ex_score
                else:
                    major_voting_acc = [ex_score, ts_score]
            
            major_voting_acc_dict[ckpt_id] = major_voting_acc
            print(major_voting_acc_dict)
            visualize(opt.eval_name, major_voting_acc_dict, "major_voting",
                os.path.join("evaluation_results", opt.eval_name, "major_voting.png"))
            save_evaluation_results(os.path.join("evaluation_results", opt.eval_name, "major_voting.json"), major_voting_acc_dict)
        else:
            print(f"skip {ckpt_id} pass at k and major voting")