model_comparator.py 3.69 KB
Newer Older
baberabb's avatar
baberabb committed
1
2
3
4
5
import argparse
import numpy as np
import lm_eval.evaluator
from lm_eval import tasks
import scipy.stats
6
7
8
9
from typing import Tuple, Dict, List
import pandas as pd
import torch
import os
baberabb's avatar
baberabb committed
10

11
os.environ["TOKENIZERS_PARALLELISM"] = "false"
baberabb's avatar
baberabb committed
12
13
14
eval_logger = lm_eval.utils.eval_logger


15
def calculate_z_value(res1: Dict, res2: Dict) -> Tuple[float, float]:
baberabb's avatar
baberabb committed
16
    acc1, acc2 = res1["acc,none"], res2["acc,none"]
17
18
    st_err1, st_err2 = res1["acc_stderr,none"], res2["acc_stderr,none"]
    Z = (acc1 - acc2) / np.sqrt((st_err1**2) + (st_err2**2))
baberabb's avatar
baberabb committed
19
20
21
22
23
    # Determining the p-value
    p_value = 2 * scipy.stats.norm.sf(abs(Z))  # two-tailed test
    return Z, p_value


24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def print_results(
    data_to_print: List = None, results_dict: Dict = None, alpha: float = None
):
    model1_data = data_to_print[0]
    model2_data = data_to_print[1]
    table_data = []
    for task in model1_data.keys():
        row = {
            "Task": task,
            "HF Accuracy": model1_data[task]["acc,none"],
            "vLLM Accuracy": model2_data[task]["acc,none"],
            "HF StdErr": model1_data[task]["acc_stderr,none"],
            "vLLM StdErr": model2_data[task]["acc_stderr,none"],
        }
        table_data.append(row)
    comparison_df = pd.DataFrame(table_data)
    comparison_df["Z-Score"] = comparison_df["Task"].apply(
        lambda task: results_dict[task]["z"]
    )
    comparison_df["P-Value"] = comparison_df["Task"].apply(
        lambda task: results_dict[task]["p_value"]
    )
    comparison_df[f"p > {alpha}"] = comparison_df["P-Value"].apply(
        lambda p: "✓" if p > alpha else "×"
    )
    return comparison_df


baberabb's avatar
baberabb committed
52
53
54
55
56
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--pretrained", default="EleutherAI/pythia-70m", help="name of model to compare"
    )
baberabb's avatar
baberabb committed
57
58
59
60
    parser.add_argument(
        "--hf_args", help="huggingface model args <arg>=<value>", default=""
    )
    parser.add_argument("--vllm_args", help="vllm model args <arg>=<value>", default="")
baberabb's avatar
baberabb committed
61
62
    parser.add_argument("--tasks", type=str, default="arc_easy,hellaswag")
    parser.add_argument(
63
        "--limit",
baberabb's avatar
baberabb committed
64
        type=int,
65
66
67
68
69
70
71
        default=100,
    )
    parser.add_argument(
        "--alpha",
        type=float,
        default=0.05,
        help="Significance level for two-tailed z-test",
baberabb's avatar
baberabb committed
72
73
74
75
76
77
78
79
    )
    parser.add_argument(
        "--device",
        type=str,
        default="cuda",
    )
    parser.add_argument(
        "--batch",
baberabb's avatar
baberabb committed
80
81
        type=int,
        default=8,
baberabb's avatar
baberabb committed
82
83
84
85
86
87
88
89
90
91
92
    )
    parser.add_argument(
        "--verbosity",
        type=str,
        default="INFO",
        help="Logging verbosity",
    )
    return parser.parse_args()


if __name__ == "__main__":
baberabb's avatar
baberabb committed
93
    tasks.initialize_tasks()
baberabb's avatar
baberabb committed
94
    args = parse_args()
baberabb's avatar
baberabb committed
95
96
    tasks = args.tasks.split(",")
    print(tasks)
97
98
99
100
    hf_args, vllm_args = "," + args.hf_args, "," + args.vllm_args
    results_vllm = lm_eval.evaluator.simple_evaluate(
        model="vllm",
        model_args=f"pretrained={args.pretrained}" + vllm_args,
baberabb's avatar
baberabb committed
101
        tasks=tasks,
102
        limit=args.limit,
baberabb's avatar
baberabb committed
103
        device=args.device,
baberabb's avatar
baberabb committed
104
        batch_size=args.batch,
baberabb's avatar
baberabb committed
105
    )
106
107
108
109
    torch.cuda.empty_cache()
    results_hf = lm_eval.evaluator.simple_evaluate(
        model="hf",
        model_args=f"pretrained={args.pretrained}" + hf_args,
baberabb's avatar
baberabb committed
110
        tasks=tasks,
111
        limit=args.limit,
baberabb's avatar
baberabb committed
112
        device=args.device,
baberabb's avatar
baberabb committed
113
        batch_size=args.batch,
baberabb's avatar
baberabb committed
114
115
    )
    all_res = {}
baberabb's avatar
baberabb committed
116
    for task1, task2 in zip(
baberabb's avatar
baberabb committed
117
118
        results_hf["results"].items(), results_vllm["results"].items()
    ):
baberabb's avatar
baberabb committed
119
        assert task1[0] == task2[0]
120
        z, p_value = calculate_z_value(task1[1], task2[1])
baberabb's avatar
baberabb committed
121
        all_res[task1[0]] = {"z": z, "p_value": p_value}
122
123
124
125
    df = print_results(
        [results_hf["results"], results_vllm["results"]], all_res, args.alpha
    )
    print(df)