model_comparator.py 3.96 KB
Newer Older
baberabb's avatar
baberabb committed
1
2
3
4
import argparse
import numpy as np
import lm_eval.evaluator
from lm_eval import tasks
5
from lm_eval import utils
baberabb's avatar
baberabb committed
6
import scipy.stats
7
8
9
10
from typing import Tuple, Dict, List
import pandas as pd
import torch
import os
baberabb's avatar
baberabb committed
11

12
os.environ["TOKENIZERS_PARALLELISM"] = "false"
13
14
15
16
17
18
19
eval_logger = utils.eval_logger


def memory_stats():
    eval_logger.info(
        f"Memory allocated: {torch.cuda.memory_allocated() / 1024 ** 2}, reserved: {torch.cuda.memory_reserved() // 1024 ** 2}"
    )
baberabb's avatar
baberabb committed
20
21


22
def calculate_z_value(res1: Dict, res2: Dict) -> Tuple[float, float]:
baberabb's avatar
baberabb committed
23
    acc1, acc2 = res1["acc,none"], res2["acc,none"]
24
25
    st_err1, st_err2 = res1["acc_stderr,none"], res2["acc_stderr,none"]
    Z = (acc1 - acc2) / np.sqrt((st_err1**2) + (st_err2**2))
baberabb's avatar
baberabb committed
26
27
28
29
30
    # Determining the p-value
    p_value = 2 * scipy.stats.norm.sf(abs(Z))  # two-tailed test
    return Z, p_value


31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def print_results(
    data_to_print: List = None, results_dict: Dict = None, alpha: float = None
):
    model1_data = data_to_print[0]
    model2_data = data_to_print[1]
    table_data = []
    for task in model1_data.keys():
        row = {
            "Task": task,
            "HF Accuracy": model1_data[task]["acc,none"],
            "vLLM Accuracy": model2_data[task]["acc,none"],
            "HF StdErr": model1_data[task]["acc_stderr,none"],
            "vLLM StdErr": model2_data[task]["acc_stderr,none"],
        }
        table_data.append(row)
    comparison_df = pd.DataFrame(table_data)
    comparison_df["Z-Score"] = comparison_df["Task"].apply(
        lambda task: results_dict[task]["z"]
    )
    comparison_df["P-Value"] = comparison_df["Task"].apply(
        lambda task: results_dict[task]["p_value"]
    )
    comparison_df[f"p > {alpha}"] = comparison_df["P-Value"].apply(
        lambda p: "✓" if p > alpha else "×"
    )
    return comparison_df


baberabb's avatar
baberabb committed
59
60
61
62
63
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--pretrained", default="EleutherAI/pythia-70m", help="name of model to compare"
    )
baberabb's avatar
baberabb committed
64
65
66
67
    parser.add_argument(
        "--hf_args", help="huggingface model args <arg>=<value>", default=""
    )
    parser.add_argument("--vllm_args", help="vllm model args <arg>=<value>", default="")
baberabb's avatar
baberabb committed
68
69
    parser.add_argument("--tasks", type=str, default="arc_easy,hellaswag")
    parser.add_argument(
70
        "--limit",
baberabb's avatar
nits  
baberabb committed
71
        type=float,
72
73
74
75
76
77
78
        default=100,
    )
    parser.add_argument(
        "--alpha",
        type=float,
        default=0.05,
        help="Significance level for two-tailed z-test",
baberabb's avatar
baberabb committed
79
80
81
82
83
84
85
86
    )
    parser.add_argument(
        "--device",
        type=str,
        default="cuda",
    )
    parser.add_argument(
        "--batch",
baberabb's avatar
baberabb committed
87
        type=str,
baberabb's avatar
baberabb committed
88
        default=8,
baberabb's avatar
baberabb committed
89
90
91
92
93
94
95
96
97
98
99
    )
    parser.add_argument(
        "--verbosity",
        type=str,
        default="INFO",
        help="Logging verbosity",
    )
    return parser.parse_args()


if __name__ == "__main__":
baberabb's avatar
baberabb committed
100
    tasks.initialize_tasks()
baberabb's avatar
baberabb committed
101
    args = parse_args()
baberabb's avatar
baberabb committed
102
103
    tasks = args.tasks.split(",")
    print(tasks)
104
105
106
107
    hf_args, vllm_args = "," + args.hf_args, "," + args.vllm_args
    results_vllm = lm_eval.evaluator.simple_evaluate(
        model="vllm",
        model_args=f"pretrained={args.pretrained}" + vllm_args,
baberabb's avatar
baberabb committed
108
        tasks=tasks,
109
        limit=args.limit,
baberabb's avatar
baberabb committed
110
        device=args.device,
baberabb's avatar
baberabb committed
111
        batch_size=args.batch,
baberabb's avatar
baberabb committed
112
    )
113
114
115
116
    memory_stats()
    utils.clear_torch_cache()
    eval_logger.info("Memory stats cleared")
    memory_stats()
117
118
119
    results_hf = lm_eval.evaluator.simple_evaluate(
        model="hf",
        model_args=f"pretrained={args.pretrained}" + hf_args,
baberabb's avatar
baberabb committed
120
        tasks=tasks,
121
        limit=args.limit,
baberabb's avatar
baberabb committed
122
        device=args.device,
baberabb's avatar
baberabb committed
123
        batch_size=args.batch,
baberabb's avatar
baberabb committed
124
125
    )
    all_res = {}
baberabb's avatar
baberabb committed
126
    for task1, task2 in zip(
baberabb's avatar
baberabb committed
127
128
        results_hf["results"].items(), results_vllm["results"].items()
    ):
baberabb's avatar
baberabb committed
129
        assert task1[0] == task2[0]
130
        z, p_value = calculate_z_value(task1[1], task2[1])
baberabb's avatar
baberabb committed
131
        all_res[task1[0]] = {"z": z, "p_value": p_value}
132
133
134
135
    df = print_results(
        [results_hf["results"], results_vllm["results"]], all_res, args.alpha
    )
    print(df)