model_comparator.py 4.25 KB
Newer Older
baberabb's avatar
baberabb committed
1
import argparse
2
3
4
import os
from typing import Dict, List, Tuple

baberabb's avatar
baberabb committed
5
import numpy as np
6
import pandas as pd
7
import scipy.stats
8
import torch
9
10

import lm_eval.evaluator
11
import lm_eval.models.utils
12
13
from lm_eval import tasks, utils

baberabb's avatar
baberabb committed
14

haileyschoelkopf's avatar
haileyschoelkopf committed
15
16
17
18
19
20
21
22
23
try:
    import scipy.stats
except ModuleNotFoundError:
    raise ModuleNotFoundError(
        "`scipy` is required for computing z-scores in hf-vllm comparisons via this script. \
please install scipy via pip install lm-eval[scipy] or pip install -e .[scipy]",
    )


24
os.environ["TOKENIZERS_PARALLELISM"] = "false"
25
26
27
28
29
30
31
eval_logger = utils.eval_logger


def memory_stats():
    eval_logger.info(
        f"Memory allocated: {torch.cuda.memory_allocated() / 1024 ** 2}, reserved: {torch.cuda.memory_reserved() // 1024 ** 2}"
    )
baberabb's avatar
baberabb committed
32
33


34
def calculate_z_value(res1: Dict, res2: Dict) -> Tuple[float, float]:
baberabb's avatar
baberabb committed
35
    acc1, acc2 = res1["acc,none"], res2["acc,none"]
36
37
    st_err1, st_err2 = res1["acc_stderr,none"], res2["acc_stderr,none"]
    Z = (acc1 - acc2) / np.sqrt((st_err1**2) + (st_err2**2))
baberabb's avatar
baberabb committed
38
39
40
41
42
    # Determining the p-value
    p_value = 2 * scipy.stats.norm.sf(abs(Z))  # two-tailed test
    return Z, p_value


43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
def print_results(
    data_to_print: List = None, results_dict: Dict = None, alpha: float = None
):
    model1_data = data_to_print[0]
    model2_data = data_to_print[1]
    table_data = []
    for task in model1_data.keys():
        row = {
            "Task": task,
            "HF Accuracy": model1_data[task]["acc,none"],
            "vLLM Accuracy": model2_data[task]["acc,none"],
            "HF StdErr": model1_data[task]["acc_stderr,none"],
            "vLLM StdErr": model2_data[task]["acc_stderr,none"],
        }
        table_data.append(row)
    comparison_df = pd.DataFrame(table_data)
    comparison_df["Z-Score"] = comparison_df["Task"].apply(
        lambda task: results_dict[task]["z"]
    )
    comparison_df["P-Value"] = comparison_df["Task"].apply(
        lambda task: results_dict[task]["p_value"]
    )
    comparison_df[f"p > {alpha}"] = comparison_df["P-Value"].apply(
        lambda p: "✓" if p > alpha else "×"
    )
    return comparison_df


baberabb's avatar
baberabb committed
71
72
73
74
75
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--pretrained", default="EleutherAI/pythia-70m", help="name of model to compare"
    )
baberabb's avatar
baberabb committed
76
77
78
79
    parser.add_argument(
        "--hf_args", help="huggingface model args <arg>=<value>", default=""
    )
    parser.add_argument("--vllm_args", help="vllm model args <arg>=<value>", default="")
baberabb's avatar
baberabb committed
80
81
    parser.add_argument("--tasks", type=str, default="arc_easy,hellaswag")
    parser.add_argument(
82
        "--limit",
baberabb's avatar
nits  
baberabb committed
83
        type=float,
84
85
86
87
88
89
90
        default=100,
    )
    parser.add_argument(
        "--alpha",
        type=float,
        default=0.05,
        help="Significance level for two-tailed z-test",
baberabb's avatar
baberabb committed
91
92
93
94
95
96
97
98
    )
    parser.add_argument(
        "--device",
        type=str,
        default="cuda",
    )
    parser.add_argument(
        "--batch",
baberabb's avatar
baberabb committed
99
        type=str,
baberabb's avatar
baberabb committed
100
        default=8,
baberabb's avatar
baberabb committed
101
102
103
104
105
106
107
108
109
110
111
    )
    parser.add_argument(
        "--verbosity",
        type=str,
        default="INFO",
        help="Logging verbosity",
    )
    return parser.parse_args()


if __name__ == "__main__":
baberabb's avatar
baberabb committed
112
    tasks.initialize_tasks()
baberabb's avatar
baberabb committed
113
    args = parse_args()
baberabb's avatar
baberabb committed
114
115
    tasks = args.tasks.split(",")
    print(tasks)
116
117
118
119
    hf_args, vllm_args = "," + args.hf_args, "," + args.vllm_args
    results_vllm = lm_eval.evaluator.simple_evaluate(
        model="vllm",
        model_args=f"pretrained={args.pretrained}" + vllm_args,
baberabb's avatar
baberabb committed
120
        tasks=tasks,
121
        limit=args.limit,
baberabb's avatar
baberabb committed
122
        device=args.device,
baberabb's avatar
baberabb committed
123
        batch_size=args.batch,
baberabb's avatar
baberabb committed
124
    )
125
    memory_stats()
126
    lm_eval.models.utils.clear_torch_cache()
127
128
    eval_logger.info("Memory stats cleared")
    memory_stats()
129
130
131
    results_hf = lm_eval.evaluator.simple_evaluate(
        model="hf",
        model_args=f"pretrained={args.pretrained}" + hf_args,
baberabb's avatar
baberabb committed
132
        tasks=tasks,
133
        limit=args.limit,
baberabb's avatar
baberabb committed
134
        device=args.device,
baberabb's avatar
baberabb committed
135
        batch_size=args.batch,
baberabb's avatar
baberabb committed
136
137
    )
    all_res = {}
baberabb's avatar
baberabb committed
138
    for task1, task2 in zip(
baberabb's avatar
baberabb committed
139
140
        results_hf["results"].items(), results_vllm["results"].items()
    ):
baberabb's avatar
baberabb committed
141
        assert task1[0] == task2[0]
142
        z, p_value = calculate_z_value(task1[1], task2[1])
baberabb's avatar
baberabb committed
143
        all_res[task1[0]] = {"z": z, "p_value": p_value}
144
145
146
147
    df = print_results(
        [results_hf["results"], results_vllm["results"]], all_res, args.alpha
    )
    print(df)