model_comparator.py 3.99 KB
Newer Older
baberabb's avatar
baberabb committed
1
import argparse
2
3
4
import os
from typing import Dict, List, Tuple

baberabb's avatar
baberabb committed
5
import numpy as np
6
import pandas as pd
7
import scipy.stats
8
import torch
9
10

import lm_eval.evaluator
11
import lm_eval.models.utils
12
13
from lm_eval import tasks, utils

baberabb's avatar
baberabb committed
14

15
os.environ["TOKENIZERS_PARALLELISM"] = "false"
16
17
18
19
20
21
22
eval_logger = utils.eval_logger


def memory_stats():
    eval_logger.info(
        f"Memory allocated: {torch.cuda.memory_allocated() / 1024 ** 2}, reserved: {torch.cuda.memory_reserved() // 1024 ** 2}"
    )
baberabb's avatar
baberabb committed
23
24


25
def calculate_z_value(res1: Dict, res2: Dict) -> Tuple[float, float]:
baberabb's avatar
baberabb committed
26
    acc1, acc2 = res1["acc,none"], res2["acc,none"]
27
28
    st_err1, st_err2 = res1["acc_stderr,none"], res2["acc_stderr,none"]
    Z = (acc1 - acc2) / np.sqrt((st_err1**2) + (st_err2**2))
baberabb's avatar
baberabb committed
29
30
31
32
33
    # Determining the p-value
    p_value = 2 * scipy.stats.norm.sf(abs(Z))  # two-tailed test
    return Z, p_value


34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def print_results(
    data_to_print: List = None, results_dict: Dict = None, alpha: float = None
):
    model1_data = data_to_print[0]
    model2_data = data_to_print[1]
    table_data = []
    for task in model1_data.keys():
        row = {
            "Task": task,
            "HF Accuracy": model1_data[task]["acc,none"],
            "vLLM Accuracy": model2_data[task]["acc,none"],
            "HF StdErr": model1_data[task]["acc_stderr,none"],
            "vLLM StdErr": model2_data[task]["acc_stderr,none"],
        }
        table_data.append(row)
    comparison_df = pd.DataFrame(table_data)
    comparison_df["Z-Score"] = comparison_df["Task"].apply(
        lambda task: results_dict[task]["z"]
    )
    comparison_df["P-Value"] = comparison_df["Task"].apply(
        lambda task: results_dict[task]["p_value"]
    )
    comparison_df[f"p > {alpha}"] = comparison_df["P-Value"].apply(
        lambda p: "✓" if p > alpha else "×"
    )
    return comparison_df


baberabb's avatar
baberabb committed
62
63
64
65
66
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--pretrained", default="EleutherAI/pythia-70m", help="name of model to compare"
    )
baberabb's avatar
baberabb committed
67
68
69
70
    parser.add_argument(
        "--hf_args", help="huggingface model args <arg>=<value>", default=""
    )
    parser.add_argument("--vllm_args", help="vllm model args <arg>=<value>", default="")
baberabb's avatar
baberabb committed
71
72
    parser.add_argument("--tasks", type=str, default="arc_easy,hellaswag")
    parser.add_argument(
73
        "--limit",
baberabb's avatar
nits  
baberabb committed
74
        type=float,
75
76
77
78
79
80
81
        default=100,
    )
    parser.add_argument(
        "--alpha",
        type=float,
        default=0.05,
        help="Significance level for two-tailed z-test",
baberabb's avatar
baberabb committed
82
83
84
85
86
87
88
89
    )
    parser.add_argument(
        "--device",
        type=str,
        default="cuda",
    )
    parser.add_argument(
        "--batch",
baberabb's avatar
baberabb committed
90
        type=str,
baberabb's avatar
baberabb committed
91
        default=8,
baberabb's avatar
baberabb committed
92
93
94
95
96
97
98
99
100
101
102
    )
    parser.add_argument(
        "--verbosity",
        type=str,
        default="INFO",
        help="Logging verbosity",
    )
    return parser.parse_args()


if __name__ == "__main__":
baberabb's avatar
baberabb committed
103
    tasks.initialize_tasks()
baberabb's avatar
baberabb committed
104
    args = parse_args()
baberabb's avatar
baberabb committed
105
106
    tasks = args.tasks.split(",")
    print(tasks)
107
108
109
110
    hf_args, vllm_args = "," + args.hf_args, "," + args.vllm_args
    results_vllm = lm_eval.evaluator.simple_evaluate(
        model="vllm",
        model_args=f"pretrained={args.pretrained}" + vllm_args,
baberabb's avatar
baberabb committed
111
        tasks=tasks,
112
        limit=args.limit,
baberabb's avatar
baberabb committed
113
        device=args.device,
baberabb's avatar
baberabb committed
114
        batch_size=args.batch,
baberabb's avatar
baberabb committed
115
    )
116
    memory_stats()
117
    lm_eval.models.utils.clear_torch_cache()
118
119
    eval_logger.info("Memory stats cleared")
    memory_stats()
120
121
122
    results_hf = lm_eval.evaluator.simple_evaluate(
        model="hf",
        model_args=f"pretrained={args.pretrained}" + hf_args,
baberabb's avatar
baberabb committed
123
        tasks=tasks,
124
        limit=args.limit,
baberabb's avatar
baberabb committed
125
        device=args.device,
baberabb's avatar
baberabb committed
126
        batch_size=args.batch,
baberabb's avatar
baberabb committed
127
128
    )
    all_res = {}
baberabb's avatar
baberabb committed
129
    for task1, task2 in zip(
baberabb's avatar
baberabb committed
130
131
        results_hf["results"].items(), results_vllm["results"].items()
    ):
baberabb's avatar
baberabb committed
132
        assert task1[0] == task2[0]
133
        z, p_value = calculate_z_value(task1[1], task2[1])
baberabb's avatar
baberabb committed
134
        all_res[task1[0]] = {"z": z, "p_value": p_value}
135
136
137
138
    df = print_results(
        [results_hf["results"], results_vllm["results"]], all_res, args.alpha
    )
    print(df)