model_comparator.py 3.95 KB
Newer Older
baberabb's avatar
baberabb committed
1
import argparse
2
3
4
import os
from typing import Dict, List, Tuple

baberabb's avatar
baberabb committed
5
import numpy as np
6
import pandas as pd
7
import scipy.stats
8
import torch
9
10
11
12

import lm_eval.evaluator
from lm_eval import tasks, utils

baberabb's avatar
baberabb committed
13

14
os.environ["TOKENIZERS_PARALLELISM"] = "false"
15
16
17
18
19
20
21
eval_logger = utils.eval_logger


def memory_stats():
    eval_logger.info(
        f"Memory allocated: {torch.cuda.memory_allocated() / 1024 ** 2}, reserved: {torch.cuda.memory_reserved() // 1024 ** 2}"
    )
baberabb's avatar
baberabb committed
22
23


24
def calculate_z_value(res1: Dict, res2: Dict) -> Tuple[float, float]:
baberabb's avatar
baberabb committed
25
    acc1, acc2 = res1["acc,none"], res2["acc,none"]
26
27
    st_err1, st_err2 = res1["acc_stderr,none"], res2["acc_stderr,none"]
    Z = (acc1 - acc2) / np.sqrt((st_err1**2) + (st_err2**2))
baberabb's avatar
baberabb committed
28
29
30
31
32
    # Determining the p-value
    p_value = 2 * scipy.stats.norm.sf(abs(Z))  # two-tailed test
    return Z, p_value


33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
def print_results(
    data_to_print: List = None, results_dict: Dict = None, alpha: float = None
):
    model1_data = data_to_print[0]
    model2_data = data_to_print[1]
    table_data = []
    for task in model1_data.keys():
        row = {
            "Task": task,
            "HF Accuracy": model1_data[task]["acc,none"],
            "vLLM Accuracy": model2_data[task]["acc,none"],
            "HF StdErr": model1_data[task]["acc_stderr,none"],
            "vLLM StdErr": model2_data[task]["acc_stderr,none"],
        }
        table_data.append(row)
    comparison_df = pd.DataFrame(table_data)
    comparison_df["Z-Score"] = comparison_df["Task"].apply(
        lambda task: results_dict[task]["z"]
    )
    comparison_df["P-Value"] = comparison_df["Task"].apply(
        lambda task: results_dict[task]["p_value"]
    )
    comparison_df[f"p > {alpha}"] = comparison_df["P-Value"].apply(
        lambda p: "✓" if p > alpha else "×"
    )
    return comparison_df


baberabb's avatar
baberabb committed
61
62
63
64
65
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--pretrained", default="EleutherAI/pythia-70m", help="name of model to compare"
    )
baberabb's avatar
baberabb committed
66
67
68
69
    parser.add_argument(
        "--hf_args", help="huggingface model args <arg>=<value>", default=""
    )
    parser.add_argument("--vllm_args", help="vllm model args <arg>=<value>", default="")
baberabb's avatar
baberabb committed
70
71
    parser.add_argument("--tasks", type=str, default="arc_easy,hellaswag")
    parser.add_argument(
72
        "--limit",
baberabb's avatar
nits  
baberabb committed
73
        type=float,
74
75
76
77
78
79
80
        default=100,
    )
    parser.add_argument(
        "--alpha",
        type=float,
        default=0.05,
        help="Significance level for two-tailed z-test",
baberabb's avatar
baberabb committed
81
82
83
84
85
86
87
88
    )
    parser.add_argument(
        "--device",
        type=str,
        default="cuda",
    )
    parser.add_argument(
        "--batch",
baberabb's avatar
baberabb committed
89
        type=str,
baberabb's avatar
baberabb committed
90
        default=8,
baberabb's avatar
baberabb committed
91
92
93
94
95
96
97
98
99
100
101
    )
    parser.add_argument(
        "--verbosity",
        type=str,
        default="INFO",
        help="Logging verbosity",
    )
    return parser.parse_args()


if __name__ == "__main__":
baberabb's avatar
baberabb committed
102
    tasks.initialize_tasks()
baberabb's avatar
baberabb committed
103
    args = parse_args()
baberabb's avatar
baberabb committed
104
105
    tasks = args.tasks.split(",")
    print(tasks)
106
107
108
109
    hf_args, vllm_args = "," + args.hf_args, "," + args.vllm_args
    results_vllm = lm_eval.evaluator.simple_evaluate(
        model="vllm",
        model_args=f"pretrained={args.pretrained}" + vllm_args,
baberabb's avatar
baberabb committed
110
        tasks=tasks,
111
        limit=args.limit,
baberabb's avatar
baberabb committed
112
        device=args.device,
baberabb's avatar
baberabb committed
113
        batch_size=args.batch,
baberabb's avatar
baberabb committed
114
    )
115
116
117
118
    memory_stats()
    utils.clear_torch_cache()
    eval_logger.info("Memory stats cleared")
    memory_stats()
119
120
121
    results_hf = lm_eval.evaluator.simple_evaluate(
        model="hf",
        model_args=f"pretrained={args.pretrained}" + hf_args,
baberabb's avatar
baberabb committed
122
        tasks=tasks,
123
        limit=args.limit,
baberabb's avatar
baberabb committed
124
        device=args.device,
baberabb's avatar
baberabb committed
125
        batch_size=args.batch,
baberabb's avatar
baberabb committed
126
127
    )
    all_res = {}
baberabb's avatar
baberabb committed
128
    for task1, task2 in zip(
baberabb's avatar
baberabb committed
129
130
        results_hf["results"].items(), results_vllm["results"].items()
    ):
baberabb's avatar
baberabb committed
131
        assert task1[0] == task2[0]
132
        z, p_value = calculate_z_value(task1[1], task2[1])
baberabb's avatar
baberabb committed
133
        all_res[task1[0]] = {"z": z, "p_value": p_value}
134
135
136
137
    df = print_results(
        [results_hf["results"], results_vllm["results"]], all_res, args.alpha
    )
    print(df)