model_comparator.py 3.99 KB
Newer Older
baberabb's avatar
baberabb committed
1
import argparse
2
3
4
import os
from typing import Dict, List, Tuple

baberabb's avatar
baberabb committed
5
import numpy as np
6
7
import pandas as pd
import torch
8
9

import lm_eval.evaluator
10
import lm_eval.models.utils
11
12
from lm_eval import tasks, utils

baberabb's avatar
baberabb committed
13

14
os.environ["TOKENIZERS_PARALLELISM"] = "false"
15
16
17
18
19
eval_logger = utils.eval_logger


def memory_stats():
    eval_logger.info(
Baber Abbasi's avatar
Baber Abbasi committed
20
        f"Memory allocated: {torch.cuda.memory_allocated() / 1024**2}, reserved: {torch.cuda.memory_reserved() // 1024**2}"
21
    )
baberabb's avatar
baberabb committed
22
23


24
def calculate_z_value(res1: Dict, res2: Dict) -> Tuple[float, float]:
25
26
    from scipy.stats.norm import sf

baberabb's avatar
baberabb committed
27
    acc1, acc2 = res1["acc,none"], res2["acc,none"]
28
29
    st_err1, st_err2 = res1["acc_stderr,none"], res2["acc_stderr,none"]
    Z = (acc1 - acc2) / np.sqrt((st_err1**2) + (st_err2**2))
baberabb's avatar
baberabb committed
30
    # Determining the p-value
31
    p_value = 2 * sf(abs(Z))  # two-tailed test
baberabb's avatar
baberabb committed
32
33
34
    return Z, p_value


35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def print_results(
    data_to_print: List = None, results_dict: Dict = None, alpha: float = None
):
    model1_data = data_to_print[0]
    model2_data = data_to_print[1]
    table_data = []
    for task in model1_data.keys():
        row = {
            "Task": task,
            "HF Accuracy": model1_data[task]["acc,none"],
            "vLLM Accuracy": model2_data[task]["acc,none"],
            "HF StdErr": model1_data[task]["acc_stderr,none"],
            "vLLM StdErr": model2_data[task]["acc_stderr,none"],
        }
        table_data.append(row)
    comparison_df = pd.DataFrame(table_data)
    comparison_df["Z-Score"] = comparison_df["Task"].apply(
        lambda task: results_dict[task]["z"]
    )
    comparison_df["P-Value"] = comparison_df["Task"].apply(
        lambda task: results_dict[task]["p_value"]
    )
    comparison_df[f"p > {alpha}"] = comparison_df["P-Value"].apply(
        lambda p: "✓" if p > alpha else "×"
    )
    return comparison_df


baberabb's avatar
baberabb committed
63
64
65
66
67
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--pretrained", default="EleutherAI/pythia-70m", help="name of model to compare"
    )
baberabb's avatar
baberabb committed
68
69
70
71
    parser.add_argument(
        "--hf_args", help="huggingface model args <arg>=<value>", default=""
    )
    parser.add_argument("--vllm_args", help="vllm model args <arg>=<value>", default="")
baberabb's avatar
baberabb committed
72
73
    parser.add_argument("--tasks", type=str, default="arc_easy,hellaswag")
    parser.add_argument(
74
        "--limit",
baberabb's avatar
nits  
baberabb committed
75
        type=float,
76
77
78
79
80
81
82
        default=100,
    )
    parser.add_argument(
        "--alpha",
        type=float,
        default=0.05,
        help="Significance level for two-tailed z-test",
baberabb's avatar
baberabb committed
83
84
85
86
87
88
89
90
    )
    parser.add_argument(
        "--device",
        type=str,
        default="cuda",
    )
    parser.add_argument(
        "--batch",
baberabb's avatar
baberabb committed
91
        type=str,
baberabb's avatar
baberabb committed
92
        default=8,
baberabb's avatar
baberabb committed
93
94
95
96
97
98
99
100
101
102
103
    )
    parser.add_argument(
        "--verbosity",
        type=str,
        default="INFO",
        help="Logging verbosity",
    )
    return parser.parse_args()


if __name__ == "__main__":
baberabb's avatar
baberabb committed
104
    tasks.initialize_tasks()
baberabb's avatar
baberabb committed
105
    args = parse_args()
baberabb's avatar
baberabb committed
106
107
    tasks = args.tasks.split(",")
    print(tasks)
108
109
110
111
    hf_args, vllm_args = "," + args.hf_args, "," + args.vllm_args
    results_vllm = lm_eval.evaluator.simple_evaluate(
        model="vllm",
        model_args=f"pretrained={args.pretrained}" + vllm_args,
baberabb's avatar
baberabb committed
112
        tasks=tasks,
113
        limit=args.limit,
baberabb's avatar
baberabb committed
114
        device=args.device,
baberabb's avatar
baberabb committed
115
        batch_size=args.batch,
baberabb's avatar
baberabb committed
116
    )
117
    memory_stats()
118
    lm_eval.models.utils.clear_torch_cache()
119
120
    eval_logger.info("Memory stats cleared")
    memory_stats()
121
122
123
    results_hf = lm_eval.evaluator.simple_evaluate(
        model="hf",
        model_args=f"pretrained={args.pretrained}" + hf_args,
baberabb's avatar
baberabb committed
124
        tasks=tasks,
125
        limit=args.limit,
baberabb's avatar
baberabb committed
126
        device=args.device,
baberabb's avatar
baberabb committed
127
        batch_size=args.batch,
baberabb's avatar
baberabb committed
128
129
    )
    all_res = {}
baberabb's avatar
baberabb committed
130
    for task1, task2 in zip(
baberabb's avatar
baberabb committed
131
132
        results_hf["results"].items(), results_vllm["results"].items()
    ):
baberabb's avatar
baberabb committed
133
        assert task1[0] == task2[0]
134
        z, p_value = calculate_z_value(task1[1], task2[1])
baberabb's avatar
baberabb committed
135
        all_res[task1[0]] = {"z": z, "p_value": p_value}
136
137
138
139
    df = print_results(
        [results_hf["results"], results_vllm["results"]], all_res, args.alpha
    )
    print(df)