model_comparator.py 4.01 KB
Newer Older
baberabb's avatar
baberabb committed
1
import argparse
Lintang Sutawika's avatar
Lintang Sutawika committed
2
import logging
3
4
5
import os
from typing import Dict, List, Tuple

baberabb's avatar
baberabb committed
6
import numpy as np
7
8
import pandas as pd
import torch
9
10

import lm_eval.evaluator
11
import lm_eval.models.utils
Lintang Sutawika's avatar
Lintang Sutawika committed
12
from lm_eval import tasks
13

baberabb's avatar
baberabb committed
14

15
os.environ["TOKENIZERS_PARALLELISM"] = "false"
Lintang Sutawika's avatar
Lintang Sutawika committed
16
eval_logger = logging.getLogger(__name__)
17
18
19
20


def memory_stats():
    eval_logger.info(
Baber Abbasi's avatar
Baber Abbasi committed
21
        f"Memory allocated: {torch.cuda.memory_allocated() / 1024**2}, reserved: {torch.cuda.memory_reserved() // 1024**2}"
22
    )
baberabb's avatar
baberabb committed
23
24


25
def calculate_z_value(res1: Dict, res2: Dict) -> Tuple[float, float]:
26
27
    from scipy.stats.norm import sf

baberabb's avatar
baberabb committed
28
    acc1, acc2 = res1["acc,none"], res2["acc,none"]
29
30
    st_err1, st_err2 = res1["acc_stderr,none"], res2["acc_stderr,none"]
    Z = (acc1 - acc2) / np.sqrt((st_err1**2) + (st_err2**2))
baberabb's avatar
baberabb committed
31
    # Determining the p-value
32
    p_value = 2 * sf(abs(Z))  # two-tailed test
baberabb's avatar
baberabb committed
33
34
35
    return Z, p_value


36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def print_results(
    data_to_print: List = None, results_dict: Dict = None, alpha: float = None
):
    model1_data = data_to_print[0]
    model2_data = data_to_print[1]
    table_data = []
    for task in model1_data.keys():
        row = {
            "Task": task,
            "HF Accuracy": model1_data[task]["acc,none"],
            "vLLM Accuracy": model2_data[task]["acc,none"],
            "HF StdErr": model1_data[task]["acc_stderr,none"],
            "vLLM StdErr": model2_data[task]["acc_stderr,none"],
        }
        table_data.append(row)
    comparison_df = pd.DataFrame(table_data)
    comparison_df["Z-Score"] = comparison_df["Task"].apply(
        lambda task: results_dict[task]["z"]
    )
    comparison_df["P-Value"] = comparison_df["Task"].apply(
        lambda task: results_dict[task]["p_value"]
    )
    comparison_df[f"p > {alpha}"] = comparison_df["P-Value"].apply(
        lambda p: "✓" if p > alpha else "×"
    )
    return comparison_df


baberabb's avatar
baberabb committed
64
65
66
67
68
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--pretrained", default="EleutherAI/pythia-70m", help="name of model to compare"
    )
baberabb's avatar
baberabb committed
69
70
71
72
    parser.add_argument(
        "--hf_args", help="huggingface model args <arg>=<value>", default=""
    )
    parser.add_argument("--vllm_args", help="vllm model args <arg>=<value>", default="")
baberabb's avatar
baberabb committed
73
74
    parser.add_argument("--tasks", type=str, default="arc_easy,hellaswag")
    parser.add_argument(
75
        "--limit",
baberabb's avatar
nits  
baberabb committed
76
        type=float,
77
78
79
80
81
82
83
        default=100,
    )
    parser.add_argument(
        "--alpha",
        type=float,
        default=0.05,
        help="Significance level for two-tailed z-test",
baberabb's avatar
baberabb committed
84
85
86
87
88
89
90
91
    )
    parser.add_argument(
        "--device",
        type=str,
        default="cuda",
    )
    parser.add_argument(
        "--batch",
baberabb's avatar
baberabb committed
92
        type=str,
baberabb's avatar
baberabb committed
93
        default=8,
baberabb's avatar
baberabb committed
94
95
96
97
98
99
100
101
102
103
104
    )
    parser.add_argument(
        "--verbosity",
        type=str,
        default="INFO",
        help="Logging verbosity",
    )
    return parser.parse_args()


if __name__ == "__main__":
baberabb's avatar
baberabb committed
105
    tasks.initialize_tasks()
baberabb's avatar
baberabb committed
106
    args = parse_args()
baberabb's avatar
baberabb committed
107
108
    tasks = args.tasks.split(",")
    print(tasks)
109
110
111
112
    hf_args, vllm_args = "," + args.hf_args, "," + args.vllm_args
    results_vllm = lm_eval.evaluator.simple_evaluate(
        model="vllm",
        model_args=f"pretrained={args.pretrained}" + vllm_args,
baberabb's avatar
baberabb committed
113
        tasks=tasks,
114
        limit=args.limit,
baberabb's avatar
baberabb committed
115
        device=args.device,
baberabb's avatar
baberabb committed
116
        batch_size=args.batch,
baberabb's avatar
baberabb committed
117
    )
118
    memory_stats()
119
    lm_eval.models.utils.clear_torch_cache()
120
121
    eval_logger.info("Memory stats cleared")
    memory_stats()
122
123
124
    results_hf = lm_eval.evaluator.simple_evaluate(
        model="hf",
        model_args=f"pretrained={args.pretrained}" + hf_args,
baberabb's avatar
baberabb committed
125
        tasks=tasks,
126
        limit=args.limit,
baberabb's avatar
baberabb committed
127
        device=args.device,
baberabb's avatar
baberabb committed
128
        batch_size=args.batch,
baberabb's avatar
baberabb committed
129
130
    )
    all_res = {}
baberabb's avatar
baberabb committed
131
    for task1, task2 in zip(
baberabb's avatar
baberabb committed
132
133
        results_hf["results"].items(), results_vllm["results"].items()
    ):
baberabb's avatar
baberabb committed
134
        assert task1[0] == task2[0]
135
        z, p_value = calculate_z_value(task1[1], task2[1])
baberabb's avatar
baberabb committed
136
        all_res[task1[0]] = {"z": z, "p_value": p_value}
137
138
139
140
    df = print_results(
        [results_hf["results"], results_vllm["results"]], all_res, args.alpha
    )
    print(df)