vllm_hf_equiv.py 2.45 KB
Newer Older
baberabb's avatar
baberabb committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import argparse
import numpy as np
import lm_eval.evaluator
from lm_eval import tasks
import scipy.stats
from typing import Tuple, Dict

eval_logger = lm_eval.utils.eval_logger


def calculate_z_value(res1: Dict, res2: Dict, limit: int) -> Tuple[float, float]:
    acc1, acc2 = res1["acc,none"], res2["acc,none"]
    st_err1, st_err2 = res1["acc_stderr"], res2["acc_stderr"]
    Z = (acc1 - acc2) / np.sqrt((st_err1**2 / limit) + (st_err2**2 / limit))
    # Determining the p-value
    p_value = 2 * scipy.stats.norm.sf(abs(Z))  # two-tailed test
    return Z, p_value


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--pretrained", default="EleutherAI/pythia-70m", help="name of model to compare"
    )
baberabb's avatar
baberabb committed
25
26
27
28
    parser.add_argument(
        "--hf_args", help="huggingface model args <arg>=<value>", default=""
    )
    parser.add_argument("--vllm_args", help="vllm model args <arg>=<value>", default="")
baberabb's avatar
baberabb committed
29
30
31
32
33
34
35
36
37
38
39
40
41
    parser.add_argument("--tasks", type=str, default="arc_easy,hellaswag")
    parser.add_argument(
        "--samples",
        type=int,
        default=30,
    )
    parser.add_argument(
        "--device",
        type=str,
        default="cuda",
    )
    parser.add_argument(
        "--batch",
baberabb's avatar
baberabb committed
42
43
        type=int,
        default=8,
baberabb's avatar
baberabb committed
44
45
46
47
48
49
50
51
52
53
54
    )
    parser.add_argument(
        "--verbosity",
        type=str,
        default="INFO",
        help="Logging verbosity",
    )
    return parser.parse_args()


if __name__ == "__main__":
baberabb's avatar
baberabb committed
55
    tasks.initialize_tasks()
baberabb's avatar
baberabb committed
56
    args = parse_args()
baberabb's avatar
baberabb committed
57
58
    tasks = args.tasks.split(",")
    print(tasks)
baberabb's avatar
nits  
baberabb committed
59
60
    hf_args = "," + args.hf_args
    vllm_args = "," + args.vllm_args
baberabb's avatar
baberabb committed
61
62
    results_hf = lm_eval.evaluator.simple_evaluate(
        model="hf",
baberabb's avatar
nits  
baberabb committed
63
        model_args=f"pretrained={args.pretrained}" + hf_args,
baberabb's avatar
baberabb committed
64
65
        tasks=tasks,
        limit=args.samples,
baberabb's avatar
baberabb committed
66
        device=args.device,
baberabb's avatar
baberabb committed
67
        batch_size=args.batch,
baberabb's avatar
baberabb committed
68
69
70
    )
    results_vllm = lm_eval.evaluator.simple_evaluate(
        model="vllm",
baberabb's avatar
nits  
baberabb committed
71
        model_args=f"pretrained={args.pretrained}" + vllm_args,
baberabb's avatar
baberabb committed
72
73
        tasks=tasks,
        limit=args.samples,
baberabb's avatar
baberabb committed
74
        device=args.device,
baberabb's avatar
baberabb committed
75
        batch_size=args.batch,
baberabb's avatar
baberabb committed
76
77
    )
    all_res = {}
baberabb's avatar
baberabb committed
78
    for task1, task2 in zip(
baberabb's avatar
baberabb committed
79
80
        results_hf["results"].items(), results_vllm["results"].items()
    ):
baberabb's avatar
baberabb committed
81
82
83
        assert task1[0] == task2[0]
        z, p_value = calculate_z_value(task1[1], task2[1], args.samples)
        all_res[task1[0]] = {"z": z, "p_value": p_value}
baberabb's avatar
baberabb committed
84
85
        assert p_value > 0.05
        eval_logger.info(all_res)