Commit 8b74beaa authored by baberabb's avatar baberabb
Browse files

fix z-score and print; rename script

parent b99ad796
...@@ -3,20 +3,52 @@ import numpy as np ...@@ -3,20 +3,52 @@ import numpy as np
import lm_eval.evaluator import lm_eval.evaluator
from lm_eval import tasks from lm_eval import tasks
import scipy.stats import scipy.stats
from typing import Tuple, Dict from typing import Tuple, Dict, List
import pandas as pd
import torch
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
eval_logger = lm_eval.utils.eval_logger eval_logger = lm_eval.utils.eval_logger
def calculate_z_value(res1: Dict, res2: Dict, limit: int) -> Tuple[float, float]: def calculate_z_value(res1: Dict, res2: Dict) -> Tuple[float, float]:
acc1, acc2 = res1["acc,none"], res2["acc,none"] acc1, acc2 = res1["acc,none"], res2["acc,none"]
st_err1, st_err2 = res1["acc_stderr"], res2["acc_stderr"] st_err1, st_err2 = res1["acc_stderr,none"], res2["acc_stderr,none"]
Z = (acc1 - acc2) / np.sqrt((st_err1**2 / limit) + (st_err2**2 / limit)) Z = (acc1 - acc2) / np.sqrt((st_err1**2) + (st_err2**2))
# Determining the p-value # Determining the p-value
p_value = 2 * scipy.stats.norm.sf(abs(Z)) # two-tailed test p_value = 2 * scipy.stats.norm.sf(abs(Z)) # two-tailed test
return Z, p_value return Z, p_value
def print_results(
data_to_print: List = None, results_dict: Dict = None, alpha: float = None
):
model1_data = data_to_print[0]
model2_data = data_to_print[1]
table_data = []
for task in model1_data.keys():
row = {
"Task": task,
"HF Accuracy": model1_data[task]["acc,none"],
"vLLM Accuracy": model2_data[task]["acc,none"],
"HF StdErr": model1_data[task]["acc_stderr,none"],
"vLLM StdErr": model2_data[task]["acc_stderr,none"],
}
table_data.append(row)
comparison_df = pd.DataFrame(table_data)
comparison_df["Z-Score"] = comparison_df["Task"].apply(
lambda task: results_dict[task]["z"]
)
comparison_df["P-Value"] = comparison_df["Task"].apply(
lambda task: results_dict[task]["p_value"]
)
comparison_df[f"p > {alpha}"] = comparison_df["P-Value"].apply(
lambda p: "✓" if p > alpha else "×"
)
return comparison_df
def parse_args(): def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument(
...@@ -28,9 +60,15 @@ def parse_args(): ...@@ -28,9 +60,15 @@ def parse_args():
parser.add_argument("--vllm_args", help="vllm model args <arg>=<value>", default="") parser.add_argument("--vllm_args", help="vllm model args <arg>=<value>", default="")
parser.add_argument("--tasks", type=str, default="arc_easy,hellaswag") parser.add_argument("--tasks", type=str, default="arc_easy,hellaswag")
parser.add_argument( parser.add_argument(
"--samples", "--limit",
type=int, type=int,
default=30, default=100,
)
parser.add_argument(
"--alpha",
type=float,
default=0.05,
help="Significance level for two-tailed z-test",
) )
parser.add_argument( parser.add_argument(
"--device", "--device",
...@@ -56,21 +94,21 @@ if __name__ == "__main__": ...@@ -56,21 +94,21 @@ if __name__ == "__main__":
args = parse_args() args = parse_args()
tasks = args.tasks.split(",") tasks = args.tasks.split(",")
print(tasks) print(tasks)
hf_args = "," + args.hf_args hf_args, vllm_args = "," + args.hf_args, "," + args.vllm_args
vllm_args = "," + args.vllm_args results_vllm = lm_eval.evaluator.simple_evaluate(
results_hf = lm_eval.evaluator.simple_evaluate( model="vllm",
model="hf", model_args=f"pretrained={args.pretrained}" + vllm_args,
model_args=f"pretrained={args.pretrained}" + hf_args,
tasks=tasks, tasks=tasks,
limit=args.samples, limit=args.limit,
device=args.device, device=args.device,
batch_size=args.batch, batch_size=args.batch,
) )
results_vllm = lm_eval.evaluator.simple_evaluate( torch.cuda.empty_cache()
model="vllm", results_hf = lm_eval.evaluator.simple_evaluate(
model_args=f"pretrained={args.pretrained}" + vllm_args, model="hf",
model_args=f"pretrained={args.pretrained}" + hf_args,
tasks=tasks, tasks=tasks,
limit=args.samples, limit=args.limit,
device=args.device, device=args.device,
batch_size=args.batch, batch_size=args.batch,
) )
...@@ -79,7 +117,9 @@ if __name__ == "__main__": ...@@ -79,7 +117,9 @@ if __name__ == "__main__":
results_hf["results"].items(), results_vllm["results"].items() results_hf["results"].items(), results_vllm["results"].items()
): ):
assert task1[0] == task2[0] assert task1[0] == task2[0]
z, p_value = calculate_z_value(task1[1], task2[1], args.samples) z, p_value = calculate_z_value(task1[1], task2[1])
all_res[task1[0]] = {"z": z, "p_value": p_value} all_res[task1[0]] = {"z": z, "p_value": p_value}
assert p_value > 0.05 df = print_results(
eval_logger.info(all_res) [results_hf["results"], results_vllm["results"]], all_res, args.alpha
)
print(df)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment