"configs/datasets/winogrande/winogrande_gen_458220.py" did not exist on "14332e08fd39b1d0fe543f98f57c3e62f9e75fe8"
Commit 6f4f9e1c authored by lintangsutawika's avatar lintangsutawika
Browse files

resolved merge conflict

parents 0d5748b7 aed90773
import string
from functools import partial
def doc_to_text_base(alphabet, style, doc):
choices = doc["choices"]["text"]
......@@ -13,19 +14,21 @@ def doc_to_text_base(alphabet, style, doc):
else:
choice_string = "{} {}"
doc_to_text = "\n\n".join([
"Question: "+doc["question"]+"\nAnswer:",
] + [
choice_string.format(i,j) for i,j in zip(letter_list, choices)
doc_to_text = "\n\n".join(
[
"Question: " + doc["question"] + "\nAnswer:",
]
+ [choice_string.format(i, j) for i, j in zip(letter_list, choices)]
)
return doc_to_text
# Full continuation
def choice_A(doc):
return doc["choices"]["text"]
# Letters only
def choice_B(alphabet, style, doc):
......@@ -34,10 +37,11 @@ def choice_B(alphabet, style, doc):
letter_list = [style.format(letter) for letter in alphabet[0:num]]
if "\t" in style:
letter_list = [letter.replace("\t","") for letter in letter_list]
letter_list = [letter.replace("\t", "") for letter in letter_list]
return letter_list
# Letters + Full continuation
def choice_C(alphabet, style, doc):
......@@ -46,9 +50,10 @@ def choice_C(alphabet, style, doc):
letter_list = [style.format(letter) for letter in alphabet[0:num]]
if "\t" not in style:
letter_list = [letter+" " for letter in letter_list]
letter_list = [letter + " " for letter in letter_list]
return [letter + choice for letter, choice in zip(letter_list, choices)]
return [letter+choice for letter, choice in zip(letter_list, choices)]
template_01 = partial(doc_to_text_base, string.ascii_lowercase, "({})")
choice_01a = choice_A
......@@ -82,5 +87,3 @@ template_08 = partial(doc_to_text_base, string.ascii_uppercase, "{}\t")
choice_08a = choice_A
choice_08b = partial(choice_B, string.ascii_uppercase, "{}\t")
choice_08c = partial(choice_C, string.ascii_uppercase, "{}\t")
......@@ -244,7 +244,7 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
idx=i,
**kwargs,
)
for i, choice in doc["choices"]
for i, choice in enumerate(doc["choices"])
]
return request_list
......
......@@ -378,7 +378,9 @@ def make_table(result_dict, column: str = "results"):
if m + "_stderr" + "," + f in dic:
se = dic[m + "_stderr" + "," + f]
values.append([k, version, f, n, m, "%.4f" % v, "±", "%.4f" % se])
if se != "N/A":
se = "%.4f" % se
values.append([k, version, f, n, m, "%.4f" % v, "±", se])
else:
values.append([k, version, f, n, m, "%.4f" % v, "", ""])
k = ""
......@@ -669,3 +671,55 @@ def stop_sequences_criteria(
],
]
)
# from more_itertools
def divide(iterable, n) -> List[Iterator]:
"""Divide the elements from *iterable* into *n* parts, maintaining
order.
>>> group_1, group_2 = divide(2, [1, 2, 3, 4, 5, 6])
>>> list(group_1)
[1, 2, 3]
>>> list(group_2)
[4, 5, 6]
If the length of *iterable* is not evenly divisible by *n*, then the
length of the returned iterables will not be identical:
>>> children = divide(3, [1, 2, 3, 4, 5, 6, 7])
>>> [list(c) for c in children]
[[1, 2, 3], [4, 5], [6, 7]]
If the length of the iterable is smaller than n, then the last returned
iterables will be empty:
>>> children = divide(5, [1, 2, 3])
>>> [list(c) for c in children]
[[1], [2], [3], [], []]
This function will exhaust the iterable before returning and may require
significant storage. If order is not important, see :func:`distribute`,
which does not first pull the iterable into memory.
"""
if n < 1:
raise ValueError("n must be at least 1")
try:
iterable[:0]
except TypeError:
seq = tuple(iterable)
else:
seq = iterable
q, r = divmod(len(seq), n)
ret = []
stop = 0
for i in range(1, n + 1):
start = stop
stop += q + 1 if i <= r else q
ret.append(iter(seq[start:stop]))
return ret
......@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "lm_eval"
version = "1.0.0"
version = "0.4.0"
authors = [
{name="EleutherAI", email="contact@eleuther.ai"}
]
......
import argparse
import numpy as np
import lm_eval.evaluator
from lm_eval import tasks
import scipy.stats
from typing import Tuple, Dict, List
import pandas as pd
import torch
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
eval_logger = lm_eval.utils.eval_logger
def calculate_z_value(res1: Dict, res2: Dict) -> Tuple[float, float]:
acc1, acc2 = res1["acc,none"], res2["acc,none"]
st_err1, st_err2 = res1["acc_stderr,none"], res2["acc_stderr,none"]
Z = (acc1 - acc2) / np.sqrt((st_err1**2) + (st_err2**2))
# Determining the p-value
p_value = 2 * scipy.stats.norm.sf(abs(Z)) # two-tailed test
return Z, p_value
def print_results(
data_to_print: List = None, results_dict: Dict = None, alpha: float = None
):
model1_data = data_to_print[0]
model2_data = data_to_print[1]
table_data = []
for task in model1_data.keys():
row = {
"Task": task,
"HF Accuracy": model1_data[task]["acc,none"],
"vLLM Accuracy": model2_data[task]["acc,none"],
"HF StdErr": model1_data[task]["acc_stderr,none"],
"vLLM StdErr": model2_data[task]["acc_stderr,none"],
}
table_data.append(row)
comparison_df = pd.DataFrame(table_data)
comparison_df["Z-Score"] = comparison_df["Task"].apply(
lambda task: results_dict[task]["z"]
)
comparison_df["P-Value"] = comparison_df["Task"].apply(
lambda task: results_dict[task]["p_value"]
)
comparison_df[f"p > {alpha}"] = comparison_df["P-Value"].apply(
lambda p: "✓" if p > alpha else "×"
)
return comparison_df
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--pretrained", default="EleutherAI/pythia-70m", help="name of model to compare"
)
parser.add_argument(
"--hf_args", help="huggingface model args <arg>=<value>", default=""
)
parser.add_argument("--vllm_args", help="vllm model args <arg>=<value>", default="")
parser.add_argument("--tasks", type=str, default="arc_easy,hellaswag")
parser.add_argument(
"--limit",
type=float,
default=100,
)
parser.add_argument(
"--alpha",
type=float,
default=0.05,
help="Significance level for two-tailed z-test",
)
parser.add_argument(
"--device",
type=str,
default="cuda",
)
parser.add_argument(
"--batch",
type=str,
default=8,
)
parser.add_argument(
"--verbosity",
type=str,
default="INFO",
help="Logging verbosity",
)
return parser.parse_args()
if __name__ == "__main__":
tasks.initialize_tasks()
args = parse_args()
tasks = args.tasks.split(",")
print(tasks)
hf_args, vllm_args = "," + args.hf_args, "," + args.vllm_args
results_vllm = lm_eval.evaluator.simple_evaluate(
model="vllm",
model_args=f"pretrained={args.pretrained}" + vllm_args,
tasks=tasks,
limit=args.limit,
device=args.device,
batch_size=args.batch,
)
torch.cuda.empty_cache()
results_hf = lm_eval.evaluator.simple_evaluate(
model="hf",
model_args=f"pretrained={args.pretrained}" + hf_args,
tasks=tasks,
limit=args.limit,
device=args.device,
batch_size=args.batch,
)
all_res = {}
for task1, task2 in zip(
results_hf["results"].items(), results_vllm["results"].items()
):
assert task1[0] == task2[0]
z, p_value = calculate_z_value(task1[1], task2[1])
all_res[task1[0]] = {"z": z, "p_value": p_value}
df = print_results(
[results_hf["results"], results_vllm["results"]], all_res, args.alpha
)
print(df)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment