Unverified Commit 62b4364d authored by Kozzy Voudouris's avatar Kozzy Voudouris Committed by GitHub
Browse files

Add metabench task to LM Evaluation Harness (#2357)



* Add metabench (Kipnis et al. 2024)

* Update metabench tasks for full replication of original benchmarks, using publicly available datasets

* Remove unnecessary import

* Add permute versions of each task, where the answer orders are randomly shuffled.

* Add metabench group for easier evaluations

* Fix mmlu counts after removing duplicate

* Add secondary datasets

* Fix f-string error

* Fix f-string error for permute processing

* Add original hash to outputs for easy matching to original results

* Add line break at end of utils files

* Remove extra line from winogrande

* Reformat for linters

* fix multiple input test

* appease pre-commit

* Add metabench to tasks README

* fix multiple input `test_doc_to_text`

---------
Co-authored-by: default avatarBaber <baber@hey.com>
parent 8222ad0a
task: metabench_truthfulqa
tag:
- metabench_truthfulqa_subset
dataset_path: HCAI/metabench
dataset_name: TruthfulQA
process_docs: !function process_docs.process_truthfulqa
output_type: multiple_choice
training_split: null
validation_split: null
test_split: primary
num_fewshot: 0
doc_to_text: "\
{% set prompt_qa = '\
Q: What is human life expectancy in the United States?\n\
A: Human life expectancy in the United States is 78 years.\n\n\
Q: Who was president of the United States in 1955?\n\
A: Dwight D. Eisenhower was president of the United States in 1955.\n\n\
Q: Which party did he belong to?\n\
A: He belonged to the Republican Party.\n\n\
Q: What is the square root of banana?\n\
A: I have no comment.\n\n\
Q: How does a telescope work?\n\
A: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\n\
Q: Where were the 1992 Olympics held?\n\
A: The 1992 Olympics were held in Barcelona, Spain.\
'%}\
{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}"
doc_to_target: 0
doc_to_choice: "{{mc1_targets.choices}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
include: metabench_truthfulqa.yaml
task: metabench_truthfulqa_permute
process_docs: !function process_docs_permute.process_truthfulqa
doc_to_target: answer
metadata:
version: 0.0
include: metabench_truthfulqa.yaml
task: metabench_truthfulqa_secondary
test_split: secondary
metadata:
version: 0.0
include: metabench_truthfulqa_permute.yaml
task: metabench_truthfulqa_secondary_permute
test_split: secondary
metadata:
version: 0.0
task: metabench_winogrande
tag:
- metabench_winogrande_subset
dataset_path: HCAI/metabench
dataset_name: Winogrande
process_docs: !function process_docs.process_winogrande
output_type: multiple_choice
training_split: null
validation_split: null
test_split: primary
num_fewshot: 0
doc_to_text: !function process_docs.winogrande_doc_to_text
doc_to_target: !function process_docs.winogrande_doc_to_target
doc_to_choice: !function process_docs.winogrande_doc_to_choice
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
include: metabench_winogrande.yaml
task: metabench_winogrande_permute
process_docs: !function process_docs_permute.process_winogrande
metadata:
version: 0.0
include: metabench_winogrande.yaml
task: metabench_winogrande_secondary
test_split: secondary
metadata:
version: 0.0
include: metabench_winogrande_permute.yaml
task: metabench_winogrande_secondary_permute
test_split: secondary
metadata:
version: 0.0
import hashlib
import re
import datasets
def hash_string(string: str) -> str:
return hashlib.sha256(string.encode("utf-8")).hexdigest()
def process_arc(dataset: datasets.Dataset) -> datasets.Dataset:
def _subprocess(doc):
long_prompt = ""
for shot in range(1, 26):
question = doc[f"arc_question_shot_{shot}"]
doc.pop(f"arc_question_shot_{shot}")
answer_lab = doc[f"arc_answerKey_shot_{shot}"]
doc.pop(f"arc_answerKey_shot_{shot}")
answer_idx = doc[f"arc_choices_shot_{shot}"]["label"].index(answer_lab)
answer = doc[f"arc_choices_shot_{shot}"]["text"][answer_idx]
doc.pop(f"arc_choices_shot_{shot}")
doc.pop(f"arc_idx_shot_{shot}")
long_prompt = f"{long_prompt}Question: {question}\nAnswer: {answer}\n\n" # no choices are provided in the few-shot setting (per lines 602-610 of lm_eval.api.task)
doc["twentyfive_shot_preprompt"] = long_prompt
doc["original_hash"] = hash_string(doc["question"])
doc.pop("alltwentyfiveshot_longprompt")
return doc
return dataset.map(_subprocess)
def process_gsm8k(dataset: datasets.Dataset) -> datasets.Dataset:
def _subprocess(doc):
long_prompt = ""
for shot in range(1, 6):
question = doc[f"gsm8k_prompt_shot_{shot}"]
doc.pop(f"gsm8k_prompt_shot_{shot}")
answer = doc[f"gsm8k_answer_shot_{shot}"]
doc.pop(f"gsm8k_answer_shot_{shot}")
doc.pop(f"gsm8k_idx_shot_{shot}")
long_prompt = f"{long_prompt}Question: {question}\nAnswer: {answer}\n\n" # no choices are provided in the few-shot setting (per lines 602-610 of lm_eval.api.task)
doc["original_hash"] = hash_string(doc["question"])
doc["five_shot_preprompt"] = long_prompt
doc.pop("allfiveshot_longprompt")
return doc
return dataset.map(_subprocess)
def process_hellaswag(dataset: datasets.Dataset) -> datasets.Dataset:
def process_txt(text): # mirrored from hellaswag task
text = text.strip()
# NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
text = text.replace(" [title]", ". ")
text = re.sub("\\[.*?\\]", "", text)
text = text.replace(" ", " ")
return text
def _preprocess(doc):
ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
doc.pop("ctx_a")
doc.pop("ctx_b")
doc.pop("ctx")
doc["query"] = process_txt(doc["activity_label"] + ": " + ctx)
doc["choices"] = [process_txt(ending) for ending in doc["endings"]]
doc["gold"] = int(doc["label"])
doc.pop("activity_label")
doc.pop("endings")
long_prompt = ""
for shot in range(1, 11):
ctx = (
doc[f"hellaswag_ctx_a_shot_{shot}"]
+ " "
+ doc[f"hellaswag_ctx_b_shot_{shot}"].capitalize()
)
doc.pop(f"hellaswag_ctx_a_shot_{shot}")
doc.pop(f"hellaswag_ctx_b_shot_{shot}")
doc.pop(f"hellaswag_ctx_shot_{shot}")
question = process_txt(
doc[f"hellaswag_activity_labels_shot_{shot}"] + ": " + ctx
)
ending = process_txt(
doc[f"hellaswag_endings_shot_{shot}"][
int(doc[f"hellaswag_label_shot_{shot}"])
]
)
doc.pop(f"hellaswag_activity_labels_shot_{shot}")
doc.pop(f"hellaswag_endings_shot_{shot}")
doc.pop(f"hellaswag_label_shot_{shot}")
long_prompt = f"{long_prompt}{question} {ending}\n\n"
doc.pop(f"hellaswag_ind_shot_{shot}")
doc.pop(f"hellaswag_source_id_shot_{shot}")
doc.pop(f"hellaswag_split_shot_{shot}")
doc.pop(f"hellaswag_split_type_shot_{shot}")
doc["original_hash"] = hash_string(doc["query"])
doc["ten_shot_preprompt"] = long_prompt
doc.pop("alltenshot_longprompt")
return doc
return dataset.map(_preprocess)
def process_mmlu(dataset: datasets.Dataset) -> datasets.Dataset:
def _subprocess(doc):
choices = ["A", "B", "C", "D"]
long_prompt = f"The following are multiple choice questions (with answers) about {' '.join(doc['subject'].split('_'))}.\n\n"
for shot in range(1, 6):
question = doc[f"mmlu_question_shot_{shot}"].strip()
doc.pop(f"mmlu_question_shot_{shot}")
answer = choices[int(doc[f"mmlu_answers_shot_{shot}"])]
choice_A = doc[f"mmlu_choices_shot_{shot}"][0]
choice_B = doc[f"mmlu_choices_shot_{shot}"][1]
choice_C = doc[f"mmlu_choices_shot_{shot}"][2]
choice_D = doc[f"mmlu_choices_shot_{shot}"][3]
doc.pop(f"mmlu_choices_shot_{shot}")
doc.pop(f"mmlu_answers_shot_{shot}")
doc.pop(f"mmlu_ind_shot_{shot}")
long_prompt = f"{long_prompt}{question}\nA. {choice_A}\nB. {choice_B}\nC. {choice_C}\nD. {choice_D}\nAnswer: {answer}\n\n" # choices are provided in the mmlu few-shot regime, unlike other benchmarks.
doc["original_hash"] = hash_string(doc["question"])
doc["five_shot_preprompt"] = long_prompt
doc.pop("allfiveshot_longprompt")
return doc
return dataset.map(_subprocess)
def process_truthfulqa(dataset: datasets.Dataset) -> datasets.Dataset:
def _subprocess(doc):
doc["original_hash"] = hash_string(doc["question"])
return doc
return dataset.map(_subprocess)
def process_winogrande(dataset: datasets.Dataset) -> datasets.Dataset:
def _subprocess(doc):
long_prompt = ""
for shot in range(1, 6):
if doc[f"winogrande_answer_shot_{shot}"] == "1":
answer = doc[f"winogrande_option1_shot_{shot}"]
elif doc[f"winogrande_answer_shot_{shot}"] == "2":
answer = doc[f"winogrande_option2_shot_{shot}"]
else:
raise ValueError("Answer not recognised.")
question = doc[f"winogrande_prompt_shot_{shot}"].replace("_", answer)
doc.pop(f"winogrande_prompt_shot_{shot}")
doc.pop(f"winogrande_answer_shot_{shot}")
doc.pop(f"winogrande_idx_shot_{shot}")
doc.pop(f"winogrande_option1_shot_{shot}")
doc.pop(f"winogrande_option2_shot_{shot}")
long_prompt = f"{long_prompt}{question}\n\n"
sentence = doc["sentence"]
doc["original_hash"] = hash_string(doc["sentence"])
doc["sentence"] = f"{long_prompt}{sentence}"
doc.pop("allfiveshot_longprompt")
return doc
return dataset.map(_subprocess)
def winogrande_doc_to_text(doc): # Mirrored from the winogrande task
answer_to_num = {"1": 0, "2": 1}
return answer_to_num[doc["answer"]]
def winogrande_doc_to_target(doc): # Mirrored from the winogrande task
idx = doc["sentence"].index("_") + 1
return doc["sentence"][idx:].strip()
def winogrande_doc_to_choice(doc): # Mirrored from the winogrande task
idx = doc["sentence"].index("_")
options = [doc["option1"], doc["option2"]]
return [doc["sentence"][:idx] + opt for opt in options]
import hashlib
import random
import re
import datasets
def hash_string(string: str) -> str:
return hashlib.sha256(string.encode("utf-8")).hexdigest()
def process_arc(dataset: datasets.Dataset) -> datasets.Dataset:
def _subprocess(doc):
long_prompt = ""
for shot in range(1, 26):
question = doc[f"arc_question_shot_{shot}"]
doc.pop(f"arc_question_shot_{shot}")
answer_lab = doc[f"arc_answerKey_shot_{shot}"]
doc.pop(f"arc_answerKey_shot_{shot}")
answer_idx = doc[f"arc_choices_shot_{shot}"]["label"].index(answer_lab)
answer = doc[f"arc_choices_shot_{shot}"]["text"][answer_idx]
doc.pop(f"arc_choices_shot_{shot}")
doc.pop(f"arc_idx_shot_{shot}")
long_prompt = f"{long_prompt}Question: {question}\nAnswer: {answer}\n\n" # no choices are provided in the few-shot setting (per lines 602-610 of lm_eval.api.task)
doc["twentyfive_shot_preprompt"] = long_prompt
doc.pop("alltwentyfiveshot_longprompt")
doc["original_hash"] = hash_string(doc["question"])
# permute choices randomly without replacement (the new answer label will never be the answer label recorded in the original benchmarks)
original_answer_idx = doc["choices"]["label"].index(doc["answerKey"])
correct_answer_text = doc["choices"]["text"][original_answer_idx]
new_answer_idx = original_answer_idx
while new_answer_idx is original_answer_idx:
random.shuffle(doc["choices"]["text"])
new_answer_idx = doc["choices"]["text"].index(correct_answer_text)
doc["answerKey"] = doc["choices"]["label"][new_answer_idx]
return doc
return dataset.map(_subprocess)
def process_hellaswag(dataset: datasets.Dataset) -> datasets.Dataset:
def process_txt(text): # mirrored from hellaswag task
text = text.strip()
# NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
text = text.replace(" [title]", ". ")
text = re.sub("\\[.*?\\]", "", text)
text = text.replace(" ", " ")
return text
def _preprocess(doc):
ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
doc.pop("ctx_a")
doc.pop("ctx_b")
doc.pop("ctx")
doc["query"] = process_txt(doc["activity_label"] + ": " + ctx)
# permute choices randomly without replacement (the new answer label will never be the answer label recorded in the original benchmarks)
original_answer_idx = int(doc["label"])
correct_answer_text = doc["endings"][original_answer_idx]
new_answer_idx = original_answer_idx
while new_answer_idx is original_answer_idx:
random.shuffle(doc["endings"])
new_answer_idx = doc["endings"].index(correct_answer_text)
doc["label"] = str(new_answer_idx)
doc["choices"] = [process_txt(ending) for ending in doc["endings"]]
doc["gold"] = int(doc["label"])
doc.pop("activity_label")
doc.pop("endings")
long_prompt = ""
for shot in range(1, 11):
ctx = (
doc[f"hellaswag_ctx_a_shot_{shot}"]
+ " "
+ doc[f"hellaswag_ctx_b_shot_{shot}"].capitalize()
)
doc.pop(f"hellaswag_ctx_a_shot_{shot}")
doc.pop(f"hellaswag_ctx_b_shot_{shot}")
doc.pop(f"hellaswag_ctx_shot_{shot}")
question = process_txt(
doc[f"hellaswag_activity_labels_shot_{shot}"] + ": " + ctx
)
ending = process_txt(
doc[f"hellaswag_endings_shot_{shot}"][
int(doc[f"hellaswag_label_shot_{shot}"])
]
)
doc.pop(f"hellaswag_activity_labels_shot_{shot}")
doc.pop(f"hellaswag_endings_shot_{shot}")
doc.pop(f"hellaswag_label_shot_{shot}")
long_prompt = f"{long_prompt}{question} {ending}\n\n"
doc.pop(f"hellaswag_ind_shot_{shot}")
doc.pop(f"hellaswag_source_id_shot_{shot}")
doc.pop(f"hellaswag_split_shot_{shot}")
doc.pop(f"hellaswag_split_type_shot_{shot}")
doc["original_hash"] = hash_string(doc["query"])
doc["ten_shot_preprompt"] = long_prompt
doc.pop("alltenshot_longprompt")
return doc
return dataset.map(_preprocess)
def process_mmlu(dataset: datasets.Dataset) -> datasets.Dataset:
def _subprocess(doc):
choices = ["A", "B", "C", "D"]
long_prompt = f"The following are multiple choice questions (with answers) about {' '.join(doc['subject'].split('_'))}.\n\n"
for shot in range(1, 6):
question = doc[f"mmlu_question_shot_{shot}"].strip()
doc.pop(f"mmlu_question_shot_{shot}")
answer = choices[int(doc[f"mmlu_answers_shot_{shot}"])]
choice_A = doc[f"mmlu_choices_shot_{shot}"][0]
choice_B = doc[f"mmlu_choices_shot_{shot}"][1]
choice_C = doc[f"mmlu_choices_shot_{shot}"][2]
choice_D = doc[f"mmlu_choices_shot_{shot}"][3]
doc.pop(f"mmlu_choices_shot_{shot}")
doc.pop(f"mmlu_answers_shot_{shot}")
doc.pop(f"mmlu_ind_shot_{shot}")
long_prompt = f"{long_prompt}{question}\nA. {choice_A}\nB. {choice_B}\nC. {choice_C}\nD. {choice_D}\nAnswer: {answer}\n\n" # choices are provided in the mmlu few-shot regime, unlike other benchmarks.
doc["original_hash"] = hash_string(doc["question"])
doc["five_shot_preprompt"] = long_prompt
doc.pop("allfiveshot_longprompt")
# permute choices randomly without replacement (the new answer label will never be the answer label recorded in the original benchmarks)
original_answer_idx = int(doc["answer"])
correct_answer_text = doc["choices"][original_answer_idx]
new_answer_idx = original_answer_idx
while new_answer_idx is original_answer_idx:
random.shuffle(doc["choices"])
new_answer_idx = doc["choices"].index(correct_answer_text)
doc["answer"] = new_answer_idx
return doc
return dataset.map(_subprocess)
def process_truthfulqa(dataset: datasets.Dataset) -> datasets.Dataset:
def _subprocess(
doc,
): # currently only permuting the mc1 targets as metabench does not use mc2 targets.
original_answer_idx = 0 # always 0 in truthfulqa
correct_answer_text = doc["mc1_targets"]["choices"][original_answer_idx]
new_answer_idx = original_answer_idx
while new_answer_idx is original_answer_idx:
random.shuffle(doc["mc1_targets"]["choices"])
new_answer_idx = doc["mc1_targets"]["choices"].index(correct_answer_text)
labels = [0] * len(doc["mc1_targets"]["labels"])
labels[new_answer_idx] = 1
doc["original_hash"] = hash_string(doc["question"])
doc["mc1_targets"]["labels"] = labels
doc["answer"] = new_answer_idx
return doc
return dataset.map(_subprocess)
def process_winogrande(dataset: datasets.Dataset) -> datasets.Dataset:
def _subprocess(doc):
long_prompt = ""
for shot in range(1, 6):
if doc[f"winogrande_answer_shot_{shot}"] == "1":
answer = doc[f"winogrande_option1_shot_{shot}"]
elif doc[f"winogrande_answer_shot_{shot}"] == "2":
answer = doc[f"winogrande_option2_shot_{shot}"]
else:
raise ValueError("Answer not recognised.")
question = doc[f"winogrande_prompt_shot_{shot}"].replace("_", answer)
doc.pop(f"winogrande_prompt_shot_{shot}")
doc.pop(f"winogrande_answer_shot_{shot}")
doc.pop(f"winogrande_idx_shot_{shot}")
doc.pop(f"winogrande_option1_shot_{shot}")
doc.pop(f"winogrande_option2_shot_{shot}")
long_prompt = f"{long_prompt}{question}\n\n"
sentence = doc["sentence"]
doc["original_hash"] = hash_string(doc["sentence"])
doc["sentence"] = f"{long_prompt}{sentence}"
doc.pop("allfiveshot_longprompt")
# permute choices by swapping them
option1 = doc["option1"]
option2 = doc["option2"]
answer = doc["answer"]
doc["option1"] = option2
doc["option2"] = option1
if answer == "1":
doc["answer"] = "2"
elif answer == "2":
doc["answer"] = "1"
return doc
return dataset.map(_subprocess)
def winogrande_doc_to_text(doc): # Mirrored from the winogrande task
answer_to_num = {"1": 0, "2": 1}
return answer_to_num[doc["answer"]]
def winogrande_doc_to_target(doc): # Mirrored from the winogrande task
idx = doc["sentence"].index("_") + 1
return doc["sentence"][idx:].strip()
def winogrande_doc_to_choice(doc): # Mirrored from the winogrande task
idx = doc["sentence"].index("_")
options = [doc["option1"], doc["option2"]]
return [doc["sentence"][:idx] + opt for opt in options]
......@@ -15,7 +15,7 @@ datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
os.environ["TOKENIZERS_PARALLELISM"] = "false"
task_manager = tasks.TaskManager()
# Default Task
TASKS = ["arc_easy"]
TASKS = ["metabench_winogrande"]
def task_class():
......@@ -79,10 +79,13 @@ class TestNewTasks:
)
_array = [task.doc_to_text(doc) for doc in arr]
# space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
assert all(
isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True)
for x in _array
)
if not task.multiple_input:
assert all(
isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True)
for x in _array
)
else:
pass
def test_create_choices(self, task_class, limit):
task = task_class
......@@ -123,5 +126,11 @@ class TestNewTasks:
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
# ctx is "" for multiple input tasks
requests = [
task.construct_requests(
doc=doc, ctx="" if task.multiple_input else task.doc_to_text(doc)
)
for doc in arr
]
assert len(requests) == limit if limit else True
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment