Commit 48c6bd65 authored by Oleh Shliazhko's avatar Oleh Shliazhko
Browse files

fix mmlu task, set updated dataset name and make the prompt identical to the original eval code

parent d1451679
...@@ -14,7 +14,6 @@ Homepage: https://github.com/hendrycks/test ...@@ -14,7 +14,6 @@ Homepage: https://github.com/hendrycks/test
""" """
from lm_eval.base import MultipleChoiceTask from lm_eval.base import MultipleChoiceTask
_CITATION = """ _CITATION = """
@article{hendryckstest2021, @article{hendryckstest2021,
title={Measuring Massive Multitask Language Understanding}, title={Measuring Massive Multitask Language Understanding},
...@@ -104,7 +103,7 @@ def create_task(subject): ...@@ -104,7 +103,7 @@ def create_task(subject):
class GeneralHendrycksTest(MultipleChoiceTask): class GeneralHendrycksTest(MultipleChoiceTask):
VERSION = 0 VERSION = 0
DATASET_PATH = "hendrycks_test" DATASET_PATH = "cais/mmlu"
DATASET_NAME = None DATASET_NAME = None
def __init__(self, subject): def __init__(self, subject):
...@@ -112,7 +111,7 @@ class GeneralHendrycksTest(MultipleChoiceTask): ...@@ -112,7 +111,7 @@ class GeneralHendrycksTest(MultipleChoiceTask):
super().__init__() super().__init__()
def has_training_docs(self): def has_training_docs(self):
return False return True
def has_validation_docs(self): def has_validation_docs(self):
return True return True
...@@ -126,41 +125,45 @@ class GeneralHendrycksTest(MultipleChoiceTask): ...@@ -126,41 +125,45 @@ class GeneralHendrycksTest(MultipleChoiceTask):
def test_docs(self): def test_docs(self):
return map(self._process_doc, self.dataset["test"]) return map(self._process_doc, self.dataset["test"])
def fewshot_context(self, doc, num_fewshot, **kwargs):
subject = self.DATASET_NAME
description = f"The following are multiple choice questions (with answers) about {subject}."
kwargs["description"] = description
return super().fewshot_context(doc=doc, num_fewshot=num_fewshot, **kwargs)
def _process_doc(self, doc): def _process_doc(self, doc):
def format_example(doc, keys): def format_example(doc, keys):
""" """
Question: <prompt> <prompt>
Choices:
A. <choice1> A. <choice1>
B. <choice2> B. <choice2>
C. <choice3> C. <choice3>
D. <choice4> D. <choice4>
Answer: Answer:
""" """
prompt = "Question: " + doc["question"] + "\nChoices:\n"
prompt += "".join( question = doc["question"]
choices = "".join(
[f"{key}. {choice}\n" for key, choice in zip(keys, doc["choices"])] [f"{key}. {choice}\n" for key, choice in zip(keys, doc["choices"])]
) )
prompt += "Answer:" prompt = f"{question}\n{choices}Answer:"
return prompt return prompt
keys = ["A", "B", "C", "D"] keys = ["A", "B", "C", "D"]
return { return {
"query": format_example(doc, keys), "query": format_example(doc, keys),
"choices": doc["choices"], "choices": keys,
"gold": keys.index(doc["answer"]) "gold": doc["answer"],
if isinstance(doc["answer"], str)
else doc["answer"],
} }
return result
def fewshot_examples(self, k, rnd): def fewshot_examples(self, k, rnd):
# fewshot_examples is not just sampling from train_docs because dev is # fewshot_examples is not just sampling from train_docs because dev is
# in the same distribution as val/test but auxiliary_train isn't # in the same distribution as val/test but auxiliary_train isn't
if self._fewshot_docs is None: if self._fewshot_docs is None:
self._fewshot_docs = list(map(self._process_doc, self.dataset["dev"])) self._fewshot_docs = list(map(self._process_doc, self.dataset["dev"]))
return rnd.sample(list(self._fewshot_docs), k) return self._fewshot_docs[:k] # rnd.sample(list(self._fewshot_docs), k)
def doc_to_text(self, doc): def doc_to_text(self, doc):
return doc["query"] return doc["query"]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment