Commit 0ec1b553 authored by jon-tow's avatar jon-tow
Browse files

Configure more tasks for mc experiment

parent 8458afa8
...@@ -12,23 +12,25 @@ import numpy as np ...@@ -12,23 +12,25 @@ import numpy as np
import lm_eval.base as base import lm_eval.base as base
from lm_eval.metrics import mean from lm_eval.metrics import mean
@dataclass @dataclass
class MultipleChoiceDoc: class MultipleChoiceDoc:
question: str question: str
# The possible answer keys, e.g. `["A", "B", "C", "D"]`. keys: typing.List[str] # Should these be the same type as gold?
# These should be the type as gold?
keys: typing.List[str]
options: typing.List[str] options: typing.List[str]
gold: int gold: int
id: int = field(init=False) id: int = field(init=False)
context: str = None # Any extra context prior to the question.
def __post_init__(self): def __post_init__(self):
self.id = hashlib.sha224(self.question.encode('utf-8')).hexdigest() self.id = hashlib.sha224(self.question.encode('utf-8')).hexdigest()
class BaseMultipleChoiceTask(base.Task, abc.ABC): class BaseMultipleChoiceTask(base.Task, abc.ABC):
def doc_to_text(self, doc: MultipleChoiceDoc): def doc_to_text(self, doc: MultipleChoiceDoc):
return self.format_prompt(doc) ctx = f"{doc.context}\n" if doc.context else ""
return ctx + self.format_prompt(doc)
@abc.abstractclassmethod @abc.abstractclassmethod
def format_prompt(cls, doc: MultipleChoiceDoc) -> str: def format_prompt(cls, doc: MultipleChoiceDoc) -> str:
...@@ -45,6 +47,7 @@ class BaseMultipleChoiceTask(base.Task, abc.ABC): ...@@ -45,6 +47,7 @@ class BaseMultipleChoiceTask(base.Task, abc.ABC):
def construct_requests(self, doc: MultipleChoiceDoc, ctx: str): def construct_requests(self, doc: MultipleChoiceDoc, ctx: str):
lls = [] lls = []
conts = self.loglikelihood_continuation(doc) conts = self.loglikelihood_continuation(doc)
#print(f"\n\n{conts}\n\n")
for cont in conts: for cont in conts:
lls.append(base.rf.loglikelihood(ctx, f" {cont}")[0]) lls.append(base.rf.loglikelihood(ctx, f" {cont}")[0])
return lls return lls
...@@ -60,8 +63,10 @@ class BaseMultipleChoiceTask(base.Task, abc.ABC): ...@@ -60,8 +63,10 @@ class BaseMultipleChoiceTask(base.Task, abc.ABC):
return { return {
"acc": is_correct, "acc": is_correct,
"acc_norm": acc_norm, "acc_norm": acc_norm,
# Bundle answers: (id, answer key, answer index, is correct). # Bundle answers: (model_answer, model_answer_index, is_correct, question_id).
"answer_bundle": (doc.id, doc.keys[ans], ans, is_correct), "answer_bundle": (doc.keys[ans], ans, is_correct, doc.id),
# Bundle questions: (question_id, question, option_0, option_1, option_2, option_3)
#"question_bundle": (doc.id, doc.question, len(doc.options)),
} }
def higher_is_better(self): def higher_is_better(self):
...@@ -69,6 +74,7 @@ class BaseMultipleChoiceTask(base.Task, abc.ABC): ...@@ -69,6 +74,7 @@ class BaseMultipleChoiceTask(base.Task, abc.ABC):
"acc": True, "acc": True,
"acc_norm": True, "acc_norm": True,
"answer_bundle": True, "answer_bundle": True,
#"question_bundle": True,
} }
def aggregation(self): def aggregation(self):
...@@ -76,16 +82,34 @@ class BaseMultipleChoiceTask(base.Task, abc.ABC): ...@@ -76,16 +82,34 @@ class BaseMultipleChoiceTask(base.Task, abc.ABC):
"acc": mean, "acc": mean,
"acc_norm": mean, "acc_norm": mean,
"answer_bundle": answer_bundle "answer_bundle": answer_bundle
#"question_bundle": question_bundle,
} }
def answer_bundle(items): def answer_bundle(items):
""" Bundles answers into a csv file. """ """ Bundles answers into a csv file. """
from pathlib import Path from pathlib import Path
import csv import csv
cols = ["question_id", "model_answer", "model_answer_index", "is_correct"] cols = ["model_answer", "model_answer_index", "is_correct", "question_id"]
rows = [*items]
path = os.environ["QUESTION_RESULT_PATH"]
with open(f'{path}/question-by-question-results.csv', 'a') as f:
write = csv.writer(f)
write.writerow(cols)
write.writerows(rows)
return 0
def question_bundle(items):
""" Bundles questions into a csv file. """
from pathlib import Path
import csv
num_options = items[0][2]
options = [f"option_{i}" for i in range(num_options)]
cols = ["question_id","question", *options]
rows = [*items] rows = [*items]
path = os.environ["QUESTION_RESULT_PATH"] path = os.environ["QUESTION_RESULT_PATH"]
with open(f'{path}/question-by-question-results.csv', 'w') as f: with open(f'{path}/question-table.csv', 'a') as f:
write = csv.writer(f) write = csv.writer(f)
write.writerow(cols) write.writerow(cols)
write.writerows(rows) write.writerows(rows)
...@@ -95,6 +119,11 @@ def answer_bundle(items): ...@@ -95,6 +119,11 @@ def answer_bundle(items):
def key2num(doc: MultipleChoiceDoc, key: str) -> int: def key2num(doc: MultipleChoiceDoc, key: str) -> int:
return str(doc.keys.index(key) + 1) # `+ 1` for 1-based indexing. return str(doc.keys.index(key) + 1) # `+ 1` for 1-based indexing.
def key2letter(doc: MultipleChoiceDoc, key: str) -> str:
A_ascii = 65
ascii_offset = doc.keys.index(key)
letter = chr(A_ascii + ascii_offset)
return letter
def format_key(key: str, type: str): def format_key(key: str, type: str):
""" Formats a multiple choice key. E.g. """ Formats a multiple choice key. E.g.
...@@ -115,8 +144,9 @@ def format_key(key: str, type: str): ...@@ -115,8 +144,9 @@ def format_key(key: str, type: str):
class MC_NoOptionList_OptionLL_Task(BaseMultipleChoiceTask): class MC_NoOptionList_OptionLL_Task(BaseMultipleChoiceTask):
""" """ "freeform"
Format: Format:
<Context>
Question: <question> Question: <question>
Answer: Answer:
Continuation: Continuation:
...@@ -126,7 +156,6 @@ class MC_NoOptionList_OptionLL_Task(BaseMultipleChoiceTask): ...@@ -126,7 +156,6 @@ class MC_NoOptionList_OptionLL_Task(BaseMultipleChoiceTask):
prompt = "Question: " + doc.question + "\n" prompt = "Question: " + doc.question + "\n"
prompt += "Answer:" prompt += "Answer:"
return prompt return prompt
# return _format_prompt(doc, list_options=False)
def doc_to_target(self, doc: MultipleChoiceDoc) -> str: def doc_to_target(self, doc: MultipleChoiceDoc) -> str:
return " " + doc.options[doc.gold] return " " + doc.options[doc.gold]
...@@ -136,8 +165,9 @@ class MC_NoOptionList_OptionLL_Task(BaseMultipleChoiceTask): ...@@ -136,8 +165,9 @@ class MC_NoOptionList_OptionLL_Task(BaseMultipleChoiceTask):
class MC_WithOptionList_OptionLL_Task(BaseMultipleChoiceTask): class MC_WithOptionList_OptionLL_Task(BaseMultipleChoiceTask):
""" """ "option"
Format: Format:
<Context>
Question: <question> Question: <question>
<key1>: <option1> <key1>: <option1>
<key2>: <option2> <key2>: <option2>
...@@ -163,11 +193,12 @@ class MC_WithOptionList_OptionLL_Task(BaseMultipleChoiceTask): ...@@ -163,11 +193,12 @@ class MC_WithOptionList_OptionLL_Task(BaseMultipleChoiceTask):
class MC_WithOptionList_LetterLL_Task(BaseMultipleChoiceTask): class MC_WithOptionList_LetterLL_Task(BaseMultipleChoiceTask):
""" """ "letter"
Format: Format:
<Context>
Question: <question> Question: <question>
<key1>: <option1> A: <option1>
<key2>: <option2> B: <option2>
... ...
Answer: Answer:
Continuation: Continuation:
...@@ -176,7 +207,7 @@ class MC_WithOptionList_LetterLL_Task(BaseMultipleChoiceTask): ...@@ -176,7 +207,7 @@ class MC_WithOptionList_LetterLL_Task(BaseMultipleChoiceTask):
def format_prompt(cls, doc: MultipleChoiceDoc) -> str: def format_prompt(cls, doc: MultipleChoiceDoc) -> str:
prompt = "Question: " + doc.question + "\n" prompt = "Question: " + doc.question + "\n"
prompt += "\n".join([ prompt += "\n".join([
f"{format_key(doc.keys[i], 'colon')} {option}" f"{format_key(key2letter(doc, doc.keys[i]), 'colon')} {option}"
for i, option in enumerate(doc.options) for i, option in enumerate(doc.options)
]) ])
prompt += "\nAnswer:" prompt += "\nAnswer:"
...@@ -190,8 +221,9 @@ class MC_WithOptionList_LetterLL_Task(BaseMultipleChoiceTask): ...@@ -190,8 +221,9 @@ class MC_WithOptionList_LetterLL_Task(BaseMultipleChoiceTask):
class MC_WithOptionList_NumLL_Task(BaseMultipleChoiceTask): class MC_WithOptionList_NumLL_Task(BaseMultipleChoiceTask):
""" """ "number"
Format: Format:
<Context>
Question: <question> Question: <question>
1: <option1> 1: <option1>
2: <option2> 2: <option2>
...@@ -226,4 +258,4 @@ elif os.environ["MC_SETTING"] == "letter": ...@@ -226,4 +258,4 @@ elif os.environ["MC_SETTING"] == "letter":
elif os.environ["MC_SETTING"] == "number": elif os.environ["MC_SETTING"] == "number":
MULTIPLE_CHOICE_TASK = MC_WithOptionList_NumLL_Task MULTIPLE_CHOICE_TASK = MC_WithOptionList_NumLL_Task
else: else:
print("No such MC_SETTING:", os.environ["MC_SETTING"]) print("No such MC_SETTING:", os.environ["MC_SETTING"])
\ No newline at end of file
from lm_eval.base import MultipleChoiceTask from lm_eval.base import MultipleChoiceTask
from . common import HFTask from . common import HFTask
from lm_eval.mctask_experimental import MultipleChoiceDoc
class ARCEasy(HFTask, MultipleChoiceTask): class ARCEasy(HFTask, MultipleChoiceTask):
...@@ -17,21 +18,22 @@ class ARCEasy(HFTask, MultipleChoiceTask): ...@@ -17,21 +18,22 @@ class ARCEasy(HFTask, MultipleChoiceTask):
return True return True
def _convert_standard(self, doc): def _convert_standard(self, doc):
question = doc["question"]
keys = ["A", "B", "C", "D", "E"]
options = doc["choices"]["text"]
while len(options) < len(keys):
options.append("")
# NOTE: Some `doc["answerKey"]`s are in numeric string format being one # NOTE: Some `doc["answerKey"]`s are in numeric string format being one
# of {'1', '2', '3', '4', '5'}. We map them back to letters. # of {'1', '2', '3', '4', '5'}. We map them back to letters.
num_to_letter = {"1": "A", "2": "B", "3": "C", "4": "D", "5": "E"} num_to_letter = {"1": "A", "2": "B", "3": "C", "4": "D", "5": "E"}
doc["answerKey"] = num_to_letter.get(doc["answerKey"], doc["answerKey"]) doc["answerKey"] = num_to_letter.get(doc["answerKey"], doc["answerKey"])
out_doc = { gold = ["A", "B", "C", "D", "E"].index(doc["answerKey"])
"id": doc["id"], return MultipleChoiceDoc(
"query": "Question: " + doc["question"] + "\nAnswer:", question=question,
"choices": doc["choices"]["text"], options=options,
"gold": ["A", "B", "C", "D", "E"].index(doc["answerKey"]), gold=gold,
} keys=keys,
return out_doc )
def doc_to_text(self, doc):
return doc["query"]
class ARCChallenge(ARCEasy): class ARCChallenge(ARCEasy):
DATASET_PATH = "ai2_arc" DATASET_PATH = "ai2_arc"
......
from ast import Mult
import re import re
from lm_eval.base import MultipleChoiceTask from lm_eval.base import MultipleChoiceTask
from . common import HFTask from . common import HFTask
from lm_eval.mctask_experimental import MultipleChoiceDoc
class HellaSwag(HFTask, MultipleChoiceTask): class HellaSwag(HFTask, MultipleChoiceTask):
...@@ -27,13 +29,15 @@ class HellaSwag(HFTask, MultipleChoiceTask): ...@@ -27,13 +29,15 @@ class HellaSwag(HFTask, MultipleChoiceTask):
return text return text
def _convert_standard(self, doc): def _convert_standard(self, doc):
ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize() question = self.preprocess(doc["ctx_a"] + " " + doc["ctx_b"].capitalize())
out_doc = { options = [self.preprocess(ending) for ending in doc['endings']]
"query": self.preprocess(doc['activity_label'] + ': ' + ctx), gold = int(doc["label"])
"choices": [self.preprocess(ending) for ending in doc['endings']], keys = ["A", "B", "C", "D"]
"gold": int(doc['label']), context = self.preprocess(doc['activity_label'])
} return MultipleChoiceDoc(
return out_doc question=question,
options=options,
def doc_to_text(self, doc): gold=gold,
return doc["query"] keys=keys,
context=context
)
...@@ -2,6 +2,7 @@ import numpy as np ...@@ -2,6 +2,7 @@ import numpy as np
from lm_eval.base import MultipleChoiceTask, rf from lm_eval.base import MultipleChoiceTask, rf
from ..metrics import mean from ..metrics import mean
from . common import HFTask from . common import HFTask
from lm_eval.mctask_experimental import MultipleChoiceDoc
class PiQA(HFTask, MultipleChoiceTask): class PiQA(HFTask, MultipleChoiceTask):
...@@ -19,12 +20,15 @@ class PiQA(HFTask, MultipleChoiceTask): ...@@ -19,12 +20,15 @@ class PiQA(HFTask, MultipleChoiceTask):
return False return False
def _convert_standard(self, doc): def _convert_standard(self, doc):
out_doc = { keys = ['0', '1']
"goal": doc["goal"], question = doc["goal"]
"choices": [doc["sol1"], doc["sol2"]], options = [doc["sol1"], doc["sol2"]]
"gold": doc["label"], gold = doc["label"]
} return MultipleChoiceDoc(
return out_doc question=question,
keys=keys,
def doc_to_text(self, doc): options=options,
return "Question: " + doc["goal"] + "\nAnswer:" gold=gold
)
...@@ -20,6 +20,7 @@ have been trained on data not specifically collected to succeed on PROST." ...@@ -20,6 +20,7 @@ have been trained on data not specifically collected to succeed on PROST."
from lm_eval.base import MultipleChoiceTask from lm_eval.base import MultipleChoiceTask
from . common import HFTask from . common import HFTask
from lm_eval.mctask_experimental import MultipleChoiceDoc
class PROST(HFTask, MultipleChoiceTask): class PROST(HFTask, MultipleChoiceTask):
...@@ -46,12 +47,15 @@ class PROST(HFTask, MultipleChoiceTask): ...@@ -46,12 +47,15 @@ class PROST(HFTask, MultipleChoiceTask):
) )
def _convert_standard(self, doc): def _convert_standard(self, doc):
out_doc = { question = doc['ex_question']
"query": f"{doc['context']}\nQuestion: {doc['ex_question']}\nAnswer:", options = [doc['A'], doc['B'], doc['C'], doc['D']]
"choices": [doc['A'], doc['B'], doc['C'], doc['D']], gold = doc["label"]
"gold": doc['label'], keys = ["A","B","C","D"]
} context = doc['context']
return out_doc return MultipleChoiceDoc(
question=question,
def doc_to_text(self, doc): options=options,
return doc["query"] gold=gold,
keys=keys,
context=context
)
...@@ -3,6 +3,7 @@ import json ...@@ -3,6 +3,7 @@ import json
import zipfile import zipfile
from lm_eval.base import MultipleChoiceTask from lm_eval.base import MultipleChoiceTask
from best_download import download_file from best_download import download_file
from lm_eval.mctask_experimental import MultipleChoiceDoc
class SciQ(MultipleChoiceTask): class SciQ(MultipleChoiceTask):
...@@ -29,20 +30,23 @@ class SciQ(MultipleChoiceTask): ...@@ -29,20 +30,23 @@ class SciQ(MultipleChoiceTask):
return True return True
def _convert_standard(self, doc): def _convert_standard(self, doc):
choices = [ question = doc["question"]
keys = ["A", "B", "C", "D"]
options = [
doc["distractor1"], doc["distractor1"],
doc["distractor2"], doc["distractor2"],
doc["distractor3"], doc["distractor3"],
doc["correct_answer"], doc["correct_answer"],
] ]
src = doc['support'] context = doc['support']
out_doc = { gold = 3
"source" : src, return MultipleChoiceDoc(
"query" : doc['question'], question=question,
"choices" : choices, options=options,
"gold" : 3, gold=gold,
} keys=keys,
return out_doc context=context
)
def load_docs(self, textfilename): def load_docs(self, textfilename):
with open(textfilename, 'r') as j: with open(textfilename, 'r') as j:
...@@ -58,6 +62,3 @@ class SciQ(MultipleChoiceTask): ...@@ -58,6 +62,3 @@ class SciQ(MultipleChoiceTask):
def test_docs(self): def test_docs(self):
return self.load_docs("data/sciq/SciQ dataset-2 3/test.json") return self.load_docs("data/sciq/SciQ dataset-2 3/test.json")
def doc_to_text(self, doc):
return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"]).strip()
...@@ -21,6 +21,7 @@ setuptools.setup( ...@@ -21,6 +21,7 @@ setuptools.setup(
python_requires='>=3.6', python_requires='>=3.6',
install_requires=[ install_requires=[
"black", "black",
"dask",
"best_download==0.0.9", "best_download==0.0.9",
"datasets==1.15.1", "datasets==1.15.1",
"click>=7.1", "click>=7.1",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment