Commit dae7b868 authored by Quentin Gregory Anthony's avatar Quentin Gregory Anthony
Browse files

Added decontamination to remaining evals

parent 341663a9
...@@ -40,6 +40,12 @@ class ANLIBase(HFTask): ...@@ -40,6 +40,12 @@ class ANLIBase(HFTask):
# want to do it exactly as OA did? # want to do it exactly as OA did?
return doc['premise'] + '\nQuestion: ' + doc['hypothesis'] + ' True, False, or Neither?\nAnswer:' return doc['premise'] + '\nQuestion: ' + doc['hypothesis'] + ' True, False, or Neither?\nAnswer:'
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["premise"]
def doc_to_target(self, doc): def doc_to_target(self, doc):
# True = entailment # True = entailment
# False = contradiction # False = contradiction
......
...@@ -32,6 +32,12 @@ class ARCEasy(HFTask, MultipleChoiceTask): ...@@ -32,6 +32,12 @@ class ARCEasy(HFTask, MultipleChoiceTask):
def doc_to_text(self, doc): def doc_to_text(self, doc):
return doc["query"] return doc["query"]
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["query"]
class ARCChallenge(ARCEasy): class ARCChallenge(ARCEasy):
DATASET_PATH = "ai2_arc" DATASET_PATH = "ai2_arc"
......
...@@ -55,6 +55,12 @@ class Arithmetic(Task): ...@@ -55,6 +55,12 @@ class Arithmetic(Task):
def doc_to_text(self, doc): def doc_to_text(self, doc):
return doc.context return doc.context
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc.context
def doc_to_target(self, doc): def doc_to_target(self, doc):
return doc.completion return doc.completion
......
...@@ -93,6 +93,12 @@ class Asdiv(Task): ...@@ -93,6 +93,12 @@ class Asdiv(Task):
# TODO: add solution-type # TODO: add solution-type
return doc['body'] + '\n' + 'Question:' + doc['question'] + '\n' + 'Answer:' return doc['body'] + '\n' + 'Question:' + doc['question'] + '\n' + 'Answer:'
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc['body'] + " " + doc['question']
def doc_to_target(self, doc): def doc_to_target(self, doc):
# TODO: add formula # TODO: add formula
......
...@@ -47,6 +47,12 @@ class BlimpTask(HFTask): ...@@ -47,6 +47,12 @@ class BlimpTask(HFTask):
# this method is invoked by tests only # this method is invoked by tests only
return "" return ""
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["sentence_good"] + " " + doc["sentence_bad"]
def doc_to_target(self, doc): def doc_to_target(self, doc):
# this method is invoked by tests only # this method is invoked by tests only
return "" return ""
......
...@@ -38,6 +38,13 @@ class CBTBase(HFTask): ...@@ -38,6 +38,13 @@ class CBTBase(HFTask):
text = "Passage: " + passage + "\nQuestion: " + doc["question"] text = "Passage: " + passage + "\nQuestion: " + doc["question"]
return self.detokenize(text) return self.detokenize(text)
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
passage = " ".join(doc["sentences"])
return passage
def doc_to_target(self, doc): def doc_to_target(self, doc):
return "" return ""
......
...@@ -47,6 +47,12 @@ class CoQA(Task): ...@@ -47,6 +47,12 @@ class CoQA(Task):
doc_text += question + answer doc_text += question + answer
return doc_text return doc_text
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["story"] + " " + doc["questions"]
@classmethod @classmethod
def get_answers(cls, doc, turn_id): def get_answers(cls, doc, turn_id):
# Returns unique answers and valid alternatives (Some questions in CoQA have multiple valid answers). # Returns unique answers and valid alternatives (Some questions in CoQA have multiple valid answers).
......
...@@ -87,6 +87,12 @@ class DROP(Task): ...@@ -87,6 +87,12 @@ class DROP(Task):
def doc_to_text(self, doc): def doc_to_text(self, doc):
return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:" return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:"
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc['passage'] + " " + doc['question']
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " " + ", ".join(doc["answers"][0]) return " " + ", ".join(doc["answers"][0])
......
...@@ -24,6 +24,12 @@ class CoLA(HFTask): ...@@ -24,6 +24,12 @@ class CoLA(HFTask):
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "{}\nQuestion: Does this sentence make sense?\nAnswer:".format(doc["sentence"]) return "{}\nQuestion: Does this sentence make sense?\nAnswer:".format(doc["sentence"])
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["sentence"]
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " {}".format({1: "yes", 0: "no"}[doc["label"]]) return " {}".format({1: "yes", 0: "no"}[doc["label"]])
......
...@@ -27,6 +27,12 @@ class HeadQABase(HFTask, MultipleChoiceTask): ...@@ -27,6 +27,12 @@ class HeadQABase(HFTask, MultipleChoiceTask):
def doc_to_text(self, doc): def doc_to_text(self, doc):
return doc["query"] return doc["query"]
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["query"]
class HeadQAEn(HeadQABase): class HeadQAEn(HeadQABase):
DATASET_NAME = "en" DATASET_NAME = "en"
...@@ -39,4 +45,4 @@ class HeadQAEsDeprecated(HeadQABase): ...@@ -39,4 +45,4 @@ class HeadQAEsDeprecated(HeadQABase):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
print("WARNING: headqa is deprecated. Please use headqa_es or headqa_en instead. See https://github.com/EleutherAI/lm-evaluation-harness/pull/240 for more info.") print("WARNING: headqa is deprecated. Please use headqa_es or headqa_en instead. See https://github.com/EleutherAI/lm-evaluation-harness/pull/240 for more info.")
\ No newline at end of file
...@@ -98,6 +98,12 @@ class EthicsCM(Ethics): ...@@ -98,6 +98,12 @@ class EthicsCM(Ethics):
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "{}\nQuestion: Is this wrong?\nAnswer:".format(doc[1]) return "{}\nQuestion: Is this wrong?\nAnswer:".format(doc[1])
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc[1]
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " {}".format(yesno(int(doc[0]))) return " {}".format(yesno(int(doc[0])))
...@@ -138,6 +144,12 @@ class EthicsDeontology(Ethics): ...@@ -138,6 +144,12 @@ class EthicsDeontology(Ethics):
prompt = " ".join([doc[1], doc[2]]) prompt = " ".join([doc[1], doc[2]])
return "Question: Would most people believe this reasonable or unreasonable to say? \"{}\"\nAnswer:".format(prompt) return "Question: Would most people believe this reasonable or unreasonable to say? \"{}\"\nAnswer:".format(prompt)
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return " ".join([doc[1], doc[2]])
def doc_to_target(self, doc): def doc_to_target(self, doc):
target = ["unreasonable", "reasonable"][int(doc[0])] target = ["unreasonable", "reasonable"][int(doc[0])]
return " {}".format(target) return " {}".format(target)
...@@ -187,6 +199,12 @@ class EthicsJustice(Ethics): ...@@ -187,6 +199,12 @@ class EthicsJustice(Ethics):
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "Question: Would most people believe this reasonable or unreasonable to say? \"{}\"\nAnswer:".format(doc[1]) return "Question: Would most people believe this reasonable or unreasonable to say? \"{}\"\nAnswer:".format(doc[1])
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc[1]
def doc_to_target(self, doc): def doc_to_target(self, doc):
target = ["unreasonable", "reasonable"][int(doc[0])] target = ["unreasonable", "reasonable"][int(doc[0])]
return " {}".format(target) return " {}".format(target)
...@@ -253,6 +271,12 @@ class EthicsUtilitarianismOriginal(Ethics): ...@@ -253,6 +271,12 @@ class EthicsUtilitarianismOriginal(Ethics):
def doc_to_text(self, doc): def doc_to_text(self, doc):
return 'Activity: "{}"\nRating:'.format(doc["activity"]) return 'Activity: "{}"\nRating:'.format(doc["activity"])
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["activity"]
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " " + doc["rating"] return " " + doc["rating"]
......
...@@ -58,6 +58,12 @@ class Math(Task): ...@@ -58,6 +58,12 @@ class Math(Task):
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "Problem: " + doc["problem"] + "\nAnswer:" return "Problem: " + doc["problem"] + "\nAnswer:"
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["problem"]
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " " + doc["answer"] return " " + doc["answer"]
......
...@@ -116,3 +116,10 @@ class GeneralHendrycksTest(MultipleChoiceTask): ...@@ -116,3 +116,10 @@ class GeneralHendrycksTest(MultipleChoiceTask):
def doc_to_text(self, doc): def doc_to_text(self, doc):
return doc["query"] return doc["query"]
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["query"]
...@@ -11,5 +11,11 @@ class LAMBADA_cloze(LAMBADA): ...@@ -11,5 +11,11 @@ class LAMBADA_cloze(LAMBADA):
def doc_to_text(self, doc): def doc_to_text(self, doc):
return doc['text'].rsplit(' ', 1)[0] + " ____. ->" return doc['text'].rsplit(' ', 1)[0] + " ____. ->"
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc['text']
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " " + doc['text'].rsplit(' ', 1)[1] return " " + doc['text'].rsplit(' ', 1)[1]
...@@ -43,6 +43,12 @@ class MCTACO(HFTask): ...@@ -43,6 +43,12 @@ class MCTACO(HFTask):
return f"{doc['sentence']}\nQuestion: {doc['question']}\n"\ return f"{doc['sentence']}\nQuestion: {doc['question']}\n"\
f"Answer: {doc['answer']}\nPlausible:" f"Answer: {doc['answer']}\nPlausible:"
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc['question'] + " " + doc['sentence']
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " " + ["no", "yes"][doc['label']] return " " + ["no", "yes"][doc['label']]
......
...@@ -73,6 +73,12 @@ class MuTualBase(Task): ...@@ -73,6 +73,12 @@ class MuTualBase(Task):
def doc_to_text(self, doc): def doc_to_text(self, doc):
return self.detokenize(doc["article"]) return self.detokenize(doc["article"])
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["article"]
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " " + self.detokenize(doc["options"][self.CHOICES.index(doc["answers"])]) return " " + self.detokenize(doc["options"][self.CHOICES.index(doc["answers"])])
......
...@@ -36,6 +36,12 @@ class NaturalQs(HFTask): ...@@ -36,6 +36,12 @@ class NaturalQs(HFTask):
def doc_to_text(self, doc): def doc_to_text(self, doc):
return 'Q: ' + doc['question']['text'] + '\n\n' + 'A:' return 'Q: ' + doc['question']['text'] + '\n\n' + 'A:'
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc['question']['text']
def doc_to_target(self, doc): def doc_to_target(self, doc):
# There's a short answer and a long answer. Based on the paper, I'm using the long answer. # There's a short answer and a long answer. Based on the paper, I'm using the long answer.
short_answer = doc['annotations']['short_answers'][0]['text'] short_answer = doc['annotations']['short_answers'][0]['text']
......
...@@ -27,3 +27,10 @@ class OpenBookQA(HFTask, MultipleChoiceTask): ...@@ -27,3 +27,10 @@ class OpenBookQA(HFTask, MultipleChoiceTask):
def doc_to_text(self, doc): def doc_to_text(self, doc):
return doc["query"] return doc["query"]
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["query"]
...@@ -55,3 +55,10 @@ class PROST(HFTask, MultipleChoiceTask): ...@@ -55,3 +55,10 @@ class PROST(HFTask, MultipleChoiceTask):
def doc_to_text(self, doc): def doc_to_text(self, doc):
return doc["query"] return doc["query"]
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["query"]
...@@ -73,6 +73,13 @@ class QA4MRE(MultipleChoiceTask): ...@@ -73,6 +73,13 @@ class QA4MRE(MultipleChoiceTask):
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"]) return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"])
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["source"] + " " + doc["query"]
class QA4MRE_2011(QA4MRE): class QA4MRE_2011(QA4MRE):
YEAR = 2011 YEAR = 2011
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment