Added decontamination to remaining evals

dae7b868 · Quentin Gregory Anthony · 341663a9 · dae7b868 · dae7b868 · dae7b868
Commit dae7b868 authored Feb 05, 2022 by Quentin Gregory Anthony
20 changed files
--- a/lm_eval/tasks/anli.py
+++ b/lm_eval/tasks/anli.py
@@ -40,6 +40,12 @@ class ANLIBase(HFTask):
        # want to do it exactly as OA did?
        return doc['premise'] + '\nQuestion: ' + doc['hypothesis'] + ' True, False, or Neither?\nAnswer:'

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["premise"]
+
    def doc_to_target(self, doc):
        # True = entailment
        # False = contradiction

--- a/lm_eval/tasks/arc.py
+++ b/lm_eval/tasks/arc.py
@@ -32,6 +32,12 @@ class ARCEasy(HFTask, MultipleChoiceTask):
    def doc_to_text(self, doc):
        return doc["query"]

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["query"]
+

 class ARCChallenge(ARCEasy):
    DATASET_PATH = "ai2_arc"

--- a/lm_eval/tasks/arithmetic.py
+++ b/lm_eval/tasks/arithmetic.py
@@ -55,6 +55,12 @@ class Arithmetic(Task):
    def doc_to_text(self, doc):
        return doc.context

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc.context
+
    def doc_to_target(self, doc):
        return doc.completion


--- a/lm_eval/tasks/asdiv.py
+++ b/lm_eval/tasks/asdiv.py
@@ -93,6 +93,12 @@ class Asdiv(Task):
        # TODO: add solution-type
        return doc['body'] + '\n' + 'Question:' + doc['question'] + '\n' + 'Answer:'

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc['body'] + " " + doc['question']
+
    def doc_to_target(self, doc):
        # TODO: add formula


--- a/lm_eval/tasks/blimp.py
+++ b/lm_eval/tasks/blimp.py
@@ -47,6 +47,12 @@ class BlimpTask(HFTask):
        # this method is invoked by tests only
        return ""

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["sentence_good"] + " " + doc["sentence_bad"]
+
    def doc_to_target(self, doc):
        # this method is invoked by tests only
        return ""

--- a/lm_eval/tasks/cbt.py
+++ b/lm_eval/tasks/cbt.py
@@ -38,6 +38,13 @@ class CBTBase(HFTask):
        text = "Passage: " + passage + "\nQuestion: " + doc["question"]
        return self.detokenize(text)

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+	passage = " ".join(doc["sentences"])
+        return passage
+
    def doc_to_target(self, doc):
        return ""


--- a/lm_eval/tasks/coqa.py
+++ b/lm_eval/tasks/coqa.py
@@ -47,6 +47,12 @@ class CoQA(Task):
            doc_text += question + answer
        return doc_text
        
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["story"] + " " + doc["questions"]
+
    @classmethod
    def get_answers(cls, doc, turn_id):
        # Returns unique answers and valid alternatives (Some questions in CoQA have multiple valid answers).

--- a/lm_eval/tasks/drop.py
+++ b/lm_eval/tasks/drop.py
@@ -87,6 +87,12 @@ class DROP(Task):
    def doc_to_text(self, doc):
        return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:"

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc['passage'] + " " + doc['question']
+
    def doc_to_target(self, doc):
        return " " + ", ".join(doc["answers"][0])


--- a/lm_eval/tasks/glue.py
+++ b/lm_eval/tasks/glue.py
@@ -24,6 +24,12 @@ class CoLA(HFTask):
    def doc_to_text(self, doc):
        return "{}\nQuestion: Does this sentence make sense?\nAnswer:".format(doc["sentence"])

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["sentence"]
+
    def doc_to_target(self, doc):
        return " {}".format({1: "yes", 0: "no"}[doc["label"]])


--- a/lm_eval/tasks/headqa.py
+++ b/lm_eval/tasks/headqa.py
@@ -27,6 +27,12 @@ class HeadQABase(HFTask, MultipleChoiceTask):
    def doc_to_text(self, doc):
        return doc["query"]

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["query"]
+
 class HeadQAEn(HeadQABase):
    DATASET_NAME = "en"

@@ -39,4 +45,4 @@ class HeadQAEsDeprecated(HeadQABase):

    def __init__(self):
        super().__init__()
-        print("WARNING: headqa is deprecated. Please use headqa_es or headqa_en instead. See https://github.com/EleutherAI/lm-evaluation-harness/pull/240 for more info.")
\ No newline at end of file
+        print("WARNING: headqa is deprecated. Please use headqa_es or headqa_en instead. See https://github.com/EleutherAI/lm-evaluation-harness/pull/240 for more info.")
--- a/lm_eval/tasks/hendrycks_ethics.py
+++ b/lm_eval/tasks/hendrycks_ethics.py
@@ -98,6 +98,12 @@ class EthicsCM(Ethics):
    def doc_to_text(self, doc):
        return "{}\nQuestion: Is this wrong?\nAnswer:".format(doc[1])

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc[1]
+
    def doc_to_target(self, doc):
        return " {}".format(yesno(int(doc[0])))

@@ -138,6 +144,12 @@ class EthicsDeontology(Ethics):
        prompt = " ".join([doc[1], doc[2]])
        return "Question: Would most people believe this reasonable or unreasonable to say? \"{}\"\nAnswer:".format(prompt)

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return " ".join([doc[1], doc[2]])
+
    def doc_to_target(self, doc):
        target = ["unreasonable", "reasonable"][int(doc[0])]
        return " {}".format(target)
@@ -187,6 +199,12 @@ class EthicsJustice(Ethics):
    def doc_to_text(self, doc):
        return "Question: Would most people believe this reasonable or unreasonable to say? \"{}\"\nAnswer:".format(doc[1])

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc[1]
+
    def doc_to_target(self, doc):
        target = ["unreasonable", "reasonable"][int(doc[0])]
        return " {}".format(target)
@@ -253,6 +271,12 @@ class EthicsUtilitarianismOriginal(Ethics):
    def doc_to_text(self, doc):
        return 'Activity: "{}"\nRating:'.format(doc["activity"])

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["activity"]
+
    def doc_to_target(self, doc):
        return " " + doc["rating"]


--- a/lm_eval/tasks/hendrycks_math.py
+++ b/lm_eval/tasks/hendrycks_math.py
@@ -58,6 +58,12 @@ class Math(Task):
    def doc_to_text(self, doc):
        return "Problem: " + doc["problem"] + "\nAnswer:"

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["problem"]
+
    def doc_to_target(self, doc):
        return " " + doc["answer"]


--- a/lm_eval/tasks/hendrycks_test.py
+++ b/lm_eval/tasks/hendrycks_test.py
@@ -116,3 +116,10 @@ class GeneralHendrycksTest(MultipleChoiceTask):

    def doc_to_text(self, doc):
        return doc["query"]
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["query"]
+
--- a/lm_eval/tasks/lambada_cloze.py
+++ b/lm_eval/tasks/lambada_cloze.py
@@ -11,5 +11,11 @@ class LAMBADA_cloze(LAMBADA):
    def doc_to_text(self, doc):
        return doc['text'].rsplit(' ', 1)[0] + " ____. ->"

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc['text']
+
    def doc_to_target(self, doc):
        return " " + doc['text'].rsplit(' ', 1)[1]
--- a/lm_eval/tasks/mc_taco.py
+++ b/lm_eval/tasks/mc_taco.py
@@ -43,6 +43,12 @@ class MCTACO(HFTask):
        return f"{doc['sentence']}\nQuestion: {doc['question']}\n"\
            f"Answer: {doc['answer']}\nPlausible:"

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc['question'] + " " + doc['sentence']
+
    def doc_to_target(self, doc):
        return " " + ["no", "yes"][doc['label']]


--- a/lm_eval/tasks/mutual.py
+++ b/lm_eval/tasks/mutual.py
@@ -73,6 +73,12 @@ class MuTualBase(Task):
    def doc_to_text(self, doc):
        return self.detokenize(doc["article"])

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["article"]
+
    def doc_to_target(self, doc):
        return " " + self.detokenize(doc["options"][self.CHOICES.index(doc["answers"])])


--- a/lm_eval/tasks/naturalqs.py
+++ b/lm_eval/tasks/naturalqs.py
@@ -36,6 +36,12 @@ class NaturalQs(HFTask):
    def doc_to_text(self, doc):
        return 'Q: ' + doc['question']['text'] + '\n\n' + 'A:'

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc['question']['text']
+
    def doc_to_target(self, doc):
        # There's a short answer and a long answer. Based on the paper, I'm using the long answer.
        short_answer = doc['annotations']['short_answers'][0]['text']

--- a/lm_eval/tasks/openbookqa.py
+++ b/lm_eval/tasks/openbookqa.py
@@ -27,3 +27,10 @@ class OpenBookQA(HFTask, MultipleChoiceTask):

    def doc_to_text(self, doc):
        return doc["query"]
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["query"]
+
--- a/lm_eval/tasks/prost.py
+++ b/lm_eval/tasks/prost.py
@@ -55,3 +55,10 @@ class PROST(HFTask, MultipleChoiceTask):

    def doc_to_text(self, doc):
        return doc["query"]
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["query"]
+
--- a/lm_eval/tasks/qa4mre.py
+++ b/lm_eval/tasks/qa4mre.py
@@ -73,6 +73,13 @@ class QA4MRE(MultipleChoiceTask):
    def doc_to_text(self, doc):
        return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"])

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["source"] + " " + doc["query"]
+
+
 class QA4MRE_2011(QA4MRE):
    YEAR = 2011