Merge pull request #86 from jon-tow/doc-to-text-refactor

Move `doc_to_text` target code into `doc_to_target`

Merge pull request #86 from jon-tow/doc-to-text-refactor
Move `doc_to_text` target code into `doc_to_target`
0e4139b8 · Leo Gao · GitHub · e5d0229f · d77241eb · 0e4139b8
Unverified Commit 0e4139b8 authored Jan 13, 2021 by Leo Gao Committed by GitHub Jan 13, 2021
14 changed files
--- a/lm_eval/tasks/anli.py
+++ b/lm_eval/tasks/anli.py
@@ -34,16 +34,16 @@ class ANLIBase(HFTask):
        # TODO: figure out description
        return ""

-    def doc_to_text(self, doc, include_target=True):
+    def doc_to_text(self, doc):
        print(doc)
        # OA does this a bit weirdly: they prepend "anli 1:  anli 1:  " to the beginning
        # of the prompt (yes, repeating it!). also, " True, False, or Neither?" is directly 
        # appended onto the question, with no "Answer:" or even a newline. Do we *really* 
        # want to do it exactly as OA did?
-        q = doc['premise'] + '\nQuestion: ' + doc['hypothesis'] + '\n'
+        return doc['premise'] + '\nQuestion: ' + doc['hypothesis'] + '\nTrue, False, or Neither?'

-        a = "True, False, or Neither?" + ((" " + ["True", "Neither", "False"][doc['label']]) if include_target else '')
-        return q + a
+    def doc_to_target(self, doc):
+        return " " + ["True", "Neither", "False"][doc['label']]

    # TODO: Implement evaluation code


--- a/lm_eval/tasks/arc.py
+++ b/lm_eval/tasks/arc.py
@@ -19,10 +19,11 @@ class ARCEasy(HFTask):
        # TODO: figure out description
        return ""

-    def doc_to_text(self, doc, include_target=True):
-        q = "Question: " + doc['question'] + '\n'
-        a = "Answer:" + ((" " + doc['choices']['text'][doc['choices']['label'].index(doc['answerKey'])]) if include_target else "")
-        return q + a
+    def doc_to_text(self, doc):
+        return "Question: " + doc['question'] + '\nAnswer:'
+
+    def doc_to_target(self, doc):
+        return " " + doc['choices']['text'][doc['choices']['label'].index(doc['answerKey'])]

    def evaluate(self, docs, lm, provide_description, num_fewshot):
        # TODO: implement

--- a/lm_eval/tasks/glue.py
+++ b/lm_eval/tasks/glue.py
@@ -39,11 +39,11 @@ class CoLA(HFTask):
    def fewshot_description(self):
        return "Does this sentence make sense?:\tTrue or False?"

-    def doc_to_text(self, doc, include_target=True):
-        text = "Sentence: {}\nAnswer:".format(doc["sentence"])
-        if include_target:
-            text += " {}".format({1: "True", 0: "False"}[doc["label"]])
-        return text
+    def doc_to_text(self, doc):
+        return "Sentence: {}\nAnswer:".format(doc["sentence"])
+
+    def doc_to_target(self, doc):
+        return " {}".format({1: "True", 0: "False"}[doc["label"]])

    def evaluate(self, docs, lm, provide_description, num_fewshot):

@@ -92,17 +92,17 @@ class MNLI(HFTask):
        if self.has_test_docs():
            return self.data["test_matched"]

-    def doc_to_text(self, doc, include_target=True):
-        text = "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
+    def doc_to_text(self, doc):
+        return "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
            doc["premise"],
            doc["hypothesis"],
        )
-        if include_target:
-            # True = entailment
-            # False = contradiction
-            # Neither = neutral
-            text += " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
-        return text
+
+    def doc_to_target(self, doc):
+        # True = entailment
+        # False = contradiction
+        # Neither = neutral
+        return " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])

    def evaluate(self, docs, lm, provide_description, num_fewshot):
        # TODO: Implement evaluation code using new framework
@@ -154,14 +154,14 @@ class MRPC(HFTask):
    def fewshot_description(self):
        return "Indicate if both sentences mean the same thing."

-    def doc_to_text(self, doc, include_target=True):
-        text = "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
+    def doc_to_text(self, doc):
+        return "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
            doc["sentence1"],
            doc["sentence2"],
        )
-        if include_target:
-            text += " {}".format(yesno(doc["label"]))
-        return text
+
+    def doc_to_target(self, doc):
+        return " {}".format(yesno(doc["label"]))

    def evaluate(self, docs, lm, provide_description, num_fewshot):
        # TODO: Implement evaluation code using new framework
@@ -194,16 +194,16 @@ class RTE(HFTask):
    def has_test_docs(self):
        return True

-    def doc_to_text(self, doc, include_target=True):
-        text = "{}\nquestion:\t{}\tTrue or False?\nanswer:".format(
+    def doc_to_text(self, doc):
+        return "{}\nquestion:\t{}\tTrue or False?\nanswer:".format(
            doc["sentence1"],
            doc["sentence2"],
        )
-        if include_target:
-            # 0 = entailment
-            # 1 = not_entailment
-            text += " {}".format({0: "True", 1: "False"}[doc["label"]])
-        return text
+
+    def doc_to_target(self, doc):
+        # 0 = entailment
+        # 1 = not_entailment
+        return " {}".format({0: "True", 1: "False"}[doc["label"]])

    def evaluate(self, docs, lm, provide_description, num_fewshot):
        # TODO: Implement evaluation code using new framework
@@ -236,16 +236,16 @@ class QNLI(HFTask):
    def has_test_docs(self):
        return True

-    def doc_to_text(self, doc, include_target=True):
-        text = "question:\t{}\nresponse:\t{}\nDoes this answer the question, Yes or No?:".format(
+    def doc_to_text(self, doc):
+        return "question:\t{}\nresponse:\t{}\nDoes this answer the question, Yes or No?:".format(
            doc["question"],
            doc["sentence"],
        )
-        if include_target:
-            # True = entailment
-            # False = not entailment
-            text += " {}".format({0: "Yes", 1: "No"}[doc["label"]])
-        return text
+
+    def doc_to_target(self, doc):
+        # True = entailment
+        # False = not entailment
+        return " {}".format({0: "Yes", 1: "No"}[doc["label"]])

    def evaluate(self, docs, lm, provide_description, num_fewshot):
        # TODO: Implement evaluation code using new framework
@@ -281,14 +281,14 @@ class QQP(HFTask):
    def fewshot_description(self):
        return "Indicate if both questions ask the same thing."

-    def doc_to_text(self, doc, include_target=True):
-        text = "question 1:\t{}\nquestion 2:\t{}\nanswer:".format(
+    def doc_to_text(self, doc):
+        return "question 1:\t{}\nquestion 2:\t{}\nanswer:".format(
            doc["question1"],
            doc["question2"],
        )
-        if include_target:
-            text += " {}".format(yesno(doc["label"]))
-        return text
+
+    def doc_to_target(self, doc):
+        return " {}".format(yesno(doc["label"]))

    def evaluate(self, docs, lm, provide_description, num_fewshot):
        # TODO: Implement evaluation code using new framework
@@ -325,14 +325,14 @@ class STSB(HFTask):
        return "Indicate if both sentences mean the same thing from a scale of 0-5, " \
           "where 5 means identical and 0 means unrelated."

-    def doc_to_text(self, doc, include_target=True):
-        text = "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
+    def doc_to_text(self, doc):
+        return "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
            doc["sentence1"],
            doc["sentence2"],
        )
-        if include_target:
-            text += " {}".format(doc["label"])
-        return text
+
+    def doc_to_target(self, doc):
+        return " {}".format(doc["label"])

    def evaluate(self, docs, lm, provide_description, num_fewshot):
        # TODO: Implement evaluation code using new framework
@@ -386,13 +386,13 @@ class SST(HFTask):
    def fewshot_description(self):
        return "Indicate if each sentence is Positive or Negative."

-    def doc_to_text(self, doc, include_target=True):
-        text = "sentence:\t{}\t\nanswer:".format(
+    def doc_to_text(self, doc):
+        return "sentence:\t{}\t\nanswer:".format(
            doc["sentence"],
        )
-        if include_target:
-            text += " {}".format({1: "Positive", 0: "Negative"}[doc["label"]])
-        return text
+
+    def doc_to_target(self, doc):
+        return " {}".format({1: "Positive", 0: "Negative"}[doc["label"]])

    def evaluate(self, docs, lm, provide_description, num_fewshot):
        # TODO: Implement evaluation code using new framework
@@ -425,17 +425,17 @@ class WNLI(HFTask):
    def has_test_docs(self):
        return True

-    def doc_to_text(self, doc, include_target=True):
-        text = "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
+    def doc_to_text(self, doc):
+        return "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
            doc["sentence1"],
            doc["sentence2"],
        )
-        if include_target:
-            # True = entailment
-            # False = contradiction
-            # Neither = neutral
-            text += " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
-        return text
+
+    def doc_to_target(self, doc):
+        # True = entailment
+        # False = contradiction
+        # Neither = neutral
+        return " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])

    def evaluate(self, docs, lm, provide_description, num_fewshot):
        # TODO: Implement evaluation code using new framework

--- a/lm_eval/tasks/hellaswag.py
+++ b/lm_eval/tasks/hellaswag.py
@@ -34,22 +34,22 @@ class HellaSwag(HFTask):
    def fewshot_description(self):
        return "Label for the relevant action: Sentences describing the context, with an incomplete sentence trailing\nanswer that plausibly completes the situation."

-    def doc_to_text(self, doc, include_target=True):
-        text = doc['activity_label'] + ': ' + doc['ctx'] + '\n'
-        if include_target:
-            letter_answer = doc['label']
-            if letter_answer == '0':
-                index = 0
-            elif letter_answer == '1':
-                index = 1
-            elif letter_answer == '2':
-                index = 2
-            elif letter_answer == '3':
-                index = 3
-            else:
-                raise ValueError("HellaSwag from HF datasets contained an invalid answer key")
-            text += doc['endings'][index]
-        return text
+    def doc_to_text(self, doc):
+        return doc['activity_label'] + ': ' + doc['ctx'] + '\n'
+
+    def doc_to_target(self, doc):
+        letter_answer = doc['label']
+        if letter_answer == '0':
+            index = 0
+        elif letter_answer == '1':
+            index = 1
+        elif letter_answer == '2':
+            index = 2
+        elif letter_answer == '3':
+            index = 3
+        else:
+            raise ValueError("HellaSwag from HF datasets contained an invalid answer key")
+        return doc['endings'][index]

    # TODO: Implement evaluation code


--- a/lm_eval/tasks/naturalqs.py
+++ b/lm_eval/tasks/naturalqs.py
@@ -32,23 +32,19 @@ class NaturalQs(HFTask):

        return random.sample(self._traindocs, k)

-    def doc_to_text(self, doc, include_target=True):
-        question = doc['question']['text']
-        
-        text = 'Q: ' + question + '\n\n' + 'A: '
-
-        if include_target:
-            # There's a short answer and a long answer. Based on the paper, I'm using the long answer.
-            short_answer = doc['annotations']['short_answers'][0]['text']
-            long_answer_start = doc['annotations']['long_answer'][0]['start_token']
-            long_answer_end = doc['annotations']['long_answer'][0]['end_token']
-            long_answer_span = doc['document']['tokens']['token'][long_answer_start:long_answer_end]
-            long_answer_is_html = doc['document']['tokens']['is_html'][long_answer_start:long_answer_end]
-            long_answer_chars = [tok for (tok, is_html) in zip(long_answer_span, long_answer_is_html) if not is_html]
-            long_answer = " ".join(long_answer_chars)
-            text += long_answer # Replace with short_answer[0] for short answer
-
-        return text
+    def doc_to_text(self, doc):
+        return 'Q: ' + doc['question']['text'] + '\n\n' + 'A: '
+
+    def doc_to_target(self, doc):
+        # There's a short answer and a long answer. Based on the paper, I'm using the long answer.
+        short_answer = doc['annotations']['short_answers'][0]['text']
+        long_answer_start = doc['annotations']['long_answer'][0]['start_token']
+        long_answer_end = doc['annotations']['long_answer'][0]['end_token']
+        long_answer_span = doc['document']['tokens']['token'][long_answer_start:long_answer_end]
+        long_answer_is_html = doc['document']['tokens']['is_html'][long_answer_start:long_answer_end]
+        long_answer_chars = [tok for (tok, is_html) in zip(long_answer_span, long_answer_is_html) if not is_html]
+        long_answer = " ".join(long_answer_chars)
+        return long_answer # Replace with short_answer[0] for short answer

    # TODO: Implement evaluation code


--- a/lm_eval/tasks/openbookqa.py
+++ b/lm_eval/tasks/openbookqa.py
@@ -36,22 +36,22 @@ class OpenBookQA(HFTask):
    def fewshot_description(self):
        return "Text of the question prompt\nText of the answer completion"

-    def doc_to_text(self, doc, include_target=True):
-        text = doc['question_stem'] + '\n'
-        if include_target:
-            letter_answer = doc['answerKey']
-            if letter_answer == 'A':
-                index = 0
-            elif letter_answer == 'B':
-                index = 1
-            elif letter_answer == 'C':
-                index = 2
-            elif letter_answer == 'D':
-                index = 3
-            else:
-                raise ValueError("OpenBookQA from HF datasets contained an invalid answer key")
-            text += doc['choices']['text'][index] + '.'
-        return text
+    def doc_to_text(self, doc):
+        return doc['question_stem'] + '\n'
+
+    def doc_to_target(self, doc):
+        letter_answer = doc['answerKey']
+        if letter_answer == 'A':
+            index = 0
+        elif letter_answer == 'B':
+            index = 1
+        elif letter_answer == 'C':
+            index = 2
+        elif letter_answer == 'D':
+            index = 3
+        else:
+            raise ValueError("OpenBookQA from HF datasets contained an invalid answer key")
+        return doc['choices']['text'][index] + '.'

    # TODO: Implement evaluation code


--- a/lm_eval/tasks/piqa.py
+++ b/lm_eval/tasks/piqa.py
@@ -47,13 +47,14 @@ class PiQA(Dataset):
    def fewshot_description(self):
        pass
    
-    def doc_to_text(self, doc, include_target=True):
-        if include_target:
-            rightanswer = int(doc[1][0])+1
-            return ''.join([doc[0]['goal'],' ',doc[0]['sol'+str(rightanswer)]])
+    def doc_to_text(self, doc):
        #TODO: check if oa uses newline
        return  doc['goal'] + ' '

+    def doc_to_target(self, doc):
+        rightanswer = int(doc[1][0]) + 1
+        return ''.join([doc[0]['goal'],' ',doc[0]['sol'+str(rightanswer)]])
+
    # TODO: Implement evaluation code

    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 

--- a/lm_eval/tasks/quac.py
+++ b/lm_eval/tasks/quac.py
@@ -55,11 +55,11 @@ class QuAC(Dataset):
                docs.append(doc)  
        return docs
    
-    def doc_to_text(self, doc, include_target=True):
-        text = 'TITLE: ' + doc['title'] + '\n' + 'PARAGRAPH: ' + doc['paragraph'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A: '
-        if include_target:
-            text += doc['answer']
-        return text
+    def doc_to_text(self, doc):
+        return 'TITLE: ' + doc['title'] + '\n' + 'PARAGRAPH: ' + doc['paragraph'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A: '
+
+    def doc_to_target(self, doc):
+        return doc['answer']

    # TODO: Implement evaluation code


--- a/lm_eval/tasks/squad.py
+++ b/lm_eval/tasks/squad.py
@@ -31,16 +31,16 @@ class SQuAD(HFTask):
        # TODO: redo description
        return "Title: The_Title_of_It\n\nBackground: A text passage as background to answer the question with.\n\nQ: Question about the passage.\n\nA: Answer."

-    def doc_to_text(self, doc, include_target=True):
-        text = 'Title: ' + doc['title'] + '\n\n' + 'Background: ' + doc['context'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A: '
-        if include_target:
-            answer_list = doc['answers']['text']
-            if len(answer_list) > 0:
-                answer = answer_list[0]
-            else:
-                answer = 'unanswerable'
-            text += answer
-        return text
+    def doc_to_text(self, doc):
+        return 'Title: ' + doc['title'] + '\n\n' + 'Background: ' + doc['context'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A: '
+
+    def doc_to_target(self, doc):
+        answer_list = doc['answers']['text']
+        if len(answer_list) > 0:
+            answer = answer_list[0]
+        else:
+            answer = 'unanswerable'
+        return answer

    # TODO: Implement evaluation code


--- a/lm_eval/tasks/storycloze.py
+++ b/lm_eval/tasks/storycloze.py
@@ -41,11 +41,11 @@ class StoryCloze(Dataset):
    def fewshot_description(self):
        pass
    
-    def doc_to_text(self, doc, include_target=True):
-        if include_target:
-            return ' '.join([*doc[1:5],doc[int(doc[-1])-4]]) 
-        else:
-            return ' '.join([*doc[1:5]])
+    def doc_to_text(self, doc):
+        return ' '.join([*doc[1:5]])
+
+    def doc_to_target(self, doc):
+        return " " + doc[int(doc[-1]) - 4]

    # TODO: Implement evaluation code


--- a/lm_eval/tasks/superglue.py
+++ b/lm_eval/tasks/superglue.py
@@ -69,17 +69,17 @@ class CommitmentBank(HFTask):
    def has_test_docs(self):
        return True

-    def doc_to_text(self, doc, include_target=True):
-        text = "{}\nquestion:\t{}\ttrue, false or neither?\nanswer:".format(
+    def doc_to_text(self, doc):
+        return "{}\nquestion:\t{}\ttrue, false or neither?\nanswer:".format(
            doc["premise"],
            doc["hypothesis"],
        )
-        if include_target:
-            # True = entailment
-            # False = contradiction
-            # Neither = neutral
-            text += " {}".format({0: "true", 1: "neither", 2: "false"}[doc["label"]])
-        return text
+
+    def doc_to_target(self, doc):
+        # True = entailment
+        # False = contradiction
+        # Neither = neutral
+        return " {}".format({0: "true", 1: "neither", 2: "false"}[doc["label"]])

    def evaluate(self, docs, lm, provide_description, num_fewshot):
        # TODO: Implement evaluation code using new framework
@@ -117,18 +117,18 @@ class Copa(HFTask):
    def has_test_docs(self):
        return True

-    def doc_to_text(self, doc, include_target=True):
+    def doc_to_text(self, doc):
        # Drop the period
        connector = {
            "cause": "because",
            "effect": "therefore",
        }[doc["question"]]
-        text = doc["premise"].strip()[:-1] + f" {connector} "
-        if include_target:
-            correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"]
-            # Connect the sentences
-            text += self.convert_choice(correct_choice)
-        return text
+        return doc["premise"].strip()[:-1] + f" {connector} "
+
+    def doc_to_target(self, doc):
+        correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"]
+        # Connect the sentences
+        return self.convert_choice(correct_choice)

    def evaluate(self, docs, lm, provide_description, num_fewshot):
        # TODO: Implement evaluation code using new framework
@@ -170,10 +170,11 @@ class MultiRC(HFTask):
    def fewshot_description(self):
        return "READING COMPREHENSION ANSWER KEY"

-    def doc_to_text(self, doc, include_target=True):
-        return f"{doc['paragraph']}\n\n{doc['question']}\n" \
-            + (self.format_answer(answer=doc["answer"], label=doc["label"])
-               if include_target else "")
+    def doc_to_text(self, doc):
+        return f"{doc['paragraph']}\n\n{doc['question']}\n"
+
+    def doc_to_target(self, doc):
+        return self.format_answer(answer=doc["answer"], label=doc["label"])

    @staticmethod
    def format_answer(answer, label):
@@ -229,16 +230,16 @@ class WordsInContext(HFTask):
    def has_test_docs(self):
        return True

-    def doc_to_text(self, doc, include_target=True):
-        text = "{}\n{}\nquestion\tIs the word '{}' used in the same way in the" \
+    def doc_to_text(self, doc):
+        return "{}\n{}\nquestion\tIs the word '{}' used in the same way in the" \
               " two sentences above?\nanswer:".format(
                    doc["sentence1"],
                    doc["sentence2"],
                    doc["sentence1"][doc["start1"]:doc["end1"]],
                )
-        if include_target:
-            text += " {}".format({0: "no", 1: "yes"}[doc["label"]])
-        return text
+
+    def doc_to_target(self, doc):
+        return " {}".format({0: "no", 1: "yes"}[doc["label"]])

    def evaluate(self, docs, lm, provide_description, num_fewshot):
        # TODO: Implement evaluation code using new framework
@@ -288,7 +289,7 @@ class SGWinogradSchemaChallenge(HFTask):
           "For each passage, you must identify which noun the pronoun marked in *bold*" \
           " refers to.\n====="

-    def doc_to_text(self, doc, include_target=True):
+    def doc_to_text(self, doc):
        raw_passage = doc["text"]
        passage = (
            raw_passage[:doc["span2_index"]]
@@ -301,10 +302,11 @@ class SGWinogradSchemaChallenge(HFTask):
            + f"Question: In the passage above, what does the pronoun \"*{pronoun}*\" refer to?\n"
            + "Answer:"
        )
-        if include_target:
-            text += " {}".format(doc["span1_text"])
        return text

+    def doc_to_target(self, doc):
+        return " {}".format(doc["span1_text"])
+
    def evaluate(self, docs, lm, provide_description, num_fewshot):
        # TODO: Implement evaluation code using new framework

@@ -336,16 +338,12 @@ class RTE(HFTask):
        #TODO: implement
        pass

-    def doc_to_text(self, doc, include_target=True):
-        if include_target:
-            if doc['label'] == 0:
-                answer = 'True'
-            else:
-                answer = 'False'
-            return ''.join([doc['premise'], '\nquestion: ',doc['hypothesis'], ' True or False?\nanswer: ', answer])
-        else:
-            return ''.join([doc['premise'], '\nquestion: ',doc['hypothesis'], ' True or False?\nanswer: '])
-    
+    def doc_to_text(self, doc):
+        return ''.join([doc['premise'], '\nquestion: ',doc['hypothesis'], ' True or False?\nanswer: '])
+
+    def doc_to_target(self, doc):
+        return 'True' if doc['label'] == 0 else 'False'
+
    # TODO: Implement evaluation code

    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 

--- a/lm_eval/tasks/triviaqa.py
+++ b/lm_eval/tasks/triviaqa.py
@@ -39,11 +39,11 @@ class TriviaQA(Dataset):
    def fewshot_description(self):
        pass
    
-    def doc_to_text(self, doc, include_target=True):
-        if include_target:
-            return ''.join(['Q: ', doc['Question'], '\n\n','A: ', doc['Answer']['Aliases'][0]])
-        else:
-            return ''.join(['Q: ', doc['Question'], '\n\n','A: '])
+    def doc_to_text(self, doc):
+        return ''.join(['Q: ', doc['Question'], '\n\n','A: '])
+
+    def doc_to_target(self, doc):
+        return doc['Answer']['Aliases'][0]

    # TODO: Implement evaluation code


--- a/lm_eval/tasks/webqs.py
+++ b/lm_eval/tasks/webqs.py
@@ -19,15 +19,15 @@ class WebQs(HFTask):
        # TODO: figure out description
        return ""

-    def doc_to_text(self, doc, include_target=True):
+    def doc_to_text(self, doc):
        print(doc)
-        q = "Q: " + doc['question'] + '\n'
+        return "Q: " + doc['question'] + '\nA:'

+    def doc_to_target(self, doc):
        # this picks one answer to be the "correct" one, despite sometimes 
        # multiple correct answers being possible.
        # TODO: make sure we're actually handling multi-answer correctly
-        a = "A:" + ((" " + doc['answers'][0]) if include_target else '')
-        return q + a
+        return " " + doc['answers'][0]

    # TODO: Implement evaluation code


--- a/lm_eval/tasks/winogrande.py
+++ b/lm_eval/tasks/winogrande.py
@@ -34,18 +34,19 @@ class Winogrande(HFTask):
    def fewshot_description(self):
        return "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in."

-    def doc_to_text(self, doc, include_target=True):
+    def doc_to_text(self, doc):
+        return doc['sentence']
+
+    def doc_to_target(self, doc):
        text = doc['sentence']
-        if include_target:
-            answer_n = doc['answer']
-            if answer_n == '1':
-                answer = doc['option1']
-            elif answer_n == '2':
-                answer = doc['option2']
-            else:
-                raise ValueError("Winogrande from HF datasets contained an invalid answer key")
-            text = text.replace("_", answer)
-        return text
+        answer_n = doc['answer']
+        if answer_n == '1':
+            answer = doc['option1']
+        elif answer_n == '2':
+            answer = doc['option2']
+        else:
+            raise ValueError("Winogrande from HF datasets contained an invalid answer key")
+        return text.replace("_", answer)

    # TODO: Implement evaluation code