Change glue and superglue prompts

c3f724cf · Leo Gao · 1050109b · c3f724cf · c3f724cf · c3f724cf
Commit c3f724cf authored Feb 08, 2021 by Leo Gao
Showing with 41 additions and 31 deletions

lm_eval/tasks/glue.py lm_eval/tasks/glue.py +17 -17

lm_eval/tasks/superglue.py lm_eval/tasks/superglue.py +12 -12

lm_eval/utils.py lm_eval/utils.py +11 -1

main.py main.py +1 -1

No files found.
--- a/lm_eval/tasks/glue.py
+++ b/lm_eval/tasks/glue.py
@@ -3,7 +3,7 @@ from lm_eval.base import rf, mean, f1_score, matthews_corrcoef
 from scipy.stats import pearsonr, spearmanr
 from tqdm import auto as tqdm_lib
 from . common import HFTask, yesno
-
+from ..utils import general_detokenize

 # Single-Sentence Tasks

@@ -22,10 +22,10 @@ class CoLA(HFTask):
        return True

    def fewshot_description(self):
-        return "Does this sentence make sense?:\tTrue or False?"
+        return "Does this sentence make sense? (True or False)"

    def doc_to_text(self, doc):
-        return "Sentence: {}\nAnswer:".format(doc["sentence"])
+        return "{}\nQuestion: Does this sentence make sense?\nAnswer:".format(doc["sentence"])

    def doc_to_target(self, doc):
        return " {}".format({1: "True", 0: "False"}[doc["label"]])
@@ -71,8 +71,8 @@ class SST(HFTask):
        return "Indicate if each sentence is Positive or Negative."

    def doc_to_text(self, doc):
-        return "sentence:\t{}\t\nanswer:".format(
-            doc["sentence"],
+        return "{}\nQuestion: Is this sentence Positive or Negative?\nAnswer:".format(
+            general_detokenize(doc["sentence"]),
        )

    def doc_to_target(self, doc):
@@ -127,9 +127,9 @@ class MNLI(HFTask):
            return self.data["test_matched"]

    def doc_to_text(self, doc):
-        return "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
+        return "{}\nQuestion: {} True, False or Neither?\nAnswer:".format(
            doc["premise"],
-            doc["hypothesis"],
+            doc["hypothesis"] + ('' if doc["hypothesis"].endswith('.') else '.'),
        )

    def doc_to_target(self, doc):
@@ -187,7 +187,7 @@ class QNLI(HFTask):
        return True

    def doc_to_text(self, doc):
-        return "question:\t{}\nresponse:\t{}\nDoes this answer the question, Yes or No?:".format(
+        return "{}\n{}\nQuestion: Does this response answer the question?\nAnswer:".format(
            doc["question"],
            doc["sentence"],
        )
@@ -235,7 +235,7 @@ class WNLI(HFTask):
        return True

    def doc_to_text(self, doc):
-        return "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
+        return "{}\nQuestion: {} True, False or Neither?\nAnswer:".format(
            doc["sentence1"],
            doc["sentence2"],
        )
@@ -284,7 +284,7 @@ class RTE(HFTask):
        return True

    def doc_to_text(self, doc):
-        return "{}\nquestion:\t{}\tTrue or False?\nanswer:".format(
+        return "{}\nQuestion: {} True or False?\nAnswer:".format(
            doc["sentence1"],
            doc["sentence2"],
        )
@@ -338,17 +338,17 @@ class MRPC(HFTask):
        return "Indicate if both sentences mean the same thing."

    def doc_to_text(self, doc):
-        return "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
-            doc["sentence1"],
-            doc["sentence2"],
+        return "Sentence 1: {}\nSentence 2: {}\nQuestion: Do both sentences mean the same thing?\nAnswer:".format(
+            general_detokenize(doc["sentence1"]),
+            general_detokenize(doc["sentence2"]),
        )

    def doc_to_target(self, doc):
        return " {}".format(yesno(doc["label"]))

    def construct_requests(self, doc, ctx):
-        ll_yes, _ = rf.loglikelihood(ctx, " yes")
-        ll_no, _ = rf.loglikelihood(ctx, " no")
+        ll_yes, _ = rf.loglikelihood(ctx, " Yes")
+        ll_no, _ = rf.loglikelihood(ctx, " No")
        return ll_yes, ll_no

    def process_results(self, doc, results):
@@ -390,7 +390,7 @@ class QQP(HFTask):
        return "Indicate if both questions ask the same thing."

    def doc_to_text(self, doc):
-        return "question 1:\t{}\nquestion 2:\t{}\nanswer:".format(
+        return "Question 1: {}\nQuestion 2: {}\nQuestion: Do both questions ask the same thing?\nAnswer:".format(
            doc["question1"],
            doc["question2"],
        )
@@ -443,7 +443,7 @@ class STSB(HFTask):
           "where 5 means identical and 0 means unrelated."

    def doc_to_text(self, doc):
-        return "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
+        return "sentence 1: {}\nsentence 2: {}\nAnswer:".format(
            doc["sentence1"],
            doc["sentence2"],
        )

--- a/lm_eval/tasks/superglue.py
+++ b/lm_eval/tasks/superglue.py
@@ -28,7 +28,7 @@ class BoolQ(HFTask):
        return "Read the following passages and answer each question with a yes or a no."

    def doc_to_text(self, doc):
-        return f"{doc['passage']}\nquestion: {doc['question']}\nanswer:"
+        return f"{doc['passage']}\nQuestion: {doc['question']}\nAnswer:"
    
    def doc_to_target(self, doc):
        return " " + yesno(doc['label']) 
@@ -80,7 +80,7 @@ class CommitmentBank(HFTask):
            "to the truth of the hypothesis. The three possible labels are true, false or neither."

    def doc_to_text(self, doc):
-        return "{}\nquestion: {} true, false or neither?\nanswer:".format(
+        return "{}\nQuestion: {}. True, False or Neither?\nAnswer:".format(
            doc["premise"],
            doc["hypothesis"],
        )
@@ -89,12 +89,12 @@ class CommitmentBank(HFTask):
        # True = entailment
        # False = contradiction
        # Neither = neutral
-        return " {}".format({0: "true", 1: "neither", 2: "false"}[doc["label"]])
+        return " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])

    def construct_requests(self, doc, ctx):
-        ll_true, _ = rf.loglikelihood(ctx, ' true')
-        ll_neither, _ = rf.loglikelihood(ctx, ' neither')
-        ll_false, _ = rf.loglikelihood(ctx, ' false')
+        ll_true, _ = rf.loglikelihood(ctx, ' True')
+        ll_neither, _ = rf.loglikelihood(ctx, ' Neither')
+        ll_false, _ = rf.loglikelihood(ctx, ' False')

        return ll_true, ll_neither, ll_false

@@ -214,15 +214,15 @@ class MultiRC(HFTask):
        return "READING COMPREHENSION ANSWER KEY"

    def doc_to_text(self, doc):
-        return f"{doc['paragraph']}\n\n{doc['question']}\n"
+        return f"{doc['paragraph']}\nQuestion: {doc['question']}\nAnswer:"

    def doc_to_target(self, doc):
        return self.format_answer(answer=doc["answer"], label=doc["label"])

    @staticmethod
    def format_answer(answer, label):
-        label_str = "True" if label else "False"
-        return f"[{label_str}] {answer}"
+        label_str = "Yes" if label else "No"
+        return f"{label_str}, {answer}"

    def construct_requests(self, doc, ctx):
        true_choice = self.format_answer(answer=doc["answer"], label=True)
@@ -364,8 +364,8 @@ class WordsInContext(HFTask):
        return ""

    def doc_to_text(self, doc):
-        return "{}\n{}\nQuestion: Is the word '{}' used in the same way in the" \
-               " two sentences above?\nanswer:".format(
+        return "Sentence 1: {}\nSentence 2: {}\nQuestion: Is the word '{}' used in the same way in the" \
+               " two sentences above?\nAnswer:".format(
                    doc["sentence1"],
                    doc["sentence2"],
                    doc["sentence1"][doc["start1"]:doc["end1"]],
@@ -438,7 +438,7 @@ class SGWinogradSchemaChallenge(HFTask):
        # NOTE: HuggingFace span indices are word-based not character-based.
        pre = " ".join(raw_passage.split()[:doc["span2_index"]])
        post = raw_passage[len(pre) + len(doc["span2_text"]) + 1:]
-        passage = pre + " *{}*".format(doc['span2_text']) + post
+        passage = general_detokenize(pre + " *{}*".format(doc['span2_text']) + post)
        noun = doc["span1_text"]
        pronoun = doc["span2_text"]
        text = (

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
 import os
+import re


 class ExitCodeError(Exception):
@@ -39,4 +40,13 @@ def chunks(iter, n):
            yield arr
            arr = []
    
-    if arr: yield arr
\ No newline at end of file
+    if arr: yield arr
+
+def general_detokenize(string):
+    string = string.replace(" n't", "n't")
+    string = string.replace(" )", ")")
+    string = string.replace("( ", "(")
+    string = string.replace("\" ", "\"")
+    string = string.replace(" \"", "\"")
+    string = re.sub(r" (['.,])", r"\1")
+    return string
\ No newline at end of file
--- a/main.py
+++ b/main.py
@@ -16,7 +16,7 @@ def parse_args():
    parser.add_argument('--model_args', default="")
    parser.add_argument('--tasks', default="all_tasks")
    parser.add_argument('--provide_description', action="store_true")
-    parser.add_argument('--num_fewshot', type=int, default=1)
+    parser.add_argument('--num_fewshot', type=int, default=0)
    parser.add_argument('--seed', type=int, default=1234)
    parser.add_argument('--output_path', default=None)
    parser.add_argument('--limit', type=int, default=None)