Merge pull request #671 from EleutherAI/revamp-process

Revamp process

Merge pull request #671 from EleutherAI/revamp-process
Revamp process
2c20cd1f · Lintang Sutawika · GitHub · 6862fa7d · 0dadc92a · 2c20cd1f
Unverified Commit 2c20cd1f authored Jul 14, 2023 by Lintang Sutawika Committed by GitHub Jul 14, 2023
20 changed files
--- a/lm_eval/tasks/qa4mre/qa4mre_2011.yaml
+++ b/lm_eval/tasks/qa4mre/qa4mre_2011.yaml
@@ -5,10 +5,10 @@ dataset_path: qa4mre
 dataset_name: 2011.main.EN
 output_type: multiple_choice
 test_split: train
-template_aliases: "{% set answer_choices = answer_options['answer_str'] %}"
-doc_to_text: "{{document_str.strip()}}\nQuestion: {{question_str}}\nChoices:\n- {{answer_choices|join('\n- ')}}\nAnswer:"
-doc_to_target: !function preprocess_qa4mre.doc_to_target
-gold_alias: !function preprocess_qa4mre.qa4mre_process
+# doc_to_text: "{{document_str.strip()}}\nQuestion: {{question_str}}\nChoices:\n- {{answer_choices|join('\n- ')}}\nAnswer:"
+doc_to_text: "{{document_str.strip()}}\nQuestion: {{question_str}}\nAnswer:"
+doc_to_target: "{{correct_answer_id|int - 1}}"
+doc_to_choice: "{{answer_options.answer_str}}"
 should_decontaminate: true
 doc_to_decontamination_query: "{{document_str.strip()}} + ' ' + {{question_str}}"
 metric_list:

--- a/lm_eval/tasks/qa4mre/qa4mre_2012.yaml
+++ b/lm_eval/tasks/qa4mre/qa4mre_2012.yaml
-group:
-  - multiple_choice
+include: qa4mre_2011.yaml
 task: qa4mre_2012
 dataset_path: qa4mre
 dataset_name: 2012.main.EN
-output_type: multiple_choice
-test_split: train
-template_aliases: "{% set answer_choices = answer_options['answer_str'] %}"
-doc_to_text: "{{document_str.strip()}}\nQuestion: {{question_str}}\nChoices:\n- {{answer_choices|join('\n- ')}}\nAnswer:"
-doc_to_target: !function preprocess_qa4mre.doc_to_target
-gold_alias: !function preprocess_qa4mre.qa4mre_process
-should_decontaminate: true
-doc_to_decontamination_query: "{{document_str.strip()}} + ' ' + {{question_str}}"
-metric_list:
-  - metric: acc
-    aggregation: mean
-    higher_is_better: true
-  - metric: acc_norm
-    aggregation: mean
-    higher_is_better: true
--- a/lm_eval/tasks/qa4mre/qa4mre_2013.yaml
+++ b/lm_eval/tasks/qa4mre/qa4mre_2013.yaml
-group:
-  - multiple_choice
+include: qa4mre_2011.yaml
 task: qa4mre_2013
 dataset_path: qa4mre
 dataset_name: 2013.main.EN
-output_type: multiple_choice
-test_split: train
-template_aliases: "{% set answer_choices = answer_options['answer_str'] %}"
-doc_to_text: "{{document_str.strip()}}\nQuestion: {{question_str}}\nChoices:\n- {{answer_choices|join('\n- ')}}\nAnswer:"
-doc_to_target: !function preprocess_qa4mre.doc_to_target
-gold_alias: !function preprocess_qa4mre.qa4mre_process
-should_decontaminate: true
-doc_to_decontamination_query: "{{document_str.strip()}} + ' ' + {{question_str}}"
-metric_list:
-  - metric: acc
-    aggregation: mean
-    higher_is_better: true
-  - metric: acc_norm
-    aggregation: mean
-    higher_is_better: true
--- a/lm_eval/tasks/race/preprocess_race.py
+++ b/lm_eval/tasks/race/preprocess_race.py
@@ -15,7 +15,7 @@ def get_answer_option(problem):
    return problem["options"][answer]


-def create_choices(doc):
+def doc_to_choice(doc):
    problem = last_problem(doc)
    choices = [problem["options"][i] for i in range(4)]
    return choices

--- a/lm_eval/tasks/race/race.yaml
+++ b/lm_eval/tasks/race/race.yaml
@@ -5,9 +5,9 @@ dataset_path: EleutherAI/race
 dataset_name: high
 output_type: multiple_choice
 test_split: test
-create_choices: !function preprocess_race.create_choices
 doc_to_text: !function preprocess_race.doc_to_text
 doc_to_target: !function preprocess_race.doc_to_target
+doc_to_choice: !function preprocess_race.doc_to_choice
 metric_list:
  - metric: acc
    aggregation: mean

--- a/lm_eval/tasks/sciq/sciq.yaml
+++ b/lm_eval/tasks/sciq/sciq.yaml
@@ -7,10 +7,11 @@ output_type: multiple_choice
 training_split: train
 validation_split: validation
 test_split: test
-template_aliases: "{% set answer_choices = [distractor1, distractor2, distractor3, correct_answer] %}{% set gold = 3 %}" # set the list of possible answer choices, and set what this doc's gold label idx is
 doc_to_text: "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:"
-doc_to_target: "{{correct_answer}}"
-gold_alias: "{{gold}}" # this will be cast to an int.
+doc_to_target: 3
+doc_to_choice: "{{[distractor1, distractor2, distractor3, correct_answer]}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{support}} {{question}}"
 metric_list:
  - metric: acc
    aggregation: mean

--- a/lm_eval/tasks/super_glue/boolq/default.yaml
+++ b/lm_eval/tasks/super_glue/boolq/default.yaml
@@ -6,13 +6,10 @@ dataset_name: boolq
 output_type: multiple_choice
 training_split: train
 validation_split: validation
-doc_to_text: "{{passage}}\nQuestion: {{question}}\nAnswer:"
-doc_to_target: "{{answer_choices[label]}}"
-gold_alias: "{{label}}" # this will be cast to an int.
-template_aliases: "{% set answer_choices = ['no', 'yes'] %}"
+doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"
+doc_to_target: label
+doc_to_choice: ["no", "yes"]
+should_decontaminate: true
+doc_to_decontamination_query: passage
 metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
+  - metric: acc
--- a/lm_eval/tasks/super_glue/boolq/seq2seq.yaml
+++ b/lm_eval/tasks/super_glue/boolq/seq2seq.yaml
@@ -6,16 +6,15 @@ dataset_name: boolq
 output_type: greedy_until
 training_split: train
 validation_split: validation
-doc_to_text: "{{passage}}\nQuestion: {{question}}\nAnswer:"
-doc_to_target: "{{answer_choices[label]}}"
-gold_alias: " {{answer_choices[label]}}" # this will be cast to an int.
+doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"
+doc_to_target: "{{[' no', ' yes'][label]}}"
+target_delimiter: ""
 generation_kwargs:
  until:
    - "\n\n"
    - "\n"
  do_sample: false
  temperature: 0.0
-template_aliases: "{% set answer_choices = ['no', 'yes'] %}"
 metric_list:
  - metric: exact_match
    aggregation: mean

--- a/lm_eval/tasks/super_glue/cb/default.yaml
+++ b/lm_eval/tasks/super_glue/cb/default.yaml
 group:
  - super-glue-lm-eval-v1
-task: "cb"
+task: cb
 dataset_path: super_glue
 dataset_name: cb
 output_type: multiple_choice
 training_split: train
 validation_split: validation
 doc_to_text: "{{premise}}\nQuestion: {{hypothesis}}. True, False, or Neither?\nAnswer:"
-doc_to_target: "{{answer_choices[label]}}"
-gold_alias: "{{label}}" # this will be cast to an int.
-template_aliases: "{% set answer_choices = ['True', 'False', 'Neither'] %}"
+doc_to_target: label
+doc_to_choice: ['True', 'False', 'Neither']
 metric_list:
  - metric: acc
  - metric: f1

--- a/lm_eval/tasks/super_glue/copa/default.yaml
+++ b/lm_eval/tasks/super_glue/copa/default.yaml
 group:
  - super-glue-lm-eval-v1
-task: "copa"
+task: copa
 dataset_path: super_glue
 dataset_name: copa
 output_type: multiple_choice
@@ -8,7 +8,6 @@ training_split: train
 validation_split: validation
 doc_to_text: !function utils.doc_to_text
 doc_to_target: !function utils.doc_to_target
-gold_alias: "{{label}}" # this will be cast to an int.
-template_aliases: "{% set answer_choices = [{{doc.choice1}}, 'b'] %} {{answer_choices}}"
+doc_to_choice: !function utils.doc_to_choice
 metric_list:
  - metric: acc
--- a/lm_eval/tasks/super_glue/copa/utils.py
+++ b/lm_eval/tasks/super_glue/copa/utils.py
@@ -15,3 +15,7 @@ def doc_to_target(doc):
    correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"]
    # Connect the sentences
    return " " + convert_choice(correct_choice)
+
+
+def doc_to_choice(doc):
+    return [" " + convert_choice(doc["choice1"]), " " + convert_choice(doc["choice2"])]
--- a/lm_eval/tasks/super_glue/multirc/default.yaml
+++ b/lm_eval/tasks/super_glue/multirc/default.yaml
+group:
+  - super-glue-lm-eval-v1
+task: multirc
+dataset_path: super_glue
+dataset_name: multirc
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: "{{paragraph}}\nQuestion: {{question}}\nAnswer:"
+doc_to_target: label
+doc_to_choice: "['''{{answer}}\\nIs the answer correct? yes''', '''{{answer}}\\nIs the answer correct? no''']"
+metric_list:
+  - metric: acc
--- a/lm_eval/tasks/super_glue/record/default.yaml
+++ b/lm_eval/tasks/super_glue/record/default.yaml
+# group:
+#   - super-glue-lm-eval-v1
+task: record
+dataset_path: super_glue
+dataset_name: record
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: !function util.doc_to_text
+doc_to_target: "{{answers}}"
+doc_to_choice: "{{entities}}"
+metric_list:
+  - metric: f1
+  - metric: em
--- a/lm_eval/tasks/super_glue/record/promptsource-01.yaml
+++ b/lm_eval/tasks/super_glue/record/promptsource-01.yaml
-include: promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "Add sentence after after (continuation choices)"
-use_prompt: "promptsource:Add sentence after after (continuation choices)"
--- a/lm_eval/tasks/super_glue/record/promptsource-02.yaml
+++ b/lm_eval/tasks/super_glue/record/promptsource-02.yaml
-include: promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "Can you figure out…"
-use_prompt: "promptsource:Can you figure out…"
--- a/lm_eval/tasks/super_glue/record/util.py
+++ b/lm_eval/tasks/super_glue/record/util.py
+def doc_to_text(doc):
+    initial_text, *highlights = doc["passage"].strip().split("\n@highlight\n")
+    text = initial_text + "\n\n"
+    for highlight in highlights:
+        text += f"  - {highlight}.\n"
+    return text
+
+
+def format_answer(query, entity):
+    return f"  - {query}".replace("@placeholder", entity)
+
+
+def doc_to_target(doc):
+    # We only output the first correct entity in a doc
+    return format_answer(query=doc["query"], entity=doc["answers"][0])
--- a/lm_eval/tasks/super_glue/wic/default.yaml
+++ b/lm_eval/tasks/super_glue/wic/default.yaml
@@ -6,9 +6,8 @@ dataset_name: wic
 output_type: multiple_choice
 training_split: train
 validation_split: validation
-doc_to_text: !function utils.doc_to_text
-doc_to_target: !function utils.doc_to_target
-gold_alias: "{{label}}" # this will be cast to an int.
-template_aliases: "{% set answer_choices = ['no', 'yes'] %}"
+doc_to_text: "Sentence 1: {{sentence1}}\nSentence 2: {{sentence2}}\nQuestion: Is the word '{{sentence1[start1:end1]}}' used in the same way in the two sentences above?\nAnswer:"
+doc_to_target: label
+doc_to_choice: ['no', 'yes']
 metric_list:
  - metric: acc
--- a/lm_eval/tasks/super_glue/wic/utils.py
+++ b/lm_eval/tasks/super_glue/wic/utils.py
-def doc_to_text(doc):
-    return (
-        "Sentence 1: {}\nSentence 2: {}\nQuestion: Is the word '{}' used in the same way in the"
-        " two sentences above?\nAnswer:".format(
-            doc["sentence1"],
-            doc["sentence2"],
-            doc["sentence1"][doc["start1"] : doc["end1"]],
-        )
-    )
-
-
-def doc_to_target(doc):
-    return " {}".format({0: "no", 1: "yes"}[doc["label"]])
--- a/lm_eval/tasks/super_glue/wsc/default.yaml
+++ b/lm_eval/tasks/super_glue/wsc/default.yaml
+group:
+  - super-glue-lm-eval-v1
+task: wsc
+dataset_path: super_glue
+dataset_name: wsc
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: !function preprocess_wsc.default_doc_to_text
+doc_to_target: label
+doc_to_choice: ['no', 'yes']
+metric_list:
+  - metric: acc
--- a/lm_eval/tasks/super_glue/wsc/preprocess_wsc.py
+++ b/lm_eval/tasks/super_glue/wsc/preprocess_wsc.py
 import re
+from lm_eval.utils import general_detokenize


-def doc_to_text(x):
+def t5_prompt_doc_to_text(x):
    def _mark_span(text, span_str, span_idx, mark):
        pattern_tmpl = r"^((?:\S+\s){N})(W)"
        pattern = re.sub("N", str(span_idx), pattern_tmpl)
@@ -15,3 +16,19 @@ def doc_to_text(x):
    text = _mark_span(text, x["span2_text"], span2_index, "#")

    return text
+
+
+def default_doc_to_text(doc):
+    raw_passage = doc["text"]
+    # NOTE: HuggingFace span indices are word-based not character-based.
+    pre = " ".join(raw_passage.split()[: doc["span2_index"]])
+    post = raw_passage[len(pre) + len(doc["span2_text"]) + 1 :]
+    passage = general_detokenize(pre + " *{}*".format(doc["span2_text"]) + post)
+    noun = doc["span1_text"]
+    pronoun = doc["span2_text"]
+    text = (
+        f"Passage: {passage}\n"
+        + f'Question: In the passage above, does the pronoun "*{pronoun}*" refer to "*{noun}*"?\n'
+        + "Answer:"
+    )
+    return text