Merge remote-tracking branch 'upstream/big-refactor' into big-refactor-autobatching

6a6a0ebb · Benjamin Fattori · e4acfcaa · 2820042d · 6a6a0ebb · 6a6a0ebb
Commit 6a6a0ebb authored Jul 22, 2023 by Benjamin Fattori
20 changed files
--- a/lm_eval/tasks/hellaswag/hellaswag.yaml
+++ b/lm_eval/tasks/hellaswag/hellaswag.yaml
@@ -7,10 +7,9 @@ output_type: multiple_choice
 training_split: train
 validation_split: validation
 test_split: null
-template_aliases: "{% set gold = label | int %}{% set answer_choices = endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', '  ', ' ')|list %}"
 doc_to_text: "{% set text = activity_label ~ ': ' ~ ctx_a ~ ' ' ~ ctx_b.capitalize() %}{{text|trim|replace(' [title]', '. ')|regex_replace('\\[.*?\\]', '')|replace('  ', ' ')}}"
-doc_to_target: "{{answer_choices[gold]}}"
+doc_to_target: "{{label}}"
-gold_alias: "{{gold}}"
+doc_to_choice: "{{endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', '  ', ' ')|list}}"
 metric_list:
  - metric: acc
    aggregation: mean

--- a/lm_eval/tasks/hendrycks_ethics/commonsense.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/commonsense.yaml
@@ -6,9 +6,8 @@ dataset_name: commonsense
 output_type: multiple_choice
 training_split: train
 test_split: test
-template_aliases: "{% set answer_choices = ['no', 'yes'] %}"
 doc_to_text: "{{input}}\nQuestion: Is this wrong?\nAnswer:"
-doc_to_target: "{{answer_choices[label]}}"
+doc_to_target: label
-gold_alias: "{{label}}" # this will be cast to an int.
+doc_to_choice: ['no', 'yes']
 metric_list:
  - metric: acc
--- a/lm_eval/tasks/hendrycks_ethics/deontology.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/deontology.yaml
-group:
+include: commonsense.yaml
-  - hendrycks_ethics
 task: ethics_deontology
 dataset_path: hails/hendrycks_ethics
 dataset_name: deontology
-output_type: multiple_choice
-training_split: train
-test_split: test
-template_aliases: "{% set answer_choices = ['unreasonable', 'reasonable'] %}{% if excuse is not defined %}{% set excuse = '' %}{% endif %}"
 doc_to_text: "Question: Would most people believe this reasonable or unreasonable to say? \"{{scenario}} {{excuse.rstrip()}}\"\nAnswer:"
-doc_to_target: "{{answer_choices[label]}}"
+doc_to_target: label
-gold_alias: "{{label}}" # this will be cast to an int.
+doc_to_choice: ['unreasonable', 'reasonable']
-metric_list:
-  - metric: acc
 # TODO: implement exact-match metric for this subset
--- a/lm_eval/tasks/hendrycks_ethics/justice.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/justice.yaml
@@ -4,5 +4,5 @@ group:
 task: ethics_justice
 dataset_name: justice
 output_type: multiple_choice
+doc_to_text: "Question: Would most people believe this reasonable or unreasonable to say? \"{{scenario}}\"\nAnswer:"
 # TODO: impl. exact match for this and deontology
--- a/lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml
+include: commonsense.yaml
 group:
  - hendrycks_ethics
 task: ethics_utilitarianism
@@ -6,9 +7,8 @@ dataset_name: utilitarianism
 output_type: multiple_choice
 training_split: train
 test_split: test
-template_aliases: "{% set answer_choices = ['no', 'yes'] %}"
 doc_to_text: !function utils.doc_to_text
 doc_to_target: !function utils.doc_to_target
-gold_alias: !function utils.gold_alias
+doc_to_choice: ['no', 'yes']
 metric_list:
  - metric: acc
--- a/lm_eval/tasks/hendrycks_ethics/utils.py
+++ b/lm_eval/tasks/hendrycks_ethics/utils.py
@@ -15,23 +15,11 @@ def _preproc_doc(doc):
    return doc
-def _yesno(x):
-    if x:
-        return "yes"
-    else:
-        return "no"
 def doc_to_text(doc):
    doc = _preproc_doc(doc)
    return f"Scenario 1: {doc['scenarios'][0]}\nScenario 2: {doc['scenarios'][1]}\nQuestion: Is Scenario 1 preferable?\nAnswer:"
 def doc_to_target(doc):
-    doc = _preproc_doc(doc)
-    return _yesno(doc["label"])
-def gold_alias(doc):
    doc = _preproc_doc(doc)
    return doc["label"]
--- a/lm_eval/tasks/hendrycks_ethics/virtue.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/virtue.yaml
+include: commonsense.yaml
 group:
  - hendrycks_ethics
 task: ethics_virtue
-dataset_path: hails/hendrycks_ethics
 dataset_name: virtue
-output_type: multiple_choice
-training_split: train
-test_split: test
-template_aliases: "{% set answer_choices = ['no', 'yes'] %}"
 doc_to_text: "Sentence: {{scenario}}\nQuestion: Does the character in this sentence exhibit the trait \"{{trait}}\"?\nAnswer:"
-doc_to_target: "{{answer_choices[label]}}"
+doc_to_target: label
-gold_alias: "{{label}}" # this will be cast to an int.
+doc_to_choice: ['no', 'yes']
-metric_list:
-  - metric: acc
--- a/lm_eval/tasks/mathqa/mathqa.yaml
+++ b/lm_eval/tasks/mathqa/mathqa.yaml
@@ -7,10 +7,11 @@ output_type: multiple_choice
 training_split: train
 validation_split: validation
 test_split: test
-create_choices: !function utils.create_choices # create list of answer choices
 doc_to_text: "Question: {{Problem}}\nAnswer:"
-doc_to_target: !function utils.doc_to_target
+doc_to_target: "{{['a', 'b', 'c', 'd', 'e'].index(correct)}}"
-gold_alias: "{{['a', 'b', 'c', 'd', 'e'].index(correct)}}" # this will be cast to an int.
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{Problem}}\nAnswer:"
 metric_list:
  - metric: acc
    aggregation: mean

--- a/lm_eval/tasks/mathqa/utils.py
+++ b/lm_eval/tasks/mathqa/utils.py
 import re
-def create_choices(doc):
+def doc_to_choice(doc):
    choices = [
        c[4:].rstrip(" ,")
        for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc["options"])
    ]
    return choices
-def doc_to_target(doc):
-    choices = create_choices(doc)
-    return choices[["a", "b", "c", "d", "e"].index(doc["correct"])]
--- a/lm_eval/tasks/openbookqa/openbookqa.yaml
+++ b/lm_eval/tasks/openbookqa/openbookqa.yaml
@@ -7,11 +7,11 @@ output_type: multiple_choice
 training_split: train
 validation_split: validation
 test_split: test
-template_aliases: "{% set answer_choices = choices['text'] %}{% set gold = choices.label.index(answerKey.lstrip()) %}" # set the list of possible answer choices, and set what this doc's gold answer is (set what ds column used, and what)
+doc_to_text: question_stem
-doc_to_text: "{{question_stem}}"
+doc_to_target: "{{choices.label.index(answerKey.lstrip())}}"
-doc_to_target: "{{gold}}" # this will be cast to an int.
+doc_to_choice: "{{choices.text}}"
 should_decontaminate: true
-doc_to_decontamination_query: "{{question_stem}}"
+doc_to_decontamination_query: question_stem
 metric_list:
  - metric: acc
    aggregation: mean

--- a/lm_eval/tasks/piqa/piqa.yaml
+++ b/lm_eval/tasks/piqa/piqa.yaml
@@ -7,10 +7,11 @@ output_type: multiple_choice
 training_split: train
 validation_split: validation
 test_split: null
-template_aliases: "{% set question = goal %}{% set answer_choices = [sol1, sol2] %}{% set gold = label %}" # set the list of possible answer choices, and set what this doc's gold label idx is
+doc_to_text: "Question: {{goal}}\nAnswer:"
-doc_to_text: "Question: {{question}}\nAnswer:"
+doc_to_target: label
-doc_to_target: "{{answer_choices[gold]}}"
+doc_to_choice: "{{[sol1, sol2]}}"
-gold_alias: "{{gold}}" # this will be cast to an int.
+should_decontaminate: true
+doc_to_decontamination_query: goal
 metric_list:
  - metric: acc
    aggregation: mean

--- a/lm_eval/tasks/prost/corypaik_prost.yaml
+++ b/lm_eval/tasks/prost/corypaik_prost.yaml
@@ -5,10 +5,9 @@ dataset_path: corypaik/prost
 dataset_name: null
 output_type: multiple_choice
 test_split: test
-template_aliases: "{% set answer_choices = [A, B, C, D] %}{% set gold = label %}" # set the list of possible answer choices, and set what this doc's gold answer is (set what ds column used, and what)
 doc_to_text: "{{context}}\nQuestion: {{ex_question}}\nAnswer:"
-doc_to_target: "{{answer_choices[gold]}}"
+doc_to_target: label
-gold_alias: "{{gold}}" # this will be cast to an int.
+doc_to_choice: "{{[A, B, C, D]}}"
 should_decontaminate: true
 doc_to_decontamination_query: "{{context}}\nQuestion: {{ex_question}}\nAnswer:"
 metric_list:

--- a/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
+++ b/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
@@ -4,13 +4,11 @@ def doc_to_text(doc):
        ctxs, doc["question"], doc["final_decision"]
    )
 def doc_to_target(doc):
    return " {}".format(doc["final_decision"])
 def gold_alias(doc):
-    dict_to_label = {
+    dict_to_label = {"yes": 0, "no": 1, "maybe": 2}
-        'yes': 0,
+    return dict_to_label[doc["final_decision"]]
-        'no': 1,
-        'maybe': 2
-    }
-    return dict_to_label[doc["final_decision"]]
\ No newline at end of file
--- a/lm_eval/tasks/pubmedqa/pubmedqa.yaml
+++ b/lm_eval/tasks/pubmedqa/pubmedqa.yaml
@@ -7,11 +7,10 @@ output_type: multiple_choice
 training_split: null
 validation_split: null
 test_split: train
-template_aliases: "{% set answer_choices = ['yes', 'no', 'maybe'] %}{% set gold = final_decision %}"
 doc_to_text: !function preprocess_pubmedqa.doc_to_text
-doc_to_target: !function preprocess_pubmedqa.doc_to_target
+doc_to_target: final_decision
-gold_alias: !function preprocess_pubmedqa.gold_alias
+doc_to_choice: ["yes", "no", "maybe"]
 metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
\ No newline at end of file
--- a/lm_eval/tasks/qa4mre/qa4mre_2011.yaml
+++ b/lm_eval/tasks/qa4mre/qa4mre_2011.yaml
@@ -5,10 +5,10 @@ dataset_path: qa4mre
 dataset_name: 2011.main.EN
 output_type: multiple_choice
 test_split: train
-template_aliases: "{% set answer_choices = answer_options['answer_str'] %}"
+# doc_to_text: "{{document_str.strip()}}\nQuestion: {{question_str}}\nChoices:\n- {{answer_choices|join('\n- ')}}\nAnswer:"
-doc_to_text: "{{document_str.strip()}}\nQuestion: {{question_str}}\nChoices:\n- {{answer_choices|join('\n- ')}}\nAnswer:"
+doc_to_text: "{{document_str.strip()}}\nQuestion: {{question_str}}\nAnswer:"
-doc_to_target: !function preprocess_qa4mre.doc_to_target
+doc_to_target: "{{correct_answer_id|int - 1}}"
-gold_alias: !function preprocess_qa4mre.qa4mre_process
+doc_to_choice: "{{answer_options.answer_str}}"
 should_decontaminate: true
 doc_to_decontamination_query: "{{document_str.strip()}} + ' ' + {{question_str}}"
 metric_list:

--- a/lm_eval/tasks/qa4mre/qa4mre_2012.yaml
+++ b/lm_eval/tasks/qa4mre/qa4mre_2012.yaml
-group:
+include: qa4mre_2011.yaml
-  - multiple_choice
 task: qa4mre_2012
 dataset_path: qa4mre
 dataset_name: 2012.main.EN
-output_type: multiple_choice
-test_split: train
-template_aliases: "{% set answer_choices = answer_options['answer_str'] %}"
-doc_to_text: "{{document_str.strip()}}\nQuestion: {{question_str}}\nChoices:\n- {{answer_choices|join('\n- ')}}\nAnswer:"
-doc_to_target: !function preprocess_qa4mre.doc_to_target
-gold_alias: !function preprocess_qa4mre.qa4mre_process
-should_decontaminate: true
-doc_to_decontamination_query: "{{document_str.strip()}} + ' ' + {{question_str}}"
-metric_list:
-  - metric: acc
-    aggregation: mean
-    higher_is_better: true
-  - metric: acc_norm
-    aggregation: mean
-    higher_is_better: true
--- a/lm_eval/tasks/qa4mre/qa4mre_2013.yaml
+++ b/lm_eval/tasks/qa4mre/qa4mre_2013.yaml
-group:
+include: qa4mre_2011.yaml
-  - multiple_choice
 task: qa4mre_2013
 dataset_path: qa4mre
 dataset_name: 2013.main.EN
-output_type: multiple_choice
-test_split: train
-template_aliases: "{% set answer_choices = answer_options['answer_str'] %}"
-doc_to_text: "{{document_str.strip()}}\nQuestion: {{question_str}}\nChoices:\n- {{answer_choices|join('\n- ')}}\nAnswer:"
-doc_to_target: !function preprocess_qa4mre.doc_to_target
-gold_alias: !function preprocess_qa4mre.qa4mre_process
-should_decontaminate: true
-doc_to_decontamination_query: "{{document_str.strip()}} + ' ' + {{question_str}}"
-metric_list:
-  - metric: acc
-    aggregation: mean
-    higher_is_better: true
-  - metric: acc_norm
-    aggregation: mean
-    higher_is_better: true
--- a/lm_eval/tasks/race/preprocess_race.py
+++ b/lm_eval/tasks/race/preprocess_race.py
-import ast 
+import ast
 def process_ast(string):
    return ast.literal_eval(string)
 def last_problem(doc):
    return process_ast(doc["problems"])[-1]
 def get_answer_option(problem):
    letter_to_num = {"A": 0, "B": 1, "C": 2, "D": 3}
    answer = letter_to_num[problem["answer"]]
    return problem["options"][answer]
-def create_choices(doc):
+def doc_to_choice(doc):
    problem = last_problem(doc)
    choices = [problem["options"][i] for i in range(4)]
    return choices
 def doc_to_text(doc):
    text = "Article: " + doc["article"] + "\n\n"
    for problem in process_ast(doc["problems"])[:-1]:
        if problem["question"][-6:] == "  _  .":
-            text += (
+            text += problem["question"][-5:] + get_answer_option(problem) + "\n"
-                problem["question"][-5:] + get_answer_option(problem) + "\n"
-            )
        else:
            question = "Question: " + problem["question"] + "\n"
            answer = "Answer: " + get_answer_option(problem) + "\n"
@@ -30,6 +33,7 @@ def doc_to_text(doc):
    text += last_problem(doc)["question"]
    return text
 def doc_to_target(doc):
    letter_to_num = {"A": 0, "B": 1, "C": 2, "D": 3}
    answer = letter_to_num[last_problem(doc)["answer"]]

--- a/lm_eval/tasks/race/race.yaml
+++ b/lm_eval/tasks/race/race.yaml
@@ -5,9 +5,9 @@ dataset_path: EleutherAI/race
 dataset_name: high
 output_type: multiple_choice
 test_split: test
-create_choices: !function preprocess_race.create_choices
 doc_to_text: !function preprocess_race.doc_to_text
 doc_to_target: !function preprocess_race.doc_to_target
+doc_to_choice: !function preprocess_race.doc_to_choice
 metric_list:
  - metric: acc
    aggregation: mean

--- a/lm_eval/tasks/sciq/sciq.yaml
+++ b/lm_eval/tasks/sciq/sciq.yaml
@@ -7,10 +7,11 @@ output_type: multiple_choice
 training_split: train
 validation_split: validation
 test_split: test
-template_aliases: "{% set answer_choices = [distractor1, distractor2, distractor3, correct_answer] %}{% set gold = 3 %}" # set the list of possible answer choices, and set what this doc's gold label idx is
 doc_to_text: "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:"
-doc_to_target: "{{correct_answer}}"
+doc_to_target: 3
-gold_alias: "{{gold}}" # this will be cast to an int.
+doc_to_choice: "{{[distractor1, distractor2, distractor3, correct_answer]}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{support}} {{question}}"
 metric_list:
  - metric: acc
    aggregation: mean