Merge branch 'big-refactor' into add-prost-config

66bb89e5 · FarzanehNakhaee · e8bb77db · 070b6b9c · 66bb89e5 · 66bb89e5
Commit 66bb89e5 authored Jul 04, 2023 by FarzanehNakhaee
20 changed files
--- a/lm_eval/tasks/hellaswag/hellaswag.yaml
+++ b/lm_eval/tasks/hellaswag/hellaswag.yaml
+group:
+  - multiple_choice
+task: hellaswag
+dataset_path: hellaswag
+dataset_name: null
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: null
+template_aliases: "{% set gold = label | int %}{% set answer_choices = endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', '  ', ' ')|list %}"
+doc_to_text: "{% set text = activity_label ~ ': ' ~ ctx_a ~ ' ' ~ ctx_b.capitalize() %}{{text|trim|replace(' [title]', '. ')|regex_replace('\\[.*?\\]', '')|replace('  ', ' ')}}"
+doc_to_target: "{{answer_choices[gold]}}"
+gold_alias: "{{gold}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/lambada/lambada_openai.yaml
+++ b/lm_eval/tasks/lambada/lambada_openai.yaml
@@ -16,6 +16,6 @@ metric_list:
  - metric: perplexity
    aggregation: perplexity
    higher_is_better: false
-  - metric: accuracy
+  - metric: acc
    aggregation: mean
    higher_is_better: true
--- a/lm_eval/tasks/lambada/lambada_standard.yaml
+++ b/lm_eval/tasks/lambada/lambada_standard.yaml
@@ -17,6 +17,6 @@ metric_list:
  - metric: perplexity
    aggregation: perplexity
    higher_is_better: false
-  - metric: accuracy
+  - metric: acc
    aggregation: mean
    higher_is_better: true
--- a/lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml
+++ b/lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml
@@ -15,6 +15,6 @@ metric_list:
  - metric: perplexity
    aggregation: perplexity
    higher_is_better: false
-  - metric: accuracy
+  - metric: acc
    aggregation: mean
    higher_is_better: true
--- a/lm_eval/tasks/openbookqa/openbookqa.yaml
+++ b/lm_eval/tasks/openbookqa/openbookqa.yaml
+group:
+  - multiple_choice
+task: openbookqa
+dataset_path: openbookqa
+dataset_name: main
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: test
+template_aliases: "{% set answer_choices = choices['text'] %}{% set gold = choices.label.index(answerKey.lstrip()) %}" # set the list of possible answer choices, and set what this doc's gold answer is (set what ds column used, and what)
+doc_to_text: "{{question_stem}}"
+doc_to_target: "{{gold}}" # this will be cast to an int.
+should_decontaminate: true
+doc_to_decontamination_query: "{{question_stem}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/piqa/piqa.yaml
+++ b/lm_eval/tasks/piqa/piqa.yaml
@@ -9,7 +9,8 @@ validation_split: validation
 test_split: null
 template_aliases: "{% set question = goal %}{% set answer_choices = [sol1, sol2] %}{% set gold = label %}" # set the list of possible answer choices, and set what this doc's gold label idx is
 doc_to_text: "Question: {{question}}\nAnswer:"
-doc_to_target: "{{gold}}" # this will be cast to an int.
+doc_to_target: "{{answer_choices[gold]}}"
+gold_alias: "{{gold}}" # this will be cast to an int.
 metric_list:
  - metric: acc
    aggregation: mean

--- a/lm_eval/tasks/sciq/sciq.yaml
+++ b/lm_eval/tasks/sciq/sciq.yaml
@@ -9,7 +9,7 @@ validation_split: validation
 test_split: test
 template_aliases: "{% set answer_choices = [distractor1, distractor2, distractor3, correct_answer] %}{% set gold = 3 %}" # set the list of possible answer choices, and set what this doc's gold label idx is
 doc_to_text: "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:"
-doc_to_target: " {{correct_answer}}"
+doc_to_target: "{{correct_answer}}"
 gold_alias: "{{gold}}" # this will be cast to an int.
 metric_list:
  - metric: acc

--- a/lm_eval/tasks/super_glue/boolq/default.yaml
+++ b/lm_eval/tasks/super_glue/boolq/default.yaml
 group:
  - super-glue-lm-eval-v1
-task: "default"
+task: "boolq"
 dataset_path: super_glue
 dataset_name: boolq
 output_type: multiple_choice
 training_split: train
 validation_split: validation
 doc_to_text: "{{passage}}\nQuestion: {{question}}\nAnswer:"
-doc_to_target: "{{label}}" # this will be cast to an int.
+doc_to_target: "{{answer_choices[label]}}"
+gold_alias: "{{label}}" # this will be cast to an int.
 template_aliases: "{% set answer_choices = ['no', 'yes'] %}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
--- a/lm_eval/tasks/super_glue/boolq/promptsource-01.yaml
+++ b/lm_eval/tasks/super_glue/boolq/promptsource-01.yaml
-include: promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "based on the previous passage"
-use_prompt: "promptsource:based on the previous passage"
--- a/lm_eval/tasks/super_glue/boolq/promptsource-02.yaml
+++ b/lm_eval/tasks/super_glue/boolq/promptsource-02.yaml
-include: promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "based on the following passage"
-use_prompt: "promptsource:based on the following passage"
--- a/lm_eval/tasks/super_glue/boolq/promptsource-00.yaml
+++ b/lm_eval/tasks/super_glue/boolq/promptsource-00.yaml
 group:
-  - super-glue-promptsource
-task: "GPT-3 Style"
+  - super-glue-lm-eval-v1-seq2seq
+task: "boolq-seq2seq"
 dataset_path: super_glue
 dataset_name: boolq
+output_type: greedy_until
 training_split: train
 validation_split: validation
-use_prompt: "promptsource:GPT-3 Style"
+doc_to_text: "{{passage}}\nQuestion: {{question}}\nAnswer:"
+doc_to_target: "{{answer_choices[label]}}"
+gold_alias: "{{label}}" # this will be cast to an int.
+template_aliases: "{% set answer_choices = ['no', 'yes'] %}"
 metric_list:
  - metric: exact_match
    aggregation: mean

--- a/lm_eval/tasks/super_glue/cb/default.yaml
+++ b/lm_eval/tasks/super_glue/cb/default.yaml
 group:
  - super-glue-lm-eval-v1
-task: "default"
+task: "cb"
 dataset_path: super_glue
 dataset_name: cb
 output_type: multiple_choice
 training_split: train
 validation_split: validation
 doc_to_text: "{{premise}}\nQuestion: {{hypothesis}}. True, False, or Neither?\nAnswer:"
-doc_to_target: "{{label}}" # this will be cast to an int.
+doc_to_target: "{{answer_choices[label]}}"
+gold_alias: "{{label}}" # this will be cast to an int.
 template_aliases: "{% set answer_choices = ['True', 'False', 'Neither'] %}"
 metric_list:
  - metric: acc

--- a/lm_eval/tasks/super_glue/cb/promptsource-00.yaml
+++ b/lm_eval/tasks/super_glue/cb/promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "GPT-3 style"
-dataset_path: super_glue
-dataset_name: cb
-training_split: train
-validation_split: validation
-use_prompt: "promptsource:GPT-3 style"
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
--- a/lm_eval/tasks/super_glue/cb/promptsource-01.yaml
+++ b/lm_eval/tasks/super_glue/cb/promptsource-01.yaml
-include: promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "MNLI crowdsource"
-use_prompt: "promptsource:MNLI crowdsource"
--- a/lm_eval/tasks/super_glue/cb/promptsource-02.yaml
+++ b/lm_eval/tasks/super_glue/cb/promptsource-02.yaml
-include: promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "based on the previous passage"
-use_prompt: "promptsource:based on the previous passage"
--- a/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
 group:
  - super-glue-t5-prompt
-task: t5-prompt
-reference: "From Raffel et. al. 2019"
+task: super_glue-cb-t5-prompt
 dataset_path: super_glue
 dataset_name: cb
 training_split: train

--- a/lm_eval/tasks/super_glue/copa/default.yaml
+++ b/lm_eval/tasks/super_glue/copa/default.yaml
+group:
+  - super-glue-lm-eval-v1-
+task: "copa"
+dataset_path: super_glue
+dataset_name: copa
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+gold_alias: "{{label}}" # this will be cast to an int.
+template_aliases: "{% set answer_choices = [{{doc.choice1}}, 'b'] %} {{answer_choices}}"
+metric_list:
+  - metric: acc
--- a/lm_eval/tasks/super_glue/copa/promptsource-00.yaml
+++ b/lm_eval/tasks/super_glue/copa/promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "C1 or C2? premise, so/because…"
-dataset_path: super_glue
-dataset_name: copa
-training_split: train
-validation_split: validation
-use_prompt: "promptsource:C1 or C2? premise, so/because…"
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
--- a/lm_eval/tasks/super_glue/copa/promptsource-01.yaml
+++ b/lm_eval/tasks/super_glue/copa/promptsource-01.yaml
-include: promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "best_option"
-use_prompt: "promptsource:best_option"
--- a/lm_eval/tasks/super_glue/copa/promptsource-02.yaml
+++ b/lm_eval/tasks/super_glue/copa/promptsource-02.yaml
-include: promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "cause_effect"
-use_prompt: "promptsource:cause_effect"