Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into toxicity

16c4afc6 · lintangsutawika · 7b376ae1 · 176d5a26 · 16c4afc6 · 16c4afc6
Commit 16c4afc6 authored Aug 03, 2023 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/arithmetic/arithmetic_4ds.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_4ds.yaml
-group:
-  - arithmetic
+include: arithmetic_1dc.yaml
 task: arithmetic_4ds
-dataset_path: EleutherAI/arithmetic
 dataset_name: arithmetic_4ds
-output_type: loglikelihood
-validation_split: validation
-test_split: null
-template_aliases: ""
-doc_to_text: "{{context}}"
-doc_to_target: "{{completion}}"
-metric_list:
-  - metric: acc
-    aggregation: mean
-    higher_is_better: true
--- a/lm_eval/tasks/arithmetic/arithmetic_5da.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_5da.yaml
-group:
-  - arithmetic
+include: arithmetic_1dc.yaml
 task: arithmetic_5da
-dataset_path: EleutherAI/arithmetic
 dataset_name: arithmetic_5da
-output_type: loglikelihood
-validation_split: validation
-test_split: null
-template_aliases: ""
-doc_to_text: "{{context}}"
-doc_to_target: "{{completion}}"
-metric_list:
-  - metric: acc
-    aggregation: mean
-    higher_is_better: true
--- a/lm_eval/tasks/arithmetic/arithmetic_5ds.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_5ds.yaml
-group:
-  - arithmetic
+include: arithmetic_1dc.yaml
 task: arithmetic_5ds
-dataset_path: EleutherAI/arithmetic
 dataset_name: arithmetic_5ds
-output_type: loglikelihood
-validation_split: validation
-test_split: null
-template_aliases: ""
-doc_to_text: "{{context}}"
-doc_to_target: "{{completion}}"
-metric_list:
-  - metric: acc
-    aggregation: mean
-    higher_is_better: true
--- a/lm_eval/tasks/benchmarks/pythia.yaml
+++ b/lm_eval/tasks/benchmarks/pythia.yaml
+group: pythia
+task:
+  - lambada_openai
+  - wikitext
+  - piqa
+  - sciq
+  - wsc
+  - winogrande
+  - arc_*
+  # - logiqa
+  # - blimp_*
+  # - hendrycksTest*
--- a/lm_eval/tasks/benchmarks/t0_eval.yaml
+++ b/lm_eval/tasks/benchmarks/t0_eval.yaml
+group: t0_eval
+task:
+  # # Coreference Resolution
+  # - dataset_path: super_glue
+  #   dataset_name: wsc.fixed
+  #   use_prompt: promptsource:*
+  #   training_split: train
+  #   validation_split: validation
+  #   metric_list:
+  #     - metric: exact_match
+  #       aggregation: mean
+  #       higher_is_better: true
+  #       ignore_case: true
+  #       ignore_punctuation: true
+  # # Coreference Resolution
+  # - dataset_path: winogrande
+  #   dataset_name: winogrande_xl
+  #   use_prompt: promptsource:*
+  #   training_split: train
+  #   validation_split: validation
+  #   metric_list:
+  #     - metric: exact_match
+  #       aggregation: mean
+  #       higher_is_better: true
+  #       ignore_case: true
+  #       ignore_punctuation: true
+  # Natural Language Inference
+  - dataset_path: super_glue
+    dataset_name: cb
+    use_prompt: promptsource:*
+    training_split: train
+    validation_split: validation
+    output_type: greedy_until
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  # Natural Language Inference
+  # - dataset_path: super_glue
+  #   dataset_name: rte
+  #   use_prompt: promptsource:*
+  #   training_split: train
+  #   validation_split: validation
+  #   metric_list:
+  #     - metric: exact_match
+  #       aggregation: mean
+  #       higher_is_better: true
+  #       ignore_case: true
+  #       ignore_punctuation: true
+  # # Natural Language Inference
+  # # - dataset_path: anli
+  # #   use_prompt: promptsource:*
+  # #   training_split: train_r1
+  # #   validation_split: dev_r1
+  # # Sentence Completion
+  # - dataset_path: super_glue
+  #   dataset_name: copa
+  #   use_prompt: promptsource:*
+  #   training_split: train
+  #   validation_split: validation
+  #   metric_list:
+  #     - metric: exact_match
+  #       aggregation: mean
+  #       higher_is_better: true
+  #       ignore_case: true
+  #       ignore_punctuation: true
+  # # Natural Language Inference
+  # - dataset_path: hellaswag
+  #   use_prompt: promptsource:*
+  #   training_split: train
+  #   validation_split: validation
+  #   metric_list:
+  #     - metric: exact_match
+  #       aggregation: mean
+  #       higher_is_better: true
+  #       ignore_case: true
+  #       ignore_punctuation: true
+  # # Word Sense Disambiguation
+  # - dataset_path: super_glue
+  #   dataset_name: wic
+  #   use_prompt: promptsource:*
+  #   training_split: train
+  #   validation_split: validation
+  #   metric_list:
+  #     - metric: exact_match
+  #       aggregation: mean
+  #       higher_is_better: true
+  #       ignore_case: true
+  #       ignore_punctuation: true
--- a/lm_eval/tasks/hendrycks_ethics/commonsense.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/commonsense.yaml
 group:
  - hendrycks_ethics
 task: ethics_cm
-dataset_path: hails/hendrycks_ethics
+dataset_path: EleutherAI/hendrycks_ethics
 dataset_name: commonsense
 output_type: multiple_choice
 training_split: train

--- a/lm_eval/tasks/hendrycks_ethics/deontology.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/deontology.yaml
 include: commonsense.yaml
 task: ethics_deontology
-dataset_path: hails/hendrycks_ethics
 dataset_name: deontology
 doc_to_text: "Question: Would most people believe this reasonable or unreasonable to say? \"{{scenario}} {{excuse.rstrip()}}\"\nAnswer:"
 doc_to_target: label

--- a/lm_eval/tasks/hendrycks_ethics/justice.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/justice.yaml
@@ -3,6 +3,5 @@ group:
  - hendrycks_ethics
 task: ethics_justice
 dataset_name: justice
-output_type: multiple_choice
 doc_to_text: "Question: Would most people believe this reasonable or unreasonable to say? \"{{scenario}}\"\nAnswer:"
 # TODO: impl. exact match for this and deontology
--- a/lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml
@@ -2,11 +2,7 @@ include: commonsense.yaml
 group:
  - hendrycks_ethics
 task: ethics_utilitarianism
-dataset_path: hails/hendrycks_ethics
 dataset_name: utilitarianism
-output_type: multiple_choice
-training_split: train
-test_split: test
 doc_to_text: !function utils.doc_to_text
 doc_to_target: !function utils.doc_to_target
 doc_to_choice: ['no', 'yes']

--- a/lm_eval/tasks/lambada/lambada_openai.yaml
+++ b/lm_eval/tasks/lambada/lambada_openai.yaml
@@ -7,7 +7,6 @@ dataset_path: EleutherAI/lambada_openai
 dataset_name: default
 output_type: loglikelihood
 test_split: test
-template_aliases: ""
 doc_to_text: "{{text.split(' ')[:-1]|join(' ')}}"
 doc_to_target: "{{' '+text.split(' ')[-1]}}"
 should_decontaminate: true

--- a/lm_eval/tasks/lambada/lambada_standard.yaml
+++ b/lm_eval/tasks/lambada/lambada_standard.yaml
@@ -8,7 +8,6 @@ dataset_name: null
 output_type: loglikelihood
 validation_split: validation
 test_split: test
-template_aliases: ""
 doc_to_text: "{{text.split(' ')[:-1]|join(' ')}}"
 doc_to_target: "{{' '+text.split(' ')[-1]}}"
 should_decontaminate: true

--- a/lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml
+++ b/lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml
@@ -6,7 +6,6 @@ dataset_path: EleutherAI/lambada_openai
 dataset_name: default
 output_type: loglikelihood
 test_split: test
-template_aliases: ""
 doc_to_text: "{{text.split(' ')[:-1]|join(' ')}} ____. ->"
 doc_to_target: "{{' '+text.split(' ')[-1]}}"
 should_decontaminate: true

--- a/lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml
+++ b/lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml
@@ -7,7 +7,6 @@ dataset_name: null
 output_type: loglikelihood
 validation_split: validation
 test_split: test
-template_aliases: ""
 doc_to_text: "{{text.split(' ')[:-1]|join(' ')}} ____. ->"
 doc_to_target: "{{' '+text.split(' ')[-1]}}"
 should_decontaminate: true

--- a/lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml
+++ b/lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml
@@ -7,7 +7,6 @@ dataset_path: EleutherAI/lambada_openai
 dataset_name: en
 output_type: loglikelihood
 test_split: test
-template_aliases: ""
 doc_to_text: "{{text.split(' ')[:-1]|join(' ')}}"
 doc_to_target: "{{' '+text.split(' ')[-1]}}"
 should_decontaminate: true

--- a/lm_eval/tasks/pile/pile_arxiv.yaml
+++ b/lm_eval/tasks/pile/pile_arxiv.yaml
@@ -3,11 +3,10 @@ group:
  - perplexity
  - loglikelihood_rolling
 task: pile_arxiv
-dataset_path: EleutherAI/the_pile
+dataset_path: EleutherAI/pile
 dataset_name: pile_arxiv
 output_type: loglikelihood_rolling
 test_split: train
-template_aliases: ""
 doc_to_text: ""
 doc_to_target: "{{text}}"
 should_decontaminate: true

--- a/lm_eval/tasks/super_glue/boolq/default.yaml
+++ b/lm_eval/tasks/super_glue/boolq/default.yaml
 group:
  - super-glue-lm-eval-v1
-task: "boolq"
+task: boolq
 dataset_path: super_glue
 dataset_name: boolq
 output_type: multiple_choice

--- a/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
@@ -5,11 +5,15 @@ dataset_path: super_glue
 dataset_name: cb
 training_split: train
 validation_split: validation
+output_type: greedy_until
 doc_to_text: "cb hypothesis: {{hypothesis}} premise {{premise}}"
-doc_to_target: "{% set answer_choices = ['entailment', 'contradiction', 'neutral'] %}{{answer_choices[label]}}"
+doc_to_target: label
+doc_to_choice: ['entailment', 'contradiction', 'neutral']
 metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
    ignore_case: true
    ignore_punctuation: true
+  - metric: f1
+    aggregation: !function "aggregate.cb_multi_fi"
--- a/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
@@ -5,8 +5,10 @@ dataset_path: super_glue
 dataset_name: copa
 training_split: train
 validation_split: validation
+output_type: greedy_until
 doc_to_text: "copa choice1: {{choice1}} choice2: {{choice2}} question: {{question}}"
-doc_to_target: "{% set answer_choices = ['False', 'True'] %}{{answer_choices[label]}}"
+doc_to_target: label
+doc_to_choice: ['False', 'True']
 metric_list:
  - metric: exact_match
    aggregation: mean

--- a/lm_eval/tasks/super_glue/multirc/promptsource-00.yaml
+++ b/lm_eval/tasks/super_glue/multirc/promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "I was going to say…"
-dataset_path: super_glue
-dataset_name: multirc
-training_split: train
-validation_split: validation
-use_prompt: "promptsource:I was going to say…"
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
--- a/lm_eval/tasks/super_glue/multirc/promptsource-01.yaml
+++ b/lm_eval/tasks/super_glue/multirc/promptsource-01.yaml
-include: promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "Would it be good to answer…"
-use_prompt: "promptsource:Would it be good to answer…"