Merge branch 'big-refactor' of...

Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into mmlu_subgroups

Merge branch 'big-refactor' of...
Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into mmlu_subgroups
f77a3a27 · lintangsutawika · 109ed1c7 · f8342178 · f77a3a27 · f77a3a27
Commit f77a3a27 authored Oct 17, 2023 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/logiqa2/logieval.yaml
+++ b/lm_eval/tasks/logiqa2/logieval.yaml
 task: logieval
 dataset_path: baber/logiqa2
 dataset_name: logieval
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 # Instructions + {content}

--- a/lm_eval/tasks/mgsm/direct/direct_yaml
+++ b/lm_eval/tasks/mgsm/direct/direct_yaml
@@ -4,7 +4,7 @@
 group: mgsm_direct
 dataset_path: juletxara/mgsm
 dataset_name: null  # Overridden by language-specific config.
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 target_delimiter: ""

--- a/lm_eval/tasks/mgsm/en_cot/cot_yaml
+++ b/lm_eval/tasks/mgsm/en_cot/cot_yaml
@@ -4,7 +4,7 @@
 group: mgsm_cot_native
 dataset_path: juletxara/mgsm
 dataset_name: null  # Overridden by language-specific config.
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 target_delimiter: ""

--- a/lm_eval/tasks/mgsm/native_cot/cot_yaml
+++ b/lm_eval/tasks/mgsm/native_cot/cot_yaml
@@ -4,7 +4,7 @@
 group: mgsm_cot_native
 dataset_path: juletxara/mgsm
 dataset_name: null  # Overridden by language-specific config.
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 target_delimiter: ""

--- a/lm_eval/tasks/minerva_math/README.md
+++ b/lm_eval/tasks/minerva_math/README.md
@@ -37,7 +37,7 @@ Eprint = {arXiv:2206.14858},
 #### Groups
 - `math_word_problems`
- `greedy_until`
+- `generate_until`
 #### Tasks

--- a/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
+++ b/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
@@ -4,7 +4,7 @@ task: minerva_math_algebra
 dataset_path: EleutherAI/hendrycks_math
 process_docs: !function utils.process_docs
 dataset_name: algebra
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 doc_to_text:  !function utils.doc_to_text

--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
@@ -2,7 +2,7 @@ group: mmlu_flan_cot_fewshot
 dataset_path: cais/mmlu
 validation_split: validation
 fewshot_split: dev
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
 doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
 filter_list:

--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
@@ -2,7 +2,7 @@ group: mmlu_flan_cot_zeroshot
 dataset_path: cais/mmlu
 validation_split: validation
 fewshot_split: dev
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
 doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
 filter_list:

--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
@@ -2,7 +2,7 @@ group: mmlu_flan_n_shot_generative
 dataset_path: cais/mmlu
 test_split: test
 fewshot_split: dev
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: "
 doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
 generation_kwargs:

--- a/lm_eval/tasks/nq_open/nq_open.yaml
+++ b/lm_eval/tasks/nq_open/nq_open.yaml
 task: nq_open
 dataset_path: nq_open
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 description: "Answer these questions:\n"

--- a/lm_eval/tasks/polemo2/polemo2_in.yaml
+++ b/lm_eval/tasks/polemo2/polemo2_in.yaml
@@ -3,7 +3,7 @@ group:
 task: polemo2_in
 dataset_path: allegro/klej-polemo2-in
 dataset_name: klej-polemo2-in
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 test_split: test

--- a/lm_eval/tasks/qasper/freeform.yaml
+++ b/lm_eval/tasks/qasper/freeform.yaml
 group: qasper
 task: qasper_freeform
 dataset_path: qasper
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 process_docs: !function utils.process_docs_freeform

--- a/lm_eval/tasks/squadv2/README.md
+++ b/lm_eval/tasks/squadv2/README.md
@@ -2,25 +2,44 @@
 ### Paper
-Title: `paper title goes here`
+Title: `Know What You Don’t Know: Unanswerable Questions for SQuAD`
-Abstract: `link to paper PDF or arXiv abstract goes here`
+Abstract: https://arxiv.org/abs/1806.03822
-`Short description of paper / benchmark goes here:`
+Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
+consisting of questions posed by crowdworkers on a set of Wikipedia articles,
+where the answer to every question is a segment of text, or span, from the
+corresponding reading passage, or the question might be unanswerable.
+SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable
+questions written adversarially by crowdworkers to look similar to answerable ones.
+To do well on SQuAD2.0, systems must not only answer questions when possible, but
+also determine when no answer is supported by the paragraph and abstain from answering.
-Homepage: `homepage to the benchmark's website goes here, if applicable`
+Homepage: https://rajpurkar.github.io/SQuAD-explorer/
 ### Citation
 ```
-BibTeX-formatted citation goes here
+@misc{rajpurkar2018know,
+    title={Know What You Don't Know: Unanswerable Questions for SQuAD},
+    author={Pranav Rajpurkar and Robin Jia and Percy Liang},
+    year={2018},
+    eprint={1806.03822},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
 ```
-### Subtasks
+### Groups and Tasks
-List or describe tasks defined in this folder, and their names here:
+#### Groups
-* `task_name`: `1-sentence description of what this particular task does`
-* `task_name2`: .....
+* `squadv2_complete`: Runs both `squadv2` and `squadv2_noans_loglikelihood`
+#### Tasks
+* `squadv2`: `Default squadv2 task`
+* `squadv2_noans_loglikelihood`: `Additional task to acquire the probability of model predicting there is no answer`
 ### Checklist

--- a/lm_eval/tasks/squadv2/_template_yaml
+++ b/lm_eval/tasks/squadv2/_template_yaml
+dataset_path: squad_v2
+training_split: train
+validation_split: validation
+doc_to_text: "Title: {{title}}\n\nBackground: {{context}}\n\nQuestion: {{question}}\n\n Answer:"
+doc_to_target: "{% if answers.text| length > 0 %}{{answers.text}}{% else %}{{['']}}{% endif %}"
+target_delimiter: ""
+should_decontaminate: true
+doc_to_decontamination_query: context
--- a/lm_eval/tasks/squadv2/default.yaml
+++ b/lm_eval/tasks/squadv2/default.yaml
+include: _template_yaml
 task: squadv2
-dataset_path: squad_v2
+output_type: generate_until
-output_type: greedy_until
-training_split: train
-validation_split: validation
-doc_to_text: "Title: {{title}}\n\nBackground: {{context}}\n\nQuestion: {{question}}\n\n Answer:"
-doc_to_target: "{% if answers.text| length > 0 %}{{answers.text}}{% else %}{{['']}}{% endif %}"
-target_delimiter: ""
-should_decontaminate: true
-doc_to_decontamination_query: context
 generation_kwargs:
  until:
    - "\n"
-# filter_list:
-#   - name: remove_whitespace
-#     filter:
-#       - function: remove_whitespace
-#       - function: take_first
 metric_list:
  - metric: !function utils.exact
    aggregation: mean

--- a/lm_eval/tasks/squadv2/no_ans.yaml
+++ b/lm_eval/tasks/squadv2/no_ans.yaml
-include: default.yaml
+include: _template_yaml
 task: squadv2_noans_loglikelihood
-dataset_path: squad_v2
 output_type: loglikelihood
-training_split: train
-validation_split: validation
 doc_to_target: " unanswerable"
 metric_list:
  - metric: perplexity
--- a/lm_eval/tasks/super_glue/boolq/seq2seq.yaml
+++ b/lm_eval/tasks/super_glue/boolq/seq2seq.yaml
@@ -3,7 +3,7 @@ group:
 task: "boolq-seq2seq"
 dataset_path: super_glue
 dataset_name: boolq
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"

--- a/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: super_glue
 dataset_name: boolq
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "boolq passage: {{passage}} question: {{question}}"
 doc_to_target: label
 doc_to_choice: ['False', 'True']

--- a/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: super_glue
 dataset_name: cb
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "cb hypothesis: {{hypothesis}} premise: {{premise}}"
 doc_to_target: label
 doc_to_choice: ['entailment', 'contradiction', 'neutral']

--- a/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: super_glue
 dataset_name: copa
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "copa choice1: {{choice1}} choice2: {{choice2}} premise: {{premise}} question: {{question}}"
 doc_to_target: label
 doc_to_choice: ['choice1', 'choice2']