Merge branch 'big-refactor' into fix-unittests

de71ad92 · Lintang Sutawika · GitHub · 09d20bfa · 73c80915 · de71ad92
Unverified Commit de71ad92 authored Oct 17, 2023 by Lintang Sutawika Committed by GitHub Oct 17, 2023
20 changed files
--- a/lm_eval/tasks/code_x_glue/code-text/python.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/python.yaml
@@ -5,7 +5,7 @@ dataset_path: CM/codexglue_code2text_python
 training_split: train
 validation_split: validation
 test_split: test
-output_type: greedy_until
+output_type: generate_until
 generation_kwargs:
  num_beams: 10
  max_length: 128

--- a/lm_eval/tasks/code_x_glue/code-text/ruby.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/ruby.yaml
@@ -5,7 +5,7 @@ dataset_path: CM/codexglue_code2text_ruby
 training_split: train
 validation_split: validation
 test_split: test
-output_type: greedy_until
+output_type: generate_until
 generation_kwargs:
  num_beams: 10
  max_length: 128

--- a/lm_eval/tasks/coqa/default.yaml
+++ b/lm_eval/tasks/coqa/default.yaml
 task: coqa
 dataset_path: EleutherAI/coqa
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 doc_to_text: !function utils.doc_to_text

--- a/lm_eval/tasks/drop/default.yaml
+++ b/lm_eval/tasks/drop/default.yaml
 task: drop
 dataset_path: EleutherAI/drop
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 process_docs: !function utils.process_docs

--- a/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
@@ -3,7 +3,7 @@ group:
 task: gsm8k_cot
 dataset_path: gsm8k
 dataset_name: main
-output_type: greedy_until
+output_type: generate_until
 test_split: test
 doc_to_text: "Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\n\nA: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.\n\n\
 Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\n\nA: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.\n\n\

--- a/lm_eval/tasks/gsm8k/gsm8k.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k.yaml
@@ -3,7 +3,7 @@ group:
 task: gsm8k_yaml
 dataset_path: gsm8k
 dataset_name: main
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 fewshot_split: train
 test_split: test

--- a/lm_eval/tasks/logiqa2/logieval.yaml
+++ b/lm_eval/tasks/logiqa2/logieval.yaml
 task: logieval
 dataset_path: baber/logiqa2
 dataset_name: logieval
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 # Instructions + {content}

--- a/lm_eval/tasks/mgsm/direct/direct_yaml
+++ b/lm_eval/tasks/mgsm/direct/direct_yaml
@@ -4,7 +4,7 @@
 group: mgsm_direct
 dataset_path: juletxara/mgsm
 dataset_name: null  # Overridden by language-specific config.
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 target_delimiter: ""

--- a/lm_eval/tasks/mgsm/en_cot/cot_yaml
+++ b/lm_eval/tasks/mgsm/en_cot/cot_yaml
@@ -4,7 +4,7 @@
 group: mgsm_cot_native
 dataset_path: juletxara/mgsm
 dataset_name: null  # Overridden by language-specific config.
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 target_delimiter: ""

--- a/lm_eval/tasks/mgsm/native_cot/cot_yaml
+++ b/lm_eval/tasks/mgsm/native_cot/cot_yaml
@@ -4,7 +4,7 @@
 group: mgsm_cot_native
 dataset_path: juletxara/mgsm
 dataset_name: null  # Overridden by language-specific config.
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 target_delimiter: ""

--- a/lm_eval/tasks/minerva_math/README.md
+++ b/lm_eval/tasks/minerva_math/README.md
@@ -37,7 +37,7 @@ Eprint = {arXiv:2206.14858},
 #### Groups
 - `math_word_problems`
- `greedy_until`
+- `generate_until`
 #### Tasks

--- a/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
+++ b/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
@@ -4,7 +4,7 @@ task: minerva_math_algebra
 dataset_path: EleutherAI/hendrycks_math
 process_docs: !function utils.process_docs
 dataset_name: algebra
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 doc_to_text:  !function utils.doc_to_text

--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
@@ -2,7 +2,7 @@ group: mmlu_flan_cot_fewshot
 dataset_path: cais/mmlu
 validation_split: validation
 fewshot_split: dev
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
 doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
 filter_list:

--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml
@@ -2,7 +2,7 @@ group: mmlu_flan_cot_zeroshot
 dataset_path: cais/mmlu
 validation_split: validation
 fewshot_split: dev
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
 doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
 filter_list:

--- a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
@@ -2,7 +2,7 @@ group: mmlu_flan_n_shot_generative
 dataset_path: cais/mmlu
 test_split: test
 fewshot_split: dev
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: "
 doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
 generation_kwargs:

--- a/lm_eval/tasks/nq_open/nq_open.yaml
+++ b/lm_eval/tasks/nq_open/nq_open.yaml
 task: nq_open
 dataset_path: nq_open
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 description: "Answer these questions:\n"

--- a/lm_eval/tasks/polemo2/polemo2_in.yaml
+++ b/lm_eval/tasks/polemo2/polemo2_in.yaml
@@ -3,7 +3,7 @@ group:
 task: polemo2_in
 dataset_path: allegro/klej-polemo2-in
 dataset_name: klej-polemo2-in
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 test_split: test

--- a/lm_eval/tasks/qasper/freeform.yaml
+++ b/lm_eval/tasks/qasper/freeform.yaml
 group: qasper
 task: qasper_freeform
 dataset_path: qasper
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 process_docs: !function utils.process_docs_freeform

--- a/lm_eval/tasks/squadv2/README.md
+++ b/lm_eval/tasks/squadv2/README.md
@@ -2,25 +2,44 @@
 ### Paper
-Title: `paper title goes here`
+Title: `Know What You Don’t Know: Unanswerable Questions for SQuAD`
-Abstract: `link to paper PDF or arXiv abstract goes here`
+Abstract: https://arxiv.org/abs/1806.03822
-`Short description of paper / benchmark goes here:`
+Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
+consisting of questions posed by crowdworkers on a set of Wikipedia articles,
+where the answer to every question is a segment of text, or span, from the
+corresponding reading passage, or the question might be unanswerable.
+SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable
+questions written adversarially by crowdworkers to look similar to answerable ones.
+To do well on SQuAD2.0, systems must not only answer questions when possible, but
+also determine when no answer is supported by the paragraph and abstain from answering.
-Homepage: `homepage to the benchmark's website goes here, if applicable`
+Homepage: https://rajpurkar.github.io/SQuAD-explorer/
 ### Citation
 ```
-BibTeX-formatted citation goes here
+@misc{rajpurkar2018know,
+    title={Know What You Don't Know: Unanswerable Questions for SQuAD},
+    author={Pranav Rajpurkar and Robin Jia and Percy Liang},
+    year={2018},
+    eprint={1806.03822},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
 ```
-### Subtasks
+### Groups and Tasks
-List or describe tasks defined in this folder, and their names here:
+#### Groups
-* `task_name`: `1-sentence description of what this particular task does`
-* `task_name2`: .....
+* `squadv2_complete`: Runs both `squadv2` and `squadv2_noans_loglikelihood`
+#### Tasks
+* `squadv2`: `Default squadv2 task`
+* `squadv2_noans_loglikelihood`: `Additional task to acquire the probability of model predicting there is no answer`
 ### Checklist

--- a/lm_eval/tasks/squadv2/_template_yaml
+++ b/lm_eval/tasks/squadv2/_template_yaml
+dataset_path: squad_v2
+training_split: train
+validation_split: validation
+doc_to_text: "Title: {{title}}\n\nBackground: {{context}}\n\nQuestion: {{question}}\n\n Answer:"
+doc_to_target: "{% if answers.text| length > 0 %}{{answers.text}}{% else %}{{['']}}{% endif %}"
+target_delimiter: ""
+should_decontaminate: true
+doc_to_decontamination_query: context