Merge branch 'big-refactor' into model-written-eval

26bc3eab · Lintang Sutawika · GitHub · 0d701496 · cf617ab1 · 26bc3eab
Unverified Commit 26bc3eab authored Oct 19, 2023 by Lintang Sutawika Committed by GitHub Oct 19, 2023
20 changed files
--- a/lm_eval/tasks/bigbench/greedy_until/word_sorting.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/word_sorting.yaml
 # Generated by utils.py
 dataset_name: word_sorting_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_word_sorting_greedy_until
+include: ../generate_until_template_yaml
+task: bigbench_word_sorting_generate_until
--- a/lm_eval/tasks/bigbench/greedy_until/word_unscrambling.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/word_unscrambling.yaml
 # Generated by utils.py
 dataset_name: word_unscrambling_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_word_unscrambling_greedy_until
+include: ../generate_until_template_yaml
+task: bigbench_word_unscrambling_generate_until
--- a/lm_eval/tasks/bigbench/greedy_until_template_yaml
+++ b/lm_eval/tasks/bigbench/greedy_until_template_yaml
 group: bigbench
 dataset_path: bigbench # will switch to `hails/bigbench` when all tasks are pushed
-output_type: greedy_until
+output_type: generate_until
 dataset_kwargs:
  # num_shots: 0 # TODO: num of shots for `bigbench` HF dataset should be controlled through this, not through the typical methods
  # subtask_name: null

--- a/lm_eval/tasks/code_x_glue/code-text/go.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/go.yaml
@@ -5,7 +5,7 @@ dataset_path: CM/codexglue_code2text_go
 training_split: train
 validation_split: validation
 test_split: test
-output_type: greedy_until
+output_type: generate_until
 generation_kwargs:
  num_beams: 10
  max_length: 128

--- a/lm_eval/tasks/code_x_glue/code-text/java.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/java.yaml
@@ -5,7 +5,7 @@ dataset_path: CM/codexglue_code2text_java
 training_split: train
 validation_split: validation
 test_split: test
-output_type: greedy_until
+output_type: generate_until
 generation_kwargs:
  num_beams: 10
  max_length: 128

--- a/lm_eval/tasks/code_x_glue/code-text/javascript.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/javascript.yaml
@@ -5,7 +5,7 @@ dataset_path: CM/codexglue_code2text_javascript
 training_split: train
 validation_split: validation
 test_split: test
-output_type: greedy_until
+output_type: generate_until
 generation_kwargs:
  num_beams: 10
  max_length: 128

--- a/lm_eval/tasks/code_x_glue/code-text/php.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/php.yaml
@@ -5,7 +5,7 @@ dataset_path: CM/codexglue_code2text_php
 training_split: train
 validation_split: validation
 test_split: test
-output_type: greedy_until
+output_type: generate_until
 generation_kwargs:
  num_beams: 10
  max_length: 128

--- a/lm_eval/tasks/code_x_glue/code-text/python.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/python.yaml
@@ -5,7 +5,7 @@ dataset_path: CM/codexglue_code2text_python
 training_split: train
 validation_split: validation
 test_split: test
-output_type: greedy_until
+output_type: generate_until
 generation_kwargs:
  num_beams: 10
  max_length: 128

--- a/lm_eval/tasks/code_x_glue/code-text/ruby.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/ruby.yaml
@@ -5,7 +5,7 @@ dataset_path: CM/codexglue_code2text_ruby
 training_split: train
 validation_split: validation
 test_split: test
-output_type: greedy_until
+output_type: generate_until
 generation_kwargs:
  num_beams: 10
  max_length: 128

--- a/lm_eval/tasks/coqa/default.yaml
+++ b/lm_eval/tasks/coqa/default.yaml
 task: coqa
 dataset_path: EleutherAI/coqa
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 doc_to_text: !function utils.doc_to_text

--- a/lm_eval/tasks/drop/default.yaml
+++ b/lm_eval/tasks/drop/default.yaml
 task: drop
 dataset_path: EleutherAI/drop
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 process_docs: !function utils.process_docs

--- a/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
@@ -3,7 +3,7 @@ group:
 task: gsm8k_cot
 dataset_path: gsm8k
 dataset_name: main
-output_type: greedy_until
+output_type: generate_until
 test_split: test
 doc_to_text: "Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\n\nA: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.\n\n\
 Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\n\nA: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.\n\n\

--- a/lm_eval/tasks/gsm8k/gsm8k.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k.yaml
@@ -3,7 +3,7 @@ group:
 task: gsm8k_yaml
 dataset_path: gsm8k
 dataset_name: main
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 fewshot_split: train
 test_split: test

--- a/lm_eval/tasks/logiqa2/logieval.yaml
+++ b/lm_eval/tasks/logiqa2/logieval.yaml
 task: logieval
 dataset_path: baber/logiqa2
 dataset_name: logieval
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 # Instructions + {content}

--- a/lm_eval/tasks/mgsm/direct/direct_yaml
+++ b/lm_eval/tasks/mgsm/direct/direct_yaml
@@ -4,7 +4,7 @@
 group: mgsm_direct
 dataset_path: juletxara/mgsm
 dataset_name: null  # Overridden by language-specific config.
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 target_delimiter: ""

--- a/lm_eval/tasks/mgsm/en_cot/cot_yaml
+++ b/lm_eval/tasks/mgsm/en_cot/cot_yaml
@@ -4,7 +4,7 @@
 group: mgsm_cot_native
 dataset_path: juletxara/mgsm
 dataset_name: null  # Overridden by language-specific config.
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 target_delimiter: ""

--- a/lm_eval/tasks/mgsm/native_cot/cot_yaml
+++ b/lm_eval/tasks/mgsm/native_cot/cot_yaml
@@ -4,7 +4,7 @@
 group: mgsm_cot_native
 dataset_path: juletxara/mgsm
 dataset_name: null  # Overridden by language-specific config.
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 target_delimiter: ""

--- a/lm_eval/tasks/minerva_math/README.md
+++ b/lm_eval/tasks/minerva_math/README.md
@@ -37,7 +37,7 @@ Eprint = {arXiv:2206.14858},
 #### Groups

 - `math_word_problems`
- `greedy_until`
+- `generate_until`

 #### Tasks


--- a/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
+++ b/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
@@ -4,7 +4,7 @@ task: minerva_math_algebra
 dataset_path: EleutherAI/hendrycks_math
 process_docs: !function utils.process_docs
 dataset_name: algebra
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 doc_to_text:  !function utils.doc_to_text

--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
@@ -2,7 +2,7 @@ group: mmlu_flan_cot_fewshot
 dataset_path: cais/mmlu
 validation_split: validation
 fewshot_split: dev
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
 doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
 filter_list: