Merge remote-tracking branch 'origin/big-refactor' into calibration

e1ae8a2f · Herbie Bradley · 50e99bd7 · 30936bc7 · e1ae8a2f · e1ae8a2f
Commit e1ae8a2f authored Nov 26, 2023 by Herbie Bradley
20 changed files
--- a/lm_eval/tasks/mgsm/direct/direct_yaml
+++ b/lm_eval/tasks/mgsm/direct/direct_yaml
@@ -4,7 +4,7 @@
 group: mgsm_direct
 dataset_path: juletxara/mgsm
 dataset_name: null  # Overridden by language-specific config.
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 target_delimiter: ""

--- a/lm_eval/tasks/mgsm/en_cot/cot_yaml
+++ b/lm_eval/tasks/mgsm/en_cot/cot_yaml
@@ -4,7 +4,7 @@
 group: mgsm_cot_native
 dataset_path: juletxara/mgsm
 dataset_name: null  # Overridden by language-specific config.
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 target_delimiter: ""

--- a/lm_eval/tasks/mgsm/native_cot/cot_yaml
+++ b/lm_eval/tasks/mgsm/native_cot/cot_yaml
+# This file will be included in the generated language-specific task configs.
+# It doesn't have a yaml file extension as it is not meant to be imported directly
+# by the harness.
+group: mgsm_cot_native
+dataset_path: juletxara/mgsm
+dataset_name: null  # Overridden by language-specific config.
+output_type: generate_until
+training_split: train
+test_split: test
+target_delimiter: ""
+generation_kwargs:
+  until:
+    - "\n\n"
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+target_delimiter: " "
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+filter_list:
+  - name: "get-answer"
+    filter:
+      - function: "regex"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_bn.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_bn.yaml
@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[16+1]}}{% else %}{{answer_nu
 doc_to_text: '{% if answer is not none %}{{question+"\nধাপে ধাপে উত্তর:"}}{% else
  %}{{"প্রশ্ন: "+question+"\nধাপে ধাপে উত্তর:"}}{% endif %}'
 include: cot_yaml
-task: mgsm_bn_direct
+task: mgsm_bn_native_cot
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_de.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_de.yaml
@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[28+1]}}{% else %}{{answer_nu
 doc_to_text: '{% if answer is not none %}{{question+"\nSchritt-für-Schritt-Antwort:"}}{%
  else %}{{"Frage: "+question+"\nSchritt-für-Schritt-Antwort:"}}{% endif %}'
 include: cot_yaml
-task: mgsm_de_direct
+task: mgsm_de_native_cot
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_en.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_en.yaml
@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[20+1]}}{% else %}{{answer_nu
 doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else
  %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
 include: cot_yaml
-task: mgsm_en_direct
+task: mgsm_en_native_cot
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_es.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_es.yaml
@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[22+1]}}{% else %}{{answer_nu
 doc_to_text: '{% if answer is not none %}{{question+"\nRespuesta paso a paso:"}}{%
  else %}{{"Pregunta: "+question+"\nRespuesta paso a paso:"}}{% endif %}'
 include: cot_yaml
-task: mgsm_es_direct
+task: mgsm_es_native_cot
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_fr.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_fr.yaml
@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[25+1]}}{% else %}{{answer_nu
 doc_to_text: '{% if answer is not none %}{{question+"\nRéponse étape par étape :"}}{%
  else %}{{"Question : "+question+"\nRéponse étape par étape :"}}{% endif %}'
 include: cot_yaml
-task: mgsm_fr_direct
+task: mgsm_fr_native_cot
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_ja.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_ja.yaml
@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[10+1]}}{% else %}{{answer_nu
 doc_to_text: '{% if answer is not none %}{{question+"\nステップごとの答え:"}}{% else %}{{"問題:
  "+question+"\nステップごとの答え:"}}{% endif %}'
 include: cot_yaml
-task: mgsm_ja_direct
+task: mgsm_ja_native_cot
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_ru.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_ru.yaml
@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[17+1]}}{% else %}{{answer_nu
 doc_to_text: '{% if answer is not none %}{{question+"\nПошаговоерешение:"}}{% else
  %}{{"Задача: "+question+"\nПошаговоерешение:"}}{% endif %}'
 include: cot_yaml
-task: mgsm_ru_direct
+task: mgsm_ru_native_cot
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_sw.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_sw.yaml
@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[24+1]}}{% else %}{{answer_nu
 doc_to_text: '{% if answer is not none %}{{question+"\nJibu la Hatua kwa Hatua:"}}{%
  else %}{{"Swali: "+question+"\nJibu la Hatua kwa Hatua:"}}{% endif %}'
 include: cot_yaml
-task: mgsm_sw_direct
+task: mgsm_sw_native_cot
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_te.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_te.yaml
@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[18+1]}}{% else %}{{answer_nu
 doc_to_text: '{% if answer is not none %}{{question+"\nదశలవారీగా సమాధానం:"}}{% else
  %}{{"ప్రశ్న: "+question+"\nదశలవారీగా సమాధానం:"}}{% endif %}'
 include: cot_yaml
-task: mgsm_te_direct
+task: mgsm_te_native_cot
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_th.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_th.yaml
@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[17+1]}}{% else %}{{answer_nu
 doc_to_text: '{% if answer is not none %}{{question+"\nคำตอบทีละขั้นตอน:"}}{% else
  %}{{"โจทย์: "+question+"\nคำตอบทีละขั้นตอน:"}}{% endif %}'
 include: cot_yaml
-task: mgsm_th_direct
+task: mgsm_th_native_cot
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_zh.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_zh.yaml
@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[5+1]}}{% else %}{{answer_num
 doc_to_text: '{% if answer is not none %}{{question+"\n逐步解答:"}}{% else %}{{"问题: "+question+"\n逐步解答:"}}{%
  endif %}'
 include: cot_yaml
-task: mgsm_zh_direct
+task: mgsm_zh_native_cot
--- a/lm_eval/tasks/minerva_math/README.md
+++ b/lm_eval/tasks/minerva_math/README.md
+# MATH
+ℹ️ This is the 4-shot variant!
+## Paper
+Measuring Mathematical Problem Solving With the MATH Dataset
+https://arxiv.org/abs/2103.03874
+Many intellectual endeavors require mathematical problem solving, but this skill remains beyond the capabilities of computers. To measure this ability in machine learning models, we introduce MATH, a new dataset of 12,500 challenging competition mathematics problems. Each problem in MATH has a full step-by-step solution which can be used to teach models to generate answer derivations and explanations.
+NOTE: The few-shot and the generated answer extraction is based on the [Minerva](https://arxiv.org/abs/2206.14858) and exact match equivalence is calculated using the `sympy` library. This requires additional dependencies, which can be installed via the `lm-eval[math]` extra.
+Homepage: https://github.com/hendrycks/math
+## Citation
+```
+@article{hendrycksmath2021,
+  title={Measuring Mathematical Problem Solving With the MATH Dataset},
+  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
+  journal={NeurIPS},
+  year={2021}
+}
+@misc{2206.14858,
+Author = {Aitor Lewkowycz and Anders Andreassen and David Dohan and Ethan Dyer and Henryk Michalewski and Vinay Ramasesh and Ambrose Slone and Cem Anil and Imanol Schlag and Theo Gutman-Solo and Yuhuai Wu and Behnam Neyshabur and Guy Gur-Ari and Vedant Misra},
+Title = {Solving Quantitative Reasoning Problems with Language Models},
+Year = {2022},
+Eprint = {arXiv:2206.14858},
+}
+```
+### Groups, Benchmarks and Tasks
+#### Benchmarks
+- `minerva_math`
+#### Groups
+- `math_word_problems`
+- `generate_until`
+#### Tasks
+- `minerva_math_algebra`
+- `minerva_math_counting_and_prob`
+- `minerva_math_geometry`
+- `minerva_math_intermediate_algebra`
+- `minerva_math_num_theory`
+- `minerva_math_prealgebra`
+- `minerva_math_precalc`
+### Checklist
+The checklist is the following:
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+    * The implementation in the original paper is one where the model is first fine-tuned on the data. They do have a few-shot evaluation for GPT-3, however the few-shot context used here is sourced from [Lewkowycz et al](https://arxiv.org/abs/2206.14858). The achieved accuracy on Llama-2 models is comparable to that provided in the paper, though not identical.
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
+### Variant Wishlist
+- [ ] zero-shot variant
--- a/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
+++ b/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
+group:
+  - math_word_problems
+task: minerva_math_algebra
+dataset_path: EleutherAI/hendrycks_math
+process_docs: !function utils.process_docs
+dataset_name: algebra
+output_type: generate_until
+training_split: train
+test_split: test
+doc_to_text:  !function utils.doc_to_text
+process_results: !function utils.process_results
+doc_to_target: "{{answer}}"
+generation_kwargs:
+  until:
+    - "Problem:"
+  do_sample: false
+  temperature: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/minerva_math/minerva_math_counting_and_prob.yaml
+++ b/lm_eval/tasks/minerva_math/minerva_math_counting_and_prob.yaml
+include: minerva_math_algebra.yaml
+dataset_name: counting_and_probability
+task: minerva_math_counting_and_prob
--- a/lm_eval/tasks/minerva_math/minerva_math_geometry.yaml
+++ b/lm_eval/tasks/minerva_math/minerva_math_geometry.yaml
+include: minerva_math_algebra.yaml
+dataset_name: geometry
+task: minerva_math_geometry
--- a/lm_eval/tasks/minerva_math/minerva_math_intermediate_algebra.yaml
+++ b/lm_eval/tasks/minerva_math/minerva_math_intermediate_algebra.yaml
+include: minerva_math_algebra.yaml
+dataset_name: intermediate_algebra
+task: minerva_math_intermediate_algebra
--- a/lm_eval/tasks/minerva_math/minerva_math_num_theory.yaml
+++ b/lm_eval/tasks/minerva_math/minerva_math_num_theory.yaml
+include: minerva_math_algebra.yaml
+dataset_name: number_theory
+task: minerva_math_num_theory