Commit e1ae8a2f authored by Herbie Bradley's avatar Herbie Bradley
Browse files

Merge remote-tracking branch 'origin/big-refactor' into calibration

parents 50e99bd7 30936bc7
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
group: mgsm_direct group: mgsm_direct
dataset_path: juletxara/mgsm dataset_path: juletxara/mgsm
dataset_name: null # Overridden by language-specific config. dataset_name: null # Overridden by language-specific config.
output_type: greedy_until output_type: generate_until
training_split: train training_split: train
test_split: test test_split: test
target_delimiter: "" target_delimiter: ""
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
group: mgsm_cot_native group: mgsm_cot_native
dataset_path: juletxara/mgsm dataset_path: juletxara/mgsm
dataset_name: null # Overridden by language-specific config. dataset_name: null # Overridden by language-specific config.
output_type: greedy_until output_type: generate_until
training_split: train training_split: train
test_split: test test_split: test
target_delimiter: "" target_delimiter: ""
......
# This file will be included in the generated language-specific task configs.
# It doesn't have a yaml file extension as it is not meant to be imported directly
# by the harness.
group: mgsm_cot_native
dataset_path: juletxara/mgsm
dataset_name: null # Overridden by language-specific config.
output_type: generate_until
training_split: train
test_split: test
target_delimiter: ""
generation_kwargs:
until:
- "\n\n"
- "\n"
do_sample: false
temperature: 0.0
target_delimiter: " "
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
filter_list:
- name: "get-answer"
filter:
- function: "regex"
regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
- function: "take_first"
...@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[16+1]}}{% else %}{{answer_nu ...@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[16+1]}}{% else %}{{answer_nu
doc_to_text: '{% if answer is not none %}{{question+"\nধাপে ধাপে উত্তর:"}}{% else doc_to_text: '{% if answer is not none %}{{question+"\nধাপে ধাপে উত্তর:"}}{% else
%}{{"প্রশ্ন: "+question+"\nধাপে ধাপে উত্তর:"}}{% endif %}' %}{{"প্রশ্ন: "+question+"\nধাপে ধাপে উত্তর:"}}{% endif %}'
include: cot_yaml include: cot_yaml
task: mgsm_bn_direct task: mgsm_bn_native_cot
...@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[28+1]}}{% else %}{{answer_nu ...@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[28+1]}}{% else %}{{answer_nu
doc_to_text: '{% if answer is not none %}{{question+"\nSchritt-für-Schritt-Antwort:"}}{% doc_to_text: '{% if answer is not none %}{{question+"\nSchritt-für-Schritt-Antwort:"}}{%
else %}{{"Frage: "+question+"\nSchritt-für-Schritt-Antwort:"}}{% endif %}' else %}{{"Frage: "+question+"\nSchritt-für-Schritt-Antwort:"}}{% endif %}'
include: cot_yaml include: cot_yaml
task: mgsm_de_direct task: mgsm_de_native_cot
...@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[20+1]}}{% else %}{{answer_nu ...@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[20+1]}}{% else %}{{answer_nu
doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else
%}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}' %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
include: cot_yaml include: cot_yaml
task: mgsm_en_direct task: mgsm_en_native_cot
...@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[22+1]}}{% else %}{{answer_nu ...@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[22+1]}}{% else %}{{answer_nu
doc_to_text: '{% if answer is not none %}{{question+"\nRespuesta paso a paso:"}}{% doc_to_text: '{% if answer is not none %}{{question+"\nRespuesta paso a paso:"}}{%
else %}{{"Pregunta: "+question+"\nRespuesta paso a paso:"}}{% endif %}' else %}{{"Pregunta: "+question+"\nRespuesta paso a paso:"}}{% endif %}'
include: cot_yaml include: cot_yaml
task: mgsm_es_direct task: mgsm_es_native_cot
...@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[25+1]}}{% else %}{{answer_nu ...@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[25+1]}}{% else %}{{answer_nu
doc_to_text: '{% if answer is not none %}{{question+"\nRéponse étape par étape :"}}{% doc_to_text: '{% if answer is not none %}{{question+"\nRéponse étape par étape :"}}{%
else %}{{"Question : "+question+"\nRéponse étape par étape :"}}{% endif %}' else %}{{"Question : "+question+"\nRéponse étape par étape :"}}{% endif %}'
include: cot_yaml include: cot_yaml
task: mgsm_fr_direct task: mgsm_fr_native_cot
...@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[10+1]}}{% else %}{{answer_nu ...@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[10+1]}}{% else %}{{answer_nu
doc_to_text: '{% if answer is not none %}{{question+"\nステップごとの答え:"}}{% else %}{{"問題: doc_to_text: '{% if answer is not none %}{{question+"\nステップごとの答え:"}}{% else %}{{"問題:
"+question+"\nステップごとの答え:"}}{% endif %}' "+question+"\nステップごとの答え:"}}{% endif %}'
include: cot_yaml include: cot_yaml
task: mgsm_ja_direct task: mgsm_ja_native_cot
...@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[17+1]}}{% else %}{{answer_nu ...@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[17+1]}}{% else %}{{answer_nu
doc_to_text: '{% if answer is not none %}{{question+"\nПошаговоерешение:"}}{% else doc_to_text: '{% if answer is not none %}{{question+"\nПошаговоерешение:"}}{% else
%}{{"Задача: "+question+"\nПошаговоерешение:"}}{% endif %}' %}{{"Задача: "+question+"\nПошаговоерешение:"}}{% endif %}'
include: cot_yaml include: cot_yaml
task: mgsm_ru_direct task: mgsm_ru_native_cot
...@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[24+1]}}{% else %}{{answer_nu ...@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[24+1]}}{% else %}{{answer_nu
doc_to_text: '{% if answer is not none %}{{question+"\nJibu la Hatua kwa Hatua:"}}{% doc_to_text: '{% if answer is not none %}{{question+"\nJibu la Hatua kwa Hatua:"}}{%
else %}{{"Swali: "+question+"\nJibu la Hatua kwa Hatua:"}}{% endif %}' else %}{{"Swali: "+question+"\nJibu la Hatua kwa Hatua:"}}{% endif %}'
include: cot_yaml include: cot_yaml
task: mgsm_sw_direct task: mgsm_sw_native_cot
...@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[18+1]}}{% else %}{{answer_nu ...@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[18+1]}}{% else %}{{answer_nu
doc_to_text: '{% if answer is not none %}{{question+"\nదశలవారీగా సమాధానం:"}}{% else doc_to_text: '{% if answer is not none %}{{question+"\nదశలవారీగా సమాధానం:"}}{% else
%}{{"ప్రశ్న: "+question+"\nదశలవారీగా సమాధానం:"}}{% endif %}' %}{{"ప్రశ్న: "+question+"\nదశలవారీగా సమాధానం:"}}{% endif %}'
include: cot_yaml include: cot_yaml
task: mgsm_te_direct task: mgsm_te_native_cot
...@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[17+1]}}{% else %}{{answer_nu ...@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[17+1]}}{% else %}{{answer_nu
doc_to_text: '{% if answer is not none %}{{question+"\nคำตอบทีละขั้นตอน:"}}{% else doc_to_text: '{% if answer is not none %}{{question+"\nคำตอบทีละขั้นตอน:"}}{% else
%}{{"โจทย์: "+question+"\nคำตอบทีละขั้นตอน:"}}{% endif %}' %}{{"โจทย์: "+question+"\nคำตอบทีละขั้นตอน:"}}{% endif %}'
include: cot_yaml include: cot_yaml
task: mgsm_th_direct task: mgsm_th_native_cot
...@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[5+1]}}{% else %}{{answer_num ...@@ -5,4 +5,4 @@ doc_to_target: '{% if answer is not none %}{{answer[5+1]}}{% else %}{{answer_num
doc_to_text: '{% if answer is not none %}{{question+"\n逐步解答:"}}{% else %}{{"问题: "+question+"\n逐步解答:"}}{% doc_to_text: '{% if answer is not none %}{{question+"\n逐步解答:"}}{% else %}{{"问题: "+question+"\n逐步解答:"}}{%
endif %}' endif %}'
include: cot_yaml include: cot_yaml
task: mgsm_zh_direct task: mgsm_zh_native_cot
# MATH
ℹ️ This is the 4-shot variant!
## Paper
Measuring Mathematical Problem Solving With the MATH Dataset
https://arxiv.org/abs/2103.03874
Many intellectual endeavors require mathematical problem solving, but this skill remains beyond the capabilities of computers. To measure this ability in machine learning models, we introduce MATH, a new dataset of 12,500 challenging competition mathematics problems. Each problem in MATH has a full step-by-step solution which can be used to teach models to generate answer derivations and explanations.
NOTE: The few-shot and the generated answer extraction is based on the [Minerva](https://arxiv.org/abs/2206.14858) and exact match equivalence is calculated using the `sympy` library. This requires additional dependencies, which can be installed via the `lm-eval[math]` extra.
Homepage: https://github.com/hendrycks/math
## Citation
```
@article{hendrycksmath2021,
title={Measuring Mathematical Problem Solving With the MATH Dataset},
author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
journal={NeurIPS},
year={2021}
}
@misc{2206.14858,
Author = {Aitor Lewkowycz and Anders Andreassen and David Dohan and Ethan Dyer and Henryk Michalewski and Vinay Ramasesh and Ambrose Slone and Cem Anil and Imanol Schlag and Theo Gutman-Solo and Yuhuai Wu and Behnam Neyshabur and Guy Gur-Ari and Vedant Misra},
Title = {Solving Quantitative Reasoning Problems with Language Models},
Year = {2022},
Eprint = {arXiv:2206.14858},
}
```
### Groups, Benchmarks and Tasks
#### Benchmarks
- `minerva_math`
#### Groups
- `math_word_problems`
- `generate_until`
#### Tasks
- `minerva_math_algebra`
- `minerva_math_counting_and_prob`
- `minerva_math_geometry`
- `minerva_math_intermediate_algebra`
- `minerva_math_num_theory`
- `minerva_math_prealgebra`
- `minerva_math_precalc`
### Checklist
The checklist is the following:
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
* The implementation in the original paper is one where the model is first fine-tuned on the data. They do have a few-shot evaluation for GPT-3, however the few-shot context used here is sourced from [Lewkowycz et al](https://arxiv.org/abs/2206.14858). The achieved accuracy on Llama-2 models is comparable to that provided in the paper, though not identical.
If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
### Variant Wishlist
- [ ] zero-shot variant
group:
- math_word_problems
task: minerva_math_algebra
dataset_path: EleutherAI/hendrycks_math
process_docs: !function utils.process_docs
dataset_name: algebra
output_type: generate_until
training_split: train
test_split: test
doc_to_text: !function utils.doc_to_text
process_results: !function utils.process_results
doc_to_target: "{{answer}}"
generation_kwargs:
until:
- "Problem:"
do_sample: false
temperature: 0
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
include: minerva_math_algebra.yaml
dataset_name: counting_and_probability
task: minerva_math_counting_and_prob
include: minerva_math_algebra.yaml
dataset_name: geometry
task: minerva_math_geometry
include: minerva_math_algebra.yaml
dataset_name: intermediate_algebra
task: minerva_math_intermediate_algebra
include: minerva_math_algebra.yaml
dataset_name: number_theory
task: minerva_math_num_theory
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment