"megatron/git@developer.sourcefind.cn:wuxk1/megatron-lm.git" did not exist on "447c11712bd188b8143474d0a70e042c71c2b350"
Commit 3e8135ce authored by Baber's avatar Baber
Browse files

Merge branch 'main' into comma

parents 8e560c96 0c134ee9
dataset_name: lm_syneval__npi__npi_across_anim__future
include: _template_yaml
task: lm_syneval__npi__npi_across_anim__future
dataset_name: lm_syneval__npi__npi_across_anim__past
include: _template_yaml
task: lm_syneval__npi__npi_across_anim__past
dataset_name: lm_syneval__npi__npi_across_inanim__future
include: _template_yaml
task: lm_syneval__npi__npi_across_inanim__future
dataset_name: lm_syneval__npi__npi_across_inanim__past
include: _template_yaml
task: lm_syneval__npi__npi_across_inanim__past
dataset_name: lm_syneval__npi__simple_npi_anim__future
include: _template_yaml
task: lm_syneval__npi__simple_npi_anim__future
dataset_name: lm_syneval__npi__simple_npi_anim__past
include: _template_yaml
task: lm_syneval__npi__simple_npi_anim__past
dataset_name: lm_syneval__npi__simple_npi_inanim__future
include: _template_yaml
task: lm_syneval__npi__simple_npi_inanim__future
dataset_name: lm_syneval__npi__simple_npi_inanim__past
include: _template_yaml
task: lm_syneval__npi__simple_npi_inanim__past
dataset_name: lm_syneval__reflexives__reflexive_sent_comp__plur_MS_ANPHR_plur_BS
include: _template_yaml
task: lm_syneval__reflexives__reflexive_sent_comp__plur_MS_ANPHR_plur_BS
dataset_name: lm_syneval__reflexives__reflexive_sent_comp__plur_MS_ANPHR_sing_BS
include: _template_yaml
task: lm_syneval__reflexives__reflexive_sent_comp__plur_MS_ANPHR_sing_BS
dataset_name: lm_syneval__reflexives__reflexive_sent_comp__sing_MS_ANPHR_plur_BS
include: _template_yaml
task: lm_syneval__reflexives__reflexive_sent_comp__sing_MS_ANPHR_plur_BS
dataset_name: lm_syneval__reflexives__reflexive_sent_comp__sing_MS_ANPHR_sing_BS
include: _template_yaml
task: lm_syneval__reflexives__reflexive_sent_comp__sing_MS_ANPHR_sing_BS
dataset_name: lm_syneval__reflexives__reflexives_across__plur_MS_ANPHR_plur_ES_EV
include: _template_yaml
task: lm_syneval__reflexives__reflexives_across__plur_MS_ANPHR_plur_ES_EV
dataset_name: lm_syneval__reflexives__reflexives_across__plur_MS_ANPHR_sing_ES_EV
include: _template_yaml
task: lm_syneval__reflexives__reflexives_across__plur_MS_ANPHR_sing_ES_EV
dataset_name: lm_syneval__reflexives__reflexives_across__sing_MS_ANPHR_plur_ES_EV
include: _template_yaml
task: lm_syneval__reflexives__reflexives_across__sing_MS_ANPHR_plur_ES_EV
dataset_name: lm_syneval__reflexives__reflexives_across__sing_MS_ANPHR_sing_ES_EV
include: _template_yaml
task: lm_syneval__reflexives__reflexives_across__sing_MS_ANPHR_sing_ES_EV
dataset_name: lm_syneval__reflexives__simple_reflexives__plur_MS_ANPHR
include: _template_yaml
task: lm_syneval__reflexives__simple_reflexives__plur_MS_ANPHR
dataset_name: lm_syneval__reflexives__simple_reflexives__sing_MS_ANPHR
include: _template_yaml
task: lm_syneval__reflexives__simple_reflexives__sing_MS_ANPHR
group: lm_syneval
task:
- group: lm_syneval__reflexives
task:
- group: lm_syneval__reflexives__simple_reflexives
task:
- lm_syneval__reflexives__simple_reflexives__sing_MS_ANPHR
- lm_syneval__reflexives__simple_reflexives__plur_MS_ANPHR
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__reflexives__reflexive_sent_comp
task:
- lm_syneval__reflexives__reflexive_sent_comp__sing_MS_ANPHR_sing_BS
- lm_syneval__reflexives__reflexive_sent_comp__sing_MS_ANPHR_plur_BS
- lm_syneval__reflexives__reflexive_sent_comp__plur_MS_ANPHR_sing_BS
- lm_syneval__reflexives__reflexive_sent_comp__plur_MS_ANPHR_plur_BS
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__reflexives__reflexives_across
task:
- lm_syneval__reflexives__reflexives_across__sing_MS_ANPHR_sing_ES_EV
- lm_syneval__reflexives__reflexives_across__sing_MS_ANPHR_plur_ES_EV
- lm_syneval__reflexives__reflexives_across__plur_MS_ANPHR_sing_ES_EV
- lm_syneval__reflexives__reflexives_across__plur_MS_ANPHR_plur_ES_EV
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement
task:
- group: lm_syneval__agreement__obj_rel_within_inanim
task:
- lm_syneval__agreement__obj_rel_within_inanim__sing_ES_EV_sing_IS_IV
- lm_syneval__agreement__obj_rel_within_inanim__sing_ES_EV_plur_IS_IV
- lm_syneval__agreement__obj_rel_within_inanim__plur_ES_EV_sing_IS_IV
- lm_syneval__agreement__obj_rel_within_inanim__plur_ES_EV_plur_IS_IV
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__vp_coord
task:
- lm_syneval__agreement__vp_coord__sing_MS_MV_MV
- lm_syneval__agreement__vp_coord__plur_MS_MV_MV
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__sent_comp
task:
- lm_syneval__agreement__sent_comp__sing_MS_MV_sing_BS
- lm_syneval__agreement__sent_comp__sing_MS_MV_plur_BS
- lm_syneval__agreement__sent_comp__plur_MS_MV_sing_BS
- lm_syneval__agreement__sent_comp__plur_MS_MV_plur_BS
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__obj_rel_no_comp_within_inanim
task:
- lm_syneval__agreement__obj_rel_no_comp_within_inanim__sing_ES_EV_sing_IS_IV
- lm_syneval__agreement__obj_rel_no_comp_within_inanim__sing_ES_EV_plur_IS_IV
- lm_syneval__agreement__obj_rel_no_comp_within_inanim__plur_ES_EV_sing_IS_IV
- lm_syneval__agreement__obj_rel_no_comp_within_inanim__plur_ES_EV_plur_IS_IV
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__obj_rel_within_anim
task:
- lm_syneval__agreement__obj_rel_within_anim__sing_ES_EV_sing_MS_MV
- lm_syneval__agreement__obj_rel_within_anim__sing_ES_EV_plur_MS_MV
- lm_syneval__agreement__obj_rel_within_anim__plur_ES_EV_sing_MS_MV
- lm_syneval__agreement__obj_rel_within_anim__plur_ES_EV_plur_MS_MV
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__subj_rel
task:
- lm_syneval__agreement__subj_rel__sing_MS_EV_MV_sing_ES
- lm_syneval__agreement__subj_rel__sing_MS_EV_MV_plur_ES
- lm_syneval__agreement__subj_rel__plur_MS_EV_MV_sing_ES
- lm_syneval__agreement__subj_rel__plur_MS_EV_MV_plur_ES
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__prep_inanim
task:
- lm_syneval__agreement__prep_inanim__sing_IS_IV_sing_ES
- lm_syneval__agreement__prep_inanim__sing_IS_IV_plur_ES
- lm_syneval__agreement__prep_inanim__plur_IS_IV_sing_ES
- lm_syneval__agreement__prep_inanim__plur_IS_IV_plur_ES
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__long_vp_coord
task:
- lm_syneval__agreement__long_vp_coord__sing_MS_LMV_LMV
- lm_syneval__agreement__long_vp_coord__plur_MS_LMV_LMV
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__obj_rel_across_anim
task:
- lm_syneval__agreement__obj_rel_across_anim__sing_MS_MV_sing_ES_EV
- lm_syneval__agreement__obj_rel_across_anim__sing_MS_MV_plur_ES_EV
- lm_syneval__agreement__obj_rel_across_anim__plur_MS_MV_sing_ES_EV
- lm_syneval__agreement__obj_rel_across_anim__plur_MS_MV_plur_ES_EV
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__obj_rel_across_inanim
task:
- lm_syneval__agreement__obj_rel_across_inanim__sing_IS_IV_sing_ES_EV
- lm_syneval__agreement__obj_rel_across_inanim__sing_IS_IV_plur_ES_EV
- lm_syneval__agreement__obj_rel_across_inanim__plur_IS_IV_sing_ES_EV
- lm_syneval__agreement__obj_rel_across_inanim__plur_IS_IV_plur_ES_EV
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__obj_rel_no_comp_across_anim
task:
- lm_syneval__agreement__obj_rel_no_comp_across_anim__sing_MS_MV_sing_ES_EV
- lm_syneval__agreement__obj_rel_no_comp_across_anim__sing_MS_MV_plur_ES_EV
- lm_syneval__agreement__obj_rel_no_comp_across_anim__plur_MS_MV_sing_ES_EV
- lm_syneval__agreement__obj_rel_no_comp_across_anim__plur_MS_MV_plur_ES_EV
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__obj_rel_no_comp_across_inanim
task:
- lm_syneval__agreement__obj_rel_no_comp_across_inanim__sing_IS_IV_sing_ES_EV
- lm_syneval__agreement__obj_rel_no_comp_across_inanim__sing_IS_IV_plur_ES_EV
- lm_syneval__agreement__obj_rel_no_comp_across_inanim__plur_IS_IV_sing_ES_EV
- lm_syneval__agreement__obj_rel_no_comp_across_inanim__plur_IS_IV_plur_ES_EV
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__simple_agrmt
task:
- lm_syneval__agreement__simple_agrmt__sing_MS_MV
- lm_syneval__agreement__simple_agrmt__plur_MS_MV
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__prep_anim
task:
- lm_syneval__agreement__prep_anim__sing_MS_MV_sing_ES
- lm_syneval__agreement__prep_anim__sing_MS_MV_plur_ES
- lm_syneval__agreement__prep_anim__plur_MS_MV_sing_ES
- lm_syneval__agreement__prep_anim__plur_MS_MV_plur_ES
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__obj_rel_no_comp_within_anim
task:
- lm_syneval__agreement__obj_rel_no_comp_within_anim__sing_ES_EV_sing_MS_MV
- lm_syneval__agreement__obj_rel_no_comp_within_anim__sing_ES_EV_plur_MS_MV
- lm_syneval__agreement__obj_rel_no_comp_within_anim__plur_ES_EV_sing_MS_MV
- lm_syneval__agreement__obj_rel_no_comp_within_anim__plur_ES_EV_plur_MS_MV
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__npi
task:
- group: lm_syneval__npi__npi_across_anim
task:
- lm_syneval__npi__npi_across_anim__past
- lm_syneval__npi__npi_across_anim__future
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__npi__npi_across_inanim
task:
- lm_syneval__npi__npi_across_inanim__past
- lm_syneval__npi__npi_across_inanim__future
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__npi__simple_npi_anim
task:
- lm_syneval__npi__simple_npi_anim__past
- lm_syneval__npi__simple_npi_anim__future
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__npi__simple_npi_inanim
task:
- lm_syneval__npi__simple_npi_inanim__past
- lm_syneval__npi__simple_npi_inanim__future
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
# MATH # MATH
ℹ️ This is the 4-shot variant! ℹ️ This is the 4-shot variant!
## Paper ## Paper
Measuring Mathematical Problem Solving With the MATH Dataset Measuring Mathematical Problem Solving With the MATH Dataset
https://arxiv.org/abs/2103.03874 https://arxiv.org/abs/2103.03874
Many intellectual endeavors require mathematical problem solving, but this skill remains beyond the capabilities of computers. To measure this ability in machine learning models, we introduce MATH, a new dataset of 12,500 challenging competition mathematics problems. Each problem in MATH has a full step-by-step solution which can be used to teach models to generate answer derivations and explanations. Many intellectual endeavors require mathematical problem solving, but this skill remains beyond the capabilities of
computers. To measure this ability in machine learning models, we introduce MATH, a new dataset of 12,500 challenging
competition mathematics problems. Each problem in MATH has a full step-by-step solution which can be used to teach
models to generate answer derivations and explanations.
NOTE: The few-shot and the generated answer extraction is based on the [Minerva](https://arxiv.org/abs/2206.14858) and exact match equivalence is calculated using the `sympy` library. This requires additional dependencies, which can be installed via the `lm-eval[math]` extra. NOTE: The few-shot and the generated answer extraction is based on the [Minerva](https://arxiv.org/abs/2206.14858) and
exact match equivalence is calculated using the `sympy` library. This requires additional dependencies, which can be
installed via the `lm-eval[math]` extra.
Homepage: https://github.com/hendrycks/math Homepage: https://github.com/hendrycks/math
## Citation ## Citation
``` ```
@article{hendrycksmath2021, @article{hendrycksmath2021,
title={Measuring Mathematical Problem Solving With the MATH Dataset}, title={Measuring Mathematical Problem Solving With the MATH Dataset},
...@@ -49,13 +57,18 @@ Eprint = {arXiv:2206.14858}, ...@@ -49,13 +57,18 @@ Eprint = {arXiv:2206.14858},
The checklist is the following: The checklist is the following:
For adding novel benchmarks/datasets to the library: For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
* The implementation in the original paper is one where the model is first fine-tuned on the data. They do have a few-shot evaluation for GPT-3, however the few-shot context used here is sourced from [Lewkowycz et al](https://arxiv.org/abs/2206.14858). The achieved accuracy on Llama-2 models is comparable to that provided in the paper, though not identical.
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the
reference implementation and documented how to run such a test?
* The implementation in the original paper is one where the model is first fine-tuned on the data. They do have
a few-shot evaluation for GPT-3, however the few-shot context used here is sourced
from [Lewkowycz et al](https://arxiv.org/abs/2206.14858). The achieved accuracy on Llama-2 models is
comparable to that provided in the paper, though not identical.
If other tasks on this dataset are already supported: If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted? * [x] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates? * [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [x] Have you noted which, if any, published evaluation setups are matched by this variant? * [x] Have you noted which, if any, published evaluation setups are matched by this variant?
...@@ -65,4 +78,7 @@ If other tasks on this dataset are already supported: ...@@ -65,4 +78,7 @@ If other tasks on this dataset are already supported:
- [ ] zero-shot variant - [ ] zero-shot variant
### Changelog ### Changelog
version 2.0: (21-Feb-2025); added math_verify (extraction) metric. For details [see](https://huggingface.co/blog/math_verify_leaderboard)
- version 2.0: (21-Feb-2025); added math_verify (extraction) metric. For
details [see](https://huggingface.co/blog/math_verify_leaderboard)
- version 3.0 (21-Aug-2025); pass the full solution and model generation to `math_verify`'s `parse`
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment