Merge branch 'main' into comma

3e8135ce · Baber · 8e560c96 · 0c134ee9 · 3e8135ce · 3e8135ce
Commit 3e8135ce authored Sep 16, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/lm_syneval/lm_syneval__npi__npi_across_anim__future.yaml
+++ b/lm_eval/tasks/lm_syneval/lm_syneval__npi__npi_across_anim__future.yaml
+dataset_name: lm_syneval__npi__npi_across_anim__future
+include: _template_yaml
+task: lm_syneval__npi__npi_across_anim__future
--- a/lm_eval/tasks/lm_syneval/lm_syneval__npi__npi_across_anim__past.yaml
+++ b/lm_eval/tasks/lm_syneval/lm_syneval__npi__npi_across_anim__past.yaml
+dataset_name: lm_syneval__npi__npi_across_anim__past
+include: _template_yaml
+task: lm_syneval__npi__npi_across_anim__past
--- a/lm_eval/tasks/lm_syneval/lm_syneval__npi__npi_across_inanim__future.yaml
+++ b/lm_eval/tasks/lm_syneval/lm_syneval__npi__npi_across_inanim__future.yaml
+dataset_name: lm_syneval__npi__npi_across_inanim__future
+include: _template_yaml
+task: lm_syneval__npi__npi_across_inanim__future
--- a/lm_eval/tasks/lm_syneval/lm_syneval__npi__npi_across_inanim__past.yaml
+++ b/lm_eval/tasks/lm_syneval/lm_syneval__npi__npi_across_inanim__past.yaml
+dataset_name: lm_syneval__npi__npi_across_inanim__past
+include: _template_yaml
+task: lm_syneval__npi__npi_across_inanim__past
--- a/lm_eval/tasks/lm_syneval/lm_syneval__npi__simple_npi_anim__future.yaml
+++ b/lm_eval/tasks/lm_syneval/lm_syneval__npi__simple_npi_anim__future.yaml
+dataset_name: lm_syneval__npi__simple_npi_anim__future
+include: _template_yaml
+task: lm_syneval__npi__simple_npi_anim__future
--- a/lm_eval/tasks/lm_syneval/lm_syneval__npi__simple_npi_anim__past.yaml
+++ b/lm_eval/tasks/lm_syneval/lm_syneval__npi__simple_npi_anim__past.yaml
+dataset_name: lm_syneval__npi__simple_npi_anim__past
+include: _template_yaml
+task: lm_syneval__npi__simple_npi_anim__past
--- a/lm_eval/tasks/lm_syneval/lm_syneval__npi__simple_npi_inanim__future.yaml
+++ b/lm_eval/tasks/lm_syneval/lm_syneval__npi__simple_npi_inanim__future.yaml
+dataset_name: lm_syneval__npi__simple_npi_inanim__future
+include: _template_yaml
+task: lm_syneval__npi__simple_npi_inanim__future
--- a/lm_eval/tasks/lm_syneval/lm_syneval__npi__simple_npi_inanim__past.yaml
+++ b/lm_eval/tasks/lm_syneval/lm_syneval__npi__simple_npi_inanim__past.yaml
+dataset_name: lm_syneval__npi__simple_npi_inanim__past
+include: _template_yaml
+task: lm_syneval__npi__simple_npi_inanim__past
--- a/lm_eval/tasks/lm_syneval/lm_syneval__reflexives__reflexive_sent_comp__plur_MS_ANPHR_plur_BS.yaml
+++ b/lm_eval/tasks/lm_syneval/lm_syneval__reflexives__reflexive_sent_comp__plur_MS_ANPHR_plur_BS.yaml
+dataset_name: lm_syneval__reflexives__reflexive_sent_comp__plur_MS_ANPHR_plur_BS
+include: _template_yaml
+task: lm_syneval__reflexives__reflexive_sent_comp__plur_MS_ANPHR_plur_BS
--- a/lm_eval/tasks/lm_syneval/lm_syneval__reflexives__reflexive_sent_comp__plur_MS_ANPHR_sing_BS.yaml
+++ b/lm_eval/tasks/lm_syneval/lm_syneval__reflexives__reflexive_sent_comp__plur_MS_ANPHR_sing_BS.yaml
+dataset_name: lm_syneval__reflexives__reflexive_sent_comp__plur_MS_ANPHR_sing_BS
+include: _template_yaml
+task: lm_syneval__reflexives__reflexive_sent_comp__plur_MS_ANPHR_sing_BS
--- a/lm_eval/tasks/lm_syneval/lm_syneval__reflexives__reflexive_sent_comp__sing_MS_ANPHR_plur_BS.yaml
+++ b/lm_eval/tasks/lm_syneval/lm_syneval__reflexives__reflexive_sent_comp__sing_MS_ANPHR_plur_BS.yaml
+dataset_name: lm_syneval__reflexives__reflexive_sent_comp__sing_MS_ANPHR_plur_BS
+include: _template_yaml
+task: lm_syneval__reflexives__reflexive_sent_comp__sing_MS_ANPHR_plur_BS
--- a/lm_eval/tasks/lm_syneval/lm_syneval__reflexives__reflexive_sent_comp__sing_MS_ANPHR_sing_BS.yaml
+++ b/lm_eval/tasks/lm_syneval/lm_syneval__reflexives__reflexive_sent_comp__sing_MS_ANPHR_sing_BS.yaml
+dataset_name: lm_syneval__reflexives__reflexive_sent_comp__sing_MS_ANPHR_sing_BS
+include: _template_yaml
+task: lm_syneval__reflexives__reflexive_sent_comp__sing_MS_ANPHR_sing_BS
--- a/lm_eval/tasks/lm_syneval/lm_syneval__reflexives__reflexives_across__plur_MS_ANPHR_plur_ES_EV.yaml
+++ b/lm_eval/tasks/lm_syneval/lm_syneval__reflexives__reflexives_across__plur_MS_ANPHR_plur_ES_EV.yaml
+dataset_name: lm_syneval__reflexives__reflexives_across__plur_MS_ANPHR_plur_ES_EV
+include: _template_yaml
+task: lm_syneval__reflexives__reflexives_across__plur_MS_ANPHR_plur_ES_EV
--- a/lm_eval/tasks/lm_syneval/lm_syneval__reflexives__reflexives_across__plur_MS_ANPHR_sing_ES_EV.yaml
+++ b/lm_eval/tasks/lm_syneval/lm_syneval__reflexives__reflexives_across__plur_MS_ANPHR_sing_ES_EV.yaml
+dataset_name: lm_syneval__reflexives__reflexives_across__plur_MS_ANPHR_sing_ES_EV
+include: _template_yaml
+task: lm_syneval__reflexives__reflexives_across__plur_MS_ANPHR_sing_ES_EV
--- a/lm_eval/tasks/lm_syneval/lm_syneval__reflexives__reflexives_across__sing_MS_ANPHR_plur_ES_EV.yaml
+++ b/lm_eval/tasks/lm_syneval/lm_syneval__reflexives__reflexives_across__sing_MS_ANPHR_plur_ES_EV.yaml
+dataset_name: lm_syneval__reflexives__reflexives_across__sing_MS_ANPHR_plur_ES_EV
+include: _template_yaml
+task: lm_syneval__reflexives__reflexives_across__sing_MS_ANPHR_plur_ES_EV
--- a/lm_eval/tasks/lm_syneval/lm_syneval__reflexives__reflexives_across__sing_MS_ANPHR_sing_ES_EV.yaml
+++ b/lm_eval/tasks/lm_syneval/lm_syneval__reflexives__reflexives_across__sing_MS_ANPHR_sing_ES_EV.yaml
+dataset_name: lm_syneval__reflexives__reflexives_across__sing_MS_ANPHR_sing_ES_EV
+include: _template_yaml
+task: lm_syneval__reflexives__reflexives_across__sing_MS_ANPHR_sing_ES_EV
--- a/lm_eval/tasks/lm_syneval/lm_syneval__reflexives__simple_reflexives__plur_MS_ANPHR.yaml
+++ b/lm_eval/tasks/lm_syneval/lm_syneval__reflexives__simple_reflexives__plur_MS_ANPHR.yaml
+dataset_name: lm_syneval__reflexives__simple_reflexives__plur_MS_ANPHR
+include: _template_yaml
+task: lm_syneval__reflexives__simple_reflexives__plur_MS_ANPHR
--- a/lm_eval/tasks/lm_syneval/lm_syneval__reflexives__simple_reflexives__sing_MS_ANPHR.yaml
+++ b/lm_eval/tasks/lm_syneval/lm_syneval__reflexives__simple_reflexives__sing_MS_ANPHR.yaml
+dataset_name: lm_syneval__reflexives__simple_reflexives__sing_MS_ANPHR
+include: _template_yaml
+task: lm_syneval__reflexives__simple_reflexives__sing_MS_ANPHR
--- a/lm_eval/tasks/lm_syneval/lm_syneval_group.yaml
+++ b/lm_eval/tasks/lm_syneval/lm_syneval_group.yaml
+group: lm_syneval
+task:
+  - group: lm_syneval__reflexives
+    task:
+      - group: lm_syneval__reflexives__simple_reflexives
+        task:
+          - lm_syneval__reflexives__simple_reflexives__sing_MS_ANPHR
+          - lm_syneval__reflexives__simple_reflexives__plur_MS_ANPHR
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__reflexives__reflexive_sent_comp
+        task:
+          - lm_syneval__reflexives__reflexive_sent_comp__sing_MS_ANPHR_sing_BS
+          - lm_syneval__reflexives__reflexive_sent_comp__sing_MS_ANPHR_plur_BS
+          - lm_syneval__reflexives__reflexive_sent_comp__plur_MS_ANPHR_sing_BS
+          - lm_syneval__reflexives__reflexive_sent_comp__plur_MS_ANPHR_plur_BS
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__reflexives__reflexives_across
+        task:
+          - lm_syneval__reflexives__reflexives_across__sing_MS_ANPHR_sing_ES_EV
+          - lm_syneval__reflexives__reflexives_across__sing_MS_ANPHR_plur_ES_EV
+          - lm_syneval__reflexives__reflexives_across__plur_MS_ANPHR_sing_ES_EV
+          - lm_syneval__reflexives__reflexives_across__plur_MS_ANPHR_plur_ES_EV
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+    aggregate_metric_list:
+      - metric: acc
+        aggregation: mean
+        weight_by_size: false
+  - group: lm_syneval__agreement
+    task:
+      - group: lm_syneval__agreement__obj_rel_within_inanim
+        task:
+          - lm_syneval__agreement__obj_rel_within_inanim__sing_ES_EV_sing_IS_IV
+          - lm_syneval__agreement__obj_rel_within_inanim__sing_ES_EV_plur_IS_IV
+          - lm_syneval__agreement__obj_rel_within_inanim__plur_ES_EV_sing_IS_IV
+          - lm_syneval__agreement__obj_rel_within_inanim__plur_ES_EV_plur_IS_IV
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__vp_coord
+        task:
+          - lm_syneval__agreement__vp_coord__sing_MS_MV_MV
+          - lm_syneval__agreement__vp_coord__plur_MS_MV_MV
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__sent_comp
+        task:
+          - lm_syneval__agreement__sent_comp__sing_MS_MV_sing_BS
+          - lm_syneval__agreement__sent_comp__sing_MS_MV_plur_BS
+          - lm_syneval__agreement__sent_comp__plur_MS_MV_sing_BS
+          - lm_syneval__agreement__sent_comp__plur_MS_MV_plur_BS
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__obj_rel_no_comp_within_inanim
+        task:
+          - lm_syneval__agreement__obj_rel_no_comp_within_inanim__sing_ES_EV_sing_IS_IV
+          - lm_syneval__agreement__obj_rel_no_comp_within_inanim__sing_ES_EV_plur_IS_IV
+          - lm_syneval__agreement__obj_rel_no_comp_within_inanim__plur_ES_EV_sing_IS_IV
+          - lm_syneval__agreement__obj_rel_no_comp_within_inanim__plur_ES_EV_plur_IS_IV
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__obj_rel_within_anim
+        task:
+          - lm_syneval__agreement__obj_rel_within_anim__sing_ES_EV_sing_MS_MV
+          - lm_syneval__agreement__obj_rel_within_anim__sing_ES_EV_plur_MS_MV
+          - lm_syneval__agreement__obj_rel_within_anim__plur_ES_EV_sing_MS_MV
+          - lm_syneval__agreement__obj_rel_within_anim__plur_ES_EV_plur_MS_MV
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__subj_rel
+        task:
+          - lm_syneval__agreement__subj_rel__sing_MS_EV_MV_sing_ES
+          - lm_syneval__agreement__subj_rel__sing_MS_EV_MV_plur_ES
+          - lm_syneval__agreement__subj_rel__plur_MS_EV_MV_sing_ES
+          - lm_syneval__agreement__subj_rel__plur_MS_EV_MV_plur_ES
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__prep_inanim
+        task:
+          - lm_syneval__agreement__prep_inanim__sing_IS_IV_sing_ES
+          - lm_syneval__agreement__prep_inanim__sing_IS_IV_plur_ES
+          - lm_syneval__agreement__prep_inanim__plur_IS_IV_sing_ES
+          - lm_syneval__agreement__prep_inanim__plur_IS_IV_plur_ES
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__long_vp_coord
+        task:
+          - lm_syneval__agreement__long_vp_coord__sing_MS_LMV_LMV
+          - lm_syneval__agreement__long_vp_coord__plur_MS_LMV_LMV
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__obj_rel_across_anim
+        task:
+          - lm_syneval__agreement__obj_rel_across_anim__sing_MS_MV_sing_ES_EV
+          - lm_syneval__agreement__obj_rel_across_anim__sing_MS_MV_plur_ES_EV
+          - lm_syneval__agreement__obj_rel_across_anim__plur_MS_MV_sing_ES_EV
+          - lm_syneval__agreement__obj_rel_across_anim__plur_MS_MV_plur_ES_EV
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__obj_rel_across_inanim
+        task:
+          - lm_syneval__agreement__obj_rel_across_inanim__sing_IS_IV_sing_ES_EV
+          - lm_syneval__agreement__obj_rel_across_inanim__sing_IS_IV_plur_ES_EV
+          - lm_syneval__agreement__obj_rel_across_inanim__plur_IS_IV_sing_ES_EV
+          - lm_syneval__agreement__obj_rel_across_inanim__plur_IS_IV_plur_ES_EV
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__obj_rel_no_comp_across_anim
+        task:
+          - lm_syneval__agreement__obj_rel_no_comp_across_anim__sing_MS_MV_sing_ES_EV
+          - lm_syneval__agreement__obj_rel_no_comp_across_anim__sing_MS_MV_plur_ES_EV
+          - lm_syneval__agreement__obj_rel_no_comp_across_anim__plur_MS_MV_sing_ES_EV
+          - lm_syneval__agreement__obj_rel_no_comp_across_anim__plur_MS_MV_plur_ES_EV
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__obj_rel_no_comp_across_inanim
+        task:
+          - lm_syneval__agreement__obj_rel_no_comp_across_inanim__sing_IS_IV_sing_ES_EV
+          - lm_syneval__agreement__obj_rel_no_comp_across_inanim__sing_IS_IV_plur_ES_EV
+          - lm_syneval__agreement__obj_rel_no_comp_across_inanim__plur_IS_IV_sing_ES_EV
+          - lm_syneval__agreement__obj_rel_no_comp_across_inanim__plur_IS_IV_plur_ES_EV
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__simple_agrmt
+        task:
+          - lm_syneval__agreement__simple_agrmt__sing_MS_MV
+          - lm_syneval__agreement__simple_agrmt__plur_MS_MV
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__prep_anim
+        task:
+          - lm_syneval__agreement__prep_anim__sing_MS_MV_sing_ES
+          - lm_syneval__agreement__prep_anim__sing_MS_MV_plur_ES
+          - lm_syneval__agreement__prep_anim__plur_MS_MV_sing_ES
+          - lm_syneval__agreement__prep_anim__plur_MS_MV_plur_ES
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__obj_rel_no_comp_within_anim
+        task:
+          - lm_syneval__agreement__obj_rel_no_comp_within_anim__sing_ES_EV_sing_MS_MV
+          - lm_syneval__agreement__obj_rel_no_comp_within_anim__sing_ES_EV_plur_MS_MV
+          - lm_syneval__agreement__obj_rel_no_comp_within_anim__plur_ES_EV_sing_MS_MV
+          - lm_syneval__agreement__obj_rel_no_comp_within_anim__plur_ES_EV_plur_MS_MV
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+    aggregate_metric_list:
+      - metric: acc
+        aggregation: mean
+        weight_by_size: false
+  - group: lm_syneval__npi
+    task:
+      - group: lm_syneval__npi__npi_across_anim
+        task:
+          - lm_syneval__npi__npi_across_anim__past
+          - lm_syneval__npi__npi_across_anim__future
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__npi__npi_across_inanim
+        task:
+          - lm_syneval__npi__npi_across_inanim__past
+          - lm_syneval__npi__npi_across_inanim__future
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__npi__simple_npi_anim
+        task:
+          - lm_syneval__npi__simple_npi_anim__past
+          - lm_syneval__npi__simple_npi_anim__future
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__npi__simple_npi_inanim
+        task:
+          - lm_syneval__npi__simple_npi_inanim__past
+          - lm_syneval__npi__simple_npi_inanim__future
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+    aggregate_metric_list:
+      - metric: acc
+        aggregation: mean
+        weight_by_size: false
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: false
--- a/lm_eval/tasks/minerva_math/README.md
+++ b/lm_eval/tasks/minerva_math/README.md
 # MATH
+
 ℹ️ This is the 4-shot variant!
+
 ## Paper
+
 Measuring Mathematical Problem Solving With the MATH Dataset
 https://arxiv.org/abs/2103.03874

-Many intellectual endeavors require mathematical problem solving, but this skill remains beyond the capabilities of computers. To measure this ability in machine learning models, we introduce MATH, a new dataset of 12,500 challenging competition mathematics problems. Each problem in MATH has a full step-by-step solution which can be used to teach models to generate answer derivations and explanations.
+Many intellectual endeavors require mathematical problem solving, but this skill remains beyond the capabilities of
+computers. To measure this ability in machine learning models, we introduce MATH, a new dataset of 12,500 challenging
+competition mathematics problems. Each problem in MATH has a full step-by-step solution which can be used to teach
+models to generate answer derivations and explanations.

-NOTE: The few-shot and the generated answer extraction is based on the [Minerva](https://arxiv.org/abs/2206.14858) and exact match equivalence is calculated using the `sympy` library. This requires additional dependencies, which can be installed via the `lm-eval[math]` extra.
+NOTE: The few-shot and the generated answer extraction is based on the [Minerva](https://arxiv.org/abs/2206.14858) and
+exact match equivalence is calculated using the `sympy` library. This requires additional dependencies, which can be
+installed via the `lm-eval[math]` extra.

 Homepage: https://github.com/hendrycks/math

-
 ## Citation
+
 ```
 @article{hendrycksmath2021,
  title={Measuring Mathematical Problem Solving With the MATH Dataset},
@@ -49,13 +57,18 @@ Eprint = {arXiv:2206.14858},
 The checklist is the following:

 For adding novel benchmarks/datasets to the library:
-* [x] Is the task an existing benchmark in the literature?
-  * [x] Have you referenced the original paper that introduced the task?
-  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
-    * The implementation in the original paper is one where the model is first fine-tuned on the data. They do have a few-shot evaluation for GPT-3, however the few-shot context used here is sourced from [Lewkowycz et al](https://arxiv.org/abs/2206.14858). The achieved accuracy on Llama-2 models is comparable to that provided in the paper, though not identical.

+* [x] Is the task an existing benchmark in the literature?
+    * [x] Have you referenced the original paper that introduced the task?
+    * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the
+      reference implementation and documented how to run such a test?
+        * The implementation in the original paper is one where the model is first fine-tuned on the data. They do have
+          a few-shot evaluation for GPT-3, however the few-shot context used here is sourced
+          from [Lewkowycz et al](https://arxiv.org/abs/2206.14858). The achieved accuracy on Llama-2 models is
+          comparable to that provided in the paper, though not identical.

 If other tasks on this dataset are already supported:
+
 * [x] Is the "Main" variant of this task clearly denoted?
 * [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
 * [x] Have you noted which, if any, published evaluation setups are matched by this variant?
@@ -65,4 +78,7 @@ If other tasks on this dataset are already supported:
 - [ ] zero-shot variant

 ### Changelog
-version 2.0: (21-Feb-2025); added math_verify (extraction) metric. For details [see](https://huggingface.co/blog/math_verify_leaderboard)
+
+- version 2.0: (21-Feb-2025); added math_verify (extraction) metric. For
+  details [see](https://huggingface.co/blog/math_verify_leaderboard)
+- version 3.0 (21-Aug-2025); pass the full solution and model generation to `math_verify`'s `parse`