Merge branch 'main' into metrics

# Conflicts: # lm_eval/models/vllm_causallms.py # pyproject.toml

Merge branch 'main' into metrics
# Conflicts: # lm_eval/models/vllm_causallms.py # pyproject.toml
c4b0c0cb · Baber · 6b20ae8c · de496b80 · c4b0c0cb · c4b0c0cb
Commit c4b0c0cb authored Sep 24, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/longbench/trec.yaml
+++ b/lm_eval/tasks/longbench/trec.yaml
@@ -5,17 +5,17 @@ task: longbench_trec
 dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: trec
-doc_to_text: 'Please determine the type of the question below. Here are some examples of questions.\n\n{{context}}\n{{input}}'
+doc_to_text: "Please determine the type of the question below. Here are some examples of questions.\n\n{{context}}\n{{input}}"
 doc_to_target: '{{answers}}'
 process_results: !function metrics.get_classification_score
 generation_kwargs:
  max_gen_toks: 64
  temperature: 1
-  do_sample: True
+  do_sample: False
  until: ["\n"]
 metric_list:
  - metric: "classification_score"
    aggregation: mean
    higher_is_better: True
 metadata:
-  version: 3.0
+  version: 4.0
--- a/lm_eval/tasks/longbench/trec_e.yaml
+++ b/lm_eval/tasks/longbench/trec_e.yaml
@@ -5,17 +5,17 @@ task: longbench_trec_e
 dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: trec_e
-doc_to_text: 'Please determine the type of the question below. Here are some examples of questions.\n\n{{context}}\n{{input}}'
+doc_to_text: "Please determine the type of the question below. Here are some examples of questions.\n\n{{context}}\n{{input}}"
 doc_to_target: '{{answers}}'
 process_results: !function metrics.get_classification_score
 generation_kwargs:
  max_gen_toks: 64
  temperature: 1
-  do_sample: True
+  do_sample: False
  until: ["\n"]
 metric_list:
  - metric: "classification_score"
    aggregation: mean
    higher_is_better: True
 metadata:
-  version: 3.0
+  version: 4.0
--- a/lm_eval/tasks/longbench/triviaqa.yaml
+++ b/lm_eval/tasks/longbench/triviaqa.yaml
@@ -5,17 +5,17 @@ task: longbench_triviaqa
 dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: triviaqa
-doc_to_text: 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{{context}}\n\n{{input}}'
+doc_to_text: "Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{{context}}\n\n{{input}}"
 doc_to_target: '{{answers}}'
 process_results: !function metrics.get_qa_f1_score
 generation_kwargs:
  max_gen_toks: 32
  temperature: 1
-  do_sample: True
+  do_sample: False
  until: ["\n"]
 metric_list:
  - metric: "qa_f1_score"
    aggregation: mean
    higher_is_better: True
 metadata:
-  version: 3.0
+  version: 4.0
--- a/lm_eval/tasks/longbench/triviaqa_e.yaml
+++ b/lm_eval/tasks/longbench/triviaqa_e.yaml
@@ -5,17 +5,17 @@ task: longbench_triviaqa_e
 dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: triviaqa_e
-doc_to_text: 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{{context}}\n\n{{input}}'
+doc_to_text: "Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{{context}}\n\n{{input}}"
 doc_to_target: '{{answers}}'
 process_results: !function metrics.get_qa_f1_score
 generation_kwargs:
  max_gen_toks: 32
  temperature: 1
-  do_sample: True
+  do_sample: False
  until: ["\n"]
 metric_list:
  - metric: "qa_f1_score"
    aggregation: mean
    higher_is_better: True
 metadata:
-  version: 3.0
+  version: 4.0
--- a/lm_eval/tasks/longbench/vcsum.yaml
+++ b/lm_eval/tasks/longbench/vcsum.yaml
@@ -5,17 +5,17 @@ task: longbench_vcsum
 dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: vcsum
-doc_to_text: '下面有一段会议记录，请你阅读后，写一段总结，总结会议的内容。\n会议记录：\n{{context}}\n\n会议总结：'
+doc_to_text: "下面有一段会议记录，请你阅读后，写一段总结，总结会议的内容。\n会议记录：\n{{context}}\n\n会议总结："
 doc_to_target: '{{answers}}'
 process_results: !function metrics.get_rouge_zh_score
 generation_kwargs:
  max_gen_toks: 512
  temperature: 1
-  do_sample: True
+  do_sample: False
  until: []
 metric_list:
  - metric: "rouge_zh_score"
    aggregation: mean
    higher_is_better: True
 metadata:
-  version: 3.0
+  version: 4.0
--- a/lm_eval/tasks/minerva_math/README.md
+++ b/lm_eval/tasks/minerva_math/README.md
 # MATH
+
 ℹ️ This is the 4-shot variant!
+
 ## Paper
+
 Measuring Mathematical Problem Solving With the MATH Dataset
 https://arxiv.org/abs/2103.03874

-Many intellectual endeavors require mathematical problem solving, but this skill remains beyond the capabilities of computers. To measure this ability in machine learning models, we introduce MATH, a new dataset of 12,500 challenging competition mathematics problems. Each problem in MATH has a full step-by-step solution which can be used to teach models to generate answer derivations and explanations.
+Many intellectual endeavors require mathematical problem solving, but this skill remains beyond the capabilities of
+computers. To measure this ability in machine learning models, we introduce MATH, a new dataset of 12,500 challenging
+competition mathematics problems. Each problem in MATH has a full step-by-step solution which can be used to teach
+models to generate answer derivations and explanations.

-NOTE: The few-shot and the generated answer extraction is based on the [Minerva](https://arxiv.org/abs/2206.14858) and exact match equivalence is calculated using the `sympy` library. This requires additional dependencies, which can be installed via the `lm-eval[math]` extra.
+NOTE: The few-shot and the generated answer extraction is based on the [Minerva](https://arxiv.org/abs/2206.14858) and
+exact match equivalence is calculated using the `sympy` library. This requires additional dependencies, which can be
+installed via the `lm-eval[math]` extra.

 Homepage: https://github.com/hendrycks/math

-
 ## Citation
+
 ```
 @article{hendrycksmath2021,
  title={Measuring Mathematical Problem Solving With the MATH Dataset},
@@ -49,13 +57,18 @@ Eprint = {arXiv:2206.14858},
 The checklist is the following:

 For adding novel benchmarks/datasets to the library:
-* [x] Is the task an existing benchmark in the literature?
-  * [x] Have you referenced the original paper that introduced the task?
-  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
-    * The implementation in the original paper is one where the model is first fine-tuned on the data. They do have a few-shot evaluation for GPT-3, however the few-shot context used here is sourced from [Lewkowycz et al](https://arxiv.org/abs/2206.14858). The achieved accuracy on Llama-2 models is comparable to that provided in the paper, though not identical.

+* [x] Is the task an existing benchmark in the literature?
+    * [x] Have you referenced the original paper that introduced the task?
+    * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the
+      reference implementation and documented how to run such a test?
+        * The implementation in the original paper is one where the model is first fine-tuned on the data. They do have
+          a few-shot evaluation for GPT-3, however the few-shot context used here is sourced
+          from [Lewkowycz et al](https://arxiv.org/abs/2206.14858). The achieved accuracy on Llama-2 models is
+          comparable to that provided in the paper, though not identical.

 If other tasks on this dataset are already supported:
+
 * [x] Is the "Main" variant of this task clearly denoted?
 * [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
 * [x] Have you noted which, if any, published evaluation setups are matched by this variant?
@@ -65,4 +78,7 @@ If other tasks on this dataset are already supported:
 - [ ] zero-shot variant

 ### Changelog
-version 2.0: (21-Feb-2025); added math_verify (extraction) metric. For details [see](https://huggingface.co/blog/math_verify_leaderboard)
+
+- version 2.0: (21-Feb-2025); added math_verify (extraction) metric. For
+  details [see](https://huggingface.co/blog/math_verify_leaderboard)
+- version 3.0 (21-Aug-2025); pass the full solution and model generation to `math_verify`'s `parse`
--- a/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
+++ b/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
@@ -24,7 +24,7 @@ metric_list:
    higher_is_better: true
 num_fewshot: 4
 metadata:
-  version: 2.0
+  version: 3.0
 fewshot_config:
  sampler: first_n
  samples: !function utils.list_fewshot_samples
--- a/lm_eval/tasks/minerva_math/utils.py
+++ b/lm_eval/tasks/minerva_math/utils.py
@@ -71,7 +71,7 @@ def list_fewshot_samples() -> list[dict]:
    ]


-def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
+def process_results(doc: dict, results: list[str]) -> dict[str, int]:
    candidates = results[0]

    unnormalized_answer = get_unnormalized_answer(candidates)
@@ -83,14 +83,17 @@ def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
        retval = 0

    # math_verify
-    res = verify(parse(doc["answer"]), parse(candidates))
-    mathval = 1 if res else 0
+    _mvres = verify(
+        gold=parse(doc["solution"]),
+        target=parse(candidates),
+    )
+    mathval = 1 if _mvres else 0

-    results = {
+    res = {
        "exact_match": retval,
        "math_verify": mathval,
    }
-    return results
+    return res


 def last_boxed_only_string(string: str) -> Optional[str]:

--- a/lm_eval/tasks/mmlu-redux-spanish/README.md
+++ b/lm_eval/tasks/mmlu-redux-spanish/README.md
+# Task-name
+
+### Paper
+
+Title: `Are We Donewith MMLU?`
+
+Abstract: `https://arxiv.org/pdf/2406.04127`
+
+`The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more, in Spanish`
+
+Homepage: `https://huggingface.co/datasets/edinburgh-dawg/mmlu-redux-2.0`
+
+### Citation
+
+```
+BibTeX
+@misc{edinburgh2024mmlu,
+      title={Are We Done with MMLU?},
+      author={Aryo Pradipta Gema and Joshua Ong Jun Leang and Giwon Hong and Alessio Devoto and
+      Alberto Carlo Maria Mancino and Rohit Saxena and Xuanli He and Yu Zhao and Xiaotang Du and
+      MohammadRezaGhasemi Madani and Claire Barale and Robert McHardy and Joshua Harris and
+      Jean Kaddour and Emile van Krieken and Pasquale Minervini},
+      year={2025},
+      eprint={2406.04127},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
+
+### Groups, Tags, and Tasks
+
+#### Groups
+
+- `stem`
+- `other`
+- `social sciences`
+- `humanities`
+
+#### Tasks
+
+- `mmlu_stem_generative_spanish`
+- `mmlu_other_generative_spanish`
+- `mmlu_social_sciences_generative_spanish`
+- `mmlu_humanities_generative_spanish`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+
+- [x] Is the task an existing benchmark in the literature?
+  - [x] Have you referenced the original paper that introduced the task?
+  - [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+If other tasks on this dataset are already supported:
+
+- [ ] Is the "Main" variant of this task clearly denoted?
+- [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+- [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
+
+ver 1: PR #2705
+First implementation
--- a/lm_eval/tasks/mmlu-redux-spanish/generative/_default_template_spanish_yaml
+++ b/lm_eval/tasks/mmlu-redux-spanish/generative/_default_template_spanish_yaml
+dataset_path: "amias-mx/mmlu-redux-2.0-spanish"
+test_split: test
+dataset_kwargs:
+  trust_remote_code: true
+output_type: generate_until
+doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nPor favor, responde con la letra correcta (A, B, C o D) sin absolutamente nada adicional, solo la letra correcta:"
+doc_to_target: "{{['A','B','C','D'][answer]}}"
+target_delimiter: ":"
+generation_kwargs:
+  until:
+    - "</s>"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+filter_list:
+  - name: default
+    filter:
+      - function: regex
+        regex_pattern: "([ABCD])"
+      - function: take_first
+metadata:
+  version: 3.0
--- a/lm_eval/tasks/mmlu-redux-spanish/generative/_mmlu.yaml
+++ b/lm_eval/tasks/mmlu-redux-spanish/generative/_mmlu.yaml
+group: mmlu_redux_spanish_generative
+group_alias: mmlu_redux_spanish (generative)
+task:
+  - group: stem_spanish
+    task:
+      - mmlu_stem_generative_spanish
+    aggregate_metric_list:
+      - metric: exact_match
+        weight_by_size: true
+  - group: other_spanish
+    task:
+      - mmlu_other_generative_spanish
+    aggregate_metric_list:
+      - metric: exact_match
+        weight_by_size: true
+  - group: social sciences_spanish
+    task:
+      - mmlu_social_sciences_generative_spanish
+    aggregate_metric_list:
+      - metric: exact_match
+        weight_by_size: true
+#  - group: humanities_spanish
+#    task:
+#      - mmlu_humanities_generative_spanish
+#    aggregate_metric_list:
+#      - metric: exact_match
+#        weight_by_size: true
+aggregate_metric_list:
+  - aggregation: mean
+    metric: exact_match
+    weight_by_size: true
+metadata:
+  version: 3
--- a/lm_eval/tasks/mmlu-redux-spanish/generative/mmlu_abstract_algebra.yaml
+++ b/lm_eval/tasks/mmlu-redux-spanish/generative/mmlu_abstract_algebra.yaml
+"dataset_name": "abstract_algebra"
+"description":
+  "The following are multiple choice questions (with answers) about abstract\
+  \ algebra.\n\n"
+"tag": "mmlu_stem_generative_spanish"
+"include": "_default_template_spanish_yaml"
+"task": "mmlu_abstract_algebra_generative_spanish"
+"task_alias": "abstract_algebra_spanish"
--- a/lm_eval/tasks/mmlu-redux-spanish/generative/mmlu_anatomy.yaml
+++ b/lm_eval/tasks/mmlu-redux-spanish/generative/mmlu_anatomy.yaml
+"dataset_name": "anatomy"
+"description":
+  "The following are multiple choice questions (with answers) about anatomy.\n\
+  \n"
+"tag": "mmlu_stem_generative_spanish"
+"include": "_default_template_spanish_yaml"
+"task": "mmlu_anatomy_generative_spanish"
+"task_alias": "anatomy_spanish"
--- a/lm_eval/tasks/mmlu-redux-spanish/generative/mmlu_astronomy.yaml
+++ b/lm_eval/tasks/mmlu-redux-spanish/generative/mmlu_astronomy.yaml
+"dataset_name": "astronomy"
+"description":
+  "The following are multiple choice questions (with answers) about astronomy.\n\
+  \n"
+"tag": "mmlu_stem_generative_spanish"
+"include": "_default_template_spanish_yaml"
+"task": "mmlu_astronomy_generative_spanish"
+"task_alias": "astronomy_spanish"
--- a/lm_eval/tasks/mmlu-redux-spanish/generative/mmlu_business_ethics.yaml
+++ b/lm_eval/tasks/mmlu-redux-spanish/generative/mmlu_business_ethics.yaml
+"dataset_name": "business_ethics"
+"description":
+  "The following are multiple choice questions (with answers) about business\
+  \ ethics.\n\n"
+"tag": "mmlu_other_generative_spanish"
+"include": "_default_template_spanish_yaml"
+"task": "mmlu_business_ethics_generative_spanish"
+"task_alias": "business_ethics_spanish"
--- a/lm_eval/tasks/mmlu-redux-spanish/generative/mmlu_clinical_knowledge.yaml
+++ b/lm_eval/tasks/mmlu-redux-spanish/generative/mmlu_clinical_knowledge.yaml
+"dataset_name": "clinical_knowledge"
+"description":
+  "The following are multiple choice questions (with answers) about clinical\
+  \ knowledge.\n\n"
+"tag": "mmlu_other_generative_spanish"
+"include": "_default_template_spanish_yaml"
+"task": "mmlu_clinical_knowledge_generative_spanish"
+"task_alias": "clinical_knowledge_spanish"
--- a/lm_eval/tasks/mmlu-redux-spanish/generative/mmlu_college_biology.yaml
+++ b/lm_eval/tasks/mmlu-redux-spanish/generative/mmlu_college_biology.yaml
+"dataset_name": "college_biology"
+"description":
+  "The following are multiple choice questions (with answers) about college\
+  \ biology.\n\n"
+"tag": "mmlu_stem_generative_spanish"
+"include": "_default_template_spanish_yaml"
+"task": "mmlu_college_biology_generative_spanish"
+"task_alias": "college_biology_spanish"
--- a/lm_eval/tasks/mmlu-redux-spanish/generative/mmlu_college_chemistry.yaml
+++ b/lm_eval/tasks/mmlu-redux-spanish/generative/mmlu_college_chemistry.yaml
+"dataset_name": "college_chemistry"
+"description":
+  "The following are multiple choice questions (with answers) about college\
+  \ chemistry.\n\n"
+"tag": "mmlu_stem_generative_spanish"
+"include": "_default_template_spanish_yaml"
+"task": "mmlu_college_chemistry_generative_spanish"
+"task_alias": "college_chemistry_spanish"
--- a/lm_eval/tasks/mmlu-redux-spanish/generative/mmlu_college_computer_science.yaml
+++ b/lm_eval/tasks/mmlu-redux-spanish/generative/mmlu_college_computer_science.yaml
+"dataset_name": "college_computer_science"
+"description":
+  "The following are multiple choice questions (with answers) about college\
+  \ computer science.\n\n"
+"tag": "mmlu_stem_generative_spanish"
+"include": "_default_template_spanish_yaml"
+"task": "mmlu_college_computer_science_generative_spanish"
+"task_alias": "college_computer_science_spanish"
--- a/lm_eval/tasks/mmlu-redux-spanish/generative/mmlu_college_mathematics.yaml
+++ b/lm_eval/tasks/mmlu-redux-spanish/generative/mmlu_college_mathematics.yaml
+"dataset_name": "college_mathematics"
+"description":
+  "The following are multiple choice questions (with answers) about college\
+  \ mathematics.\n\n"
+"tag": "mmlu_stem_generative_spanish"
+"include": "_default_template_spanish_yaml"
+"task": "mmlu_college_mathematics_generative_spanish"
+"task_alias": "college_mathematics_spanish"