Merge branch 'main' into mathvista

# Conflicts: # lm_eval/models/hf_vlms.py

Merge branch 'main' into mathvista
# Conflicts: # lm_eval/models/hf_vlms.py
25869601 · Baber · 56f40c53 · c1d8795d · 25869601 · 25869601
Commit 25869601 authored Oct 19, 2024 by Baber
20 changed files
--- a/lm_eval/tasks/mgsm/en_cot/cot_yaml
+++ b/lm_eval/tasks/mgsm/en_cot/cot_yaml
 # This file will be included in the generated language-specific task configs.
 # It doesn't have a yaml file extension as it is not meant to be imported directly
 # by the harness.
-group: mgsm_cot_native
+tag: mgsm_cot_native
 dataset_path: juletxara/mgsm
 dataset_name: null  # Overridden by language-specific config.
 output_type: generate_until

--- a/lm_eval/tasks/mgsm/native_cot/cot_yaml
+++ b/lm_eval/tasks/mgsm/native_cot/cot_yaml
 # This file will be included in the generated language-specific task configs.
 # It doesn't have a yaml file extension as it is not meant to be imported directly
 # by the harness.
-group: mgsm_cot_native
+tag: mgsm_cot_native
 dataset_path: juletxara/mgsm
 dataset_name: null  # Overridden by language-specific config.
 output_type: generate_until

--- a/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
+++ b/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
-group:
+tag:
  - math_word_problems
 task: minerva_math_algebra
 dataset_path: EleutherAI/hendrycks_math

--- a/lm_eval/tasks/mmlu/README.md
+++ b/lm_eval/tasks/mmlu/README.md
+# Task-name
+
+### Paper
+
+Title: `Measuring Massive Multitask Language Understanding`
+
+Abstract: `https://arxiv.org/abs/2009.03300`
+
+`The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.`
+
+Homepage: `https://github.com/hendrycks/test`
+
+Note: The `Flan` variants are derived from [here](https://github.com/jasonwei20/flan-2), and as described in Appendix D.1 of [Scaling Instruction-Finetuned Language Models](https://arxiv.org/abs/2210.11416).
+
+### Citation
+
+```
+@article{hendryckstest2021,
+  title={Measuring Massive Multitask Language Understanding},
+  author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
+  journal={Proceedings of the International Conference on Learning Representations (ICLR)},
+  year={2021}
+}
+
+@article{hendrycks2021ethics,
+  title={Aligning AI With Shared Human Values},
+  author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},
+  journal={Proceedings of the International Conference on Learning Representations (ICLR)},
+  year={2021}
+}
+```
+
+### Groups, Tags, and Tasks
+
+#### Groups
+
+* `mmlu`: `Original multiple-choice MMLU benchmark`
+* `mmlu_continuation`: `MMLU but with continuation prompts`
+* `mmlu_generation`: `MMLU generation`
+
+MMLU is the original benchmark as implemented by Hendrycks et al. with the choices in context and the answer letters (e.g `A`, `B`, `C`, `D`) in the continuation.
+`mmlu_continuation` is a cloze-style variant without the choices in context and the full answer choice in the continuation.
+`mmlu_generation` is a generation variant, similar to the original but the LLM is asked to generate the correct answer letter.
+
+
+#### Subgroups
+
+* `mmlu_stem'
+* `mmlu_humanities'
+* `mmlu_social_sciences'
+* `mmlu_other'
+
+Subgroup variants are prefixed with the subgroup name, e.g. `mmlu_stem_continuation`.
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
+
+# changelog
+ver 1: PR #497
+switch to original implementation
+
+ver 2: PR #2116
+add missing newline in description.
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml
@@ -26,7 +26,9 @@ task:
      - metric: acc
        weight_by_size: True
 aggregate_metric_list:
-  - metric: acc
+  - aggregation: mean
+    metric: exact_match
    weight_by_size: True
+    filter_list: get-answer
 metadata:
  version: 2
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
 dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split
 validation_split: validation
 test_split: test
-fewshot_split: dev
 fewshot_config:
  sampler: first_n
 output_type: generate_until
-doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
-doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
+doc_to_text: "{% if choices is defined%}Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step.{% else %}Q: {{ question.strip() }}\nA:{% endif %}"
+doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer] if answer is defined else target}}"
 filter_list:
  - name: "get-answer"
    filter:
@@ -18,7 +17,7 @@ generation_kwargs:
    - "</s>"
  do_sample: false
  temperature: 0.0
-num_fewshot: 0
+num_fewshot: 4
 metric_list:
  - metric: exact_match
    aggregation: mean

--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
@@ -12,7 +12,7 @@ filter_list:
      - function: "take_first"
  - name: "flexible-extract"
    filter:
-      - function: !function utils.MultiChoiceRegexFilter
+      - function: "multi_choice_regex"
        group_select: 0
        regex_pattern: "(\\([A-Z]\\))"
        ignore_case: true

--- a/lm_eval/tasks/mmlu_pro/README.md
+++ b/lm_eval/tasks/mmlu_pro/README.md
@@ -57,3 +57,8 @@ If other tasks on this dataset are already supported:
 * [ ] Is the "Main" variant of this task clearly denoted?
 * [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
 * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
+
+### Changelog
+
+* (tasks, group) 2024-09-23 -- (version 1 --> version 2)
+  * Added one newline to task description(s) as per [reference implementation](https://github.com/TIGER-AI-Lab/MMLU-Pro/blob/47b9891aacb8bd7cda29d5c5ba17b9434dd333bc/evaluate_from_local.py#L93)
--- a/lm_eval/tasks/mmlu_pro/_default_template_yaml
+++ b/lm_eval/tasks/mmlu_pro/_default_template_yaml
@@ -30,4 +30,4 @@ metric_list:
    ignore_case: true
    ignore_punctuation: true
 metadata:
-  version: 0.0
+  version: 1.0
--- a/lm_eval/tasks/mmlu_pro/_mmlu_pro.yaml
+++ b/lm_eval/tasks/mmlu_pro/_mmlu_pro.yaml
@@ -20,4 +20,4 @@ aggregate_metric_list:
    weight_by_size: true
    filter_list: custom-extract
 metadata:
-  version: 1.0
+  version: 2.0
--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_biology.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_biology.yaml
-description: "The following are multiple choice questions (with answers) about biology. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+description: "The following are multiple choice questions (with answers) about biology. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n"
 include: "_default_template_yaml"
 task: "mmlu_pro_biology"
 task_alias: "biology"

--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_business.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_business.yaml
-description: "The following are multiple choice questions (with answers) about business. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+description: "The following are multiple choice questions (with answers) about business. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n"
 include: "_default_template_yaml"
 task: "mmlu_pro_business"
 task_alias: "business"

--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_chemistry.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_chemistry.yaml
-description: "The following are multiple choice questions (with answers) about chemistry. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+description: "The following are multiple choice questions (with answers) about chemistry. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n"
 include: "_default_template_yaml"
 task: "mmlu_pro_chemistry"
 task_alias: "chemistry"

--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_computer_science.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_computer_science.yaml
-description: "The following are multiple choice questions (with answers) about computer science. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+description: "The following are multiple choice questions (with answers) about computer science. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n"
 include: "_default_template_yaml"
 task: "mmlu_pro_computer_science"
 task_alias: "computer_science"

--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_economics.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_economics.yaml
-description: "The following are multiple choice questions (with answers) about economics. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+description: "The following are multiple choice questions (with answers) about economics. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n"
 include: "_default_template_yaml"
 task: "mmlu_pro_economics"
 task_alias: "economics"

--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_engineering.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_engineering.yaml
-description: "The following are multiple choice questions (with answers) about engineering. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+description: "The following are multiple choice questions (with answers) about engineering. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n"
 include: "_default_template_yaml"
 task: "mmlu_pro_engineering"
 task_alias: "engineering"

--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_health.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_health.yaml
-description: "The following are multiple choice questions (with answers) about health. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+description: "The following are multiple choice questions (with answers) about health. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n"
 include: "_default_template_yaml"
 task: "mmlu_pro_health"
 task_alias: "health"

--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_history.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_history.yaml
-description: "The following are multiple choice questions (with answers) about history. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+description: "The following are multiple choice questions (with answers) about history. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n"
 include: "_default_template_yaml"
 task: "mmlu_pro_history"
 task_alias: "history"

--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_law.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_law.yaml
-description: "The following are multiple choice questions (with answers) about law. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+description: "The following are multiple choice questions (with answers) about law. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n"
 include: "_default_template_yaml"
 task: "mmlu_pro_law"
 task_alias: "law"

--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_math.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_math.yaml
-description: "The following are multiple choice questions (with answers) about math. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+description: "The following are multiple choice questions (with answers) about math. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n"
 include: "_default_template_yaml"
 task: "mmlu_pro_math"
 task_alias: "math"