Merge branch 'main' into mathvista

# Conflicts: # lm_eval/models/openai_completions.py

Merge branch 'main' into mathvista
# Conflicts: # lm_eval/models/openai_completions.py
2106fbeb · Baber · 4354fe46 · 703fbffd · 2106fbeb · 2106fbeb
Commit 2106fbeb authored Jan 15, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/llama3/instruct/mmlu/mmlu_management.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu/mmlu_management.yaml
+"dataset_name": "management"
+"include": "_continuation_template_yaml"
+"tag": "mmlu_llama_other_tasks"
+"task": "mmlu_llama_management"
+"task_alias": "management"
--- a/lm_eval/tasks/llama3/instruct/mmlu/mmlu_marketing.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu/mmlu_marketing.yaml
+"dataset_name": "marketing"
+"include": "_continuation_template_yaml"
+"tag": "mmlu_llama_other_tasks"
+"task": "mmlu_llama_marketing"
+"task_alias": "marketing"
--- a/lm_eval/tasks/llama3/instruct/mmlu/mmlu_medical_genetics.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu/mmlu_medical_genetics.yaml
+"dataset_name": "medical_genetics"
+"include": "_continuation_template_yaml"
+"tag": "mmlu_llama_other_tasks"
+"task": "mmlu_llama_medical_genetics"
+"task_alias": "medical genetics"
--- a/lm_eval/tasks/llama3/instruct/mmlu/mmlu_miscellaneous.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu/mmlu_miscellaneous.yaml
+"dataset_name": "miscellaneous"
+"include": "_continuation_template_yaml"
+"tag": "mmlu_llama_other_tasks"
+"task": "mmlu_llama_miscellaneous"
+"task_alias": "miscellaneous"
--- a/lm_eval/tasks/llama3/instruct/mmlu/mmlu_moral_disputes.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu/mmlu_moral_disputes.yaml
+"dataset_name": "moral_disputes"
+"include": "_continuation_template_yaml"
+"tag": "mmlu_llama_humanities_tasks"
+"task": "mmlu_llama_moral_disputes"
+"task_alias": "moral disputes"
--- a/lm_eval/tasks/llama3/instruct/mmlu/mmlu_moral_scenarios.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu/mmlu_moral_scenarios.yaml
+"dataset_name": "moral_scenarios"
+"include": "_continuation_template_yaml"
+"tag": "mmlu_llama_humanities_tasks"
+"task": "mmlu_llama_moral_scenarios"
+"task_alias": "moral scenarios"
--- a/lm_eval/tasks/llama3/instruct/mmlu/mmlu_nutrition.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu/mmlu_nutrition.yaml
+"dataset_name": "nutrition"
+"include": "_continuation_template_yaml"
+"tag": "mmlu_llama_other_tasks"
+"task": "mmlu_llama_nutrition"
+"task_alias": "nutrition"
--- a/lm_eval/tasks/llama3/instruct/mmlu/mmlu_philosophy.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu/mmlu_philosophy.yaml
+"dataset_name": "philosophy"
+"include": "_continuation_template_yaml"
+"tag": "mmlu_llama_humanities_tasks"
+"task": "mmlu_llama_philosophy"
+"task_alias": "philosophy"
--- a/lm_eval/tasks/llama3/instruct/mmlu/mmlu_prehistory.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu/mmlu_prehistory.yaml
+"dataset_name": "prehistory"
+"include": "_continuation_template_yaml"
+"tag": "mmlu_llama_humanities_tasks"
+"task": "mmlu_llama_prehistory"
+"task_alias": "prehistory"
--- a/lm_eval/tasks/llama3/instruct/mmlu/mmlu_professional_accounting.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu/mmlu_professional_accounting.yaml
+"dataset_name": "professional_accounting"
+"include": "_continuation_template_yaml"
+"tag": "mmlu_llama_other_tasks"
+"task": "mmlu_llama_professional_accounting"
+"task_alias": "professional accounting"
--- a/lm_eval/tasks/llama3/instruct/mmlu/mmlu_professional_law.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu/mmlu_professional_law.yaml
+"dataset_name": "professional_law"
+"include": "_continuation_template_yaml"
+"tag": "mmlu_llama_humanities_tasks"
+"task": "mmlu_llama_professional_law"
+"task_alias": "professional law"
--- a/lm_eval/tasks/llama3/instruct/mmlu/mmlu_professional_medicine.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu/mmlu_professional_medicine.yaml
+"dataset_name": "professional_medicine"
+"include": "_continuation_template_yaml"
+"tag": "mmlu_llama_other_tasks"
+"task": "mmlu_llama_professional_medicine"
+"task_alias": "professional medicine"
--- a/lm_eval/tasks/llama3/instruct/mmlu/mmlu_professional_psychology.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu/mmlu_professional_psychology.yaml
+"dataset_name": "professional_psychology"
+"include": "_continuation_template_yaml"
+"tag": "mmlu_llama_social_sciences_tasks"
+"task": "mmlu_llama_professional_psychology"
+"task_alias": "professional psychology"
--- a/lm_eval/tasks/llama3/instruct/mmlu/mmlu_public_relations.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu/mmlu_public_relations.yaml
+"dataset_name": "public_relations"
+"include": "_continuation_template_yaml"
+"tag": "mmlu_llama_social_sciences_tasks"
+"task": "mmlu_llama_public_relations"
+"task_alias": "public relations"
--- a/lm_eval/tasks/llama3/instruct/mmlu/mmlu_security_studies.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu/mmlu_security_studies.yaml
+"dataset_name": "security_studies"
+"include": "_continuation_template_yaml"
+"tag": "mmlu_llama_social_sciences_tasks"
+"task": "mmlu_llama_security_studies"
+"task_alias": "security studies"
--- a/lm_eval/tasks/llama3/instruct/mmlu/mmlu_sociology.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu/mmlu_sociology.yaml
+"dataset_name": "sociology"
+"include": "_continuation_template_yaml"
+"tag": "mmlu_llama_social_sciences_tasks"
+"task": "mmlu_llama_sociology"
+"task_alias": "sociology"
--- a/lm_eval/tasks/llama3/instruct/mmlu/mmlu_us_foreign_policy.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu/mmlu_us_foreign_policy.yaml
+"dataset_name": "us_foreign_policy"
+"include": "_continuation_template_yaml"
+"tag": "mmlu_llama_social_sciences_tasks"
+"task": "mmlu_llama_us_foreign_policy"
+"task_alias": "us foreign policy"
--- a/lm_eval/tasks/llama3/instruct/mmlu/mmlu_virology.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu/mmlu_virology.yaml
+"dataset_name": "virology"
+"include": "_continuation_template_yaml"
+"tag": "mmlu_llama_other_tasks"
+"task": "mmlu_llama_virology"
+"task_alias": "virology"
--- a/lm_eval/tasks/llama3/instruct/mmlu/mmlu_world_religions.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu/mmlu_world_religions.yaml
+"dataset_name": "world_religions"
+"include": "_continuation_template_yaml"
+"tag": "mmlu_llama_humanities_tasks"
+"task": "mmlu_llama_world_religions"
+"task_alias": "world religions"
--- a/lm_eval/tasks/llama3/instruct/mmlu_pro/_default_template_yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_pro/_default_template_yaml
+dataset_path: TIGER-Lab/MMLU-Pro
+output_type: generate_until
+test_split: test
+fewshot_split: validation
+fewshot_config:
+    sampler: first_n
+    doc_to_target: !function utils.fewshot_to_text
+doc_to_text: "{% set letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' %}Given the following question and candidate answers, choose the best answer.\nQuestion: {{question.strip()}}\n{% for choice in options %}{{letters[loop.index0]}}. {{choice}}\n{% endfor %}\nYour response should end with \"The best answer is [the_answer_letter].\" where the [the_answer_letter] is a letter from the provided choices.\n\nLet's think step by step."
+doc_to_target: answer
+num_fewshot: 5
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - "\\$"
+      - "\\.$"
+generation_kwargs:
+  until:
+    - "."
+  max_gen_toks: 1024
+filter_list:
+  - name: strict_match
+    filter:
+        - function: "regex"
+          regex_pattern: "[tT]he best answer is ([A-Z])"
+          group_select: -1
+        - function: take_first
+metadata:
+  version: 1.0
+dataset_kwargs:
+  trust_remote_code: true