Commit 2106fbeb authored by Baber's avatar Baber
Browse files

Merge branch 'main' into mathvista

# Conflicts:
#	lm_eval/models/openai_completions.py
parents 4354fe46 703fbffd
"dataset_name": "management"
"include": "_continuation_template_yaml"
"tag": "mmlu_llama_other_tasks"
"task": "mmlu_llama_management"
"task_alias": "management"
"dataset_name": "marketing"
"include": "_continuation_template_yaml"
"tag": "mmlu_llama_other_tasks"
"task": "mmlu_llama_marketing"
"task_alias": "marketing"
"dataset_name": "medical_genetics"
"include": "_continuation_template_yaml"
"tag": "mmlu_llama_other_tasks"
"task": "mmlu_llama_medical_genetics"
"task_alias": "medical genetics"
"dataset_name": "miscellaneous"
"include": "_continuation_template_yaml"
"tag": "mmlu_llama_other_tasks"
"task": "mmlu_llama_miscellaneous"
"task_alias": "miscellaneous"
"dataset_name": "moral_disputes"
"include": "_continuation_template_yaml"
"tag": "mmlu_llama_humanities_tasks"
"task": "mmlu_llama_moral_disputes"
"task_alias": "moral disputes"
"dataset_name": "moral_scenarios"
"include": "_continuation_template_yaml"
"tag": "mmlu_llama_humanities_tasks"
"task": "mmlu_llama_moral_scenarios"
"task_alias": "moral scenarios"
"dataset_name": "nutrition"
"include": "_continuation_template_yaml"
"tag": "mmlu_llama_other_tasks"
"task": "mmlu_llama_nutrition"
"task_alias": "nutrition"
"dataset_name": "philosophy"
"include": "_continuation_template_yaml"
"tag": "mmlu_llama_humanities_tasks"
"task": "mmlu_llama_philosophy"
"task_alias": "philosophy"
"dataset_name": "prehistory"
"include": "_continuation_template_yaml"
"tag": "mmlu_llama_humanities_tasks"
"task": "mmlu_llama_prehistory"
"task_alias": "prehistory"
"dataset_name": "professional_accounting"
"include": "_continuation_template_yaml"
"tag": "mmlu_llama_other_tasks"
"task": "mmlu_llama_professional_accounting"
"task_alias": "professional accounting"
"dataset_name": "professional_law"
"include": "_continuation_template_yaml"
"tag": "mmlu_llama_humanities_tasks"
"task": "mmlu_llama_professional_law"
"task_alias": "professional law"
"dataset_name": "professional_medicine"
"include": "_continuation_template_yaml"
"tag": "mmlu_llama_other_tasks"
"task": "mmlu_llama_professional_medicine"
"task_alias": "professional medicine"
"dataset_name": "professional_psychology"
"include": "_continuation_template_yaml"
"tag": "mmlu_llama_social_sciences_tasks"
"task": "mmlu_llama_professional_psychology"
"task_alias": "professional psychology"
"dataset_name": "public_relations"
"include": "_continuation_template_yaml"
"tag": "mmlu_llama_social_sciences_tasks"
"task": "mmlu_llama_public_relations"
"task_alias": "public relations"
"dataset_name": "security_studies"
"include": "_continuation_template_yaml"
"tag": "mmlu_llama_social_sciences_tasks"
"task": "mmlu_llama_security_studies"
"task_alias": "security studies"
"dataset_name": "sociology"
"include": "_continuation_template_yaml"
"tag": "mmlu_llama_social_sciences_tasks"
"task": "mmlu_llama_sociology"
"task_alias": "sociology"
"dataset_name": "us_foreign_policy"
"include": "_continuation_template_yaml"
"tag": "mmlu_llama_social_sciences_tasks"
"task": "mmlu_llama_us_foreign_policy"
"task_alias": "us foreign policy"
"dataset_name": "virology"
"include": "_continuation_template_yaml"
"tag": "mmlu_llama_other_tasks"
"task": "mmlu_llama_virology"
"task_alias": "virology"
"dataset_name": "world_religions"
"include": "_continuation_template_yaml"
"tag": "mmlu_llama_humanities_tasks"
"task": "mmlu_llama_world_religions"
"task_alias": "world religions"
dataset_path: TIGER-Lab/MMLU-Pro
output_type: generate_until
test_split: test
fewshot_split: validation
fewshot_config:
sampler: first_n
doc_to_target: !function utils.fewshot_to_text
doc_to_text: "{% set letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' %}Given the following question and candidate answers, choose the best answer.\nQuestion: {{question.strip()}}\n{% for choice in options %}{{letters[loop.index0]}}. {{choice}}\n{% endfor %}\nYour response should end with \"The best answer is [the_answer_letter].\" where the [the_answer_letter] is a letter from the provided choices.\n\nLet's think step by step."
doc_to_target: answer
num_fewshot: 5
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
regexes_to_ignore:
- "\\$"
- "\\.$"
generation_kwargs:
until:
- "."
max_gen_toks: 1024
filter_list:
- name: strict_match
filter:
- function: "regex"
regex_pattern: "[tT]he best answer is ([A-Z])"
group_select: -1
- function: take_first
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment