Commit cd441ab1 authored by Yu Shi Jie's avatar Yu Shi Jie
Browse files

mmlu-pro: update yaml content in line with mmlu

parent 5bae76d6
dataset_path: sjyuxyz/MMLU-Pro-with-subset
output_type: multiple_choice
test_split: test
fewshot_split: dev
fewshot_config:
......@@ -8,3 +9,5 @@ doc_to_choice: "{{options}}"
doc_to_target: "{{answer_index}}"
metadata:
version: 0.0
dataset_kwargs:
trust_remote_code: true
group: mmlu_pro_continuation
group_alias: mmlu-pro (continuation)
task:
- mmlu_pro_continuation_stem
- mmlu_pro_continuation_other
- mmlu_pro_continuation_social_sciences
- mmlu_pro_continuation_humanities
- group: stem
task:
- mmlu_pro_continuation_stem
aggregate_metric_list:
- metric: acc
weight_by_size: True
- group: other
task:
- mmlu_pro_continuation_other
aggregate_metric_list:
- metric: acc
weight_by_size: True
- group: social sciences
task:
- mmlu_pro_continuation_social_sciences
aggregate_metric_list:
- metric: acc
weight_by_size: True
- group: humanities
task:
- mmlu_pro_continuation_humanities
aggregate_metric_list:
- metric: acc
weight_by_size: True
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 1
"dataset_name": "biology"
"description": "The following are questions (with answers) about biology.\n\
\n"
"group": "mmlu_continuation_stem"
"tag": "mmlu_pro_continuation_stem"
"include": "_continuation_template_yaml"
"task": "mmlu_continuation_biology"
"task": "mmlu_pro_continuation_biology"
"dataset_name": "math"
"description": "The following are questions (with answers) about math.\n\
\n"
"group": "mmlu_continuation_stem"
"tag": "mmlu_pro_continuation_stem"
"include": "_continuation_template_yaml"
"task": "mmlu_continuation_chemistry"
\ No newline at end of file
"task": "mmlu_pro_continuation_chemistry"
\ No newline at end of file
"dataset_name": "psychology"
"description": "The following are questions (with answers) about psychology.\n\
\n"
"group": "mmlu_continuation_social_sciences"
"tag": "mmlu_pro_continuation_social_sciences"
"include": "_continuation_template_yaml"
"task": "mmlu_continuation_psychology"
"task": "mmlu_pro_continuation_psychology"
group: mmlu_pro_flan_cot_fewshot
group_alias: mmlu-pro (flan style, fewshot cot)
task:
- mmlu_pro_flan_cot_fewshot_stem
- mmlu_pro_flan_cot_fewshot_other
- mmlu_pro_flan_cot_fewshot_social_sciences
- mmlu_pro_flan_cot_fewshot_humanities
- group: stem
task:
- mmlu_pro_flan_cot_fewshot_stem
aggregate_metric_list:
- metric: acc
weight_by_size: True
- group: other
task:
- mmlu_pro_flan_cot_fewshot_other
aggregate_metric_list:
- metric: acc
weight_by_size: True
- group: social sciences
task:
- mmlu_pro_flan_cot_fewshot_social_sciences
aggregate_metric_list:
- metric: acc
weight_by_size: True
- group: humanities
task:
- mmlu_pro_flan_cot_fewshot_humanities
aggregate_metric_list:
- metric: acc
weight_by_size: True
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 1
......@@ -27,5 +27,5 @@ metric_list:
ignore_punctuation: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -28,6 +28,6 @@ fewshot_config:
- question: "A total of 30 players will play basketball at a park. There will be exactly 5 players on each team. Which statement correctly explains how to find the number of teams needed?
(A) Multiply 5 by 5 to find 25 teams. (B) Divide 30 by 5 to find 6 teams. (C) Add 5 to 30 to find 35 teams. (D) Subtract 30 from 5 to find -25 teams. (E) Divide 5 by 30 to find 0.1667 teams. (F) Add 5 to 30 then divide by 2 to find 17.5 teams. (G) N/A (H) N/A (I) N/A (J) N/A"
target: "Let's think step by step. We want to find the number of teams. We know that there are 5 players/team, and 30 players. Thus to get the number of teams we divide players by players/team, so 30 players / 5 players/team = 6 teams. The answer is (B)."
group: mmlu_pro_flan_cot_fewshot_stem
tag: mmlu_pro_flan_cot_fewshot_stem
include: _mmlu_pro_flan_cot_fewshot_template_yaml
task: mmlu_pro_flan_cot_fewshot_math
......@@ -18,6 +18,6 @@ fewshot_config:
- question: "What place is named in the title of the 1979 live album by rock legends Cheap Trick?
(A) Brooklyn (B) Beijing (C) Budapest (D) Boston (E) Bhutan (F) Barcelona (G) Britain (H) Brisbane (I) Bruges (J) Budokan"
target: "Let's think step by step. We refer to Wikipedia for help. Nippon Budokan is an indoor arena in Tokyo, Japan renowned for hosting rock music concerts including Cheap Trick in 1978. 'Cheap Trick at Budokan' became the name of their album. The answer is (J)."
group: mmlu_pro_flan_cot_fewshot_other
tag: mmlu_pro_flan_cot_fewshot_other
include: _mmlu_pro_flan_cot_fewshot_template_yaml
task: mmlu_pro_flan_cot_fewshot_miscellaneous
......@@ -24,6 +24,6 @@ fewshot_config:
- question: "What is the sign of the covenant for Jewish males?
(A) Fasting on Yom Kippur (B) Lighting Shabbat candles (C) The rainbow (D) Circumcision (E) The Torah (F) Bar mitzvah (G) Keeping kosher (H) Wearing a kippah (I) A son (J) The Star of David"
target: "Let's think step by step. We refer to Wikipedia articles on world religions for help. In Judaism, the most distinctive sign of the covenant is circumcision (brit milah). The answer is (D)."
group: mmlu_pro_flan_cot_fewshot_humanities
tag: mmlu_pro_flan_cot_fewshot_humanities
include: _mmlu_pro_flan_cot_fewshot_template_yaml
task: mmlu_pro_flan_cot_fewshot_philosophy
......@@ -18,6 +18,6 @@ fewshot_config:
- question: "A microwave oven is connected to an outlet, 120 V, and draws a current of 2 amps. At what rate is energy being used by the microwave oven?
(A) 240 W (B) 120 W (C) 10 W (D) 480 W (E) 360 W (F) 200 W (G) 30 W (H) 150 W (I) 60 W (J) 300 W"
target: "Let's think step by step. Rate of energy usage is known as power; in an dissipative electrical circuit, power is given by voltage times current. So in our case, the power is 120 V times 2 amps, or 240 W. The answer is (A)."
group: mmlu_pro_flan_cot_fewshot_stem
tag: mmlu_pro_flan_cot_fewshot_stem
include: _mmlu_pro_flan_cot_fewshot_template_yaml
task: mmlu_pro_flan_cot_fewshot_physics
......@@ -18,6 +18,6 @@ fewshot_config:
- question: "In terms of Hofstede’s (1980) five cultural dimensions, the United States scores at the top of the scale on:
(A) individualism and long-term orientation. (B) individualism and power distance. (C) uncertainty avoidance. (D) long-term orientation. (E) individualism. (F) individualism and masculinity. (G) long-term orientation and uncertainty avoidance. (H) power distance. (I) power distance and masculinity. (J) N/A"
target: "Let's think step by step. We refer to Wikipedia articles on psychology for help. The US scores highest on individualism among the five cultural dimensions. The answer is (E)."
group: mmlu_pro_flan_cot_fewshot_social_sciences
tag: mmlu_pro_flan_cot_fewshot_social_sciences
include: _mmlu_pro_flan_cot_fewshot_template_yaml
task: mmlu_pro_flan_cot_fewshot_psychology
group: mmlu_pro_flan_cot_zeroshot
group_alias: mmlu-pro (flan style, zeroshot cot)
task:
- mmlu_pro_flan_cot_zeroshot_stem
- mmlu_pro_flan_cot_zeroshot_other
- mmlu_pro_flan_cot_zeroshot_social_sciences
- mmlu_pro_flan_cot_zeroshot_humanities
- group: stem
task:
- mmlu_pro_flan_cot_zeroshot_stem
aggregate_metric_list:
- metric: acc
weight_by_size: True
- group: other
task:
- mmlu_pro_flan_cot_zeroshot_other
aggregate_metric_list:
- metric: acc
weight_by_size: True
- group: social sciences
task:
- mmlu_pro_flan_cot_zeroshot_social_sciences
aggregate_metric_list:
- metric: acc
weight_by_size: True
- group: humanities
task:
- mmlu_pro_flan_cot_zeroshot_humanities
aggregate_metric_list:
- metric: acc
weight_by_size: True
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 1
......@@ -33,4 +33,6 @@ metric_list:
ignore_case: true
ignore_punctuation: true
metadata:
version: 2.0
version: 1.0
dataset_kwargs:
trust_remote_code: true
\ No newline at end of file
"dataset_name": "biology"
"description": "The following are multiple choice questions (with answers) about biology.\n\
\n"
"group": "mmlu_pro_flan_cot_zeroshot_stem"
"tag": "mmlu_pro_flan_cot_zeroshot_stem"
"include": "_mmlu_pro_flan_cot_zeroshot_template_yaml"
"task": "mmlu_pro_flan_cot_zeroshot_biology"
"dataset_name": "business"
"description": "The following are multiple choice questions (with answers) about business.\n\
\n"
"group": "mmlu_pro_flan_cot_zeroshot_other"
"tag": "mmlu_pro_flan_cot_zeroshot_other"
"include": "_mmlu_pro_flan_cot_zeroshot_template_yaml"
"task": "mmlu_pro_flan_cot_zeroshot_business"
"dataset_name": "math"
"description": "The following are multiple choice questions (with answers) about math.\n\
\n"
"group": "mmlu_pro_flan_cot_zeroshot_stem"
"tag": "mmlu_pro_flan_cot_zeroshot_stem"
"include": "_mmlu_pro_flan_cot_zeroshot_template_yaml"
"task": "mmlu_pro_flan_cot_zeroshot_math"
"dataset_name": "psychology"
"description": "The following are multiple choice questions (with answers) about psychology.\n\
\n"
"group": "mmlu_pro_flan_cot_zeroshot_social_sciences"
"tag": "mmlu_pro_flan_cot_zeroshot_social_sciences"
"include": "_mmlu_pro_flan_cot_zeroshot_template_yaml"
"task": "mmlu_pro_flan_cot_zeroshot_psychology"
......@@ -16,3 +16,5 @@ metric_list:
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
group: mmlu_pro_generative
group_alias: mmlu-pro (generative)
task:
- mmlu_pro_stem_generative
- mmlu_pro_other_generative
- mmlu_pro_social_sciences_generative
- mmlu_pro_humanities_generative
- group: stem
task:
- mmlu_pro_stem_generative
aggregate_metric_list:
- metric: acc
weight_by_size: True
- group: other
task:
- mmlu_pro_other_generative
aggregate_metric_list:
- metric: acc
weight_by_size: True
- group: social sciences
task:
- mmlu_pro_social_sciences_generative
aggregate_metric_list:
- metric: acc
weight_by_size: True
- group: humanities
task:
- mmlu_pro_humanities_generative
aggregate_metric_list:
- metric: acc
weight_by_size: True
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment