mmlu-pro: update yaml content in line with mmlu

cd441ab1 · Yu Shi Jie · 5bae76d6 · cd441ab1 · cd441ab1 · cd441ab1
Commit cd441ab1 authored Jul 26, 2024 by Yu Shi Jie
20 changed files
--- a/lm_eval/tasks/mmlu_pro/continuation/_continuation_template_yaml
+++ b/lm_eval/tasks/mmlu_pro/continuation/_continuation_template_yaml
 dataset_path: sjyuxyz/MMLU-Pro-with-subset
+output_type: multiple_choice
 test_split: test
 fewshot_split: dev
 fewshot_config:
@@ -8,3 +9,5 @@ doc_to_choice: "{{options}}"
 doc_to_target: "{{answer_index}}"
 metadata:
  version: 0.0
+dataset_kwargs:
+  trust_remote_code: true
--- a/lm_eval/tasks/mmlu_pro/continuation/_mmlu_pro.yaml
+++ b/lm_eval/tasks/mmlu_pro/continuation/_mmlu_pro.yaml
 group: mmlu_pro_continuation
+group_alias: mmlu-pro (continuation)
 task:
-  - mmlu_pro_continuation_stem
-  - mmlu_pro_continuation_other
-  - mmlu_pro_continuation_social_sciences
-  - mmlu_pro_continuation_humanities
+  - group: stem
+    task:
+      - mmlu_pro_continuation_stem
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: other
+    task:
+      - mmlu_pro_continuation_other
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: social sciences
+    task:
+      - mmlu_pro_continuation_social_sciences
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: humanities
+    task:
+      - mmlu_pro_continuation_humanities
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 1
--- a/lm_eval/tasks/mmlu_pro/continuation/mmlu_pro_biology.yaml
+++ b/lm_eval/tasks/mmlu_pro/continuation/mmlu_pro_biology.yaml
 "dataset_name": "biology"
 "description": "The following are questions (with answers) about biology.\n\
  \n"
-"group": "mmlu_continuation_stem"
+"tag": "mmlu_pro_continuation_stem"
 "include": "_continuation_template_yaml"
-"task": "mmlu_continuation_biology"
+"task": "mmlu_pro_continuation_biology"
--- a/lm_eval/tasks/mmlu_pro/continuation/mmlu_pro_chemistry.yaml
+++ b/lm_eval/tasks/mmlu_pro/continuation/mmlu_pro_chemistry.yaml
 "dataset_name": "math"
 "description": "The following are questions (with answers) about math.\n\
  \n"
-"group": "mmlu_continuation_stem"
+"tag": "mmlu_pro_continuation_stem"
 "include": "_continuation_template_yaml"
-"task": "mmlu_continuation_chemistry"
\ No newline at end of file
+"task": "mmlu_pro_continuation_chemistry"
\ No newline at end of file
--- a/lm_eval/tasks/mmlu_pro/continuation/mmlu_pro_psychology.yaml
+++ b/lm_eval/tasks/mmlu_pro/continuation/mmlu_pro_psychology.yaml
 "dataset_name": "psychology"
 "description": "The following are questions (with answers) about psychology.\n\
  \n"
-"group": "mmlu_continuation_social_sciences"
+"tag": "mmlu_pro_continuation_social_sciences"
 "include": "_continuation_template_yaml"
-"task": "mmlu_continuation_psychology"
+"task": "mmlu_pro_continuation_psychology"
--- a/lm_eval/tasks/mmlu_pro/flan_cot_fewshot/_mmlu_pro.yaml
+++ b/lm_eval/tasks/mmlu_pro/flan_cot_fewshot/_mmlu_pro.yaml
 group: mmlu_pro_flan_cot_fewshot
+group_alias: mmlu-pro (flan style, fewshot cot)
 task:
-  - mmlu_pro_flan_cot_fewshot_stem
-  - mmlu_pro_flan_cot_fewshot_other
-  - mmlu_pro_flan_cot_fewshot_social_sciences
-  - mmlu_pro_flan_cot_fewshot_humanities
+  - group: stem
+    task:
+      - mmlu_pro_flan_cot_fewshot_stem
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: other
+    task:
+      - mmlu_pro_flan_cot_fewshot_other
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: social sciences
+    task:
+      - mmlu_pro_flan_cot_fewshot_social_sciences
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: humanities
+    task:
+      - mmlu_pro_flan_cot_fewshot_humanities
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 1
--- a/lm_eval/tasks/mmlu_pro/flan_cot_fewshot/_mmlu_pro_flan_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/mmlu_pro/flan_cot_fewshot/_mmlu_pro_flan_cot_fewshot_template_yaml
@@ -27,5 +27,5 @@ metric_list:
    ignore_punctuation: true
 metadata:
  version: 1.0
-
-
+dataset_kwargs:
+  trust_remote_code: true
--- a/lm_eval/tasks/mmlu_pro/flan_cot_fewshot/mmlu_pro_math.yaml
+++ b/lm_eval/tasks/mmlu_pro/flan_cot_fewshot/mmlu_pro_math.yaml
@@ -28,6 +28,6 @@ fewshot_config:
  - question: "A total of 30 players will play basketball at a park. There will be exactly 5 players on each team. Which statement correctly explains how to find the number of teams needed?
      (A) Multiply 5 by 5 to find 25 teams. (B) Divide 30 by 5 to find 6 teams. (C) Add 5 to 30 to find 35 teams. (D) Subtract 30 from 5 to find -25 teams. (E) Divide 5 by 30 to find 0.1667 teams. (F) Add 5 to 30 then divide by 2 to find 17.5 teams. (G) N/A (H) N/A (I) N/A (J) N/A"
    target: "Let's think step by step. We want to find the number of teams. We know that there are 5 players/team, and 30 players. Thus to get the number of teams we divide players by players/team, so 30 players / 5 players/team = 6 teams. The answer is (B)."
-group: mmlu_pro_flan_cot_fewshot_stem
+tag: mmlu_pro_flan_cot_fewshot_stem
 include: _mmlu_pro_flan_cot_fewshot_template_yaml
 task: mmlu_pro_flan_cot_fewshot_math
--- a/lm_eval/tasks/mmlu_pro/flan_cot_fewshot/mmlu_pro_miscellaneous.yaml
+++ b/lm_eval/tasks/mmlu_pro/flan_cot_fewshot/mmlu_pro_miscellaneous.yaml
@@ -18,6 +18,6 @@ fewshot_config:
  - question: "What place is named in the title of the 1979 live album by rock legends Cheap Trick?
      (A) Brooklyn (B) Beijing (C) Budapest (D) Boston (E) Bhutan (F) Barcelona (G) Britain (H) Brisbane (I) Bruges (J) Budokan"
    target: "Let's think step by step. We refer to Wikipedia for help. Nippon Budokan is an indoor arena in Tokyo, Japan renowned for hosting rock music concerts including Cheap Trick in 1978. 'Cheap Trick at Budokan' became the name of their album. The answer is (J)."
-group: mmlu_pro_flan_cot_fewshot_other
+tag: mmlu_pro_flan_cot_fewshot_other
 include: _mmlu_pro_flan_cot_fewshot_template_yaml
 task: mmlu_pro_flan_cot_fewshot_miscellaneous
--- a/lm_eval/tasks/mmlu_pro/flan_cot_fewshot/mmlu_pro_philosophy.yaml
+++ b/lm_eval/tasks/mmlu_pro/flan_cot_fewshot/mmlu_pro_philosophy.yaml
@@ -24,6 +24,6 @@ fewshot_config:
  - question: "What is the sign of the covenant for Jewish males?
      (A) Fasting on Yom Kippur (B) Lighting Shabbat candles (C) The rainbow (D) Circumcision (E) The Torah (F) Bar mitzvah (G) Keeping kosher (H) Wearing a kippah (I) A son (J) The Star of David"
    target: "Let's think step by step. We refer to Wikipedia articles on world religions for help. In Judaism, the most distinctive sign of the covenant is circumcision (brit milah). The answer is (D)."
-group: mmlu_pro_flan_cot_fewshot_humanities
+tag: mmlu_pro_flan_cot_fewshot_humanities
 include: _mmlu_pro_flan_cot_fewshot_template_yaml
 task: mmlu_pro_flan_cot_fewshot_philosophy
--- a/lm_eval/tasks/mmlu_pro/flan_cot_fewshot/mmlu_pro_physics.yaml
+++ b/lm_eval/tasks/mmlu_pro/flan_cot_fewshot/mmlu_pro_physics.yaml
@@ -18,6 +18,6 @@ fewshot_config:
  - question: "A microwave oven is connected to an outlet, 120 V, and draws a current of 2 amps. At what rate is energy being used by the microwave oven?
      (A) 240 W (B) 120 W (C) 10 W (D) 480 W (E) 360 W (F) 200 W (G) 30 W (H) 150 W (I) 60 W (J) 300 W"
    target: "Let's think step by step. Rate of energy usage is known as power; in an dissipative electrical circuit, power is given by voltage times current. So in our case, the power is 120 V times 2 amps, or 240 W. The answer is (A)."
-group: mmlu_pro_flan_cot_fewshot_stem
+tag: mmlu_pro_flan_cot_fewshot_stem
 include: _mmlu_pro_flan_cot_fewshot_template_yaml
 task: mmlu_pro_flan_cot_fewshot_physics
--- a/lm_eval/tasks/mmlu_pro/flan_cot_fewshot/mmlu_pro_psychology.yaml
+++ b/lm_eval/tasks/mmlu_pro/flan_cot_fewshot/mmlu_pro_psychology.yaml
@@ -18,6 +18,6 @@ fewshot_config:
  - question: "In terms of Hofstede’s (1980) five cultural dimensions, the United States scores at the top of the scale on:
      (A) individualism and long-term orientation. (B) individualism and power distance. (C) uncertainty avoidance. (D) long-term orientation. (E) individualism. (F) individualism and masculinity. (G) long-term orientation and uncertainty avoidance. (H) power distance. (I) power distance and masculinity. (J) N/A"
    target: "Let's think step by step. We refer to Wikipedia articles on psychology for help. The US scores highest on individualism among the five cultural dimensions. The answer is (E)."
-group: mmlu_pro_flan_cot_fewshot_social_sciences
+tag: mmlu_pro_flan_cot_fewshot_social_sciences
 include: _mmlu_pro_flan_cot_fewshot_template_yaml
 task: mmlu_pro_flan_cot_fewshot_psychology
--- a/lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/_mmlu_pro.yaml
+++ b/lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/_mmlu_pro.yaml
 group: mmlu_pro_flan_cot_zeroshot
+group_alias: mmlu-pro (flan style, zeroshot cot)
 task:
-  - mmlu_pro_flan_cot_zeroshot_stem
-  - mmlu_pro_flan_cot_zeroshot_other
-  - mmlu_pro_flan_cot_zeroshot_social_sciences
-  - mmlu_pro_flan_cot_zeroshot_humanities
+  - group: stem
+    task:
+      - mmlu_pro_flan_cot_zeroshot_stem
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: other
+    task:
+      - mmlu_pro_flan_cot_zeroshot_other
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: social sciences
+    task:
+      - mmlu_pro_flan_cot_zeroshot_social_sciences
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: humanities
+    task:
+      - mmlu_pro_flan_cot_zeroshot_humanities
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 1
--- a/lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/_mmlu_pro_flan_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/_mmlu_pro_flan_cot_zeroshot_template_yaml
@@ -33,4 +33,6 @@ metric_list:
    ignore_case: true
    ignore_punctuation: true
 metadata:
-  version: 2.0
+  version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
\ No newline at end of file
--- a/lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/mmlu_pro_biology.yaml
+++ b/lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/mmlu_pro_biology.yaml
 "dataset_name": "biology"
 "description": "The following are multiple choice questions (with answers) about biology.\n\
  \n"
-"group": "mmlu_pro_flan_cot_zeroshot_stem"
+"tag": "mmlu_pro_flan_cot_zeroshot_stem"
 "include": "_mmlu_pro_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_pro_flan_cot_zeroshot_biology"
--- a/lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/mmlu_pro_business.yaml
+++ b/lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/mmlu_pro_business.yaml
 "dataset_name": "business"
 "description": "The following are multiple choice questions (with answers) about business.\n\
  \n"
-"group": "mmlu_pro_flan_cot_zeroshot_other"
+"tag": "mmlu_pro_flan_cot_zeroshot_other"
 "include": "_mmlu_pro_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_pro_flan_cot_zeroshot_business"
--- a/lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/mmlu_pro_math.yaml
+++ b/lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/mmlu_pro_math.yaml
 "dataset_name": "math"
 "description": "The following are multiple choice questions (with answers) about math.\n\
  \n"
-"group": "mmlu_pro_flan_cot_zeroshot_stem"
+"tag": "mmlu_pro_flan_cot_zeroshot_stem"
 "include": "_mmlu_pro_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_pro_flan_cot_zeroshot_math"
--- a/lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/mmlu_pro_psychology.yaml
+++ b/lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/mmlu_pro_psychology.yaml
 "dataset_name": "psychology"
 "description": "The following are multiple choice questions (with answers) about psychology.\n\
  \n"
-"group": "mmlu_pro_flan_cot_zeroshot_social_sciences"
+"tag": "mmlu_pro_flan_cot_zeroshot_social_sciences"
 "include": "_mmlu_pro_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_pro_flan_cot_zeroshot_psychology"
--- a/lm_eval/tasks/mmlu_pro/generative/_default_template_yaml
+++ b/lm_eval/tasks/mmlu_pro/generative/_default_template_yaml
@@ -16,3 +16,5 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
--- a/lm_eval/tasks/mmlu_pro/generative/_mmlu_pro.yaml
+++ b/lm_eval/tasks/mmlu_pro/generative/_mmlu_pro.yaml
 group: mmlu_pro_generative
+group_alias: mmlu-pro (generative)
 task:
-  - mmlu_pro_stem_generative
-  - mmlu_pro_other_generative
-  - mmlu_pro_social_sciences_generative
-  - mmlu_pro_humanities_generative
+  - group: stem
+    task:
+      - mmlu_pro_stem_generative
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: other
+    task:
+      - mmlu_pro_other_generative
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: social sciences
+    task:
+      - mmlu_pro_social_sciences_generative
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: humanities
+    task:
+      - mmlu_pro_humanities_generative
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 1