separate category for `global_mmlu` (#2652)

* separate category * set version 0.0 * apply precommit

separate category for `global_mmlu` (#2652)
* separate category * set version 0.0 * apply precommit
5c006ed4 · Minho Ryu · GitHub · 370e2f9e · 5c006ed4 · 5c006ed4
Unverified Commit 5c006ed4 authored Jan 25, 2025 by Minho Ryu Committed by GitHub Jan 24, 2025
20 changed files
--- a/lm_eval/tasks/global_mmlu/default/id/_id_template_yaml
+++ b/lm_eval/tasks/global_mmlu/default/id/_id_template_yaml
+dataset_path: CohereForAI/Global-MMLU-Lite
+dataset_name: id
+test_split: test
+fewshot_split: dev
+fewshot_config:
+  sampler: default
+output_type: multiple_choice
+doc_to_text: "{{question.strip()}}\nA. {{option_a}}\nB. {{option_b}}\nC. {{option_c}}\nD. {{option_d}}\nAnswer:"
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/global_mmlu/default/id/global_mmlu_id_business.yaml
+++ b/lm_eval/tasks/global_mmlu/default/id/global_mmlu_id_business.yaml
+# Generated by _generate_configs.py
+include: _id_template_yaml
+process_docs: !function utils.process_business
+task: global_mmlu_id_business
--- a/lm_eval/tasks/global_mmlu/default/id/global_mmlu_id_humanities.yaml
+++ b/lm_eval/tasks/global_mmlu/default/id/global_mmlu_id_humanities.yaml
+# Generated by _generate_configs.py
+include: _id_template_yaml
+process_docs: !function utils.process_humanities
+task: global_mmlu_id_humanities
--- a/lm_eval/tasks/global_mmlu/default/id/global_mmlu_id_medical.yaml
+++ b/lm_eval/tasks/global_mmlu/default/id/global_mmlu_id_medical.yaml
+# Generated by _generate_configs.py
+include: _id_template_yaml
+process_docs: !function utils.process_medical
+task: global_mmlu_id_medical
--- a/lm_eval/tasks/global_mmlu/default/id/global_mmlu_id_other.yaml
+++ b/lm_eval/tasks/global_mmlu/default/id/global_mmlu_id_other.yaml
+# Generated by _generate_configs.py
+include: _id_template_yaml
+process_docs: !function utils.process_other
+task: global_mmlu_id_other
--- a/lm_eval/tasks/global_mmlu/default/id/global_mmlu_id_social_sciences.yaml
+++ b/lm_eval/tasks/global_mmlu/default/id/global_mmlu_id_social_sciences.yaml
+# Generated by _generate_configs.py
+include: _id_template_yaml
+process_docs: !function utils.process_social_sciences
+task: global_mmlu_id_social_sciences
--- a/lm_eval/tasks/global_mmlu/default/id/global_mmlu_id_stem.yaml
+++ b/lm_eval/tasks/global_mmlu/default/id/global_mmlu_id_stem.yaml
+# Generated by _generate_configs.py
+include: _id_template_yaml
+process_docs: !function utils.process_stem
+task: global_mmlu_id_stem
--- a/lm_eval/tasks/global_mmlu/default/id/utils.py
+++ b/lm_eval/tasks/global_mmlu/default/id/utils.py
+from functools import partial
+
+
+CATEGORIES = ["Business", "Humanities", "Medical", "Other", "STEM", "Social Sciences"]
+
+
+def process_docs(dataset, category):
+    return dataset.filter(lambda x: x["subject_category"] == category)
+
+
+process_functions = {
+    f"process_{category.lower().replace(' ', '_')}": partial(
+        process_docs, category=category
+    )
+    for category in CATEGORIES
+}
+
+globals().update(process_functions)
--- a/lm_eval/tasks/global_mmlu/default/it/_global_mmlu_it.yaml
+++ b/lm_eval/tasks/global_mmlu/default/it/_global_mmlu_it.yaml
+group: global_mmlu_it
+task:
+  - global_mmlu_it_business
+  - global_mmlu_it_humanities
+  - global_mmlu_it_medical
+  - global_mmlu_it_other
+  - global_mmlu_it_stem
+  - global_mmlu_it_social_sciences
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/global_mmlu/default/it/_it_template_yaml
+++ b/lm_eval/tasks/global_mmlu/default/it/_it_template_yaml
+dataset_path: CohereForAI/Global-MMLU-Lite
+dataset_name: it
+test_split: test
+fewshot_split: dev
+fewshot_config:
+  sampler: default
+output_type: multiple_choice
+doc_to_text: "{{question.strip()}}\nA. {{option_a}}\nB. {{option_b}}\nC. {{option_c}}\nD. {{option_d}}\nAnswer:"
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/global_mmlu/default/it/global_mmlu_it_business.yaml
+++ b/lm_eval/tasks/global_mmlu/default/it/global_mmlu_it_business.yaml
+# Generated by _generate_configs.py
+include: _it_template_yaml
+process_docs: !function utils.process_business
+task: global_mmlu_it_business
--- a/lm_eval/tasks/global_mmlu/default/it/global_mmlu_it_humanities.yaml
+++ b/lm_eval/tasks/global_mmlu/default/it/global_mmlu_it_humanities.yaml
+# Generated by _generate_configs.py
+include: _it_template_yaml
+process_docs: !function utils.process_humanities
+task: global_mmlu_it_humanities
--- a/lm_eval/tasks/global_mmlu/default/it/global_mmlu_it_medical.yaml
+++ b/lm_eval/tasks/global_mmlu/default/it/global_mmlu_it_medical.yaml
+# Generated by _generate_configs.py
+include: _it_template_yaml
+process_docs: !function utils.process_medical
+task: global_mmlu_it_medical
--- a/lm_eval/tasks/global_mmlu/default/it/global_mmlu_it_other.yaml
+++ b/lm_eval/tasks/global_mmlu/default/it/global_mmlu_it_other.yaml
+# Generated by _generate_configs.py
+include: _it_template_yaml
+process_docs: !function utils.process_other
+task: global_mmlu_it_other
--- a/lm_eval/tasks/global_mmlu/default/it/global_mmlu_it_social_sciences.yaml
+++ b/lm_eval/tasks/global_mmlu/default/it/global_mmlu_it_social_sciences.yaml
+# Generated by _generate_configs.py
+include: _it_template_yaml
+process_docs: !function utils.process_social_sciences
+task: global_mmlu_it_social_sciences
--- a/lm_eval/tasks/global_mmlu/default/it/global_mmlu_it_stem.yaml
+++ b/lm_eval/tasks/global_mmlu/default/it/global_mmlu_it_stem.yaml
+# Generated by _generate_configs.py
+include: _it_template_yaml
+process_docs: !function utils.process_stem
+task: global_mmlu_it_stem
--- a/lm_eval/tasks/global_mmlu/default/it/utils.py
+++ b/lm_eval/tasks/global_mmlu/default/it/utils.py
+from functools import partial
+
+
+CATEGORIES = ["Business", "Humanities", "Medical", "Other", "STEM", "Social Sciences"]
+
+
+def process_docs(dataset, category):
+    return dataset.filter(lambda x: x["subject_category"] == category)
+
+
+process_functions = {
+    f"process_{category.lower().replace(' ', '_')}": partial(
+        process_docs, category=category
+    )
+    for category in CATEGORIES
+}
+
+globals().update(process_functions)
--- a/lm_eval/tasks/global_mmlu/default/ja/_global_mmlu_ja.yaml
+++ b/lm_eval/tasks/global_mmlu/default/ja/_global_mmlu_ja.yaml
+group: global_mmlu_ja
+task:
+  - global_mmlu_ja_business
+  - global_mmlu_ja_humanities
+  - global_mmlu_ja_medical
+  - global_mmlu_ja_other
+  - global_mmlu_ja_stem
+  - global_mmlu_ja_social_sciences
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/global_mmlu/default/ja/_ja_template_yaml
+++ b/lm_eval/tasks/global_mmlu/default/ja/_ja_template_yaml
+dataset_path: CohereForAI/Global-MMLU-Lite
+dataset_name: ja
+test_split: test
+fewshot_split: dev
+fewshot_config:
+  sampler: default
+output_type: multiple_choice
+doc_to_text: "{{question.strip()}}\nA. {{option_a}}\nB. {{option_b}}\nC. {{option_c}}\nD. {{option_d}}\nAnswer:"
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/global_mmlu/default/ja/global_mmlu_ja_business.yaml
+++ b/lm_eval/tasks/global_mmlu/default/ja/global_mmlu_ja_business.yaml
+# Generated by _generate_configs.py
+include: _ja_template_yaml
+process_docs: !function utils.process_business
+task: global_mmlu_ja_business