Adds MMLU CoT, gsm8k and arc_challenge for llama instruct (#2829)

* llama-style MMLU CoT * Refactor MMLU CoT template YAML to simplify 'until' structure * Add GSM8K task configuration for LLaMA3 with few-shot examples * Fix missing newline at end of MMLU CoT YAML file * Add ARC-Challenge task configuration and processing utility * Add additional MMLU and ARC-Challenge task variants to README * Update README with notes on arc_challenge_llama dataset preprocessing

Adds MMLU CoT, gsm8k and arc_challenge for llama instruct (#2829)
* llama-style MMLU CoT * Refactor MMLU CoT template YAML to simplify 'until' structure * Add GSM8K task configuration for LLaMA3 with few-shot examples * Fix missing newline at end of MMLU CoT YAML file * Add ARC-Challenge task configuration and processing utility * Add additional MMLU and ARC-Challenge task variants to README * Update README with notes on arc_challenge_llama dataset preprocessing
3816796e · Alexandre Marques · GitHub · 1514ac1e · 3816796e · 3816796e
Unverified Commit 3816796e authored Mar 30, 2025 by Alexandre Marques Committed by GitHub Mar 30, 2025
20 changed files
--- a/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_high_school_us_history.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_high_school_us_history.yaml
+"dataset_name": "high_school_us_history"
+"description": ""
+"include": "_mmlu_cot_llama_template_yaml"
+"tag": "mmlu_cot_llama_humanities_tasks"
+"task": "mmlu_cot_llama_high_school_us_history"
+"task_alias": "high school us history"
--- a/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_high_school_world_history.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_high_school_world_history.yaml
+"dataset_name": "high_school_world_history"
+"description": ""
+"include": "_mmlu_cot_llama_template_yaml"
+"tag": "mmlu_cot_llama_humanities_tasks"
+"task": "mmlu_cot_llama_high_school_world_history"
+"task_alias": "high school world history"
--- a/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_human_aging.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_human_aging.yaml
+"dataset_name": "human_aging"
+"description": ""
+"include": "_mmlu_cot_llama_template_yaml"
+"tag": "mmlu_cot_llama_other_tasks"
+"task": "mmlu_cot_llama_human_aging"
+"task_alias": "human aging"
--- a/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_human_sexuality.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_human_sexuality.yaml
+"dataset_name": "human_sexuality"
+"description": ""
+"include": "_mmlu_cot_llama_template_yaml"
+"tag": "mmlu_cot_llama_social_sciences_tasks"
+"task": "mmlu_cot_llama_human_sexuality"
+"task_alias": "human sexuality"
--- a/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_international_law.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_international_law.yaml
+"dataset_name": "international_law"
+"description": ""
+"include": "_mmlu_cot_llama_template_yaml"
+"tag": "mmlu_cot_llama_humanities_tasks"
+"task": "mmlu_cot_llama_international_law"
+"task_alias": "international law"
--- a/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_jurisprudence.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_jurisprudence.yaml
+"dataset_name": "jurisprudence"
+"description": ""
+"include": "_mmlu_cot_llama_template_yaml"
+"tag": "mmlu_cot_llama_humanities_tasks"
+"task": "mmlu_cot_llama_jurisprudence"
+"task_alias": "jurisprudence"
--- a/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_logical_fallacies.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_logical_fallacies.yaml
+"dataset_name": "logical_fallacies"
+"description": ""
+"include": "_mmlu_cot_llama_template_yaml"
+"tag": "mmlu_cot_llama_humanities_tasks"
+"task": "mmlu_cot_llama_logical_fallacies"
+"task_alias": "logical fallacies"
--- a/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_machine_learning.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_machine_learning.yaml
+"dataset_name": "machine_learning"
+"description": ""
+"include": "_mmlu_cot_llama_template_yaml"
+"tag": "mmlu_cot_llama_stem_tasks"
+"task": "mmlu_cot_llama_machine_learning"
+"task_alias": "machine learning"
--- a/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_management.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_management.yaml
+"dataset_name": "management"
+"description": ""
+"include": "_mmlu_cot_llama_template_yaml"
+"tag": "mmlu_cot_llama_other_tasks"
+"task": "mmlu_cot_llama_management"
+"task_alias": "management"
--- a/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_marketing.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_marketing.yaml
+"dataset_name": "marketing"
+"description": ""
+"include": "_mmlu_cot_llama_template_yaml"
+"tag": "mmlu_cot_llama_other_tasks"
+"task": "mmlu_cot_llama_marketing"
+"task_alias": "marketing"
--- a/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_medical_genetics.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_medical_genetics.yaml
+"dataset_name": "medical_genetics"
+"description": ""
+"include": "_mmlu_cot_llama_template_yaml"
+"tag": "mmlu_cot_llama_other_tasks"
+"task": "mmlu_cot_llama_medical_genetics"
+"task_alias": "medical genetics"
--- a/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_miscellaneous.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_miscellaneous.yaml
+"dataset_name": "miscellaneous"
+"description": ""
+"include": "_mmlu_cot_llama_template_yaml"
+"tag": "mmlu_cot_llama_other_tasks"
+"task": "mmlu_cot_llama_miscellaneous"
+"task_alias": "miscellaneous"
--- a/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_moral_disputes.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_moral_disputes.yaml
+"dataset_name": "moral_disputes"
+"description": ""
+"include": "_mmlu_cot_llama_template_yaml"
+"tag": "mmlu_cot_llama_humanities_tasks"
+"task": "mmlu_cot_llama_moral_disputes"
+"task_alias": "moral disputes"
--- a/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_moral_scenarios.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_moral_scenarios.yaml
+"dataset_name": "moral_scenarios"
+"description": ""
+"include": "_mmlu_cot_llama_template_yaml"
+"tag": "mmlu_cot_llama_humanities_tasks"
+"task": "mmlu_cot_llama_moral_scenarios"
+"task_alias": "moral scenarios"
--- a/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_nutrition.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_nutrition.yaml
+"dataset_name": "nutrition"
+"description": ""
+"include": "_mmlu_cot_llama_template_yaml"
+"tag": "mmlu_cot_llama_other_tasks"
+"task": "mmlu_cot_llama_nutrition"
+"task_alias": "nutrition"
--- a/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_philosophy.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_philosophy.yaml
+"dataset_name": "philosophy"
+"description": ""
+"include": "_mmlu_cot_llama_template_yaml"
+"tag": "mmlu_cot_llama_humanities_tasks"
+"task": "mmlu_cot_llama_philosophy"
+"task_alias": "philosophy"
--- a/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_prehistory.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_prehistory.yaml
+"dataset_name": "prehistory"
+"description": ""
+"include": "_mmlu_cot_llama_template_yaml"
+"tag": "mmlu_cot_llama_humanities_tasks"
+"task": "mmlu_cot_llama_prehistory"
+"task_alias": "prehistory"
--- a/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_professional_accounting.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_professional_accounting.yaml
+"dataset_name": "professional_accounting"
+"description": ""
+"include": "_mmlu_cot_llama_template_yaml"
+"tag": "mmlu_cot_llama_other_tasks"
+"task": "mmlu_cot_llama_professional_accounting"
+"task_alias": "professional accounting"
--- a/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_professional_law.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_professional_law.yaml
+"dataset_name": "professional_law"
+"description": ""
+"include": "_mmlu_cot_llama_template_yaml"
+"tag": "mmlu_cot_llama_humanities_tasks"
+"task": "mmlu_cot_llama_professional_law"
+"task_alias": "professional law"
--- a/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_professional_medicine.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_cot/mmlu_professional_medicine.yaml
+"dataset_name": "professional_medicine"
+"description": ""
+"include": "_mmlu_cot_llama_template_yaml"
+"tag": "mmlu_cot_llama_other_tasks"
+"task": "mmlu_cot_llama_professional_medicine"
+"task_alias": "professional medicine"