Bump version to v0.4.4 ; Fixes to TMMLUplus (#2280)

543617fe · Hailey Schoelkopf · GitHub · 7a1614eb · 543617fe · 543617fe
Unverified Commit 543617fe authored Sep 05, 2024 by Hailey Schoelkopf Committed by GitHub Sep 05, 2024
20 changed files
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -299,13 +299,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
            "When `fewshot_as_multiturn` is selected, `apply_chat_template` must be set (either to `True` or to the chosen template name)."
        )
-    if (
-        args.num_fewshot is None or args.num_fewshot == 0
-    ) and args.fewshot_as_multiturn:
-        raise ValueError(
-            "If fewshot_as_multiturn is set, num_fewshot must be greater than 0."
-        )
    if args.include_path is not None:
        eval_logger.info(f"Including path: {args.include_path}")
    task_manager = TaskManager(args.verbosity, include_path=args.include_path)

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -489,10 +489,12 @@ class TaskManager:
                            if attr in config:
                                if attr == "group" and print_info:
                                    self.logger.info(
-                                        "`group` and `group_alias` keys in tasks' configs will no longer be used in the next release of lm-eval. "
+                                        "`group` and `group_alias` keys in TaskConfigs are deprecated and will be removed in v0.4.5 of lm_eval. "
-                                        "`tag` will be used to allow to call a collection of tasks just like `group`. "
+                                        "The new `tag` field will be used to allow for a shortcut to a group of tasks one does not wish to aggregate metrics across. "
-                                        "`group` will be removed in order to not cause confusion with the new ConfigurableGroup "
+                                        "`group`s which aggregate across subtasks must be only defined in a separate group config file, "
-                                        "which will be the official way to create groups with addition of group-wide configurations."
+                                        "which will be the official way to create groups that support cross-task aggregation as in `mmlu`. "
+                                        "Please see the v0.4.4 patch notes and our documentation: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/new_task_guide.md#advanced-group-configs "
+                                        "for more information."
                                    )
                                    print_info = False
                                    # attr = "tag"

--- a/lm_eval/tasks/tmmluplus/default/tmmluplus.yaml
+++ b/lm_eval/tasks/tmmluplus/default/tmmluplus.yaml
@@ -4,3 +4,10 @@ task:
 - tmmluplus_social_sciences
 - tmmluplus_humanities
 - tmmluplus_STEM
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+  - metric: acc_norm
+    weight_by_size: True
+metadata:
+  version: 2.0
--- a/lm_eval/tasks/tmmluplus/default/_tmmluplus_STEM.yaml
+++ b/lm_eval/tasks/tmmluplus/default/_tmmluplus_STEM.yaml
+group: tmmluplus_STEM
+task:
+- tmmluplus_STEM_tasks
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+  - metric: acc_norm
+    weight_by_size: True
+metadata:
+  version: 2.0
--- a/lm_eval/tasks/tmmluplus/default/_tmmluplus_humanities.yaml
+++ b/lm_eval/tasks/tmmluplus/default/_tmmluplus_humanities.yaml
+group: tmmluplus_humanities
+task:
+- tmmluplus_humanities_tasks
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+  - metric: acc_norm
+    weight_by_size: True
+metadata:
+  version: 2.0
--- a/lm_eval/tasks/tmmluplus/default/_tmmluplus_other.yaml
+++ b/lm_eval/tasks/tmmluplus/default/_tmmluplus_other.yaml
+group: tmmluplus_other
+task:
+- tmmluplus_other_tasks
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+  - metric: acc_norm
+    weight_by_size: True
+metadata:
+  version: 2.0
--- a/lm_eval/tasks/tmmluplus/default/_tmmluplus_social_sciences.yaml
+++ b/lm_eval/tasks/tmmluplus/default/_tmmluplus_social_sciences.yaml
+group: tmmluplus_social_sciences
+task:
+- tmmluplus_social_sciences_tasks
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+  - metric: acc_norm
+    weight_by_size: True
+metadata:
+  version: 2.0
--- a/lm_eval/tasks/tmmluplus/default/_default_template_yaml
+++ b/lm_eval/tasks/tmmluplus/default/_default_template_yaml
@@ -16,4 +16,4 @@ metric_list:
    aggregation: mean
    higher_is_better: true
 metadata:
-  version: 1.0
+  version: 2.0
--- a/lm_eval/tasks/tmmluplus/default/tmmluplus_accounting.yaml
+++ b/lm_eval/tasks/tmmluplus/default/tmmluplus_accounting.yaml
 "dataset_name": "accounting"
 "description": "以下為會計學的單選題，請提供正確答案的選項。\n\n"
-"group": "tmmluplus_other"
+"tag": "tmmluplus_other_tasks"
-"group_alias": "other"
+"include": "_tmmluplus_template_yaml"
-"include": "_default_template_yaml"
 "task": "tmmluplus_accounting"
 "task_alias": "accounting"
--- a/lm_eval/tasks/tmmluplus/default/tmmluplus_administrative_law.yaml
+++ b/lm_eval/tasks/tmmluplus/default/tmmluplus_administrative_law.yaml
 "dataset_name": "administrative_law"
 "description": "以下為行政法的單選題，請提供正確答案的選項。\n\n"
-"group": "tmmluplus_humanities"
+"tag": "tmmluplus_humanities_tasks"
-"group_alias": "humanities"
+"include": "_tmmluplus_template_yaml"
-"include": "_default_template_yaml"
 "task": "tmmluplus_administrative_law"
 "task_alias": "administrative law"
--- a/lm_eval/tasks/tmmluplus/default/tmmluplus_advance_chemistry.yaml
+++ b/lm_eval/tasks/tmmluplus/default/tmmluplus_advance_chemistry.yaml
 "dataset_name": "advance_chemistry"
 "description": "以下為化學的單選題，請提供正確答案的選項。\n\n"
-"group": "tmmluplus_STEM"
+"tag": "tmmluplus_STEM_tasks"
-"group_alias": "STEM"
+"include": "_tmmluplus_template_yaml"
-"include": "_default_template_yaml"
 "task": "tmmluplus_advance_chemistry"
 "task_alias": "advance chemistry"
--- a/lm_eval/tasks/tmmluplus/default/tmmluplus_agriculture.yaml
+++ b/lm_eval/tasks/tmmluplus/default/tmmluplus_agriculture.yaml
 "dataset_name": "agriculture"
 "description": "以下為農業的單選題，請提供正確答案的選項。\n\n"
-"group": "tmmluplus_other"
+"tag": "tmmluplus_other_tasks"
-"group_alias": "other"
+"include": "_tmmluplus_template_yaml"
-"include": "_default_template_yaml"
 "task": "tmmluplus_agriculture"
 "task_alias": "agriculture"
--- a/lm_eval/tasks/tmmluplus/default/tmmluplus_anti_money_laundering.yaml
+++ b/lm_eval/tasks/tmmluplus/default/tmmluplus_anti_money_laundering.yaml
 "dataset_name": "anti_money_laundering"
 "description": "以下為洗錢防制的單選題，請提供正確答案的選項。\n\n"
-"group": "tmmluplus_humanities"
+"tag": "tmmluplus_humanities_tasks"
-"group_alias": "humanities"
+"include": "_tmmluplus_template_yaml"
-"include": "_default_template_yaml"
 "task": "tmmluplus_anti_money_laundering"
 "task_alias": "anti money laundering"
--- a/lm_eval/tasks/tmmluplus/default/tmmluplus_auditing.yaml
+++ b/lm_eval/tasks/tmmluplus/default/tmmluplus_auditing.yaml
 "dataset_name": "auditing"
 "description": "以下為審計學的單選題，請提供正確答案的選項。\n\n"
-"group": "tmmluplus_other"
+"tag": "tmmluplus_other_tasks"
-"group_alias": "other"
+"include": "_tmmluplus_template_yaml"
-"include": "_default_template_yaml"
 "task": "tmmluplus_auditing"
 "task_alias": "auditing"
--- a/lm_eval/tasks/tmmluplus/default/tmmluplus_basic_medical_science.yaml
+++ b/lm_eval/tasks/tmmluplus/default/tmmluplus_basic_medical_science.yaml
 "dataset_name": "basic_medical_science"
 "description": "以下為基礎醫學的單選題，請提供正確答案的選項。\n\n"
-"group": "tmmluplus_STEM"
+"tag": "tmmluplus_STEM_tasks"
-"group_alias": "STEM"
+"include": "_tmmluplus_template_yaml"
-"include": "_default_template_yaml"
 "task": "tmmluplus_basic_medical_science"
 "task_alias": "basic medical science"
--- a/lm_eval/tasks/tmmluplus/default/tmmluplus_business_management.yaml
+++ b/lm_eval/tasks/tmmluplus/default/tmmluplus_business_management.yaml
 "dataset_name": "business_management"
 "description": "以下為企業管理的單選題，請提供正確答案的選項。\n\n"
-"group": "tmmluplus_other"
+"tag": "tmmluplus_other_tasks"
-"group_alias": "other"
+"include": "_tmmluplus_template_yaml"
-"include": "_default_template_yaml"
 "task": "tmmluplus_business_management"
 "task_alias": "business management"
--- a/lm_eval/tasks/tmmluplus/default/tmmluplus_chinese_language_and_literature.yaml
+++ b/lm_eval/tasks/tmmluplus/default/tmmluplus_chinese_language_and_literature.yaml
 "dataset_name": "chinese_language_and_literature"
 "description": "以下為國文的單選題，請提供正確答案的選項。\n\n"
-"group": "tmmluplus_social_sciences"
+"tag": "tmmluplus_social_sciences_tasks"
-"group_alias": "social sciences"
+"include": "_tmmluplus_template_yaml"
-"include": "_default_template_yaml"
 "task": "tmmluplus_chinese_language_and_literature"
 "task_alias": "chinese language and literature"
--- a/lm_eval/tasks/tmmluplus/default/tmmluplus_clinical_psychology.yaml
+++ b/lm_eval/tasks/tmmluplus/default/tmmluplus_clinical_psychology.yaml
 "dataset_name": "clinical_psychology"
 "description": "以下為臨床心理學的單選題，請提供正確答案的選項。\n\n"
-"group": "tmmluplus_social_sciences"
+"tag": "tmmluplus_social_sciences_tasks"
-"group_alias": "social sciences"
+"include": "_tmmluplus_template_yaml"
-"include": "_default_template_yaml"
 "task": "tmmluplus_clinical_psychology"
 "task_alias": "clinical psychology"
--- a/lm_eval/tasks/tmmluplus/default/tmmluplus_computer_science.yaml
+++ b/lm_eval/tasks/tmmluplus/default/tmmluplus_computer_science.yaml
 "dataset_name": "computer_science"
 "description": "以下為資訊工程的單選題，請提供正確答案的選項。\n\n"
-"group": "tmmluplus_STEM"
+"tag": "tmmluplus_STEM_tasks"
-"group_alias": "STEM"
+"include": "_tmmluplus_template_yaml"
-"include": "_default_template_yaml"
 "task": "tmmluplus_computer_science"
 "task_alias": "computer science"
--- a/lm_eval/tasks/tmmluplus/default/tmmluplus_culinary_skills.yaml
+++ b/lm_eval/tasks/tmmluplus/default/tmmluplus_culinary_skills.yaml
 "dataset_name": "culinary_skills"
 "description": "以下為餐旅的單選題，請提供正確答案的選項。\n\n"
-"group": "tmmluplus_other"
+"tag": "tmmluplus_other_tasks"
-"group_alias": "other"
+"include": "_tmmluplus_template_yaml"
-"include": "_default_template_yaml"
 "task": "tmmluplus_culinary_skills"
 "task_alias": "culinary skills"