Unverified Commit 543617fe authored by Hailey Schoelkopf's avatar Hailey Schoelkopf Committed by GitHub
Browse files

Bump version to v0.4.4 ; Fixes to TMMLUplus (#2280)

parent 7a1614eb
......@@ -299,13 +299,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
"When `fewshot_as_multiturn` is selected, `apply_chat_template` must be set (either to `True` or to the chosen template name)."
)
if (
args.num_fewshot is None or args.num_fewshot == 0
) and args.fewshot_as_multiturn:
raise ValueError(
"If fewshot_as_multiturn is set, num_fewshot must be greater than 0."
)
if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}")
task_manager = TaskManager(args.verbosity, include_path=args.include_path)
......
......@@ -489,10 +489,12 @@ class TaskManager:
if attr in config:
if attr == "group" and print_info:
self.logger.info(
"`group` and `group_alias` keys in tasks' configs will no longer be used in the next release of lm-eval. "
"`tag` will be used to allow to call a collection of tasks just like `group`. "
"`group` will be removed in order to not cause confusion with the new ConfigurableGroup "
"which will be the official way to create groups with addition of group-wide configurations."
"`group` and `group_alias` keys in TaskConfigs are deprecated and will be removed in v0.4.5 of lm_eval. "
"The new `tag` field will be used to allow for a shortcut to a group of tasks one does not wish to aggregate metrics across. "
"`group`s which aggregate across subtasks must be only defined in a separate group config file, "
"which will be the official way to create groups that support cross-task aggregation as in `mmlu`. "
"Please see the v0.4.4 patch notes and our documentation: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/new_task_guide.md#advanced-group-configs "
"for more information."
)
print_info = False
# attr = "tag"
......
......@@ -4,3 +4,10 @@ task:
- tmmluplus_social_sciences
- tmmluplus_humanities
- tmmluplus_STEM
aggregate_metric_list:
- metric: acc
weight_by_size: True
- metric: acc_norm
weight_by_size: True
metadata:
version: 2.0
group: tmmluplus_STEM
task:
- tmmluplus_STEM_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
- metric: acc_norm
weight_by_size: True
metadata:
version: 2.0
group: tmmluplus_humanities
task:
- tmmluplus_humanities_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
- metric: acc_norm
weight_by_size: True
metadata:
version: 2.0
group: tmmluplus_other
task:
- tmmluplus_other_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
- metric: acc_norm
weight_by_size: True
metadata:
version: 2.0
group: tmmluplus_social_sciences
task:
- tmmluplus_social_sciences_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
- metric: acc_norm
weight_by_size: True
metadata:
version: 2.0
......@@ -16,4 +16,4 @@ metric_list:
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
version: 2.0
"dataset_name": "accounting"
"description": "以下為會計學的單選題,請提供正確答案的選項。\n\n"
"group": "tmmluplus_other"
"group_alias": "other"
"include": "_default_template_yaml"
"tag": "tmmluplus_other_tasks"
"include": "_tmmluplus_template_yaml"
"task": "tmmluplus_accounting"
"task_alias": "accounting"
"dataset_name": "administrative_law"
"description": "以下為行政法的單選題,請提供正確答案的選項。\n\n"
"group": "tmmluplus_humanities"
"group_alias": "humanities"
"include": "_default_template_yaml"
"tag": "tmmluplus_humanities_tasks"
"include": "_tmmluplus_template_yaml"
"task": "tmmluplus_administrative_law"
"task_alias": "administrative law"
"dataset_name": "advance_chemistry"
"description": "以下為化學的單選題,請提供正確答案的選項。\n\n"
"group": "tmmluplus_STEM"
"group_alias": "STEM"
"include": "_default_template_yaml"
"tag": "tmmluplus_STEM_tasks"
"include": "_tmmluplus_template_yaml"
"task": "tmmluplus_advance_chemistry"
"task_alias": "advance chemistry"
"dataset_name": "agriculture"
"description": "以下為農業的單選題,請提供正確答案的選項。\n\n"
"group": "tmmluplus_other"
"group_alias": "other"
"include": "_default_template_yaml"
"tag": "tmmluplus_other_tasks"
"include": "_tmmluplus_template_yaml"
"task": "tmmluplus_agriculture"
"task_alias": "agriculture"
"dataset_name": "anti_money_laundering"
"description": "以下為洗錢防制的單選題,請提供正確答案的選項。\n\n"
"group": "tmmluplus_humanities"
"group_alias": "humanities"
"include": "_default_template_yaml"
"tag": "tmmluplus_humanities_tasks"
"include": "_tmmluplus_template_yaml"
"task": "tmmluplus_anti_money_laundering"
"task_alias": "anti money laundering"
"dataset_name": "auditing"
"description": "以下為審計學的單選題,請提供正確答案的選項。\n\n"
"group": "tmmluplus_other"
"group_alias": "other"
"include": "_default_template_yaml"
"tag": "tmmluplus_other_tasks"
"include": "_tmmluplus_template_yaml"
"task": "tmmluplus_auditing"
"task_alias": "auditing"
"dataset_name": "basic_medical_science"
"description": "以下為基礎醫學的單選題,請提供正確答案的選項。\n\n"
"group": "tmmluplus_STEM"
"group_alias": "STEM"
"include": "_default_template_yaml"
"tag": "tmmluplus_STEM_tasks"
"include": "_tmmluplus_template_yaml"
"task": "tmmluplus_basic_medical_science"
"task_alias": "basic medical science"
"dataset_name": "business_management"
"description": "以下為企業管理的單選題,請提供正確答案的選項。\n\n"
"group": "tmmluplus_other"
"group_alias": "other"
"include": "_default_template_yaml"
"tag": "tmmluplus_other_tasks"
"include": "_tmmluplus_template_yaml"
"task": "tmmluplus_business_management"
"task_alias": "business management"
"dataset_name": "chinese_language_and_literature"
"description": "以下為國文的單選題,請提供正確答案的選項。\n\n"
"group": "tmmluplus_social_sciences"
"group_alias": "social sciences"
"include": "_default_template_yaml"
"tag": "tmmluplus_social_sciences_tasks"
"include": "_tmmluplus_template_yaml"
"task": "tmmluplus_chinese_language_and_literature"
"task_alias": "chinese language and literature"
"dataset_name": "clinical_psychology"
"description": "以下為臨床心理學的單選題,請提供正確答案的選項。\n\n"
"group": "tmmluplus_social_sciences"
"group_alias": "social sciences"
"include": "_default_template_yaml"
"tag": "tmmluplus_social_sciences_tasks"
"include": "_tmmluplus_template_yaml"
"task": "tmmluplus_clinical_psychology"
"task_alias": "clinical psychology"
"dataset_name": "computer_science"
"description": "以下為資訊工程的單選題,請提供正確答案的選項。\n\n"
"group": "tmmluplus_STEM"
"group_alias": "STEM"
"include": "_default_template_yaml"
"tag": "tmmluplus_STEM_tasks"
"include": "_tmmluplus_template_yaml"
"task": "tmmluplus_computer_science"
"task_alias": "computer science"
"dataset_name": "culinary_skills"
"description": "以下為餐旅的單選題,請提供正確答案的選項。\n\n"
"group": "tmmluplus_other"
"group_alias": "other"
"include": "_default_template_yaml"
"tag": "tmmluplus_other_tasks"
"include": "_tmmluplus_template_yaml"
"task": "tmmluplus_culinary_skills"
"task_alias": "culinary skills"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment