Unverified Commit 543617fe authored by Hailey Schoelkopf's avatar Hailey Schoelkopf Committed by GitHub
Browse files

Bump version to v0.4.4 ; Fixes to TMMLUplus (#2280)

parent 7a1614eb
...@@ -299,13 +299,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -299,13 +299,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
"When `fewshot_as_multiturn` is selected, `apply_chat_template` must be set (either to `True` or to the chosen template name)." "When `fewshot_as_multiturn` is selected, `apply_chat_template` must be set (either to `True` or to the chosen template name)."
) )
if (
args.num_fewshot is None or args.num_fewshot == 0
) and args.fewshot_as_multiturn:
raise ValueError(
"If fewshot_as_multiturn is set, num_fewshot must be greater than 0."
)
if args.include_path is not None: if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}") eval_logger.info(f"Including path: {args.include_path}")
task_manager = TaskManager(args.verbosity, include_path=args.include_path) task_manager = TaskManager(args.verbosity, include_path=args.include_path)
......
...@@ -489,10 +489,12 @@ class TaskManager: ...@@ -489,10 +489,12 @@ class TaskManager:
if attr in config: if attr in config:
if attr == "group" and print_info: if attr == "group" and print_info:
self.logger.info( self.logger.info(
"`group` and `group_alias` keys in tasks' configs will no longer be used in the next release of lm-eval. " "`group` and `group_alias` keys in TaskConfigs are deprecated and will be removed in v0.4.5 of lm_eval. "
"`tag` will be used to allow to call a collection of tasks just like `group`. " "The new `tag` field will be used to allow for a shortcut to a group of tasks one does not wish to aggregate metrics across. "
"`group` will be removed in order to not cause confusion with the new ConfigurableGroup " "`group`s which aggregate across subtasks must be only defined in a separate group config file, "
"which will be the official way to create groups with addition of group-wide configurations." "which will be the official way to create groups that support cross-task aggregation as in `mmlu`. "
"Please see the v0.4.4 patch notes and our documentation: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/new_task_guide.md#advanced-group-configs "
"for more information."
) )
print_info = False print_info = False
# attr = "tag" # attr = "tag"
......
...@@ -4,3 +4,10 @@ task: ...@@ -4,3 +4,10 @@ task:
- tmmluplus_social_sciences - tmmluplus_social_sciences
- tmmluplus_humanities - tmmluplus_humanities
- tmmluplus_STEM - tmmluplus_STEM
aggregate_metric_list:
- metric: acc
weight_by_size: True
- metric: acc_norm
weight_by_size: True
metadata:
version: 2.0
group: tmmluplus_STEM
task:
- tmmluplus_STEM_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
- metric: acc_norm
weight_by_size: True
metadata:
version: 2.0
group: tmmluplus_humanities
task:
- tmmluplus_humanities_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
- metric: acc_norm
weight_by_size: True
metadata:
version: 2.0
group: tmmluplus_other
task:
- tmmluplus_other_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
- metric: acc_norm
weight_by_size: True
metadata:
version: 2.0
group: tmmluplus_social_sciences
task:
- tmmluplus_social_sciences_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
- metric: acc_norm
weight_by_size: True
metadata:
version: 2.0
...@@ -16,4 +16,4 @@ metric_list: ...@@ -16,4 +16,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 2.0
"dataset_name": "accounting" "dataset_name": "accounting"
"description": "以下為會計學的單選題,請提供正確答案的選項。\n\n" "description": "以下為會計學的單選題,請提供正確答案的選項。\n\n"
"group": "tmmluplus_other" "tag": "tmmluplus_other_tasks"
"group_alias": "other" "include": "_tmmluplus_template_yaml"
"include": "_default_template_yaml"
"task": "tmmluplus_accounting" "task": "tmmluplus_accounting"
"task_alias": "accounting" "task_alias": "accounting"
"dataset_name": "administrative_law" "dataset_name": "administrative_law"
"description": "以下為行政法的單選題,請提供正確答案的選項。\n\n" "description": "以下為行政法的單選題,請提供正確答案的選項。\n\n"
"group": "tmmluplus_humanities" "tag": "tmmluplus_humanities_tasks"
"group_alias": "humanities" "include": "_tmmluplus_template_yaml"
"include": "_default_template_yaml"
"task": "tmmluplus_administrative_law" "task": "tmmluplus_administrative_law"
"task_alias": "administrative law" "task_alias": "administrative law"
"dataset_name": "advance_chemistry" "dataset_name": "advance_chemistry"
"description": "以下為化學的單選題,請提供正確答案的選項。\n\n" "description": "以下為化學的單選題,請提供正確答案的選項。\n\n"
"group": "tmmluplus_STEM" "tag": "tmmluplus_STEM_tasks"
"group_alias": "STEM" "include": "_tmmluplus_template_yaml"
"include": "_default_template_yaml"
"task": "tmmluplus_advance_chemistry" "task": "tmmluplus_advance_chemistry"
"task_alias": "advance chemistry" "task_alias": "advance chemistry"
"dataset_name": "agriculture" "dataset_name": "agriculture"
"description": "以下為農業的單選題,請提供正確答案的選項。\n\n" "description": "以下為農業的單選題,請提供正確答案的選項。\n\n"
"group": "tmmluplus_other" "tag": "tmmluplus_other_tasks"
"group_alias": "other" "include": "_tmmluplus_template_yaml"
"include": "_default_template_yaml"
"task": "tmmluplus_agriculture" "task": "tmmluplus_agriculture"
"task_alias": "agriculture" "task_alias": "agriculture"
"dataset_name": "anti_money_laundering" "dataset_name": "anti_money_laundering"
"description": "以下為洗錢防制的單選題,請提供正確答案的選項。\n\n" "description": "以下為洗錢防制的單選題,請提供正確答案的選項。\n\n"
"group": "tmmluplus_humanities" "tag": "tmmluplus_humanities_tasks"
"group_alias": "humanities" "include": "_tmmluplus_template_yaml"
"include": "_default_template_yaml"
"task": "tmmluplus_anti_money_laundering" "task": "tmmluplus_anti_money_laundering"
"task_alias": "anti money laundering" "task_alias": "anti money laundering"
"dataset_name": "auditing" "dataset_name": "auditing"
"description": "以下為審計學的單選題,請提供正確答案的選項。\n\n" "description": "以下為審計學的單選題,請提供正確答案的選項。\n\n"
"group": "tmmluplus_other" "tag": "tmmluplus_other_tasks"
"group_alias": "other" "include": "_tmmluplus_template_yaml"
"include": "_default_template_yaml"
"task": "tmmluplus_auditing" "task": "tmmluplus_auditing"
"task_alias": "auditing" "task_alias": "auditing"
"dataset_name": "basic_medical_science" "dataset_name": "basic_medical_science"
"description": "以下為基礎醫學的單選題,請提供正確答案的選項。\n\n" "description": "以下為基礎醫學的單選題,請提供正確答案的選項。\n\n"
"group": "tmmluplus_STEM" "tag": "tmmluplus_STEM_tasks"
"group_alias": "STEM" "include": "_tmmluplus_template_yaml"
"include": "_default_template_yaml"
"task": "tmmluplus_basic_medical_science" "task": "tmmluplus_basic_medical_science"
"task_alias": "basic medical science" "task_alias": "basic medical science"
"dataset_name": "business_management" "dataset_name": "business_management"
"description": "以下為企業管理的單選題,請提供正確答案的選項。\n\n" "description": "以下為企業管理的單選題,請提供正確答案的選項。\n\n"
"group": "tmmluplus_other" "tag": "tmmluplus_other_tasks"
"group_alias": "other" "include": "_tmmluplus_template_yaml"
"include": "_default_template_yaml"
"task": "tmmluplus_business_management" "task": "tmmluplus_business_management"
"task_alias": "business management" "task_alias": "business management"
"dataset_name": "chinese_language_and_literature" "dataset_name": "chinese_language_and_literature"
"description": "以下為國文的單選題,請提供正確答案的選項。\n\n" "description": "以下為國文的單選題,請提供正確答案的選項。\n\n"
"group": "tmmluplus_social_sciences" "tag": "tmmluplus_social_sciences_tasks"
"group_alias": "social sciences" "include": "_tmmluplus_template_yaml"
"include": "_default_template_yaml"
"task": "tmmluplus_chinese_language_and_literature" "task": "tmmluplus_chinese_language_and_literature"
"task_alias": "chinese language and literature" "task_alias": "chinese language and literature"
"dataset_name": "clinical_psychology" "dataset_name": "clinical_psychology"
"description": "以下為臨床心理學的單選題,請提供正確答案的選項。\n\n" "description": "以下為臨床心理學的單選題,請提供正確答案的選項。\n\n"
"group": "tmmluplus_social_sciences" "tag": "tmmluplus_social_sciences_tasks"
"group_alias": "social sciences" "include": "_tmmluplus_template_yaml"
"include": "_default_template_yaml"
"task": "tmmluplus_clinical_psychology" "task": "tmmluplus_clinical_psychology"
"task_alias": "clinical psychology" "task_alias": "clinical psychology"
"dataset_name": "computer_science" "dataset_name": "computer_science"
"description": "以下為資訊工程的單選題,請提供正確答案的選項。\n\n" "description": "以下為資訊工程的單選題,請提供正確答案的選項。\n\n"
"group": "tmmluplus_STEM" "tag": "tmmluplus_STEM_tasks"
"group_alias": "STEM" "include": "_tmmluplus_template_yaml"
"include": "_default_template_yaml"
"task": "tmmluplus_computer_science" "task": "tmmluplus_computer_science"
"task_alias": "computer science" "task_alias": "computer science"
"dataset_name": "culinary_skills" "dataset_name": "culinary_skills"
"description": "以下為餐旅的單選題,請提供正確答案的選項。\n\n" "description": "以下為餐旅的單選題,請提供正確答案的選項。\n\n"
"group": "tmmluplus_other" "tag": "tmmluplus_other_tasks"
"group_alias": "other" "include": "_tmmluplus_template_yaml"
"include": "_default_template_yaml"
"task": "tmmluplus_culinary_skills" "task": "tmmluplus_culinary_skills"
"task_alias": "culinary skills" "task_alias": "culinary skills"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment