Unverified Commit b2c090cc authored by Minho Ryu's avatar Minho Ryu Committed by GitHub
Browse files

aggregate by group (total and categories) (#2643)

parent ed9c6fc8
dataset_name: math
include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_math
task: kmmlu_direct_hard_math
tag: kmmlu_direct_hard_stem_tasks
dataset_name: mechanical_engineering
include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_mechanical_engineering
task: kmmlu_direct_hard_mechanical_engineering
tag: kmmlu_direct_hard_stem_tasks
dataset_name: nondestructive_testing
include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_nondestructive_testing
task: kmmlu_direct_hard_nondestructive_testing
tag: kmmlu_direct_hard_applied_science_tasks
dataset_name: patent
include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_patent
task: kmmlu_direct_hard_patent
tag: kmmlu_direct_hard_other_tasks
dataset_name: political_science_and_sociology
include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_political_science_and_sociology
task: kmmlu_direct_hard_political_science_and_sociology
tag: kmmlu_direct_hard_humss_tasks
dataset_name: psychology
include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_psychology
task: kmmlu_direct_hard_psychology
tag: kmmlu_direct_hard_humss_tasks
dataset_name: public_safety
include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_public_safety
task: kmmlu_direct_hard_public_safety
tag: kmmlu_direct_hard_other_tasks
dataset_name: railway_and_automotive_engineering
include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_railway_and_automotive_engineering
task: kmmlu_direct_hard_railway_and_automotive_engineering
tag: kmmlu_direct_hard_applied_science_tasks
dataset_name: real_estate
include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_real_estate
task: kmmlu_direct_hard_real_estate
tag: kmmlu_direct_hard_other_tasks
dataset_name: refrigerating_machinery
include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_refrigerating_machinery
task: kmmlu_direct_hard_refrigerating_machinery
tag: kmmlu_direct_hard_other_tasks
dataset_name: social_welfare
include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_social_welfare
task: kmmlu_direct_hard_social_welfare
tag: kmmlu_direct_hard_humss_tasks
dataset_name: taxation
include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_taxation
task: kmmlu_direct_hard_taxation
tag: kmmlu_direct_hard_humss_tasks
dataset_name: telecommunications_and_wireless_technology
include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_telecommunications_and_wireless_technology
task: kmmlu_direct_hard_telecommunications_and_wireless_technology
tag: kmmlu_direct_hard_applied_science_tasks
tag:
- kmmlu
- kmmlu_hard
dataset_path: HAERAE-HUB/KMMLU-HARD
output_type: multiple_choice
test_split: test
......@@ -12,8 +9,5 @@ metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 2.0
group: kmmlu_hard
task:
- kmmlu_hard_stem
- kmmlu_hard_other
- kmmlu_hard_applied_science
- kmmlu_hard_humss
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 2.0
group: kmmlu_hard_applied_science
task:
- kmmlu_hard_applied_science_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 2.0
group: kmmlu_hard_humss
task:
- kmmlu_hard_humss_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 2.0
group: kmmlu_hard_other
task:
- kmmlu_hard_other_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 2.0
group: kmmlu_hard_stem
task:
- kmmlu_hard_stem_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 2.0
dataset_name: accounting
include: _hard_kmmlu_yaml
task: kmmlu_hard_accounting
tag: kmmlu_hard_humss_tasks
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment