"vscode:/vscode.git/clone" did not exist on "029617179f5e829f2692b9a07be930ee8f7dd0c2"
Unverified Commit b2c090cc authored by Minho Ryu's avatar Minho Ryu Committed by GitHub
Browse files

aggregate by group (total and categories) (#2643)

parent ed9c6fc8
dataset_name: math dataset_name: math
include: _direct_hard_kmmlu_yaml include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_math task: kmmlu_direct_hard_math
tag: kmmlu_direct_hard_stem_tasks
dataset_name: mechanical_engineering dataset_name: mechanical_engineering
include: _direct_hard_kmmlu_yaml include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_mechanical_engineering task: kmmlu_direct_hard_mechanical_engineering
tag: kmmlu_direct_hard_stem_tasks
dataset_name: nondestructive_testing dataset_name: nondestructive_testing
include: _direct_hard_kmmlu_yaml include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_nondestructive_testing task: kmmlu_direct_hard_nondestructive_testing
tag: kmmlu_direct_hard_applied_science_tasks
dataset_name: patent dataset_name: patent
include: _direct_hard_kmmlu_yaml include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_patent task: kmmlu_direct_hard_patent
tag: kmmlu_direct_hard_other_tasks
dataset_name: political_science_and_sociology dataset_name: political_science_and_sociology
include: _direct_hard_kmmlu_yaml include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_political_science_and_sociology task: kmmlu_direct_hard_political_science_and_sociology
tag: kmmlu_direct_hard_humss_tasks
dataset_name: psychology dataset_name: psychology
include: _direct_hard_kmmlu_yaml include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_psychology task: kmmlu_direct_hard_psychology
tag: kmmlu_direct_hard_humss_tasks
dataset_name: public_safety dataset_name: public_safety
include: _direct_hard_kmmlu_yaml include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_public_safety task: kmmlu_direct_hard_public_safety
tag: kmmlu_direct_hard_other_tasks
dataset_name: railway_and_automotive_engineering dataset_name: railway_and_automotive_engineering
include: _direct_hard_kmmlu_yaml include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_railway_and_automotive_engineering task: kmmlu_direct_hard_railway_and_automotive_engineering
tag: kmmlu_direct_hard_applied_science_tasks
dataset_name: real_estate dataset_name: real_estate
include: _direct_hard_kmmlu_yaml include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_real_estate task: kmmlu_direct_hard_real_estate
tag: kmmlu_direct_hard_other_tasks
dataset_name: refrigerating_machinery dataset_name: refrigerating_machinery
include: _direct_hard_kmmlu_yaml include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_refrigerating_machinery task: kmmlu_direct_hard_refrigerating_machinery
tag: kmmlu_direct_hard_other_tasks
dataset_name: social_welfare dataset_name: social_welfare
include: _direct_hard_kmmlu_yaml include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_social_welfare task: kmmlu_direct_hard_social_welfare
tag: kmmlu_direct_hard_humss_tasks
dataset_name: taxation dataset_name: taxation
include: _direct_hard_kmmlu_yaml include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_taxation task: kmmlu_direct_hard_taxation
tag: kmmlu_direct_hard_humss_tasks
dataset_name: telecommunications_and_wireless_technology dataset_name: telecommunications_and_wireless_technology
include: _direct_hard_kmmlu_yaml include: _direct_hard_kmmlu_yaml
task: kmmlu_hard_direct_telecommunications_and_wireless_technology task: kmmlu_direct_hard_telecommunications_and_wireless_technology
tag: kmmlu_direct_hard_applied_science_tasks
tag:
- kmmlu
- kmmlu_hard
dataset_path: HAERAE-HUB/KMMLU-HARD dataset_path: HAERAE-HUB/KMMLU-HARD
output_type: multiple_choice output_type: multiple_choice
test_split: test test_split: test
...@@ -12,8 +9,5 @@ metric_list: ...@@ -12,8 +9,5 @@ metric_list:
- metric: acc - metric: acc
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata: metadata:
version: 2.0 version: 2.0
group: kmmlu_hard
task:
- kmmlu_hard_stem
- kmmlu_hard_other
- kmmlu_hard_applied_science
- kmmlu_hard_humss
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 2.0
group: kmmlu_hard_applied_science
task:
- kmmlu_hard_applied_science_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 2.0
group: kmmlu_hard_humss
task:
- kmmlu_hard_humss_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 2.0
group: kmmlu_hard_other
task:
- kmmlu_hard_other_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 2.0
group: kmmlu_hard_stem
task:
- kmmlu_hard_stem_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 2.0
dataset_name: accounting dataset_name: accounting
include: _hard_kmmlu_yaml include: _hard_kmmlu_yaml
task: kmmlu_hard_accounting task: kmmlu_hard_accounting
tag: kmmlu_hard_humss_tasks
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment