Unverified Commit ded890f3 authored by Jinho Heo's avatar Jinho Heo Committed by GitHub
Browse files

Add kmmlu multiple-choice(accuracy) task (#2849)

parent febd19d8
......@@ -32,6 +32,7 @@ Homepage: https://huggingface.co/datasets/HAERAE-HUB/KMMLU
#### Tasks
The following tasks evaluate subjects in the KMMLU dataset
- `kmmlu_{subject_english}`
- `kmmlu_direct_{subject_english}`
The following tasks evaluate subjects in the KMMLU-Hard dataset
......
dataset_path: HAERAE-HUB/KMMLU
output_type: multiple_choice
test_split: test
fewshot_split: dev
doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: "{{answer-1}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 2.0
group: kmmlu_applied_science
task:
- kmmlu_applied_science_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 2.0
group: kmmlu
task:
- kmmlu_stem
- kmmlu_other
- kmmlu_applied_science
- kmmlu_humss
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 2.0
group: kmmlu_humss
task:
- kmmlu_humss_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 2.0
group: kmmlu_other
task:
- kmmlu_other_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 2.0
group: kmmlu_stem
task:
- kmmlu_stem_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 2.0
dataset_name: Accounting
include: _default_kmmlu_yaml
task: kmmlu_accounting
tag: kmmlu_humss_tasks
dataset_name: Agricultural-Sciences
include: _default_kmmlu_yaml
task: kmmlu_agricultural_sciences
tag: kmmlu_other_tasks
dataset_name: Aviation-Engineering-and-Maintenance
include: _default_kmmlu_yaml
task: kmmlu_aviation_engineering_and_maintenance
tag: kmmlu_applied_science_tasks
dataset_name: Biology
include: _default_kmmlu_yaml
task: kmmlu_biology
tag: kmmlu_stem_tasks
dataset_name: Chemical-Engineering
include: _default_kmmlu_yaml
task: kmmlu_chemical_engineering
tag: kmmlu_stem_tasks
dataset_name: Chemistry
include: _default_kmmlu_yaml
task: kmmlu_chemistry
tag: kmmlu_stem_tasks
dataset_name: Civil-Engineering
include: _default_kmmlu_yaml
task: kmmlu_civil_engineering
tag: kmmlu_stem_tasks
dataset_name: Computer-Science
include: _default_kmmlu_yaml
task: kmmlu_computer_science
tag: kmmlu_stem_tasks
dataset_name: Construction
include: _default_kmmlu_yaml
task: kmmlu_construction
tag: kmmlu_other_tasks
dataset_name: Criminal-Law
include: _default_kmmlu_yaml
task: kmmlu_criminal_law
tag: kmmlu_humss_tasks
dataset_name: Ecology
include: _default_kmmlu_yaml
task: kmmlu_ecology
tag: kmmlu_stem_tasks
dataset_name: Economics
include: _default_kmmlu_yaml
task: kmmlu_economics
tag: kmmlu_humss_tasks
dataset_name: Education
include: _default_kmmlu_yaml
task: kmmlu_education
tag: kmmlu_humss_tasks
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment