Commit 02e841ce authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'main' of https://github.com/EleutherAI/lm-evaluation-harness into t5v2-alt-plus

parents 90ad5db7 e74ec966
...@@ -3,7 +3,7 @@ from tqdm import tqdm ...@@ -3,7 +3,7 @@ from tqdm import tqdm
def main() -> None: def main() -> None:
subset = ["extended", "diamond", "experts", "main"] subset = ["extended", "diamond", "main"]
for task in tqdm(subset): for task in tqdm(subset):
file_name = f"gpqa_{task}_n_shot.yaml" file_name = f"gpqa_{task}_n_shot.yaml"
......
...@@ -3,7 +3,7 @@ from tqdm import tqdm ...@@ -3,7 +3,7 @@ from tqdm import tqdm
def main() -> None: def main() -> None:
subset = ["extended", "diamond", "experts", "main"] subset = ["extended", "diamond", "main"]
setting = "zeroshot" setting = "zeroshot"
for task in tqdm(subset): for task in tqdm(subset):
file_name = f"gpqa_{task}_{setting}.yaml" file_name = f"gpqa_{task}_{setting}.yaml"
......
...@@ -12,6 +12,7 @@ generation_kwargs: ...@@ -12,6 +12,7 @@ generation_kwargs:
temperature: 0.0 temperature: 0.0
max_gen_toks: 1280 max_gen_toks: 1280
process_results: !function utils.process_results process_results: !function utils.process_results
num_fewshot: 0
metric_list: metric_list:
- metric: prompt_level_strict_acc - metric: prompt_level_strict_acc
aggregation: mean aggregation: mean
......
...@@ -3,6 +3,7 @@ group: ...@@ -3,6 +3,7 @@ group:
- kmmlu_hard_cot - kmmlu_hard_cot
dataset_path: HAERAE-HUB/KMMLU-HARD dataset_path: HAERAE-HUB/KMMLU-HARD
output_type: generate_until output_type: generate_until
validation_split: dev # not meant to be used, only here to silence warnings
test_split: test test_split: test
doc_to_target: "{{['A', 'B', 'C', 'D'][answer-1]}}" doc_to_target: "{{['A', 'B', 'C', 'D'][answer-1]}}"
metric_list: metric_list:
...@@ -17,6 +18,7 @@ generation_kwargs: ...@@ -17,6 +18,7 @@ generation_kwargs:
do_sample: false do_sample: false
max_gen_toks: 2048 max_gen_toks: 2048
temperature: 0.0 temperature: 0.0
num_fewshot: 0
filter_list: filter_list:
- name: "get-answer" - name: "get-answer"
filter: filter:
...@@ -26,3 +28,4 @@ filter_list: ...@@ -26,3 +28,4 @@ filter_list:
- function: "take_first" - function: "take_first"
metadata: metadata:
version: 2.0 version: 2.0
num_fewshot: 5
...@@ -21,3 +21,5 @@ metric_list: ...@@ -21,3 +21,5 @@ metric_list:
higher_is_better: True higher_is_better: True
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -21,3 +21,5 @@ metric_list: ...@@ -21,3 +21,5 @@ metric_list:
higher_is_better: True higher_is_better: True
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
# KorMedMCQA
### Paper
Title: `KorMedMCQA: Multi-Choice Question Answering Benchmark for Korean Healthcare Professional Licensing Examinations`
Abstract: `We introduce KorMedMCQA, the first Korean multiple-choice question answering (MCQA) benchmark derived from Korean healthcare professional licensing examinations, covering from the year 2012 to year 2023. This dataset consists of a selection of questions from the license examinations for doctors, nurses, and pharmacists, featuring a diverse array of subjects. We conduct baseline experiments on various large language models, including proprietary/open-source, multilingual/Korean-additional pretrained, and clinical context pretrained models, highlighting the potential for further enhancements. We make our data publicly available on HuggingFace and provide a evaluation script via LM-Harness, inviting further exploration and advancement in Korean healthcare environments.`
Paper : https://arxiv.org/abs/2403.01469
Homepage: https://huggingface.co/datasets/sean0042/KorMedMCQA
### Citation
```
@article{kweon2024kormedmcqa,
title={KorMedMCQA: Multi-Choice Question Answering Benchmark for Korean Healthcare Professional Licensing Examinations},
author={Sunjun Kweon and Byungjin Choi and Minkyu Kim and Rae Woong Park and Edward Choi},
journal={arXiv preprint arXiv:2403.01469},
year={2024}
}
```
### Groups and Tasks
* `kormedmcqa`: Runs `kormedmcqa_doctor`, `kormedmcqa_nurse`, and `kormedmcqa_pharm`.
#### Tasks
* `kormedmcqa_doctor`: `Official Korean Doctor Examination`
* `kormedmcqa_nurse`: `Official Korean Nurse Examination`
* `kormedmcqa_pharm`: `Official Korean Pharmacist Examination`
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
group: kormedmcqa
task : kormedmcqa_doctor
dataset_path : sean0042/KorMedMCQA
dataset_name : doctor
test_split : test
fewshot_split : dev
fewshot_config:
sampler: first_n
output_type: generate_until
doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nE. {{E}}\n정답:"
doc_to_target: "{{['A', 'B', 'C', 'D', 'E'][answer-1]}}"
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
regexes_to_ignore:
- " "
generation_kwargs:
until:
- "Q:"
- "\n\n"
- "</s>"
- "."
do_sample: false
temperature: 0.0
group: kormedmcqa
task : kormedmcqa_nurse
dataset_path : sean0042/KorMedMCQA
dataset_name : nurse
test_split : test
fewshot_split : dev
fewshot_config:
sampler: first_n
output_type: generate_until
doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nE. {{E}}\n정답:"
doc_to_target: "{{['A', 'B', 'C', 'D', 'E'][answer-1]}}"
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
regexes_to_ignore:
- " "
generation_kwargs:
until:
- "Q:"
- "\n\n"
- "</s>"
- "."
do_sample: false
temperature: 0.0
group: kormedmcqa
task : kormedmcqa_pharm
dataset_path : sean0042/KorMedMCQA
dataset_name : pharm
test_split : test
fewshot_split : dev
fewshot_config:
sampler: first_n
output_type: generate_until
doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nE. {{E}}\n정답:"
doc_to_target: "{{['A', 'B', 'C', 'D', 'E'][answer-1]}}"
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
regexes_to_ignore:
- " "
generation_kwargs:
until:
- "Q:"
- "\n\n"
- "</s>"
- "."
do_sample: false
temperature: 0.0
...@@ -18,3 +18,5 @@ metric_list: ...@@ -18,3 +18,5 @@ metric_list:
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -19,3 +19,5 @@ metric_list: ...@@ -19,3 +19,5 @@ metric_list:
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -25,3 +25,5 @@ filter_list: ...@@ -25,3 +25,5 @@ filter_list:
- function: "take_first" - function: "take_first"
metadata: metadata:
version: 0.0 version: 0.0
dataset_kwargs:
trust_remote_code: true
...@@ -23,3 +23,5 @@ num_fewshot: 0 ...@@ -23,3 +23,5 @@ num_fewshot: 0
metadata: metadata:
version: 1.0 version: 1.0
num_fewshot: 4 num_fewshot: 4
dataset_kwargs:
trust_remote_code: true
...@@ -12,3 +12,5 @@ metric_list: ...@@ -12,3 +12,5 @@ metric_list:
- metric: acc - metric: acc
metadata: metadata:
version: 0.0 version: 0.0
dataset_kwargs:
trust_remote_code: true
...@@ -12,3 +12,5 @@ metric_list: ...@@ -12,3 +12,5 @@ metric_list:
- metric: acc - metric: acc
metadata: metadata:
version: 0.0 version: 0.0
dataset_kwargs:
trust_remote_code: true
...@@ -12,3 +12,5 @@ metric_list: ...@@ -12,3 +12,5 @@ metric_list:
- metric: acc - metric: acc
metadata: metadata:
version: 0.0 version: 0.0
dataset_kwargs:
trust_remote_code: true
...@@ -23,3 +23,5 @@ metric_list: ...@@ -23,3 +23,5 @@ metric_list:
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 2.0 version: 2.0
dataset_kwargs:
trust_remote_code: true
...@@ -37,6 +37,7 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: ...@@ -37,6 +37,7 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
"query": QA_PROMPT + "\n\nQ: " + preprocess(doc["question"]) + "\nA:", "query": QA_PROMPT + "\n\nQ: " + preprocess(doc["question"]) + "\nA:",
"mc1_choices": doc["mc1_targets_choices"], "mc1_choices": doc["mc1_targets_choices"],
"mc2_choices": doc["mc2_targets_choices"], "mc2_choices": doc["mc2_targets_choices"],
"mc2_targets": {"labels": doc["mc2_targets_labels"]},
"gold": " ", "gold": " ",
} }
return out_doc return out_doc
......
...@@ -12,3 +12,5 @@ metric_list: ...@@ -12,3 +12,5 @@ metric_list:
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 2.0 version: 2.0
dataset_kwargs:
trust_remote_code: true
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment