Commit 0d1ef037 authored by lintangsutawika's avatar lintangsutawika
Browse files

solved merge conflict

parents aa44be3f ada4a31d
"dataset_name": "war_Latn" "fewshot_split": "war_Latn"
"include": "_default_template_yaml" "include": "_default_template_yaml"
"task": "belebele_war_Latn" "task": "belebele_war_Latn"
"test_split": "war_Latn"
"dataset_name": "wol_Latn" "fewshot_split": "wol_Latn"
"include": "_default_template_yaml" "include": "_default_template_yaml"
"task": "belebele_wol_Latn" "task": "belebele_wol_Latn"
"test_split": "wol_Latn"
"dataset_name": "xho_Latn" "fewshot_split": "xho_Latn"
"include": "_default_template_yaml" "include": "_default_template_yaml"
"task": "belebele_xho_Latn" "task": "belebele_xho_Latn"
"test_split": "xho_Latn"
"dataset_name": "yor_Latn" "fewshot_split": "yor_Latn"
"include": "_default_template_yaml" "include": "_default_template_yaml"
"task": "belebele_yor_Latn" "task": "belebele_yor_Latn"
"test_split": "yor_Latn"
"dataset_name": "zho_Hans" "fewshot_split": "zho_Hans"
"include": "_default_template_yaml" "include": "_default_template_yaml"
"task": "belebele_zho_Hans" "task": "belebele_zho_Hans"
"test_split": "zho_Hans"
"dataset_name": "zho_Hant" "fewshot_split": "zho_Hant"
"include": "_default_template_yaml" "include": "_default_template_yaml"
"task": "belebele_zho_Hant" "task": "belebele_zho_Hant"
"test_split": "zho_Hant"
"dataset_name": "zsm_Latn" "fewshot_split": "zsm_Latn"
"include": "_default_template_yaml" "include": "_default_template_yaml"
"task": "belebele_zsm_Latn" "task": "belebele_zsm_Latn"
"test_split": "zsm_Latn"
"dataset_name": "zul_Latn" "fewshot_split": "zul_Latn"
"include": "_default_template_yaml" "include": "_default_template_yaml"
"task": "belebele_zul_Latn" "task": "belebele_zul_Latn"
"test_split": "zul_Latn"
...@@ -17,3 +17,5 @@ filter_list: ...@@ -17,3 +17,5 @@ filter_list:
- function: "regex" - function: "regex"
regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)" regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
- function: "take_first" - function: "take_first"
metadata:
version: 1.0
...@@ -9,3 +9,5 @@ generation_kwargs: ...@@ -9,3 +9,5 @@ generation_kwargs:
- "</s>" - "</s>"
do_sample: false do_sample: false
temperature: 0.0 temperature: 0.0
metadata:
version: 1.0
# MultiMedQA (multiple-choice subset)
### Paper
Title: Large Language Models Encode Clinical Knowledge
Abstract: https://arxiv.org/abs/2212.13138
A benchmark combining four existing multiple-choice question answering datasets spanning professional medical exams and research queries.
### Citation
```
@Article{Singhal2023,
author={Singhal, Karan and Azizi, Shekoofeh and Tu, Tao and Mahdavi, S. Sara and Wei, Jason and Chung, Hyung Won and Scales, Nathan and Tanwani, Ajay and Cole-Lewis, Heather and Pfohl, Stephen and Payne, Perry and Seneviratne, Martin and Gamble, Paul and Kelly, Chris and Babiker, Abubakr and Sch{\"a}rli, Nathanael and Chowdhery, Aakanksha and Mansfield, Philip and Demner-Fushman, Dina and Ag{\"u}era y Arcas, Blaise and Webster, Dale and Corrado, Greg S. and Matias, Yossi and Chou, Katherine and Gottweis, Juraj and Tomasev, Nenad and Liu, Yun and Rajkomar, Alvin and Barral, Joelle and Semturs, Christopher and Karthikesalingam, Alan and Natarajan, Vivek},
title={Large language models encode clinical knowledge},
journal={Nature},
year={2023},
month={Aug},
day={01},
volume={620},
number={7972},
pages={172-180},
issn={1476-4687},
doi={10.1038/s41586-023-06291-2},
url={https://doi.org/10.1038/s41586-023-06291-2}
}
```
### Tasks
* [PubMedQA](https://pubmedqa.github.io/) - 1,000 expert-labeled Q&A pairs where a question and corresponding PubMed abstract as context is given and the a yes/maybe/no answer must be produced. Unlike the rest of the tasks in this suite, PubMedQA is a closed-domain Q&A task.
* [MedQA](https://github.com/jind11/MedQA) - US Medical License Exam (USMLE) questions with 4 or 5 possible answers. Typically, only the 4-option questions are used.
* [MedMCQA](https://medmcqa.github.io/) - 4-option multiple choice questions from Indian medical entrance examinations, >191k total questions.
* [MMLU](https://arxiv.org/abs/2009.03300) - 4-option multiple choice exam questions from a variety of domains. The following 6 domains are utilized here:
* Anatomy
* Clinical Knowledge
* College Medicine
* Medical Genetics
* Professional Medicine
* College Biology
Note that MultiMedQA also includes some short-form and long-form Q&A tasks (LiveQA, MedicationQA, HealthSearchQA). Evaluation on these tasks is usually done by experts and is not typically performed automatically, and therefore is ignored here.
group: multimedqa
task:
- pubmedqa
- medmcqa
- medqa_4options
- task: mmlu_anatomy
task_alias: "anatomy (mmlu)"
group_alias: null
- task: mmlu_clinical_knowledge
task_alias: "clinical_knowledge (mmlu)"
group_alias: null
- task: mmlu_college_medicine
task_alias: "college_medicine (mmlu)"
group_alias: null
- task: mmlu_medical_genetics
task_alias: "medical_genetics (mmlu)"
group_alias: null
- task: mmlu_professional_medicine
task_alias: "professional_medicine (mmlu)"
group_alias: null
- task: mmlu_college_biology
task_alias: "college_biology (mmlu)"
group_alias: null
...@@ -173,7 +173,6 @@ all_subtasks = [ ...@@ -173,7 +173,6 @@ all_subtasks = [
def main() -> None: def main() -> None:
for path, task_type in zip( for path, task_type in zip(
["multiple_choice", "generate_until"], ["multiple_choice", "generate_until"],
["multiple_choice_template_yaml", "generate_until_template_yaml"], ["multiple_choice_template_yaml", "generate_until_template_yaml"],
......
...@@ -15,4 +15,4 @@ metric_list: ...@@ -15,4 +15,4 @@ metric_list:
higher_is_better: true higher_is_better: true
ignore_punctuation: true ignore_punctuation: true
metadata: metadata:
- version: 0.0 version: 1.0
# Generated by utils.py
dataset_name: causal_judgment_zero_shot
include: ../multiple_choice_template_yaml
task: bigbench_causal_judgement_multiple_choice
...@@ -12,4 +12,4 @@ metric_list: ...@@ -12,4 +12,4 @@ metric_list:
- metric: acc - metric: acc
# TODO: brier score and other metrics # TODO: brier score and other metrics
metadata: metadata:
- version: 0.0 version: 0.0
...@@ -11,4 +11,4 @@ doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}" ...@@ -11,4 +11,4 @@ doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}"
metric_list: metric_list:
- metric: acc - metric: acc
metadata: metadata:
- version: 1.0 version: 1.0
...@@ -73,7 +73,6 @@ all_subtasks = [ ...@@ -73,7 +73,6 @@ all_subtasks = [
def main() -> None: def main() -> None:
for task in all_subtasks: for task in all_subtasks:
file_name = f"{task}.yaml" file_name = f"{task}.yaml"
try: try:
with open(f"{file_name}", "w") as f: with open(f"{file_name}", "w") as f:
......
...@@ -16,4 +16,4 @@ metric_list: ...@@ -16,4 +16,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
- version: 1.0 version: 1.0
...@@ -75,7 +75,6 @@ def parse_args(): ...@@ -75,7 +75,6 @@ def parse_args():
if __name__ == "__main__": if __name__ == "__main__":
args = parse_args() args = parse_args()
# get filename of base_yaml so we can `"include": ` it in our other YAMLs. # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
...@@ -93,7 +92,9 @@ if __name__ == "__main__": ...@@ -93,7 +92,9 @@ if __name__ == "__main__":
if args.cot_prompt_path is not None: if args.cot_prompt_path is not None:
description = cot_file[subject_eng] description = cot_file[subject_eng]
else: else:
description = f"以下是中国关于{subject_zh}的单项选择题,请选出其中的正确答案。\n\n" description = (
f"以下是中国关于{subject_zh}的单项选择题,请选出其中的正确答案。\n\n"
)
yaml_dict = { yaml_dict = {
"include": base_yaml_name, "include": base_yaml_name,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment