Merge branch 'main' of https://github.com/EleutherAI/lm-evaluation-harness into t5v2-alt-plus

02e841ce · lintangsutawika · 90ad5db7 · e74ec966 · 02e841ce · 02e841ce
Commit 02e841ce authored Mar 14, 2024 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/gpqa/n_shot/_generate_configs.py
+++ b/lm_eval/tasks/gpqa/n_shot/_generate_configs.py
@@ -3,7 +3,7 @@ from tqdm import tqdm
 def main() -> None:
-    subset = ["extended", "diamond", "experts", "main"]
+    subset = ["extended", "diamond", "main"]
    for task in tqdm(subset):
        file_name = f"gpqa_{task}_n_shot.yaml"

--- a/lm_eval/tasks/gpqa/zeroshot/_generate_configs.py
+++ b/lm_eval/tasks/gpqa/zeroshot/_generate_configs.py
@@ -3,7 +3,7 @@ from tqdm import tqdm
 def main() -> None:
-    subset = ["extended", "diamond", "experts", "main"]
+    subset = ["extended", "diamond", "main"]
    setting = "zeroshot"
    for task in tqdm(subset):
        file_name = f"gpqa_{task}_{setting}.yaml"

--- a/lm_eval/tasks/ifeval/ifeval.yaml
+++ b/lm_eval/tasks/ifeval/ifeval.yaml
@@ -12,6 +12,7 @@ generation_kwargs:
  temperature: 0.0
  max_gen_toks: 1280
 process_results: !function utils.process_results
+num_fewshot: 0
 metric_list:
  - metric: prompt_level_strict_acc
    aggregation: mean

--- a/lm_eval/tasks/kmmlu/cot_hard/_cot_kmmlu_yaml
+++ b/lm_eval/tasks/kmmlu/cot_hard/_cot_kmmlu_yaml
@@ -3,6 +3,7 @@ group:
    - kmmlu_hard_cot
 dataset_path: HAERAE-HUB/KMMLU-HARD
 output_type: generate_until
+validation_split: dev # not meant to be used, only here to silence warnings
 test_split: test
 doc_to_target: "{{['A', 'B', 'C', 'D'][answer-1]}}"
 metric_list:
@@ -17,6 +18,7 @@ generation_kwargs:
  do_sample: false
  max_gen_toks: 2048
  temperature: 0.0
+num_fewshot: 0
 filter_list:
  - name: "get-answer"
    filter:
@@ -26,3 +28,4 @@ filter_list:
      - function: "take_first"
 metadata:
  version: 2.0
+  num_fewshot: 5
--- a/lm_eval/tasks/kobest/kobest_sentineg.yaml
+++ b/lm_eval/tasks/kobest/kobest_sentineg.yaml
@@ -21,3 +21,5 @@ metric_list:
    higher_is_better: True
 metadata:
  version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
--- a/lm_eval/tasks/kobest/kobest_wic.yaml
+++ b/lm_eval/tasks/kobest/kobest_wic.yaml
@@ -21,3 +21,5 @@ metric_list:
    higher_is_better: True
 metadata:
  version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
--- a/lm_eval/tasks/kormedmcqa/README.md
+++ b/lm_eval/tasks/kormedmcqa/README.md
+# KorMedMCQA
+### Paper
+Title: `KorMedMCQA: Multi-Choice Question Answering Benchmark for Korean Healthcare Professional Licensing Examinations`
+Abstract: `We introduce KorMedMCQA, the first Korean multiple-choice question answering (MCQA) benchmark derived from Korean healthcare professional licensing examinations, covering from the year 2012 to year 2023. This dataset consists of a selection of questions from the license examinations for doctors, nurses, and pharmacists, featuring a diverse array of subjects. We conduct baseline experiments on various large language models, including proprietary/open-source, multilingual/Korean-additional pretrained, and clinical context pretrained models, highlighting the potential for further enhancements. We make our data publicly available on HuggingFace and provide a evaluation script via LM-Harness, inviting further exploration and advancement in Korean healthcare environments.`
+Paper : https://arxiv.org/abs/2403.01469
+Homepage: https://huggingface.co/datasets/sean0042/KorMedMCQA
+### Citation
+```
+@article{kweon2024kormedmcqa,
+      title={KorMedMCQA: Multi-Choice Question Answering Benchmark for Korean Healthcare Professional Licensing Examinations},
+      author={Sunjun Kweon and Byungjin Choi and Minkyu Kim and Rae Woong Park and Edward Choi},
+      journal={arXiv preprint arXiv:2403.01469},
+      year={2024}
+}
+```
+### Groups and Tasks
+* `kormedmcqa`: Runs `kormedmcqa_doctor`, `kormedmcqa_nurse`, and `kormedmcqa_pharm`.
+#### Tasks
+* `kormedmcqa_doctor`: `Official Korean Doctor Examination`
+* `kormedmcqa_nurse`: `Official Korean Nurse Examination`
+* `kormedmcqa_pharm`: `Official Korean Pharmacist Examination`
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/kormedmcqa/kormedmcqa_doctor.yaml
+++ b/lm_eval/tasks/kormedmcqa/kormedmcqa_doctor.yaml
+group: kormedmcqa
+task : kormedmcqa_doctor
+dataset_path : sean0042/KorMedMCQA
+dataset_name : doctor
+test_split : test
+fewshot_split : dev
+fewshot_config:
+  sampler: first_n
+output_type: generate_until
+doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nE. {{E}}\n정답："
+doc_to_target: "{{['A', 'B', 'C', 'D', 'E'][answer-1]}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+          - " "
+generation_kwargs:
+  until:
+    - "Q:"
+    - "\n\n"
+    - "</s>"
+    - "."
+  do_sample: false
+  temperature: 0.0
--- a/lm_eval/tasks/kormedmcqa/kormedmcqa_nurse.yaml
+++ b/lm_eval/tasks/kormedmcqa/kormedmcqa_nurse.yaml
+group: kormedmcqa
+task : kormedmcqa_nurse
+dataset_path : sean0042/KorMedMCQA
+dataset_name : nurse
+test_split : test
+fewshot_split : dev
+fewshot_config:
+  sampler: first_n
+output_type: generate_until
+doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nE. {{E}}\n정답："
+doc_to_target: "{{['A', 'B', 'C', 'D', 'E'][answer-1]}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+          - " "
+generation_kwargs:
+  until:
+    - "Q:"
+    - "\n\n"
+    - "</s>"
+    - "."
+  do_sample: false
+  temperature: 0.0
--- a/lm_eval/tasks/kormedmcqa/kormedmcqa_pharm.yaml
+++ b/lm_eval/tasks/kormedmcqa/kormedmcqa_pharm.yaml
+group: kormedmcqa
+task : kormedmcqa_pharm
+dataset_path : sean0042/KorMedMCQA
+dataset_name : pharm
+test_split : test
+fewshot_split : dev
+fewshot_config:
+  sampler: first_n
+output_type: generate_until
+doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nE. {{E}}\n정답："
+doc_to_target: "{{['A', 'B', 'C', 'D', 'E'][answer-1]}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+          - " "
+generation_kwargs:
+  until:
+    - "Q:"
+    - "\n\n"
+    - "</s>"
+    - "."
+  do_sample: false
+  temperature: 0.0
--- a/lm_eval/tasks/lambada/lambada_openai.yaml
+++ b/lm_eval/tasks/lambada/lambada_openai.yaml
@@ -18,3 +18,5 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
--- a/lm_eval/tasks/logiqa/logiqa.yaml
+++ b/lm_eval/tasks/logiqa/logiqa.yaml
@@ -19,3 +19,5 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
--- a/lm_eval/tasks/logiqa2/logieval.yaml
+++ b/lm_eval/tasks/logiqa2/logieval.yaml
@@ -25,3 +25,5 @@ filter_list:
      - function: "take_first"
 metadata:
  version: 0.0
+dataset_kwargs:
+  trust_remote_code: true
--- a/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
+++ b/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
@@ -23,3 +23,5 @@ num_fewshot: 0
 metadata:
  version: 1.0
  num_fewshot: 4
+dataset_kwargs:
+  trust_remote_code: true
--- a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml
+++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml
@@ -12,3 +12,5 @@ metric_list:
  - metric: acc
 metadata:
  version: 0.0
+dataset_kwargs:
+  trust_remote_code: true
--- a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml
+++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml
@@ -12,3 +12,5 @@ metric_list:
  - metric: acc
 metadata:
  version: 0.0
+dataset_kwargs:
+  trust_remote_code: true
--- a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml
+++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml
@@ -12,3 +12,5 @@ metric_list:
  - metric: acc
 metadata:
  version: 0.0
+dataset_kwargs:
+  trust_remote_code: true
--- a/lm_eval/tasks/mutual/mutual.yaml
+++ b/lm_eval/tasks/mutual/mutual.yaml
@@ -23,3 +23,5 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 2.0
+dataset_kwargs:
+  trust_remote_code: true
--- a/lm_eval/tasks/okapi/truthfulqa_multilingual/utils.py
+++ b/lm_eval/tasks/okapi/truthfulqa_multilingual/utils.py
@@ -37,6 +37,7 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
            "query": QA_PROMPT + "\n\nQ: " + preprocess(doc["question"]) + "\nA:",
            "mc1_choices": doc["mc1_targets_choices"],
            "mc2_choices": doc["mc2_targets_choices"],
+            "mc2_targets": {"labels": doc["mc2_targets_labels"]},
            "gold": " ",
        }
        return out_doc

--- a/lm_eval/tasks/race/race.yaml
+++ b/lm_eval/tasks/race/race.yaml
@@ -12,3 +12,5 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 2.0
+dataset_kwargs:
+  trust_remote_code: true