add `llama_gpqa`

83b1c564 · Baber · 4288b53e · 83b1c564 · 83b1c564
Commit 83b1c564 authored Jan 29, 2025 by Baber
Showing with 74 additions and 0 deletions

lm_eval/tasks/llama3/instruct/gpqa/gpqa.yaml lm_eval/tasks/llama3/instruct/gpqa/gpqa.yaml +35 -0

lm_eval/tasks/llama3/instruct/gpqa/utils.py lm_eval/tasks/llama3/instruct/gpqa/utils.py +39 -0

No files found.
--- a/lm_eval/tasks/llama3/instruct/gpqa/gpqa.yaml
+++ b/lm_eval/tasks/llama3/instruct/gpqa/gpqa.yaml
+task: llama_gpqa
+dataset_path: Idavidrein/gpqa
+dataset_name: gpqa_main
+output_type: generate_until
+test_split: train
+doc_to_text: "Given the following question and four candidate answers (A, B, C and D), choose the best answer.\nQuestion: {{Question}}\nA. {{choice1}}\nB. {{choice2}}\nC. {{choice3}}\nD. {{choice4}}\nYour response should end with \"The best answer is [the_answer_letter]\" where the [the_answer_letter] is one of A, B, C or D."
+process_docs: !function utils.process_docs
+doc_to_target: answer
+gen_prefix: "The best answer is"
+generation_kwargs:
+  until:
+    - "\n"
+  max_gen_toks: 96
+  do_sample: false
+  temperature: 0
+filter_list:
+  - name: exact_match
+    filter:
+      - function: multi_choice_regex
+        group_select: 0
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: ([A-Z])
+      - function: remove_whitespace
+      - function: take_first
+metric_list:
+  - metric: exact_match
+    ignore_punctuation: true
+    aggregation: mean
+    higher_is_better: true
+num_fewshot: 0
+metadata:
+  version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
--- a/lm_eval/tasks/llama3/instruct/gpqa/utils.py
+++ b/lm_eval/tasks/llama3/instruct/gpqa/utils.py
+import random
+import re
+import datasets
+def preprocess(text):
+    if text is None:
+        return " "
+    text = text.strip()
+    text = text.replace(" [title]", ". ")
+    text = re.sub("\[.*?]", "", text)
+    text = text.replace("  ", " ")
+    return text
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc):
+        choices = [
+            preprocess(doc["Incorrect Answer 1"]),
+            preprocess(doc["Incorrect Answer 2"]),
+            preprocess(doc["Incorrect Answer 3"]),
+            preprocess(doc["Correct Answer"]),
+        ]
+        random.shuffle(choices)
+        correct_answer_index = choices.index(preprocess(doc["Correct Answer"]))
+        out_doc = {
+            "choice1": choices[0],
+            "choice2": choices[1],
+            "choice3": choices[2],
+            "choice4": choices[3],
+            "choices": [choices[0], choices[1], choices[2], choices[3]],
+            "answer": f"{chr(65 + correct_answer_index)}",
+        }
+        return out_doc
+    return dataset.map(_process_doc)