Add a new task GPQA (the part CoT and generative) (#1482)

* Add new tasks of GPQA * Add README * Remove unused functions * Remove unused functions * Linters * Add flexible match * update * Remove deplicate function * Linter * update * Update lm_eval/filters/extraction.py Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> * register multi_choice_regex * Update * run precommit --------- Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> Co-authored-by: haileyschoelkopf <hailey@eleuther.ai>

Add a new task GPQA (the part CoT and generative) (#1482)
* Add new tasks of GPQA * Add README * Remove unused functions * Remove unused functions * Linters * Add flexible match * update * Remove deplicate function * Linter * update * Update lm_eval/filters/extraction.py Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> * register multi_choice_regex * Update * run precommit --------- Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> Co-authored-by: haileyschoelkopf <hailey@eleuther.ai>
01108aca · Uanu · GitHub · 8a875e9a · 01108aca · 01108aca
Unverified Commit 01108aca authored Mar 06, 2024 by Uanu Committed by GitHub Mar 05, 2024
3 changed files
--- a/lm_eval/tasks/gpqa/generative/utils.py
+++ b/lm_eval/tasks/gpqa/generative/utils.py
+import random
+import re
+
+import datasets
+
+
+def preprocess(text):
+    if text is None:
+        return " "
+    text = text.strip()
+    text = text.replace(" [title]", ". ")
+    text = re.sub("\\[.*?\\]", "", text)
+    text = text.replace("  ", " ")
+    return text
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc):
+        choices = [
+            preprocess(doc["Incorrect Answer 1"]),
+            preprocess(doc["Incorrect Answer 2"]),
+            preprocess(doc["Incorrect Answer 3"]),
+            preprocess(doc["Correct Answer"]),
+        ]
+
+        random.shuffle(choices)
+        correct_answer_index = choices.index(preprocess(doc["Correct Answer"]))
+
+        out_doc = {
+            "choice1": choices[0],
+            "choice2": choices[1],
+            "choice3": choices[2],
+            "choice4": choices[3],
+            "choices": [choices[0], choices[1], choices[2], choices[3]],
+            "answer": f"({chr(65 + correct_answer_index)})",
+        }
+        return out_doc
+
+    return dataset.map(_process_doc)
--- a/lm_eval/tasks/gpqa/n_shot/_generate_configs.py
+++ b/lm_eval/tasks/gpqa/n_shot/_generate_configs.py
@@ -3,7 +3,7 @@ from tqdm import tqdm


 def main() -> None:
-    subset = ["extended", "diamond", "experts", "main"]
+    subset = ["extended", "diamond", "main"]

    for task in tqdm(subset):
        file_name = f"gpqa_{task}_n_shot.yaml"

--- a/lm_eval/tasks/gpqa/zeroshot/_generate_configs.py
+++ b/lm_eval/tasks/gpqa/zeroshot/_generate_configs.py
@@ -3,7 +3,7 @@ from tqdm import tqdm


 def main() -> None:
-    subset = ["extended", "diamond", "experts", "main"]
+    subset = ["extended", "diamond", "main"]
    setting = "zeroshot"
    for task in tqdm(subset):
        file_name = f"gpqa_{task}_{setting}.yaml"