merged main

90ad5db7 · lintangsutawika · f692caa9 · b177c82c · 90ad5db7 · 90ad5db7
Commit 90ad5db7 authored Mar 01, 2024 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/ammlu/ammlu_security_studies.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_security_studies.yaml
+"dataset_name": "security_studies"
+"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_security_studies"
--- a/lm_eval/tasks/ammlu/ammlu_sociology.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_sociology.yaml
+"dataset_name": "sociology"
+"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_sociology"
--- a/lm_eval/tasks/ammlu/ammlu_us_foreign_policy.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_us_foreign_policy.yaml
+"dataset_name": "us_foreign_policy"
+"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_us_foreign_policy"
--- a/lm_eval/tasks/ammlu/ammlu_virology.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_virology.yaml
+"dataset_name": "virology"
+"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_virology"
--- a/lm_eval/tasks/ammlu/ammlu_world_religions.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_world_religions.yaml
+"dataset_name": "world_religions"
+"description": "فم بعملية التقييم في مجال العلوم الانسانية \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_world_religions"
--- a/lm_eval/tasks/bbh/_generate_configs.py
+++ b/lm_eval/tasks/bbh/_generate_configs.py
 """
 Take in a YAML, and output all other splits with this YAML
 """
+import argparse
 import os
 import re
-import yaml
-import requests
-import argparse

 import datasets
+import requests
+import yaml
 from tqdm import tqdm

 from lm_eval import utils

--- a/lm_eval/tasks/bbh/cot_zeroshot/utils.py
+++ b/lm_eval/tasks/bbh/cot_zeroshot/utils.py
 import collections
 import re
 import sys
-
 import unicodedata

-from lm_eval.filters.extraction import RegexFilter, Filter
+from lm_eval.filters.extraction import Filter, RegexFilter


 class ExtendedRegexFilter(RegexFilter):
-    punct_tbl = dict.fromkeys(i for i in range(sys.maxunicode)
-                              if unicodedata.category(chr(i)).startswith('P'))
+    punct_tbl = dict.fromkeys(
+        i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P")
+    )

    def __init__(
-            self, regex_pattern: str = r"#### (\-?[0-9\.\,]+)", group_select=0, fallback: str = "[invalid]",
-            ignore_case=False, ignore_punctuation=False, regexes_to_ignore=None,
+        self,
+        regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
+        group_select=0,
+        fallback: str = "[invalid]",
+        ignore_case=False,
+        ignore_punctuation=False,
+        regexes_to_ignore=None,
    ) -> None:
        super().__init__(regex_pattern, group_select, fallback)
        self.ignore_case = ignore_case
@@ -47,8 +52,13 @@ class ExtendedRegexFilter(RegexFilter):

 class MapRegexFilter(ExtendedRegexFilter):
    def __init__(
-            self, regex_pattern_to_value: dict = {}, group_select=0, fallback: str = "[invalid]",
-            ignore_case=False, ignore_punctuation=False, regexes_to_ignore=None,
+        self,
+        regex_pattern_to_value: dict = {},
+        group_select=0,
+        fallback: str = "[invalid]",
+        ignore_case=False,
+        ignore_punctuation=False,
+        regexes_to_ignore=None,
    ) -> None:
        """
        regex_pattern_to_value: Match the regex pattern and change the result into the value
@@ -57,8 +67,17 @@ class MapRegexFilter(ExtendedRegexFilter):
        ignore_punctuation: Remove the punctuation before matching with the given regex
        regexes_to_ignore: Remove these regexes before matching with the given regex
        """
-        super().__init__('|'.join(list(regex_pattern_to_value.keys())), group_select, fallback, ignore_case, ignore_punctuation, regexes_to_ignore)
-        self.regex_to_value = {re.compile(r): v for r, v in regex_pattern_to_value.items()}
+        super().__init__(
+            "|".join(list(regex_pattern_to_value.keys())),
+            group_select,
+            fallback,
+            ignore_case,
+            ignore_punctuation,
+            regexes_to_ignore,
+        )
+        self.regex_to_value = {
+            re.compile(r): v for r, v in regex_pattern_to_value.items()
+        }

    def apply(self, resps, docs):
        filtered_resps = []
@@ -66,10 +85,15 @@ class MapRegexFilter(ExtendedRegexFilter):
        for r in resps:
            filtered = []
            for resp in r:
-                whole_match_considering_group_select = self.find_match(self.regex, self.filter_ignores(resp))
+                whole_match_considering_group_select = self.find_match(
+                    self.regex, self.filter_ignores(resp)
+                )
                if whole_match_considering_group_select:
                    for regex, mapped_value in self.regex_to_value.items():
-                        match = self.find_match(regex, self.filter_ignores(whole_match_considering_group_select))
+                        match = self.find_match(
+                            regex,
+                            self.filter_ignores(whole_match_considering_group_select),
+                        )
                        if match:
                            match = mapped_value
                            break
@@ -91,9 +115,11 @@ class NumberParseRegexFilter(ExtendedRegexFilter):
        filtered_resps = []
        import regex
        from word2number import w2n
+
        # https://www.reddit.com/r/regex/comments/11a38uk/parsing_numbers_written_out_as_english_words
        english_number_regex = regex.compile(
-            "((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))")
+            "((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))"
+        )

        for r in resps:
            filtered = []
@@ -118,21 +144,22 @@ class WordSortFilter(Filter):
        filtered_resps = []

        for r, doc in zip(resps, docs):
-            words = doc['input'].split("List:")[1].strip().split()
-            regex = re.compile('|'.join([f"\\b{w}\\b" for w in words]))
+            words = doc["input"].split("List:")[1].strip().split()
+            regex = re.compile("|".join([f"\\b{w}\\b" for w in words]))
            filtered = []
            for resp in r:
                match = regex.findall(resp)
                match.reverse()
-                ordered_words = reversed(collections.OrderedDict(zip(match, [None] * len(match))))
-                filtered.append(' '.join(ordered_words))
+                ordered_words = reversed(
+                    collections.OrderedDict(zip(match, [None] * len(match)))
+                )
+                filtered.append(" ".join(ordered_words))
            filtered_resps.append(filtered)

        return filtered_resps


 class MultiChoiceRegexFilter(ExtendedRegexFilter):
-
    def __init__(self, *args, **kwargs):
        """
        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
@@ -156,13 +183,13 @@ class MultiChoiceRegexFilter(ExtendedRegexFilter):
        for r, doc in zip(resps, docs):
            fallback_regexes = []
            choice_to_alpha = {}
-            next_alpha = 'A'
+            next_alpha = "A"

            without_paren_fallback_regexes = []
            without_paren_to_target = {}

            multiple_choices_regex = re.compile(r"\([A-Z]\)([^\n^(]*)")
-            match = multiple_choices_regex.findall(doc['input'])
+            match = multiple_choices_regex.findall(doc["input"])
            for m in match:
                m = self.filter_ignores(m.strip())
                fallback_regexes.append(f"{re.escape(m)}")
@@ -172,17 +199,23 @@ class MultiChoiceRegexFilter(ExtendedRegexFilter):
                without_paren_to_target[next_alpha] = f"({next_alpha})"

                next_alpha = chr(ord(next_alpha) + 1)
-            fallback_regex = re.compile('|'.join(fallback_regexes))
-            without_paren_fallback_regex = '|'.join(without_paren_fallback_regexes)
-            without_paren_fallback_regex = re.compile(f":[\s]*({without_paren_fallback_regex})")
+            fallback_regex = re.compile("|".join(fallback_regexes))
+            without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
+            without_paren_fallback_regex = re.compile(
+                f":[\s]*({without_paren_fallback_regex})"
+            )

            filtered = []
            for resp in r:
                match = self.find_match(self.regex, resp)
                if not match:
-                    match = self.find_match(fallback_regex, self.filter_ignores(resp), choice_to_alpha)
+                    match = self.find_match(
+                        fallback_regex, self.filter_ignores(resp), choice_to_alpha
+                    )
                    if not match:
-                        match = self.find_match(without_paren_fallback_regex, resp, without_paren_to_target)
+                        match = self.find_match(
+                            without_paren_fallback_regex, resp, without_paren_to_target
+                        )
                if not match:
                    match = self.fallback
                filtered.append(match)

--- a/lm_eval/tasks/bbh/zeroshot/utils.py
+++ b/lm_eval/tasks/bbh/zeroshot/utils.py
 import collections
 import re
 import sys
-
 import unicodedata

-from lm_eval.filters.extraction import RegexFilter, Filter
+from lm_eval.filters.extraction import Filter, RegexFilter


 class ExtendedRegexFilter(RegexFilter):
-    punct_tbl = dict.fromkeys(i for i in range(sys.maxunicode)
-                              if unicodedata.category(chr(i)).startswith('P'))
+    punct_tbl = dict.fromkeys(
+        i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P")
+    )

    def __init__(
-            self, regex_pattern: str = r"#### (\-?[0-9\.\,]+)", group_select=0, fallback: str = "[invalid]",
-            ignore_case=False, ignore_punctuation=False, regexes_to_ignore=None,
+        self,
+        regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
+        group_select=0,
+        fallback: str = "[invalid]",
+        ignore_case=False,
+        ignore_punctuation=False,
+        regexes_to_ignore=None,
    ) -> None:
        super().__init__(regex_pattern, group_select, fallback)
        self.ignore_case = ignore_case
@@ -47,8 +52,13 @@ class ExtendedRegexFilter(RegexFilter):

 class MapRegexFilter(ExtendedRegexFilter):
    def __init__(
-            self, regex_pattern_to_value: dict = {}, group_select=0, fallback: str = "[invalid]",
-            ignore_case=False, ignore_punctuation=False, regexes_to_ignore=None,
+        self,
+        regex_pattern_to_value: dict = {},
+        group_select=0,
+        fallback: str = "[invalid]",
+        ignore_case=False,
+        ignore_punctuation=False,
+        regexes_to_ignore=None,
    ) -> None:
        """
        regex_pattern_to_value: Match the regex pattern and change the result into the value
@@ -57,8 +67,17 @@ class MapRegexFilter(ExtendedRegexFilter):
        ignore_punctuation: Remove the punctuation before matching with the given regex
        regexes_to_ignore: Remove these regexes before matching with the given regex
        """
-        super().__init__('|'.join(list(regex_pattern_to_value.keys())), group_select, fallback, ignore_case, ignore_punctuation, regexes_to_ignore)
-        self.regex_to_value = {re.compile(r): v for r, v in regex_pattern_to_value.items()}
+        super().__init__(
+            "|".join(list(regex_pattern_to_value.keys())),
+            group_select,
+            fallback,
+            ignore_case,
+            ignore_punctuation,
+            regexes_to_ignore,
+        )
+        self.regex_to_value = {
+            re.compile(r): v for r, v in regex_pattern_to_value.items()
+        }

    def apply(self, resps, docs):
        filtered_resps = []
@@ -66,10 +85,15 @@ class MapRegexFilter(ExtendedRegexFilter):
        for r in resps:
            filtered = []
            for resp in r:
-                whole_match_considering_group_select = self.find_match(self.regex, self.filter_ignores(resp))
+                whole_match_considering_group_select = self.find_match(
+                    self.regex, self.filter_ignores(resp)
+                )
                if whole_match_considering_group_select:
                    for regex, mapped_value in self.regex_to_value.items():
-                        match = self.find_match(regex, self.filter_ignores(whole_match_considering_group_select))
+                        match = self.find_match(
+                            regex,
+                            self.filter_ignores(whole_match_considering_group_select),
+                        )
                        if match:
                            match = mapped_value
                            break
@@ -91,9 +115,11 @@ class NumberParseRegexFilter(ExtendedRegexFilter):
        filtered_resps = []
        import regex
        from word2number import w2n
+
        # https://www.reddit.com/r/regex/comments/11a38uk/parsing_numbers_written_out_as_english_words
        english_number_regex = regex.compile(
-            "((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))")
+            "((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))"
+        )

        for r in resps:
            filtered = []
@@ -118,21 +144,22 @@ class WordSortFilter(Filter):
        filtered_resps = []

        for r, doc in zip(resps, docs):
-            words = doc['input'].split("List:")[1].strip().split()
-            regex = re.compile('|'.join([f"\\b{w}\\b" for w in words]))
+            words = doc["input"].split("List:")[1].strip().split()
+            regex = re.compile("|".join([f"\\b{w}\\b" for w in words]))
            filtered = []
            for resp in r:
                match = regex.findall(resp)
                match.reverse()
-                ordered_words = reversed(collections.OrderedDict(zip(match, [None] * len(match))))
-                filtered.append(' '.join(ordered_words))
+                ordered_words = reversed(
+                    collections.OrderedDict(zip(match, [None] * len(match)))
+                )
+                filtered.append(" ".join(ordered_words))
            filtered_resps.append(filtered)

        return filtered_resps


 class MultiChoiceRegexFilter(ExtendedRegexFilter):
-
    def __init__(self, *args, **kwargs):
        """
        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
@@ -156,13 +183,13 @@ class MultiChoiceRegexFilter(ExtendedRegexFilter):
        for r, doc in zip(resps, docs):
            fallback_regexes = []
            choice_to_alpha = {}
-            next_alpha = 'A'
+            next_alpha = "A"

            without_paren_fallback_regexes = []
            without_paren_to_target = {}

            multiple_choices_regex = re.compile(r"\([A-Z]\)([^\n^(]*)")
-            match = multiple_choices_regex.findall(doc['input'])
+            match = multiple_choices_regex.findall(doc["input"])
            for m in match:
                m = self.filter_ignores(m.strip())
                fallback_regexes.append(f"{re.escape(m)}")
@@ -172,17 +199,23 @@ class MultiChoiceRegexFilter(ExtendedRegexFilter):
                without_paren_to_target[next_alpha] = f"({next_alpha})"

                next_alpha = chr(ord(next_alpha) + 1)
-            fallback_regex = re.compile('|'.join(fallback_regexes))
-            without_paren_fallback_regex = '|'.join(without_paren_fallback_regexes)
-            without_paren_fallback_regex = re.compile(f":[\s]*({without_paren_fallback_regex})")
+            fallback_regex = re.compile("|".join(fallback_regexes))
+            without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
+            without_paren_fallback_regex = re.compile(
+                f":[\s]*({without_paren_fallback_regex})"
+            )

            filtered = []
            for resp in r:
                match = self.find_match(self.regex, resp)
                if not match:
-                    match = self.find_match(fallback_regex, self.filter_ignores(resp), choice_to_alpha)
+                    match = self.find_match(
+                        fallback_regex, self.filter_ignores(resp), choice_to_alpha
+                    )
                    if not match:
-                        match = self.find_match(without_paren_fallback_regex, resp, without_paren_to_target)
+                        match = self.find_match(
+                            without_paren_fallback_regex, resp, without_paren_to_target
+                        )
                if not match:
                    match = self.fallback
                filtered.append(match)

--- a/lm_eval/tasks/belebele/_generate_configs.py
+++ b/lm_eval/tasks/belebele/_generate_configs.py
 """
 Take in a YAML, and output all other splits with this YAML
 """
-import os
-import yaml
 import argparse
-import requests
+import os

+import requests
+import yaml
 from tqdm import tqdm

 from lm_eval.utils import logging

+
 API_URL = "https://datasets-server.huggingface.co/splits?dataset=facebook/belebele"


@@ -39,6 +40,7 @@ if __name__ == "__main__":
    def query():
        response = requests.get(API_URL)
        return response.json()["splits"]
+
    print(query())
    languages = [split["split"] for split in query()]

@@ -49,7 +51,7 @@ if __name__ == "__main__":
            if args.task_prefix != ""
            else f"belebele_{lang}",
            "test_split": lang,
-            "fewshot_split":lang,
+            "fewshot_split": lang,
        }

        file_save_path = args.save_prefix_path + f"_{lang}.yaml"

--- a/lm_eval/tasks/bigbench/generate_tasks.py
+++ b/lm_eval/tasks/bigbench/generate_tasks.py
 import os
+
 import yaml

+
 all_subtasks = [
    "abstract_narrative_understanding",
    "anachronisms",

--- a/lm_eval/tasks/bigbench/push_bigbench_dataset.py
+++ b/lm_eval/tasks/bigbench/push_bigbench_dataset.py
@@ -8,10 +8,9 @@ Requires the installation of
 `pip install "bigbench @ https://storage.googleapis.com/public_research_data/bigbench/bigbench-0.0.1.tar.gz"`
 and is included so that the bigbench dependency can be avoided.
 """
-from tqdm import tqdm
-import datasets
-
 import bigbench.api.util as bb_utils
+import datasets
+from tqdm import tqdm


 all_task_names = bb_utils.get_all_json_task_names()

--- a/lm_eval/tasks/blimp/generate_configs.py
+++ b/lm_eval/tasks/blimp/generate_configs.py
 import yaml

+
 all_subtasks = [
    "adjunct_island",
    "anaphor_gender_agreement",

--- a/lm_eval/tasks/ceval/_generate_configs.py
+++ b/lm_eval/tasks/ceval/_generate_configs.py
 """
 Take in a YAML, and output all other splits with this YAML
 """
-import os
-import yaml
 import argparse
+import os

+import yaml
 from tqdm import tqdm

 from lm_eval.logger import eval_logger

+
 SUBJECTS = {
    "computer_network": "计算机网络",
    "operating_system": "操作系统",

--- a/lm_eval/tasks/cmmlu/_generate_configs.py
+++ b/lm_eval/tasks/cmmlu/_generate_configs.py
 """
 Take in a YAML, and output all other splits with this YAML
 """
-import os
-import yaml
 import argparse
+import os

+import yaml
 from tqdm import tqdm

 from lm_eval.logger import eval_logger

+
 SUBJECTS = {
    "agronomy": "农学",
    "anatomy": "解剖学",

--- a/lm_eval/tasks/code_x_glue/code-text/bleu.py
+++ b/lm_eval/tasks/code_x_glue/code-text/bleu.py
 #!/usr/bin/python
+import math
 import re
 import sys
-import math
 import xml.sax.saxutils
+from typing import Any, Dict, List, Optional, Pattern, Tuple, Union

-from typing import List, Pattern, Tuple, Union, Dict, Any, Optional

 """
 This script was adapted from the original version by hieuhoang1972 which is part of MOSES.
@@ -60,7 +60,7 @@ def normalize(s):
    # Added to bypass NIST-style pre-processing of hyp and ref files -- wade
    if nonorm:
        return s.split()
-    if type(s) is not str:
+    if not isinstance(s, str):
        s = " ".join(s)
    # language-independent part:
    for pattern, replace in normalize1:

--- a/lm_eval/tasks/csatqa/_generate_configs.py
+++ b/lm_eval/tasks/csatqa/_generate_configs.py
 """
 Take in a YAML, and output all other splits with this YAML
 """
-import os
-import yaml
 import argparse
+import os

+import yaml
 from tqdm import tqdm

 from lm_eval.logger import eval_logger

+
 SUBSETS = ["WR", "GR", "RCS", "RCSS", "RCH", "LI"]



--- a/lm_eval/tasks/drop/utils.py
+++ b/lm_eval/tasks/drop/utils.py
@@ -4,6 +4,7 @@ import string
 import numpy as np
 from scipy.optimize import linear_sum_assignment

+
 _ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE)



--- a/lm_eval/tasks/gpqa/README.md
+++ b/lm_eval/tasks/gpqa/README.md
+# GPQA
+
+### Paper
+
+Title: GPQA: A Graduate-Level Google-Proof Q&A Benchmark
+
+Abstract: https://arxiv.org/abs/2311.12022
+
+We present GPQA, a challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry. We ensure that the questions are high-quality and extremely difficult: experts who have or are pursuing PhDs in the corresponding domains reach 65% accuracy (74% when discounting clear mistakes the experts identified in retrospect), while highly skilled non-expert validators only reach 34% accuracy, despite spending on average over 30 minutes with unrestricted access to the web (i.e., the questions are “Google-proof”). The questions are also difficult for state-of-the-art AI systems, with our strongest GPT-4–based baseline achieving 39% accuracy. If we are to use future AI systems to help us answer very hard questions—for example, when developing new scientific knowledge—we need to develop *scalable oversight* methods that enable humans to supervise their outputs, which may be difficult even if the supervisors are themselves skilled and knowledgeable. The difficulty of GPQA both for skilled non-experts and frontier AI systems should enable realistic scalable oversight experiments, which we hope can help devise ways for human experts to reliably get truthful information from AI systems that surpass human capabilities.
+
+Homepage: `https://github.com/idavidrein/gpqa/tree/main`
+
+### Citation
+
+```
+@misc{rein2023gpqa,
+      title={GPQA: A Graduate-Level Google-Proof Q&A Benchmark},
+      author={David Rein and Betty Li Hou and Asa Cooper Stickland and Jackson Petty and Richard Yuanzhe Pang and Julien Dirani and Julian Michael and Samuel R. Bowman},
+      year={2023},
+      eprint={2311.12022},
+      archivePrefix={arXiv},
+      primaryClass={cs.AI}
+}
+```
+
+This dataset is gated, so you will have to accept the terms of use at https://huggingface.co/datasets/Idavidrein/gpqa and login via `huggingface-cli login` using your HF Hub token before running this task.
+
+### Groups and Tasks
+
+#### Groups
+
+* `gpqa`
+
+#### Tasks
+
+* `gpqa_{main, diamond, extended}_zeroshot`
+* `gpqa_{main, diamond, extended}_n_shot`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+
+* [x] Is the task an existing benchmark in the literature?
+    * [x] Have you referenced the original paper that introduced the task?
+    * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/gpqa/n_shot/_generate_configs.py
+++ b/lm_eval/tasks/gpqa/n_shot/_generate_configs.py
+import yaml
+from tqdm import tqdm
+
+
+def main() -> None:
+    subset = ["extended", "diamond", "experts", "main"]
+
+    for task in tqdm(subset):
+        file_name = f"gpqa_{task}_n_shot.yaml"
+        try:
+            with open(f"{file_name}", "w") as f:
+                f.write("# Generated by _generate_configs.py\n")
+                yaml.dump(
+                    {
+                        "include": "_gpqa_n_shot_yaml",
+                        "task": f"gpqa_{task}_n_shot",
+                        "dataset_name": f"gpqa_{task}",
+                    },
+                    f,
+                )
+        except FileExistsError:
+            pass
+
+
+if __name__ == "__main__":
+    main()
--- a/lm_eval/tasks/gpqa/n_shot/_gpqa_n_shot_yaml
+++ b/lm_eval/tasks/gpqa/n_shot/_gpqa_n_shot_yaml
+dataset_path: Idavidrein/gpqa
+group: gpqa
+output_type: multiple_choice
+process_docs: !function utils.process_docs
+training_split: train
+# Because huggingface dataset only has train split
+validation_split: train
+test_split: null
+description: "Here are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n"
+doc_to_text: "Question: {{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nAnswer:"
+doc_to_target: answer
+doc_to_choice: ["(A)", "(B)", "(C)", "(D)"]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0