Commit 90ad5db7 authored by lintangsutawika's avatar lintangsutawika
Browse files

merged main

parents f692caa9 b177c82c
"dataset_name": "security_studies"
"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_security_studies"
"dataset_name": "sociology"
"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_sociology"
"dataset_name": "us_foreign_policy"
"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_us_foreign_policy"
"dataset_name": "virology"
"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
"include": "_default_template_yaml"
"task": "ammlu_virology"
"dataset_name": "world_religions"
"description": "فم بعملية التقييم في مجال العلوم الانسانية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_world_religions"
"""
Take in a YAML, and output all other splits with this YAML
"""
import argparse
import os
import re
import yaml
import requests
import argparse
import datasets
import requests
import yaml
from tqdm import tqdm
from lm_eval import utils
......
import collections
import re
import sys
import unicodedata
from lm_eval.filters.extraction import RegexFilter, Filter
from lm_eval.filters.extraction import Filter, RegexFilter
class ExtendedRegexFilter(RegexFilter):
punct_tbl = dict.fromkeys(i for i in range(sys.maxunicode)
if unicodedata.category(chr(i)).startswith('P'))
punct_tbl = dict.fromkeys(
i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P")
)
def __init__(
self, regex_pattern: str = r"#### (\-?[0-9\.\,]+)", group_select=0, fallback: str = "[invalid]",
ignore_case=False, ignore_punctuation=False, regexes_to_ignore=None,
self,
regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
group_select=0,
fallback: str = "[invalid]",
ignore_case=False,
ignore_punctuation=False,
regexes_to_ignore=None,
) -> None:
super().__init__(regex_pattern, group_select, fallback)
self.ignore_case = ignore_case
......@@ -47,8 +52,13 @@ class ExtendedRegexFilter(RegexFilter):
class MapRegexFilter(ExtendedRegexFilter):
def __init__(
self, regex_pattern_to_value: dict = {}, group_select=0, fallback: str = "[invalid]",
ignore_case=False, ignore_punctuation=False, regexes_to_ignore=None,
self,
regex_pattern_to_value: dict = {},
group_select=0,
fallback: str = "[invalid]",
ignore_case=False,
ignore_punctuation=False,
regexes_to_ignore=None,
) -> None:
"""
regex_pattern_to_value: Match the regex pattern and change the result into the value
......@@ -57,8 +67,17 @@ class MapRegexFilter(ExtendedRegexFilter):
ignore_punctuation: Remove the punctuation before matching with the given regex
regexes_to_ignore: Remove these regexes before matching with the given regex
"""
super().__init__('|'.join(list(regex_pattern_to_value.keys())), group_select, fallback, ignore_case, ignore_punctuation, regexes_to_ignore)
self.regex_to_value = {re.compile(r): v for r, v in regex_pattern_to_value.items()}
super().__init__(
"|".join(list(regex_pattern_to_value.keys())),
group_select,
fallback,
ignore_case,
ignore_punctuation,
regexes_to_ignore,
)
self.regex_to_value = {
re.compile(r): v for r, v in regex_pattern_to_value.items()
}
def apply(self, resps, docs):
filtered_resps = []
......@@ -66,10 +85,15 @@ class MapRegexFilter(ExtendedRegexFilter):
for r in resps:
filtered = []
for resp in r:
whole_match_considering_group_select = self.find_match(self.regex, self.filter_ignores(resp))
whole_match_considering_group_select = self.find_match(
self.regex, self.filter_ignores(resp)
)
if whole_match_considering_group_select:
for regex, mapped_value in self.regex_to_value.items():
match = self.find_match(regex, self.filter_ignores(whole_match_considering_group_select))
match = self.find_match(
regex,
self.filter_ignores(whole_match_considering_group_select),
)
if match:
match = mapped_value
break
......@@ -91,9 +115,11 @@ class NumberParseRegexFilter(ExtendedRegexFilter):
filtered_resps = []
import regex
from word2number import w2n
# https://www.reddit.com/r/regex/comments/11a38uk/parsing_numbers_written_out_as_english_words
english_number_regex = regex.compile(
"((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))")
"((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))"
)
for r in resps:
filtered = []
......@@ -118,21 +144,22 @@ class WordSortFilter(Filter):
filtered_resps = []
for r, doc in zip(resps, docs):
words = doc['input'].split("List:")[1].strip().split()
regex = re.compile('|'.join([f"\\b{w}\\b" for w in words]))
words = doc["input"].split("List:")[1].strip().split()
regex = re.compile("|".join([f"\\b{w}\\b" for w in words]))
filtered = []
for resp in r:
match = regex.findall(resp)
match.reverse()
ordered_words = reversed(collections.OrderedDict(zip(match, [None] * len(match))))
filtered.append(' '.join(ordered_words))
ordered_words = reversed(
collections.OrderedDict(zip(match, [None] * len(match)))
)
filtered.append(" ".join(ordered_words))
filtered_resps.append(filtered)
return filtered_resps
class MultiChoiceRegexFilter(ExtendedRegexFilter):
def __init__(self, *args, **kwargs):
"""
regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
......@@ -156,13 +183,13 @@ class MultiChoiceRegexFilter(ExtendedRegexFilter):
for r, doc in zip(resps, docs):
fallback_regexes = []
choice_to_alpha = {}
next_alpha = 'A'
next_alpha = "A"
without_paren_fallback_regexes = []
without_paren_to_target = {}
multiple_choices_regex = re.compile(r"\([A-Z]\)([^\n^(]*)")
match = multiple_choices_regex.findall(doc['input'])
match = multiple_choices_regex.findall(doc["input"])
for m in match:
m = self.filter_ignores(m.strip())
fallback_regexes.append(f"{re.escape(m)}")
......@@ -172,17 +199,23 @@ class MultiChoiceRegexFilter(ExtendedRegexFilter):
without_paren_to_target[next_alpha] = f"({next_alpha})"
next_alpha = chr(ord(next_alpha) + 1)
fallback_regex = re.compile('|'.join(fallback_regexes))
without_paren_fallback_regex = '|'.join(without_paren_fallback_regexes)
without_paren_fallback_regex = re.compile(f":[\s]*({without_paren_fallback_regex})")
fallback_regex = re.compile("|".join(fallback_regexes))
without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
without_paren_fallback_regex = re.compile(
f":[\s]*({without_paren_fallback_regex})"
)
filtered = []
for resp in r:
match = self.find_match(self.regex, resp)
if not match:
match = self.find_match(fallback_regex, self.filter_ignores(resp), choice_to_alpha)
match = self.find_match(
fallback_regex, self.filter_ignores(resp), choice_to_alpha
)
if not match:
match = self.find_match(without_paren_fallback_regex, resp, without_paren_to_target)
match = self.find_match(
without_paren_fallback_regex, resp, without_paren_to_target
)
if not match:
match = self.fallback
filtered.append(match)
......
import collections
import re
import sys
import unicodedata
from lm_eval.filters.extraction import RegexFilter, Filter
from lm_eval.filters.extraction import Filter, RegexFilter
class ExtendedRegexFilter(RegexFilter):
punct_tbl = dict.fromkeys(i for i in range(sys.maxunicode)
if unicodedata.category(chr(i)).startswith('P'))
punct_tbl = dict.fromkeys(
i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P")
)
def __init__(
self, regex_pattern: str = r"#### (\-?[0-9\.\,]+)", group_select=0, fallback: str = "[invalid]",
ignore_case=False, ignore_punctuation=False, regexes_to_ignore=None,
self,
regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
group_select=0,
fallback: str = "[invalid]",
ignore_case=False,
ignore_punctuation=False,
regexes_to_ignore=None,
) -> None:
super().__init__(regex_pattern, group_select, fallback)
self.ignore_case = ignore_case
......@@ -47,8 +52,13 @@ class ExtendedRegexFilter(RegexFilter):
class MapRegexFilter(ExtendedRegexFilter):
def __init__(
self, regex_pattern_to_value: dict = {}, group_select=0, fallback: str = "[invalid]",
ignore_case=False, ignore_punctuation=False, regexes_to_ignore=None,
self,
regex_pattern_to_value: dict = {},
group_select=0,
fallback: str = "[invalid]",
ignore_case=False,
ignore_punctuation=False,
regexes_to_ignore=None,
) -> None:
"""
regex_pattern_to_value: Match the regex pattern and change the result into the value
......@@ -57,8 +67,17 @@ class MapRegexFilter(ExtendedRegexFilter):
ignore_punctuation: Remove the punctuation before matching with the given regex
regexes_to_ignore: Remove these regexes before matching with the given regex
"""
super().__init__('|'.join(list(regex_pattern_to_value.keys())), group_select, fallback, ignore_case, ignore_punctuation, regexes_to_ignore)
self.regex_to_value = {re.compile(r): v for r, v in regex_pattern_to_value.items()}
super().__init__(
"|".join(list(regex_pattern_to_value.keys())),
group_select,
fallback,
ignore_case,
ignore_punctuation,
regexes_to_ignore,
)
self.regex_to_value = {
re.compile(r): v for r, v in regex_pattern_to_value.items()
}
def apply(self, resps, docs):
filtered_resps = []
......@@ -66,10 +85,15 @@ class MapRegexFilter(ExtendedRegexFilter):
for r in resps:
filtered = []
for resp in r:
whole_match_considering_group_select = self.find_match(self.regex, self.filter_ignores(resp))
whole_match_considering_group_select = self.find_match(
self.regex, self.filter_ignores(resp)
)
if whole_match_considering_group_select:
for regex, mapped_value in self.regex_to_value.items():
match = self.find_match(regex, self.filter_ignores(whole_match_considering_group_select))
match = self.find_match(
regex,
self.filter_ignores(whole_match_considering_group_select),
)
if match:
match = mapped_value
break
......@@ -91,9 +115,11 @@ class NumberParseRegexFilter(ExtendedRegexFilter):
filtered_resps = []
import regex
from word2number import w2n
# https://www.reddit.com/r/regex/comments/11a38uk/parsing_numbers_written_out_as_english_words
english_number_regex = regex.compile(
"((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))")
"((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))"
)
for r in resps:
filtered = []
......@@ -118,21 +144,22 @@ class WordSortFilter(Filter):
filtered_resps = []
for r, doc in zip(resps, docs):
words = doc['input'].split("List:")[1].strip().split()
regex = re.compile('|'.join([f"\\b{w}\\b" for w in words]))
words = doc["input"].split("List:")[1].strip().split()
regex = re.compile("|".join([f"\\b{w}\\b" for w in words]))
filtered = []
for resp in r:
match = regex.findall(resp)
match.reverse()
ordered_words = reversed(collections.OrderedDict(zip(match, [None] * len(match))))
filtered.append(' '.join(ordered_words))
ordered_words = reversed(
collections.OrderedDict(zip(match, [None] * len(match)))
)
filtered.append(" ".join(ordered_words))
filtered_resps.append(filtered)
return filtered_resps
class MultiChoiceRegexFilter(ExtendedRegexFilter):
def __init__(self, *args, **kwargs):
"""
regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
......@@ -156,13 +183,13 @@ class MultiChoiceRegexFilter(ExtendedRegexFilter):
for r, doc in zip(resps, docs):
fallback_regexes = []
choice_to_alpha = {}
next_alpha = 'A'
next_alpha = "A"
without_paren_fallback_regexes = []
without_paren_to_target = {}
multiple_choices_regex = re.compile(r"\([A-Z]\)([^\n^(]*)")
match = multiple_choices_regex.findall(doc['input'])
match = multiple_choices_regex.findall(doc["input"])
for m in match:
m = self.filter_ignores(m.strip())
fallback_regexes.append(f"{re.escape(m)}")
......@@ -172,17 +199,23 @@ class MultiChoiceRegexFilter(ExtendedRegexFilter):
without_paren_to_target[next_alpha] = f"({next_alpha})"
next_alpha = chr(ord(next_alpha) + 1)
fallback_regex = re.compile('|'.join(fallback_regexes))
without_paren_fallback_regex = '|'.join(without_paren_fallback_regexes)
without_paren_fallback_regex = re.compile(f":[\s]*({without_paren_fallback_regex})")
fallback_regex = re.compile("|".join(fallback_regexes))
without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
without_paren_fallback_regex = re.compile(
f":[\s]*({without_paren_fallback_regex})"
)
filtered = []
for resp in r:
match = self.find_match(self.regex, resp)
if not match:
match = self.find_match(fallback_regex, self.filter_ignores(resp), choice_to_alpha)
match = self.find_match(
fallback_regex, self.filter_ignores(resp), choice_to_alpha
)
if not match:
match = self.find_match(without_paren_fallback_regex, resp, without_paren_to_target)
match = self.find_match(
without_paren_fallback_regex, resp, without_paren_to_target
)
if not match:
match = self.fallback
filtered.append(match)
......
"""
Take in a YAML, and output all other splits with this YAML
"""
import os
import yaml
import argparse
import requests
import os
import requests
import yaml
from tqdm import tqdm
from lm_eval.utils import logging
API_URL = "https://datasets-server.huggingface.co/splits?dataset=facebook/belebele"
......@@ -39,6 +40,7 @@ if __name__ == "__main__":
def query():
response = requests.get(API_URL)
return response.json()["splits"]
print(query())
languages = [split["split"] for split in query()]
......@@ -49,7 +51,7 @@ if __name__ == "__main__":
if args.task_prefix != ""
else f"belebele_{lang}",
"test_split": lang,
"fewshot_split":lang,
"fewshot_split": lang,
}
file_save_path = args.save_prefix_path + f"_{lang}.yaml"
......
import os
import yaml
all_subtasks = [
"abstract_narrative_understanding",
"anachronisms",
......
......@@ -8,10 +8,9 @@ Requires the installation of
`pip install "bigbench @ https://storage.googleapis.com/public_research_data/bigbench/bigbench-0.0.1.tar.gz"`
and is included so that the bigbench dependency can be avoided.
"""
from tqdm import tqdm
import datasets
import bigbench.api.util as bb_utils
import datasets
from tqdm import tqdm
all_task_names = bb_utils.get_all_json_task_names()
......
import yaml
all_subtasks = [
"adjunct_island",
"anaphor_gender_agreement",
......
"""
Take in a YAML, and output all other splits with this YAML
"""
import os
import yaml
import argparse
import os
import yaml
from tqdm import tqdm
from lm_eval.logger import eval_logger
SUBJECTS = {
"computer_network": "计算机网络",
"operating_system": "操作系统",
......
"""
Take in a YAML, and output all other splits with this YAML
"""
import os
import yaml
import argparse
import os
import yaml
from tqdm import tqdm
from lm_eval.logger import eval_logger
SUBJECTS = {
"agronomy": "农学",
"anatomy": "解剖学",
......
#!/usr/bin/python
import math
import re
import sys
import math
import xml.sax.saxutils
from typing import Any, Dict, List, Optional, Pattern, Tuple, Union
from typing import List, Pattern, Tuple, Union, Dict, Any, Optional
"""
This script was adapted from the original version by hieuhoang1972 which is part of MOSES.
......@@ -60,7 +60,7 @@ def normalize(s):
# Added to bypass NIST-style pre-processing of hyp and ref files -- wade
if nonorm:
return s.split()
if type(s) is not str:
if not isinstance(s, str):
s = " ".join(s)
# language-independent part:
for pattern, replace in normalize1:
......
"""
Take in a YAML, and output all other splits with this YAML
"""
import os
import yaml
import argparse
import os
import yaml
from tqdm import tqdm
from lm_eval.logger import eval_logger
SUBSETS = ["WR", "GR", "RCS", "RCSS", "RCH", "LI"]
......
......@@ -4,6 +4,7 @@ import string
import numpy as np
from scipy.optimize import linear_sum_assignment
_ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE)
......
# GPQA
### Paper
Title: GPQA: A Graduate-Level Google-Proof Q&A Benchmark
Abstract: https://arxiv.org/abs/2311.12022
We present GPQA, a challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry. We ensure that the questions are high-quality and extremely difficult: experts who have or are pursuing PhDs in the corresponding domains reach 65% accuracy (74% when discounting clear mistakes the experts identified in retrospect), while highly skilled non-expert validators only reach 34% accuracy, despite spending on average over 30 minutes with unrestricted access to the web (i.e., the questions are “Google-proof”). The questions are also difficult for state-of-the-art AI systems, with our strongest GPT-4–based baseline achieving 39% accuracy. If we are to use future AI systems to help us answer very hard questions—for example, when developing new scientific knowledge—we need to develop *scalable oversight* methods that enable humans to supervise their outputs, which may be difficult even if the supervisors are themselves skilled and knowledgeable. The difficulty of GPQA both for skilled non-experts and frontier AI systems should enable realistic scalable oversight experiments, which we hope can help devise ways for human experts to reliably get truthful information from AI systems that surpass human capabilities.
Homepage: `https://github.com/idavidrein/gpqa/tree/main`
### Citation
```
@misc{rein2023gpqa,
title={GPQA: A Graduate-Level Google-Proof Q&A Benchmark},
author={David Rein and Betty Li Hou and Asa Cooper Stickland and Jackson Petty and Richard Yuanzhe Pang and Julien Dirani and Julian Michael and Samuel R. Bowman},
year={2023},
eprint={2311.12022},
archivePrefix={arXiv},
primaryClass={cs.AI}
}
```
This dataset is gated, so you will have to accept the terms of use at https://huggingface.co/datasets/Idavidrein/gpqa and login via `huggingface-cli login` using your HF Hub token before running this task.
### Groups and Tasks
#### Groups
* `gpqa`
#### Tasks
* `gpqa_{main, diamond, extended}_zeroshot`
* `gpqa_{main, diamond, extended}_n_shot`
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
import yaml
from tqdm import tqdm
def main() -> None:
subset = ["extended", "diamond", "experts", "main"]
for task in tqdm(subset):
file_name = f"gpqa_{task}_n_shot.yaml"
try:
with open(f"{file_name}", "w") as f:
f.write("# Generated by _generate_configs.py\n")
yaml.dump(
{
"include": "_gpqa_n_shot_yaml",
"task": f"gpqa_{task}_n_shot",
"dataset_name": f"gpqa_{task}",
},
f,
)
except FileExistsError:
pass
if __name__ == "__main__":
main()
dataset_path: Idavidrein/gpqa
group: gpqa
output_type: multiple_choice
process_docs: !function utils.process_docs
training_split: train
# Because huggingface dataset only has train split
validation_split: train
test_split: null
description: "Here are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n"
doc_to_text: "Question: {{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nAnswer:"
doc_to_target: answer
doc_to_choice: ["(A)", "(B)", "(C)", "(D)"]
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment