Unverified Commit 9822b06e authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Merge branch 'main' into weight_by_size

parents 51f27158 b177c82c
group: flan_held_in
task:
- include: flan/yaml_templates/held_in_template_yaml
dataset_path: super_glue
dataset_name: boolq
use_prompt: flan/prompt_templates/boolq.yaml:*
validation_split: validation
- include: flan/yaml_templates/held_in_template_yaml
dataset_path: super_glue
dataset_name: rte
use_prompt: flan/prompt_templates/rte.yaml:*
validation_split: validation
- include: flan/yaml_templates/held_in_template_yaml
task: anli_r1
dataset_path: anli
use_prompt: flan/prompt_templates/anli.yaml:*
validation_split: dev_r1
- include: flan/yaml_templates/held_in_template_yaml
task: anli_r2
dataset_path: anli
use_prompt: flan/prompt_templates/anli.yaml:*
validation_split: dev_r2
- include: flan/yaml_templates/held_in_template_yaml
task: anli_r3
dataset_path: anli
use_prompt: flan/prompt_templates/anli.yaml:*
validation_split: dev_r3
- include: flan/yaml_templates/held_in_template_yaml
task: arc_easy
dataset_path: ai2_arc
dataset_name: ARC-Easy
use_prompt: flan/prompt_templates/arc.yaml:*
validation_split: validation
- include: flan/yaml_templates/held_in_template_yaml
task: arc_challenge
dataset_path: ai2_arc
dataset_name: ARC-Challenge
use_prompt: flan/prompt_templates/arc.yaml:*
validation_split: validation
group: flan_held_out group: flan_held_out
task: task:
# BBH # BBH
- bbh_flan_zeroshot - bbh_zeroshot
- bbh_flan_fewshot - bbh_fewshot
- bbh_flan_cot_fewshot - bbh_cot_fewshot
- bbh_flan_cot_zeroshot - bbh_cot_zeroshot
# MMLU # MMLU
- mmlu - mmlu
- mmlu_flan_n_shot_generative - mmlu_flan_n_shot_generative
......
group: flan_rte
task:
- include: yaml_templates/held_in_template_yaml
dataset_path: super_glue
dataset_name: rte
use_prompt: prompt_templates/rte.yaml:*
validation_split: validation
# Flan Prompt Templates
prompts:
"template-0":
doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-1":
doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-2":
doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-3":
doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-4":
doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-5":
doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-6":
doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-7":
doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-8":
doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
# Flan Prompt Templates
prompts:
"template-0":
doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
"template-1":
doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
"template-2":
doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
"template-3":
doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
"template-4":
doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
"template-5":
doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
"template-6":
doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
# Flan Prompt Templates
prompts:
"template-0":
doc_to_text: "{{passage}}\n\nCan we conclude that {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-1":
doc_to_text: "{{passage}}\n\nIs it true that {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-2":
doc_to_text: "{{passage}}\n\n{{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-3":
doc_to_text: "Text: {{passage}}\n\nQuestion: {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-4":
doc_to_text: "{{passage}}\n\nWhat's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-5":
doc_to_text: "{{passage}}\nBased on the above text what's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-6":
doc_to_text: "{{passage}}\nAnswer this question making sure that the answer is supposed by the text: {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-7":
doc_to_text: "{{passage}}\n\nIs the following statement correct based on the text\n\n{{question}}\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-8":
# doc_to_text: "{{title}}\n\n{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
doc_to_text: "{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-9":
doc_to_text: "Is it true that {{question}} based on the following text?\n\n{{passage}}\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
# Flan Prompt Templates
prompts:
"template-0":
doc_to_text: "{{premise}}\n\nQuestion with options: Based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- yes\n- no"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-1":
doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that the sentence below is true?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-2":
doc_to_text: "{{premise}}\n\nQ with options: Can we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-3":
doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-4":
doc_to_text: "{{premise}}\nOPTIONS:\n- yes\n- no\nQuestion: Can we infer the following?\n{{hypothesis}}"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-5":
doc_to_text: "Read the following paragraph and determine if the hypothesis is true. Select from options at the end:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nThe answer is"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-6":
doc_to_text: "Read the text and determine if the sentence is true:\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nA:"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-7":
doc_to_text: "Question with options: can we draw the following hypothesis from the context? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nA:"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-8":
doc_to_text: "Determine if the sentence is true based on the text below. Choose from options.\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- yes\n- no"
doc_to_target: "{{['yes', 'no'][label]}}"
...@@ -5,19 +5,13 @@ task: ...@@ -5,19 +5,13 @@ task:
- medqa_4options - medqa_4options
- task: mmlu_anatomy - task: mmlu_anatomy
task_alias: "anatomy (mmlu)" task_alias: "anatomy (mmlu)"
group_alias: null
- task: mmlu_clinical_knowledge - task: mmlu_clinical_knowledge
task_alias: "clinical_knowledge (mmlu)" task_alias: "clinical_knowledge (mmlu)"
group_alias: null
- task: mmlu_college_medicine - task: mmlu_college_medicine
task_alias: "college_medicine (mmlu)" task_alias: "college_medicine (mmlu)"
group_alias: null
- task: mmlu_medical_genetics - task: mmlu_medical_genetics
task_alias: "medical_genetics (mmlu)" task_alias: "medical_genetics (mmlu)"
group_alias: null
- task: mmlu_professional_medicine - task: mmlu_professional_medicine
task_alias: "professional_medicine (mmlu)" task_alias: "professional_medicine (mmlu)"
group_alias: null
- task: mmlu_college_biology - task: mmlu_college_biology
task_alias: "college_biology (mmlu)" task_alias: "college_biology (mmlu)"
group_alias: null
import os import os
import yaml import yaml
all_subtasks = [ all_subtasks = [
"abstract_narrative_understanding", "abstract_narrative_understanding",
"anachronisms", "anachronisms",
......
...@@ -8,10 +8,9 @@ Requires the installation of ...@@ -8,10 +8,9 @@ Requires the installation of
`pip install "bigbench @ https://storage.googleapis.com/public_research_data/bigbench/bigbench-0.0.1.tar.gz"` `pip install "bigbench @ https://storage.googleapis.com/public_research_data/bigbench/bigbench-0.0.1.tar.gz"`
and is included so that the bigbench dependency can be avoided. and is included so that the bigbench dependency can be avoided.
""" """
from tqdm import tqdm
import datasets
import bigbench.api.util as bb_utils import bigbench.api.util as bb_utils
import datasets
from tqdm import tqdm
all_task_names = bb_utils.get_all_json_task_names() all_task_names = bb_utils.get_all_json_task_names()
......
import yaml import yaml
all_subtasks = [ all_subtasks = [
"adjunct_island", "adjunct_island",
"anaphor_gender_agreement", "anaphor_gender_agreement",
......
""" """
Take in a YAML, and output all other splits with this YAML Take in a YAML, and output all other splits with this YAML
""" """
import os
import yaml
import argparse import argparse
import os
import yaml
from tqdm import tqdm from tqdm import tqdm
from lm_eval.logger import eval_logger from lm_eval.logger import eval_logger
SUBJECTS = { SUBJECTS = {
"computer_network": "计算机网络", "computer_network": "计算机网络",
"operating_system": "操作系统", "operating_system": "操作系统",
......
""" """
Take in a YAML, and output all other splits with this YAML Take in a YAML, and output all other splits with this YAML
""" """
import os
import yaml
import argparse import argparse
import os
import yaml
from tqdm import tqdm from tqdm import tqdm
from lm_eval.logger import eval_logger from lm_eval.logger import eval_logger
SUBJECTS = { SUBJECTS = {
"agronomy": "农学", "agronomy": "农学",
"anatomy": "解剖学", "anatomy": "解剖学",
......
#!/usr/bin/python #!/usr/bin/python
import math
import re import re
import sys import sys
import math
import xml.sax.saxutils import xml.sax.saxutils
from typing import Any, Dict, List, Optional, Pattern, Tuple, Union
from typing import List, Pattern, Tuple, Union, Dict, Any, Optional
""" """
This script was adapted from the original version by hieuhoang1972 which is part of MOSES. This script was adapted from the original version by hieuhoang1972 which is part of MOSES.
...@@ -60,7 +60,7 @@ def normalize(s): ...@@ -60,7 +60,7 @@ def normalize(s):
# Added to bypass NIST-style pre-processing of hyp and ref files -- wade # Added to bypass NIST-style pre-processing of hyp and ref files -- wade
if nonorm: if nonorm:
return s.split() return s.split()
if type(s) is not str: if not isinstance(s, str):
s = " ".join(s) s = " ".join(s)
# language-independent part: # language-independent part:
for pattern, replace in normalize1: for pattern, replace in normalize1:
......
""" """
Take in a YAML, and output all other splits with this YAML Take in a YAML, and output all other splits with this YAML
""" """
import os
import yaml
import argparse import argparse
import os
import yaml
from tqdm import tqdm from tqdm import tqdm
from lm_eval.logger import eval_logger from lm_eval.logger import eval_logger
SUBSETS = ["WR", "GR", "RCS", "RCSS", "RCH", "LI"] SUBSETS = ["WR", "GR", "RCS", "RCSS", "RCH", "LI"]
......
...@@ -4,6 +4,7 @@ import string ...@@ -4,6 +4,7 @@ import string
import numpy as np import numpy as np
from scipy.optimize import linear_sum_assignment from scipy.optimize import linear_sum_assignment
_ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE) _ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE)
......
# GPQA
### Paper
Title: GPQA: A Graduate-Level Google-Proof Q&A Benchmark
Abstract: https://arxiv.org/abs/2311.12022
We present GPQA, a challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry. We ensure that the questions are high-quality and extremely difficult: experts who have or are pursuing PhDs in the corresponding domains reach 65% accuracy (74% when discounting clear mistakes the experts identified in retrospect), while highly skilled non-expert validators only reach 34% accuracy, despite spending on average over 30 minutes with unrestricted access to the web (i.e., the questions are “Google-proof”). The questions are also difficult for state-of-the-art AI systems, with our strongest GPT-4–based baseline achieving 39% accuracy. If we are to use future AI systems to help us answer very hard questions—for example, when developing new scientific knowledge—we need to develop *scalable oversight* methods that enable humans to supervise their outputs, which may be difficult even if the supervisors are themselves skilled and knowledgeable. The difficulty of GPQA both for skilled non-experts and frontier AI systems should enable realistic scalable oversight experiments, which we hope can help devise ways for human experts to reliably get truthful information from AI systems that surpass human capabilities.
Homepage: `https://github.com/idavidrein/gpqa/tree/main`
### Citation
```
@misc{rein2023gpqa,
title={GPQA: A Graduate-Level Google-Proof Q&A Benchmark},
author={David Rein and Betty Li Hou and Asa Cooper Stickland and Jackson Petty and Richard Yuanzhe Pang and Julien Dirani and Julian Michael and Samuel R. Bowman},
year={2023},
eprint={2311.12022},
archivePrefix={arXiv},
primaryClass={cs.AI}
}
```
This dataset is gated, so you will have to accept the terms of use at https://huggingface.co/datasets/Idavidrein/gpqa and login via `huggingface-cli login` using your HF Hub token before running this task.
### Groups and Tasks
#### Groups
* `gpqa`
#### Tasks
* `gpqa_{main, diamond, extended}_zeroshot`
* `gpqa_{main, diamond, extended}_n_shot`
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
import yaml
from tqdm import tqdm
def main() -> None:
subset = ["extended", "diamond", "experts", "main"]
for task in tqdm(subset):
file_name = f"gpqa_{task}_n_shot.yaml"
try:
with open(f"{file_name}", "w") as f:
f.write("# Generated by _generate_configs.py\n")
yaml.dump(
{
"include": "_gpqa_n_shot_yaml",
"task": f"gpqa_{task}_n_shot",
"dataset_name": f"gpqa_{task}",
},
f,
)
except FileExistsError:
pass
if __name__ == "__main__":
main()
dataset_path: Idavidrein/gpqa
group: gpqa
output_type: multiple_choice
process_docs: !function utils.process_docs
training_split: train
# Because huggingface dataset only has train split
validation_split: train
test_split: null
description: "Here are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n"
doc_to_text: "Question: {{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nAnswer:"
doc_to_target: answer
doc_to_choice: ["(A)", "(B)", "(C)", "(D)"]
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
# Generated by _generate_configs.py
dataset_name: gpqa_diamond
include: _gpqa_n_shot_yaml
task: gpqa_diamond_n_shot
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment