Commit 02e841ce authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'main' of https://github.com/EleutherAI/lm-evaluation-harness into t5v2-alt-plus

parents 90ad5db7 e74ec966
import collections
import re
import string
import datasets
import evaluate
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
regex = re.compile(r"\b(un|une|des|le|la|les)\b", re.UNICODE)
return re.sub(regex, " ", text)
def white_space_fix(text):
return " ".join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return "".join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def get_tokens(s):
if not s:
return []
return normalize_answer(s).split()
# Exact match (the normalized answer exactly match the gold answer)
def exact(predictions, references):
return int(normalize_answer(references[0]) == normalize_answer(predictions[0]))
# The F-score of predicted tokens versus the gold answer
def f1(predictions, references):
gold_toks = get_tokens(references[0])
pred_toks = get_tokens(predictions[0])
common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
num_same = sum(common.values())
if len(gold_toks) == 0 or len(pred_toks) == 0:
# If either is no-answer, then F1 is 1 if they agree, 0 otherwise
return int(gold_toks == pred_toks)
if num_same == 0:
return 0
precision = 1.0 * num_same / len(pred_toks)
recall = 1.0 * num_same / len(gold_toks)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def rouge1(items):
"""
# passthrough for efficiency
"""
return items
def rouge1_agg(items):
"""
Higher is better
"""
refs = list(zip(*items))[0]
preds = list(zip(*items))[1]
rouge_scorer = evaluate.load("rouge")
return rouge_scorer.compute(predictions=preds, references=refs)["rouge1"]
def is_included(items):
"""
# passthrough for efficiency
"""
if items[0] in items[1]:
return True
return False
def preprocess(text):
text = text.strip()
# NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
text = text.replace(" [title]", ". ")
text = re.sub("\\[.*?\\]", "", text)
text = text.replace(" ", " ")
return text
def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
def _process_doc(doc):
ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
out_doc = {
"query": preprocess(doc["activity_label"] + ": " + ctx),
"choices": [preprocess(ending) for ending in doc["endings"]],
"gold": int(doc["label"]),
}
return out_doc
return dataset.map(_process_doc)
......@@ -35,6 +35,9 @@ This dataset is gated, so you will have to accept the terms of use at https://hu
* `gpqa_{main, diamond, extended}_zeroshot`
* `gpqa_{main, diamond, extended}_n_shot`
* `gpqa_{main, diamond, extended}_generative_n_shot`
* `gpqa_{main, diamond, extended}_cot_zeroshot`
* `gpqa_{main, diamond, extended}_cot_n_shot`
### Checklist
......
import yaml
from tqdm import tqdm
def main() -> None:
subset = ["extended", "diamond", "main"]
setting = "cot_n_shot"
for task in tqdm(subset):
file_name = f"gpqa_{task}_{setting}.yaml"
try:
with open(f"{file_name}", "w") as f:
f.write("# Generated by _generate_configs.py\n")
yaml.dump(
{
"include": f"_gpqa_{setting}_yaml",
"task": f"gpqa_{task}_{setting}",
"dataset_name": f"gpqa_{task}",
},
f,
)
except FileExistsError:
pass
if __name__ == "__main__":
main()
dataset_path: Idavidrein/gpqa
group: gpqa
output_type: generate_until
process_docs: !function utils.process_docs
training_split: train
# Because huggingface dataset only has train split
validation_split: train
test_split: null
description: "Here are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n"
doc_to_text: "Question: {{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nLet's think step by step: "
doc_to_target: answer
filter_list:
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "(?<=The answer is )(.*)(?=.)"
- function: "take_first"
- name: "flexible-extract"
filter:
- function: "multi_choice_regex"
group_select: -1
ignore_case: true
ignore_punctuation: true
regex_pattern: "(\\([A-Z]\\))"
- function: "take_first"
generation_kwargs:
until:
- "</s>"
do_sample: false
temperature: 0.0
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
metadata:
version: 1.0
# Generated by _generate_configs.py
dataset_name: gpqa_diamond
include: _gpqa_cot_n_shot_yaml
task: gpqa_diamond_cot_n_shot
# Generated by _generate_configs.py
dataset_name: gpqa_extended
include: _gpqa_cot_n_shot_yaml
task: gpqa_extended_cot_n_shot
# Generated by _generate_configs.py
dataset_name: gpqa_main
include: _gpqa_cot_n_shot_yaml
task: gpqa_main_cot_n_shot
import random
import re
import datasets
def preprocess(text):
if text is None:
return " "
text = text.strip()
text = text.replace(" [title]", ". ")
text = re.sub("\\[.*?\\]", "", text)
text = text.replace(" ", " ")
return text
def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
def _process_doc(doc):
choices = [
preprocess(doc["Incorrect Answer 1"]),
preprocess(doc["Incorrect Answer 2"]),
preprocess(doc["Incorrect Answer 3"]),
preprocess(doc["Correct Answer"]),
]
random.shuffle(choices)
correct_answer_index = choices.index(preprocess(doc["Correct Answer"]))
out_doc = {
"choice1": choices[0],
"choice2": choices[1],
"choice3": choices[2],
"choice4": choices[3],
"choices": [choices[0], choices[1], choices[2], choices[3]],
"answer": f"({chr(65 + correct_answer_index)})",
}
return out_doc
return dataset.map(_process_doc)
import yaml
from tqdm import tqdm
def main() -> None:
subset = ["extended", "diamond", "main"]
setting = "cot_zeroshot"
for task in tqdm(subset):
file_name = f"gpqa_{task}_{setting}.yaml"
try:
with open(f"{file_name}", "w") as f:
f.write("# Generated by _generate_configs.py\n")
yaml.dump(
{
"include": f"_gpqa_{setting}_yaml",
"task": f"gpqa_{task}_{setting}",
"dataset_name": f"gpqa_{task}",
},
f,
)
except FileExistsError:
pass
if __name__ == "__main__":
main()
dataset_path: Idavidrein/gpqa
group: gpqa
output_type: generate_until
process_docs: !function utils.process_docs
training_split: train
# Because huggingface dataset only has train split
validation_split: train
test_split: null
doc_to_text: "What is the correct answer to this question:{{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nLet's think step by step: "
doc_to_target: answer
filter_list:
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "(?<=The answer is )(.*)(?=.)"
- function: "take_first"
- name: "flexible-extract"
filter:
- function: "multi_choice_regex"
group_select: -1
ignore_case: true
ignore_punctuation: true
regex_pattern: "(\\([A-Z]\\))"
- function: "take_first"
generation_kwargs:
until:
- "</s>"
do_sample: false
temperature: 0.0
num_fewshot: 0
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
metadata:
version: 1.0
# Generated by _generate_configs.py
dataset_name: gpqa_diamond
include: _gpqa_cot_zeroshot_yaml
task: gpqa_diamond_cot_zeroshot
# Generated by _generate_configs.py
dataset_name: gpqa_extended
include: _gpqa_cot_zeroshot_yaml
task: gpqa_extended_cot_zeroshot
# Generated by _generate_configs.py
dataset_name: gpqa_main
include: _gpqa_cot_zeroshot_yaml
task: gpqa_main_cot_zeroshot
import random
import re
import datasets
def preprocess(text):
if text is None:
return " "
text = text.strip()
text = text.replace(" [title]", ". ")
text = re.sub("\\[.*?\\]", "", text)
text = text.replace(" ", " ")
return text
def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
def _process_doc(doc):
choices = [
preprocess(doc["Incorrect Answer 1"]),
preprocess(doc["Incorrect Answer 2"]),
preprocess(doc["Incorrect Answer 3"]),
preprocess(doc["Correct Answer"]),
]
random.shuffle(choices)
correct_answer_index = choices.index(preprocess(doc["Correct Answer"]))
out_doc = {
"choice1": choices[0],
"choice2": choices[1],
"choice3": choices[2],
"choice4": choices[3],
"choices": [choices[0], choices[1], choices[2], choices[3]],
"answer": f"({chr(65 + correct_answer_index)})",
}
return out_doc
return dataset.map(_process_doc)
import yaml
from tqdm import tqdm
def main() -> None:
subset = ["extended", "diamond", "main"]
setting = "generative_n_shot"
for task in tqdm(subset):
file_name = f"gpqa_{task}_{setting}.yaml"
try:
with open(f"{file_name}", "w") as f:
f.write("# Generated by _generate_configs.py\n")
yaml.dump(
{
"include": f"_gpqa_{setting}_yaml",
"task": f"gpqa_{task}_{setting}",
"dataset_name": f"gpqa_{task}",
},
f,
)
except FileExistsError:
pass
if __name__ == "__main__":
main()
dataset_path: Idavidrein/gpqa
group: gpqa
output_type: generate_until
process_docs: !function utils.process_docs
training_split: train
# Because huggingface dataset only has train split
validation_split: train
test_split: null
description: "Here are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n"
doc_to_text: "Question: {{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nAnswer:"
doc_to_target: answer
filter_list:
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "(?<=The answer is )(.*)(?=.)"
- function: "take_first"
- name: "flexible-extract"
filter:
- function: "multi_choice_regex"
group_select: -1
ignore_case: true
ignore_punctuation: true
regex_pattern: "(\\([A-Z]\\))"
- function: "take_first"
generation_kwargs:
until:
- "</s>"
- "Question:"
- "<|im_end|>"
temperature: 0.0
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
metadata:
version: 1.0
# Generated by _generate_configs.py
dataset_name: gpqa_diamond
include: _gpqa_generative_n_shot_yaml
task: gpqa_diamond_generative_n_shot
# Generated by _generate_configs.py
dataset_name: gpqa_extended
include: _gpqa_generative_n_shot_yaml
task: gpqa_extended_generative_n_shot
# Generated by _generate_configs.py
dataset_name: gpqa_main
include: _gpqa_generative_n_shot_yaml
task: gpqa_main_generative_n_shot
import random
import re
import datasets
def preprocess(text):
if text is None:
return " "
text = text.strip()
text = text.replace(" [title]", ". ")
text = re.sub("\\[.*?\\]", "", text)
text = text.replace(" ", " ")
return text
def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
def _process_doc(doc):
choices = [
preprocess(doc["Incorrect Answer 1"]),
preprocess(doc["Incorrect Answer 2"]),
preprocess(doc["Incorrect Answer 3"]),
preprocess(doc["Correct Answer"]),
]
random.shuffle(choices)
correct_answer_index = choices.index(preprocess(doc["Correct Answer"]))
out_doc = {
"choice1": choices[0],
"choice2": choices[1],
"choice3": choices[2],
"choice4": choices[3],
"choices": [choices[0], choices[1], choices[2], choices[3]],
"answer": f"({chr(65 + correct_answer_index)})",
}
return out_doc
return dataset.map(_process_doc)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment