Commit e1ae8a2f authored by Herbie Bradley's avatar Herbie Bradley
Browse files

Merge remote-tracking branch 'origin/big-refactor' into calibration

parents 50e99bd7 30936bc7
group:
- codexglue_code2text
task: code2text_php
dataset_path: CM/codexglue_code2text_php
training_split: train
validation_split: validation
test_split: test
output_type: generate_until
generation_kwargs:
num_beams: 10
max_length: 128
until:
- "</s>"
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
metric_list:
- metric: !function bleu.smoothed_bleu_4
aggregation: mean
higher_is_better: True
group:
- codexglue_code2text
task: code2text_python
dataset_path: CM/codexglue_code2text_python
training_split: train
validation_split: validation
test_split: test
output_type: generate_until
generation_kwargs:
num_beams: 10
max_length: 128
until:
- "</s>"
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
metric_list:
- metric: !function bleu.smoothed_bleu_4
aggregation: mean
higher_is_better: True
group:
- codexglue_code2text
task: code2text_ruby
dataset_path: CM/codexglue_code2text_ruby
training_split: train
validation_split: validation
test_split: test
output_type: generate_until
generation_kwargs:
num_beams: 10
max_length: 128
until:
- "</s>"
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
metric_list:
- metric: !function bleu.smoothed_bleu_4
aggregation: mean
higher_is_better: True
def doc_to_text(doc):
inputs = " ".join(doc["code_tokens"]).replace("\n", " ")
inputs = " ".join(inputs.strip().split())
return inputs
def doc_to_target(doc):
targets = " ".join(doc["docstring_tokens"]).replace("\n", "")
targets = " ".join(targets.strip().split())
return targets
task: coqa
dataset_path: EleutherAI/coqa
output_type: greedy_until
output_type: generate_until
training_split: train
validation_split: validation
doc_to_text: !function utils.doc_to_text
......
......@@ -93,10 +93,9 @@ All tasks evaluate the percentage of more-stereotypical sentences that are rated
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation?
* [x] The original paper does not for causal language models, so
* [x] The original paper does not for causal language models, so this is a novel formulation of the task for autoregressive LMs.
If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
* [x] This matches the evaluations performed in the [Pythia paper](https://arxiv.org/abs/2304.01373)
group: csatqa
dataset_path: EleutherAI/csatqa
test_split: test
output_type: multiple_choice
process_docs: !function utils.process_docs
doc_to_text: "{{question}}"
doc_to_choice: "{{choices}}"
doc_to_target: "{{gold}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
"""
Take in a YAML, and output all other splits with this YAML
"""
import os
import yaml
import argparse
from tqdm import tqdm
from lm_eval.logger import eval_logger
SUBSETS = ["WR", "GR", "RCS", "RCSS", "RCH", "LI"]
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--base_yaml_path", required=True)
parser.add_argument("--save_prefix_path", default="csatqa")
parser.add_argument("--task_prefix", default="")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
# get filename of base_yaml so we can `"include": ` it in our other YAMLs.
base_yaml_name = os.path.split(args.base_yaml_path)[-1]
with open(args.base_yaml_path) as f:
base_yaml = yaml.full_load(f)
for name in tqdm(SUBSETS):
yaml_dict = {
"include": base_yaml_name,
"task": f"csatqa_{args.task_prefix}_{name}"
if args.task_prefix != ""
else f"csatqa_{name.lower()}",
"dataset_name": name,
}
file_save_path = args.save_prefix_path + f"_{name.lower()}.yaml"
eval_logger.info(f"Saving yaml for subset {name} to {file_save_path}")
with open(file_save_path, "w") as yaml_file:
yaml.dump(
yaml_dict,
yaml_file,
width=float("inf"),
allow_unicode=True,
default_style='"',
)
"dataset_name": "GR"
"include": "_default_csatqa_yaml"
"task": "csatqa_gr"
"dataset_name": "LI"
"include": "_default_csatqa_yaml"
"task": "csatqa_li"
"dataset_name": "RCH"
"include": "_default_csatqa_yaml"
"task": "csatqa_rch"
"dataset_name": "RCS"
"include": "_default_csatqa_yaml"
"task": "csatqa_rcs"
"dataset_name": "RCSS"
"include": "_default_csatqa_yaml"
"task": "csatqa_rcss"
"dataset_name": "WR"
"include": "_default_csatqa_yaml"
"task": "csatqa_wr"
import datasets
def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
def _process_doc(doc):
instruction = f"""다음을 읽고 정답으로 알맞은 것을 고르시요.
### Context: {doc["context"]}
### Question: {doc["question"]}
### Options:
(1) {doc['option#1']}\n(2) {doc["option#2"]}\n(3) {doc["option#3"]}\n(4) {doc['option#4']}\n(5) {doc['option#5']}
### Answer: 주어진 문제의 정답은"""
out_doc = {
"question": instruction,
"choices": ["(1)", "(2)", "(3)", "(4)", "(5)"],
"gold": int(doc["gold"]) - 1,
}
return out_doc
return dataset.map(_process_doc)
task: drop
dataset_path: EleutherAI/drop
output_type: greedy_until
output_type: generate_until
training_split: train
validation_split: validation
process_docs: !function utils.process_docs
......
......@@ -3,7 +3,7 @@ group:
task: gsm8k_cot
dataset_path: gsm8k
dataset_name: main
output_type: greedy_until
output_type: generate_until
test_split: test
doc_to_text: "Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\n\nA: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.\n\n\
Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\n\nA: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.\n\n\
......@@ -14,8 +14,7 @@ Q: There were nine computers in the server room. Five more computers were instal
Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?\n\nA: Michael started with 58 golf balls. After losing 23 on tuesday, he had 58 - 23 = 35. After losing 2 more, he had 35 - 2 = 33 golf balls. The answer is 33.\n\n\
Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?\n\nA: Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15 dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The answer is 8.\n\n\
Q: {{question}}\n\nA:"
doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}"
gold_alias: "{{answer.split('### ')[-1].rstrip()}}" # this post-processes the reference that we'll score against
doc_to_target: " {{answer.split('### ')[-1].rstrip()}}"
metric_list:
- metric: exact_match
aggregation: mean
......@@ -25,6 +24,8 @@ metric_list:
regexes_to_ignore:
- ","
- "\\$"
- "(?s).*#### "
- "\n\n"
generation_kwargs:
until:
- "Q:"
......@@ -37,5 +38,5 @@ filter_list:
- name: "get-answer"
filter:
- function: "regex"
regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)."
- function: "take_first"
group:
- math_word_problems
task: gsm8k_yaml
task: gsm8k
dataset_path: gsm8k
dataset_name: main
output_type: greedy_until
output_type: generate_until
training_split: train
fewshot_split: train
test_split: test
doc_to_text: "Question: {{question}}\nAnswer:"
doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}"
gold_alias: "{{answer.split('### ')[-1].rstrip()}}" # this post-processes the reference that we'll score against
metric_list:
- metric: exact_match
aggregation: mean
......@@ -19,7 +18,7 @@ metric_list:
regexes_to_ignore:
- ","
- "\\$"
- ".*### "
- "(?s).*#### "
generation_kwargs:
until:
- "\n\n"
......@@ -28,9 +27,9 @@ generation_kwargs:
temperature: 0.0
repeats: 1
num_fewshot: 5
# filter_list:
# - name: "get-answer"
# filter:
# - function: "regex"
# regex_pattern: "### (\\-?[0-9\\.\\,]+)"
# - function: "take_first"
filter_list:
- name: "get-answer"
filter:
- function: "regex"
regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
- function: "take_first"
......@@ -9,7 +9,6 @@
# template_aliases: #"{% set answer_choices = range(1, 11)|list %}"
# doc_to_text: 'Activity: "{{activity}}"\nRating:'
# doc_to_target: "{{answer_choices[label]}}"
# gold_alias: "{{label}}" # this will be cast to an int.
# metric_list:
# - metric: acc
# TODO: we want this to be implemented as a winograd_schema task type, actually
task: logieval
dataset_path: baber/logiqa2
dataset_name: logieval
output_type: greedy_until
output_type: generate_until
training_split: train
test_split: test
# Instructions + {content}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment