Unverified Commit 6769119f authored by Hailey Schoelkopf's avatar Hailey Schoelkopf Committed by GitHub
Browse files

Merge pull request #816 from EleutherAI/flan-benchmark

[Refactor] Flan benchmark
parents 4824a832 7d5e511c
group:
- codexglue_code2text
task: code2text_javascript
dataset_path: CM/codexglue_code2text_javascript
training_split: train
validation_split: validation
test_split: test
output_type: greedy_until
generation_kwargs:
num_beams: 10
max_length: 128
until:
- "</s>"
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
metric_list:
- metric: !function bleu.smoothed_bleu_4
aggregation: mean
higher_is_better: True
group:
- codexglue_code2text
task: code2text_php
dataset_path: CM/codexglue_code2text_php
training_split: train
validation_split: validation
test_split: test
output_type: greedy_until
generation_kwargs:
num_beams: 10
max_length: 128
until:
- "</s>"
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
metric_list:
- metric: !function bleu.smoothed_bleu_4
aggregation: mean
higher_is_better: True
group:
- codexglue_code2text
task: code2text_python
dataset_path: CM/codexglue_code2text_python
training_split: train
validation_split: validation
test_split: test
output_type: greedy_until
generation_kwargs:
num_beams: 10
max_length: 128
until:
- "</s>"
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
metric_list:
- metric: !function bleu.smoothed_bleu_4
aggregation: mean
higher_is_better: True
group:
- codexglue_code2text
task: code2text_ruby
dataset_path: CM/codexglue_code2text_ruby
training_split: train
validation_split: validation
test_split: test
output_type: greedy_until
generation_kwargs:
num_beams: 10
max_length: 128
until:
- "</s>"
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
metric_list:
- metric: !function bleu.smoothed_bleu_4
aggregation: mean
higher_is_better: True
def doc_to_text(doc):
inputs = " ".join(doc["code_tokens"]).replace("\n", " ")
inputs = " ".join(inputs.strip().split())
return inputs
def doc_to_target(doc):
targets = " ".join(doc["docstring_tokens"]).replace("\n", "")
targets = " ".join(targets.strip().split())
return targets
This source diff could not be displayed because it is too large. You can view the blob instead.
"""
Take in a YAML, and output all other splits with this YAML
"""
import os
import yaml
import argparse
from tqdm import tqdm
from lm_eval import utils
from lm_eval.logger import eval_logger
SUBJECTS = [
"abstract_algebra",
"anatomy",
"astronomy",
"business_ethics",
"clinical_knowledge",
"college_biology",
"college_chemistry",
"college_computer_science",
"college_mathematics",
"college_medicine",
"college_physics",
"computer_security",
"conceptual_physics",
"econometrics",
"electrical_engineering",
"elementary_mathematics",
"formal_logic",
"global_facts",
"high_school_biology",
"high_school_chemistry",
"high_school_computer_science",
"high_school_european_history",
"high_school_geography",
"high_school_government_and_politics",
"high_school_macroeconomics",
"high_school_mathematics",
"high_school_microeconomics",
"high_school_physics",
"high_school_psychology",
"high_school_statistics",
"high_school_us_history",
"high_school_world_history",
"human_aging",
"human_sexuality",
"international_law",
"jurisprudence",
"logical_fallacies",
"machine_learning",
"management",
"marketing",
"medical_genetics",
"miscellaneous",
"moral_disputes",
"moral_scenarios",
"nutrition",
"philosophy",
"prehistory",
"professional_accounting",
"professional_law",
"professional_medicine",
"professional_psychology",
"public_relations",
"security_studies",
"sociology",
"us_foreign_policy",
"virology",
"world_religions",
]
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--base_yaml_path", required=True)
parser.add_argument("--save_prefix_path", default="flan")
parser.add_argument("--cot_prompt_path", default=None)
parser.add_argument("--task_prefix", default="")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
# get filename of base_yaml so we can `"include": ` it in our other YAMLs.
base_yaml_name = os.path.split(args.base_yaml_path)[-1]
with open(args.base_yaml_path) as f:
base_yaml = yaml.full_load(f)
if args.cot_prompt_path is not None:
import json
with open(args.cot_prompt_path) as f:
cot_file = json.load(f)
for subject in tqdm(SUBJECTS):
if args.cot_prompt_path is not None:
description = cot_file[subject]
else:
description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
yaml_dict = {
"include": base_yaml_name,
"task": f"mmlu_{args.task_prefix}_{subject}"
if args.task_prefix != ""
else f"mmlu_{subject}",
"dataset_name": subject,
"description": description,
}
file_save_path = args.save_prefix_path + f"_{subject}.yaml"
eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
with open(file_save_path, "w") as yaml_file:
yaml.dump(
yaml_dict,
yaml_file,
width=float("inf"),
allow_unicode=True,
default_style='"',
)
group: mmlu
dataset_path: cais/mmlu
test_split: test
fewshot_split: dev
fewshot_config:
sampler: first_n
output_type: multiple_choice
doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: answer
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
"dataset_name": "abstract_algebra"
"description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n"
"include": "_default_template_yaml"
"task": "mmlu_abstract_algebra"
"dataset_name": "anatomy"
"description": "The following are multiple choice questions (with answers) about anatomy.\n\n"
"include": "_default_template_yaml"
"task": "mmlu_anatomy"
"dataset_name": "astronomy"
"description": "The following are multiple choice questions (with answers) about astronomy.\n\n"
"include": "_default_template_yaml"
"task": "mmlu_astronomy"
"dataset_name": "business_ethics"
"description": "The following are multiple choice questions (with answers) about business ethics.\n\n"
"include": "_default_template_yaml"
"task": "mmlu_business_ethics"
"dataset_name": "clinical_knowledge"
"description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n"
"include": "_default_template_yaml"
"task": "mmlu_clinical_knowledge"
"dataset_name": "college_biology"
"description": "The following are multiple choice questions (with answers) about college biology.\n\n"
"include": "_default_template_yaml"
"task": "mmlu_college_biology"
"dataset_name": "college_chemistry"
"description": "The following are multiple choice questions (with answers) about college chemistry.\n\n"
"include": "_default_template_yaml"
"task": "mmlu_college_chemistry"
"dataset_name": "college_computer_science"
"description": "The following are multiple choice questions (with answers) about college computer science.\n\n"
"include": "_default_template_yaml"
"task": "mmlu_college_computer_science"
"dataset_name": "college_mathematics"
"description": "The following are multiple choice questions (with answers) about college mathematics.\n\n"
"include": "_default_template_yaml"
"task": "mmlu_college_mathematics"
"dataset_name": "college_medicine"
"description": "The following are multiple choice questions (with answers) about college medicine.\n\n"
"include": "_default_template_yaml"
"task": "mmlu_college_medicine"
"dataset_name": "college_physics"
"description": "The following are multiple choice questions (with answers) about college physics.\n\n"
"include": "_default_template_yaml"
"task": "mmlu_college_physics"
"dataset_name": "computer_security"
"description": "The following are multiple choice questions (with answers) about computer security.\n\n"
"include": "_default_template_yaml"
"task": "mmlu_computer_security"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment