Unverified Commit 6f700f98 authored by Hailey Schoelkopf's avatar Hailey Schoelkopf Committed by GitHub
Browse files

Merge pull request #815 from EleutherAI/model-written-eval

[Refactor] Model written eval
parents c7b3f538 9f09016d
......@@ -5,3 +5,4 @@ maka
mor
te
ond
extraversion
......@@ -182,6 +182,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
assert args.output_path, "Specify --output_path"
eval_logger.info(f"Selected Tasks: {task_names}")
eval_logger.verbose = args.verbose
results = evaluator.simple_evaluate(
model=args.model,
......
import yaml
import inspect
import datasets
from tqdm import tqdm
def main() -> None:
dataset_path = "EleutherAI/advanced_ai_risk"
for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()):
file_name = f"{task}.yaml"
try:
with open(f"{file_name}", "w") as f:
f.write("# Generated by _generate_configs.py\n")
yaml.dump(
{
"include": "_template_yaml",
"task": f"{dataset_path.split('/')[-1]}_{task}",
"dataset_name": task,
},
f,
)
except FileExistsError:
pass
if __name__ == "__main__":
main()
group: advanced_ai_risk
dataset_path: EleutherAI/advanced_ai_risk
output_type: multiple_choice
validation_split: validation
target_delimiter: ""
doc_to_text: "Human: {question}\n\nAssistant:"
doc_to_target: 0
doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
should_decontaminate: true
doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}"
metric_list:
- metric: acc
# Generated by _generate_configs.py
dataset_name: fewshot-coordinate-itself
include: _template_yaml
task: advanced_ai_risk_fewshot-coordinate-itself
# Generated by _generate_configs.py
dataset_name: fewshot-coordinate-other-ais
include: _template_yaml
task: advanced_ai_risk_fewshot-coordinate-other-ais
# Generated by _generate_configs.py
dataset_name: fewshot-coordinate-other-versions
include: _template_yaml
task: advanced_ai_risk_fewshot-coordinate-other-versions
# Generated by _generate_configs.py
dataset_name: fewshot-corrigible-less-HHH
include: _template_yaml
task: advanced_ai_risk_fewshot-corrigible-less-HHH
# Generated by _generate_configs.py
dataset_name: fewshot-corrigible-more-HHH
include: _template_yaml
task: advanced_ai_risk_fewshot-corrigible-more-HHH
# Generated by _generate_configs.py
dataset_name: fewshot-corrigible-neutral-HHH
include: _template_yaml
task: advanced_ai_risk_fewshot-corrigible-neutral-HHH
# Generated by _generate_configs.py
dataset_name: fewshot-myopic-reward
include: _template_yaml
task: advanced_ai_risk_fewshot-myopic-reward
# Generated by _generate_configs.py
dataset_name: fewshot-one-box-tendency
include: _template_yaml
task: advanced_ai_risk_fewshot-one-box-tendency
# Generated by _generate_configs.py
dataset_name: fewshot-power-seeking-inclination
include: _template_yaml
task: advanced_ai_risk_fewshot-power-seeking-inclination
# Generated by _generate_configs.py
dataset_name: fewshot-self-awareness-general-ai
include: _template_yaml
task: advanced_ai_risk_fewshot-self-awareness-general-ai
# Generated by _generate_configs.py
dataset_name: fewshot-self-awareness-good-text-model
include: _template_yaml
task: advanced_ai_risk_fewshot-self-awareness-good-text-model
# Generated by _generate_configs.py
dataset_name: fewshot-self-awareness-text-model
include: _template_yaml
task: advanced_ai_risk_fewshot-self-awareness-text-model
# Generated by _generate_configs.py
dataset_name: fewshot-self-awareness-training-architecture
include: _template_yaml
task: advanced_ai_risk_fewshot-self-awareness-training-architecture
# Generated by _generate_configs.py
dataset_name: fewshot-self-awareness-training-web-gpt
include: _template_yaml
task: advanced_ai_risk_fewshot-self-awareness-training-web-gpt
# Generated by _generate_configs.py
dataset_name: fewshot-survival-instinct
include: _template_yaml
task: advanced_ai_risk_fewshot-survival-instinct
# Generated by _generate_configs.py
dataset_name: fewshot-wealth-seeking-inclination
include: _template_yaml
task: advanced_ai_risk_fewshot-wealth-seeking-inclination
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment