Commit 16c4afc6 authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into toxicity

parents 7b376ae1 176d5a26
group:
- arithmetic
include: arithmetic_1dc.yaml
task: arithmetic_4ds
dataset_path: EleutherAI/arithmetic
dataset_name: arithmetic_4ds
output_type: loglikelihood
validation_split: validation
test_split: null
template_aliases: ""
doc_to_text: "{{context}}"
doc_to_target: "{{completion}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
group:
- arithmetic
include: arithmetic_1dc.yaml
task: arithmetic_5da
dataset_path: EleutherAI/arithmetic
dataset_name: arithmetic_5da
output_type: loglikelihood
validation_split: validation
test_split: null
template_aliases: ""
doc_to_text: "{{context}}"
doc_to_target: "{{completion}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
group:
- arithmetic
include: arithmetic_1dc.yaml
task: arithmetic_5ds
dataset_path: EleutherAI/arithmetic
dataset_name: arithmetic_5ds
output_type: loglikelihood
validation_split: validation
test_split: null
template_aliases: ""
doc_to_text: "{{context}}"
doc_to_target: "{{completion}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
group: pythia
task:
- lambada_openai
- wikitext
- piqa
- sciq
- wsc
- winogrande
- arc_*
# - logiqa
# - blimp_*
# - hendrycksTest*
group: t0_eval
task:
# # Coreference Resolution
# - dataset_path: super_glue
# dataset_name: wsc.fixed
# use_prompt: promptsource:*
# training_split: train
# validation_split: validation
# metric_list:
# - metric: exact_match
# aggregation: mean
# higher_is_better: true
# ignore_case: true
# ignore_punctuation: true
# # Coreference Resolution
# - dataset_path: winogrande
# dataset_name: winogrande_xl
# use_prompt: promptsource:*
# training_split: train
# validation_split: validation
# metric_list:
# - metric: exact_match
# aggregation: mean
# higher_is_better: true
# ignore_case: true
# ignore_punctuation: true
# Natural Language Inference
- dataset_path: super_glue
dataset_name: cb
use_prompt: promptsource:*
training_split: train
validation_split: validation
output_type: greedy_until
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
# Natural Language Inference
# - dataset_path: super_glue
# dataset_name: rte
# use_prompt: promptsource:*
# training_split: train
# validation_split: validation
# metric_list:
# - metric: exact_match
# aggregation: mean
# higher_is_better: true
# ignore_case: true
# ignore_punctuation: true
# # Natural Language Inference
# # - dataset_path: anli
# # use_prompt: promptsource:*
# # training_split: train_r1
# # validation_split: dev_r1
# # Sentence Completion
# - dataset_path: super_glue
# dataset_name: copa
# use_prompt: promptsource:*
# training_split: train
# validation_split: validation
# metric_list:
# - metric: exact_match
# aggregation: mean
# higher_is_better: true
# ignore_case: true
# ignore_punctuation: true
# # Natural Language Inference
# - dataset_path: hellaswag
# use_prompt: promptsource:*
# training_split: train
# validation_split: validation
# metric_list:
# - metric: exact_match
# aggregation: mean
# higher_is_better: true
# ignore_case: true
# ignore_punctuation: true
# # Word Sense Disambiguation
# - dataset_path: super_glue
# dataset_name: wic
# use_prompt: promptsource:*
# training_split: train
# validation_split: validation
# metric_list:
# - metric: exact_match
# aggregation: mean
# higher_is_better: true
# ignore_case: true
# ignore_punctuation: true
group:
- hendrycks_ethics
task: ethics_cm
dataset_path: hails/hendrycks_ethics
dataset_path: EleutherAI/hendrycks_ethics
dataset_name: commonsense
output_type: multiple_choice
training_split: train
......
include: commonsense.yaml
task: ethics_deontology
dataset_path: hails/hendrycks_ethics
dataset_name: deontology
doc_to_text: "Question: Would most people believe this reasonable or unreasonable to say? \"{{scenario}} {{excuse.rstrip()}}\"\nAnswer:"
doc_to_target: label
......
......@@ -3,6 +3,5 @@ group:
- hendrycks_ethics
task: ethics_justice
dataset_name: justice
output_type: multiple_choice
doc_to_text: "Question: Would most people believe this reasonable or unreasonable to say? \"{{scenario}}\"\nAnswer:"
# TODO: impl. exact match for this and deontology
......@@ -2,11 +2,7 @@ include: commonsense.yaml
group:
- hendrycks_ethics
task: ethics_utilitarianism
dataset_path: hails/hendrycks_ethics
dataset_name: utilitarianism
output_type: multiple_choice
training_split: train
test_split: test
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
doc_to_choice: ['no', 'yes']
......
......@@ -7,7 +7,6 @@ dataset_path: EleutherAI/lambada_openai
dataset_name: default
output_type: loglikelihood
test_split: test
template_aliases: ""
doc_to_text: "{{text.split(' ')[:-1]|join(' ')}}"
doc_to_target: "{{' '+text.split(' ')[-1]}}"
should_decontaminate: true
......
......@@ -8,7 +8,6 @@ dataset_name: null
output_type: loglikelihood
validation_split: validation
test_split: test
template_aliases: ""
doc_to_text: "{{text.split(' ')[:-1]|join(' ')}}"
doc_to_target: "{{' '+text.split(' ')[-1]}}"
should_decontaminate: true
......
......@@ -6,7 +6,6 @@ dataset_path: EleutherAI/lambada_openai
dataset_name: default
output_type: loglikelihood
test_split: test
template_aliases: ""
doc_to_text: "{{text.split(' ')[:-1]|join(' ')}} ____. ->"
doc_to_target: "{{' '+text.split(' ')[-1]}}"
should_decontaminate: true
......
......@@ -7,7 +7,6 @@ dataset_name: null
output_type: loglikelihood
validation_split: validation
test_split: test
template_aliases: ""
doc_to_text: "{{text.split(' ')[:-1]|join(' ')}} ____. ->"
doc_to_target: "{{' '+text.split(' ')[-1]}}"
should_decontaminate: true
......
......@@ -7,7 +7,6 @@ dataset_path: EleutherAI/lambada_openai
dataset_name: en
output_type: loglikelihood
test_split: test
template_aliases: ""
doc_to_text: "{{text.split(' ')[:-1]|join(' ')}}"
doc_to_target: "{{' '+text.split(' ')[-1]}}"
should_decontaminate: true
......
......@@ -3,11 +3,10 @@ group:
- perplexity
- loglikelihood_rolling
task: pile_arxiv
dataset_path: EleutherAI/the_pile
dataset_path: EleutherAI/pile
dataset_name: pile_arxiv
output_type: loglikelihood_rolling
test_split: train
template_aliases: ""
doc_to_text: ""
doc_to_target: "{{text}}"
should_decontaminate: true
......
group:
- super-glue-lm-eval-v1
task: "boolq"
task: boolq
dataset_path: super_glue
dataset_name: boolq
output_type: multiple_choice
......
......@@ -5,11 +5,15 @@ dataset_path: super_glue
dataset_name: cb
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: "cb hypothesis: {{hypothesis}} premise {{premise}}"
doc_to_target: "{% set answer_choices = ['entailment', 'contradiction', 'neutral'] %}{{answer_choices[label]}}"
doc_to_target: label
doc_to_choice: ['entailment', 'contradiction', 'neutral']
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
- metric: f1
aggregation: !function "aggregate.cb_multi_fi"
......@@ -5,8 +5,10 @@ dataset_path: super_glue
dataset_name: copa
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: "copa choice1: {{choice1}} choice2: {{choice2}} question: {{question}}"
doc_to_target: "{% set answer_choices = ['False', 'True'] %}{{answer_choices[label]}}"
doc_to_target: label
doc_to_choice: ['False', 'True']
metric_list:
- metric: exact_match
aggregation: mean
......
group:
- super-glue-promptsource
task: "I was going to say…"
dataset_path: super_glue
dataset_name: multirc
training_split: train
validation_split: validation
use_prompt: "promptsource:I was going to say…"
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
include: promptsource-00.yaml
group:
- super-glue-promptsource
task: "Would it be good to answer…"
use_prompt: "promptsource:Would it be good to answer…"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment