Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
16c4afc6
Commit
16c4afc6
authored
Aug 03, 2023
by
lintangsutawika
Browse files
Merge branch 'big-refactor' of
https://github.com/EleutherAI/lm-evaluation-harness
into toxicity
parents
7b376ae1
176d5a26
Changes
248
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
117 additions
and
75 deletions
+117
-75
lm_eval/tasks/arithmetic/arithmetic_4ds.yaml
lm_eval/tasks/arithmetic/arithmetic_4ds.yaml
+1
-13
lm_eval/tasks/arithmetic/arithmetic_5da.yaml
lm_eval/tasks/arithmetic/arithmetic_5da.yaml
+1
-13
lm_eval/tasks/arithmetic/arithmetic_5ds.yaml
lm_eval/tasks/arithmetic/arithmetic_5ds.yaml
+1
-13
lm_eval/tasks/benchmarks/pythia.yaml
lm_eval/tasks/benchmarks/pythia.yaml
+12
-0
lm_eval/tasks/benchmarks/t0_eval.yaml
lm_eval/tasks/benchmarks/t0_eval.yaml
+91
-0
lm_eval/tasks/hendrycks_ethics/commonsense.yaml
lm_eval/tasks/hendrycks_ethics/commonsense.yaml
+1
-1
lm_eval/tasks/hendrycks_ethics/deontology.yaml
lm_eval/tasks/hendrycks_ethics/deontology.yaml
+0
-1
lm_eval/tasks/hendrycks_ethics/justice.yaml
lm_eval/tasks/hendrycks_ethics/justice.yaml
+0
-1
lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml
lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml
+0
-4
lm_eval/tasks/lambada/lambada_openai.yaml
lm_eval/tasks/lambada/lambada_openai.yaml
+0
-1
lm_eval/tasks/lambada/lambada_standard.yaml
lm_eval/tasks/lambada/lambada_standard.yaml
+0
-1
lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml
lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml
+0
-1
lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml
lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml
+0
-1
lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml
lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml
+0
-1
lm_eval/tasks/pile/pile_arxiv.yaml
lm_eval/tasks/pile/pile_arxiv.yaml
+1
-2
lm_eval/tasks/super_glue/boolq/default.yaml
lm_eval/tasks/super_glue/boolq/default.yaml
+1
-1
lm_eval/tasks/super_glue/cb/t5-prompt.yaml
lm_eval/tasks/super_glue/cb/t5-prompt.yaml
+5
-1
lm_eval/tasks/super_glue/copa/t5-prompt.yaml
lm_eval/tasks/super_glue/copa/t5-prompt.yaml
+3
-1
lm_eval/tasks/super_glue/multirc/promptsource-00.yaml
lm_eval/tasks/super_glue/multirc/promptsource-00.yaml
+0
-14
lm_eval/tasks/super_glue/multirc/promptsource-01.yaml
lm_eval/tasks/super_glue/multirc/promptsource-01.yaml
+0
-5
No files found.
lm_eval/tasks/arithmetic/arithmetic_4ds.yaml
View file @
16c4afc6
group
:
-
arithmetic
include
:
arithmetic_1dc.yaml
task
:
arithmetic_4ds
dataset_path
:
EleutherAI/arithmetic
dataset_name
:
arithmetic_4ds
output_type
:
loglikelihood
validation_split
:
validation
test_split
:
null
template_aliases
:
"
"
doc_to_text
:
"
{{context}}"
doc_to_target
:
"
{{completion}}"
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
lm_eval/tasks/arithmetic/arithmetic_5da.yaml
View file @
16c4afc6
group
:
-
arithmetic
include
:
arithmetic_1dc.yaml
task
:
arithmetic_5da
dataset_path
:
EleutherAI/arithmetic
dataset_name
:
arithmetic_5da
output_type
:
loglikelihood
validation_split
:
validation
test_split
:
null
template_aliases
:
"
"
doc_to_text
:
"
{{context}}"
doc_to_target
:
"
{{completion}}"
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
lm_eval/tasks/arithmetic/arithmetic_5ds.yaml
View file @
16c4afc6
group
:
-
arithmetic
include
:
arithmetic_1dc.yaml
task
:
arithmetic_5ds
dataset_path
:
EleutherAI/arithmetic
dataset_name
:
arithmetic_5ds
output_type
:
loglikelihood
validation_split
:
validation
test_split
:
null
template_aliases
:
"
"
doc_to_text
:
"
{{context}}"
doc_to_target
:
"
{{completion}}"
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
lm_eval/tasks/benchmarks/pythia.yaml
0 → 100644
View file @
16c4afc6
group
:
pythia
task
:
-
lambada_openai
-
wikitext
-
piqa
-
sciq
-
wsc
-
winogrande
-
arc_*
# - logiqa
# - blimp_*
# - hendrycksTest*
lm_eval/tasks/benchmarks/t0_eval.yaml
0 → 100644
View file @
16c4afc6
group
:
t0_eval
task
:
# # Coreference Resolution
# - dataset_path: super_glue
# dataset_name: wsc.fixed
# use_prompt: promptsource:*
# training_split: train
# validation_split: validation
# metric_list:
# - metric: exact_match
# aggregation: mean
# higher_is_better: true
# ignore_case: true
# ignore_punctuation: true
# # Coreference Resolution
# - dataset_path: winogrande
# dataset_name: winogrande_xl
# use_prompt: promptsource:*
# training_split: train
# validation_split: validation
# metric_list:
# - metric: exact_match
# aggregation: mean
# higher_is_better: true
# ignore_case: true
# ignore_punctuation: true
# Natural Language Inference
-
dataset_path
:
super_glue
dataset_name
:
cb
use_prompt
:
promptsource:*
training_split
:
train
validation_split
:
validation
output_type
:
greedy_until
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
higher_is_better
:
true
ignore_case
:
true
ignore_punctuation
:
true
# Natural Language Inference
# - dataset_path: super_glue
# dataset_name: rte
# use_prompt: promptsource:*
# training_split: train
# validation_split: validation
# metric_list:
# - metric: exact_match
# aggregation: mean
# higher_is_better: true
# ignore_case: true
# ignore_punctuation: true
# # Natural Language Inference
# # - dataset_path: anli
# # use_prompt: promptsource:*
# # training_split: train_r1
# # validation_split: dev_r1
# # Sentence Completion
# - dataset_path: super_glue
# dataset_name: copa
# use_prompt: promptsource:*
# training_split: train
# validation_split: validation
# metric_list:
# - metric: exact_match
# aggregation: mean
# higher_is_better: true
# ignore_case: true
# ignore_punctuation: true
# # Natural Language Inference
# - dataset_path: hellaswag
# use_prompt: promptsource:*
# training_split: train
# validation_split: validation
# metric_list:
# - metric: exact_match
# aggregation: mean
# higher_is_better: true
# ignore_case: true
# ignore_punctuation: true
# # Word Sense Disambiguation
# - dataset_path: super_glue
# dataset_name: wic
# use_prompt: promptsource:*
# training_split: train
# validation_split: validation
# metric_list:
# - metric: exact_match
# aggregation: mean
# higher_is_better: true
# ignore_case: true
# ignore_punctuation: true
lm_eval/tasks/hendrycks_ethics/commonsense.yaml
View file @
16c4afc6
group
:
-
hendrycks_ethics
task
:
ethics_cm
dataset_path
:
hails
/hendrycks_ethics
dataset_path
:
EleutherAI
/hendrycks_ethics
dataset_name
:
commonsense
output_type
:
multiple_choice
training_split
:
train
...
...
lm_eval/tasks/hendrycks_ethics/deontology.yaml
View file @
16c4afc6
include
:
commonsense.yaml
task
:
ethics_deontology
dataset_path
:
hails/hendrycks_ethics
dataset_name
:
deontology
doc_to_text
:
"
Question:
Would
most
people
believe
this
reasonable
or
unreasonable
to
say?
\"
{{scenario}}
{{excuse.rstrip()}}
\"\n
Answer:"
doc_to_target
:
label
...
...
lm_eval/tasks/hendrycks_ethics/justice.yaml
View file @
16c4afc6
...
...
@@ -3,6 +3,5 @@ group:
-
hendrycks_ethics
task
:
ethics_justice
dataset_name
:
justice
output_type
:
multiple_choice
doc_to_text
:
"
Question:
Would
most
people
believe
this
reasonable
or
unreasonable
to
say?
\"
{{scenario}}
\"\n
Answer:"
# TODO: impl. exact match for this and deontology
lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml
View file @
16c4afc6
...
...
@@ -2,11 +2,7 @@ include: commonsense.yaml
group
:
-
hendrycks_ethics
task
:
ethics_utilitarianism
dataset_path
:
hails/hendrycks_ethics
dataset_name
:
utilitarianism
output_type
:
multiple_choice
training_split
:
train
test_split
:
test
doc_to_text
:
!function
utils.doc_to_text
doc_to_target
:
!function
utils.doc_to_target
doc_to_choice
:
[
'
no'
,
'
yes'
]
...
...
lm_eval/tasks/lambada/lambada_openai.yaml
View file @
16c4afc6
...
...
@@ -7,7 +7,6 @@ dataset_path: EleutherAI/lambada_openai
dataset_name
:
default
output_type
:
loglikelihood
test_split
:
test
template_aliases
:
"
"
doc_to_text
:
"
{{text.split('
')[:-1]|join('
')}}"
doc_to_target
:
"
{{'
'+text.split('
')[-1]}}"
should_decontaminate
:
true
...
...
lm_eval/tasks/lambada/lambada_standard.yaml
View file @
16c4afc6
...
...
@@ -8,7 +8,6 @@ dataset_name: null
output_type
:
loglikelihood
validation_split
:
validation
test_split
:
test
template_aliases
:
"
"
doc_to_text
:
"
{{text.split('
')[:-1]|join('
')}}"
doc_to_target
:
"
{{'
'+text.split('
')[-1]}}"
should_decontaminate
:
true
...
...
lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml
View file @
16c4afc6
...
...
@@ -6,7 +6,6 @@ dataset_path: EleutherAI/lambada_openai
dataset_name
:
default
output_type
:
loglikelihood
test_split
:
test
template_aliases
:
"
"
doc_to_text
:
"
{{text.split('
')[:-1]|join('
')}}
____.
->"
doc_to_target
:
"
{{'
'+text.split('
')[-1]}}"
should_decontaminate
:
true
...
...
lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml
View file @
16c4afc6
...
...
@@ -7,7 +7,6 @@ dataset_name: null
output_type
:
loglikelihood
validation_split
:
validation
test_split
:
test
template_aliases
:
"
"
doc_to_text
:
"
{{text.split('
')[:-1]|join('
')}}
____.
->"
doc_to_target
:
"
{{'
'+text.split('
')[-1]}}"
should_decontaminate
:
true
...
...
lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml
View file @
16c4afc6
...
...
@@ -7,7 +7,6 @@ dataset_path: EleutherAI/lambada_openai
dataset_name
:
en
output_type
:
loglikelihood
test_split
:
test
template_aliases
:
"
"
doc_to_text
:
"
{{text.split('
')[:-1]|join('
')}}"
doc_to_target
:
"
{{'
'+text.split('
')[-1]}}"
should_decontaminate
:
true
...
...
lm_eval/tasks/pile/pile_arxiv.yaml
View file @
16c4afc6
...
...
@@ -3,11 +3,10 @@ group:
-
perplexity
-
loglikelihood_rolling
task
:
pile_arxiv
dataset_path
:
EleutherAI/
the_
pile
dataset_path
:
EleutherAI/pile
dataset_name
:
pile_arxiv
output_type
:
loglikelihood_rolling
test_split
:
train
template_aliases
:
"
"
doc_to_text
:
"
"
doc_to_target
:
"
{{text}}"
should_decontaminate
:
true
...
...
lm_eval/tasks/super_glue/boolq/default.yaml
View file @
16c4afc6
group
:
-
super-glue-lm-eval-v1
task
:
"
boolq
"
task
:
boolq
dataset_path
:
super_glue
dataset_name
:
boolq
output_type
:
multiple_choice
...
...
lm_eval/tasks/super_glue/cb/t5-prompt.yaml
View file @
16c4afc6
...
...
@@ -5,11 +5,15 @@ dataset_path: super_glue
dataset_name
:
cb
training_split
:
train
validation_split
:
validation
output_type
:
greedy_until
doc_to_text
:
"
cb
hypothesis:
{{hypothesis}}
premise
{{premise}}"
doc_to_target
:
"
{%
set
answer_choices
=
['entailment',
'contradiction',
'neutral']
%}{{answer_choices[label]}}"
doc_to_target
:
label
doc_to_choice
:
[
'
entailment'
,
'
contradiction'
,
'
neutral'
]
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
higher_is_better
:
true
ignore_case
:
true
ignore_punctuation
:
true
-
metric
:
f1
aggregation
:
!function
"
aggregate.cb_multi_fi"
lm_eval/tasks/super_glue/copa/t5-prompt.yaml
View file @
16c4afc6
...
...
@@ -5,8 +5,10 @@ dataset_path: super_glue
dataset_name
:
copa
training_split
:
train
validation_split
:
validation
output_type
:
greedy_until
doc_to_text
:
"
copa
choice1:
{{choice1}}
choice2:
{{choice2}}
question:
{{question}}"
doc_to_target
:
"
{%
set
answer_choices
=
['False',
'True']
%}{{answer_choices[label]}}"
doc_to_target
:
label
doc_to_choice
:
[
'
False'
,
'
True'
]
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
...
...
lm_eval/tasks/super_glue/multirc/promptsource-00.yaml
deleted
100644 → 0
View file @
7b376ae1
group
:
-
super-glue-promptsource
task
:
"
I
was
going
to
say…"
dataset_path
:
super_glue
dataset_name
:
multirc
training_split
:
train
validation_split
:
validation
use_prompt
:
"
promptsource:I
was
going
to
say…"
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
higher_is_better
:
true
ignore_case
:
true
ignore_punctuation
:
true
lm_eval/tasks/super_glue/multirc/promptsource-01.yaml
deleted
100644 → 0
View file @
7b376ae1
include
:
promptsource-00.yaml
group
:
-
super-glue-promptsource
task
:
"
Would
it
be
good
to
answer…"
use_prompt
:
"
promptsource:Would
it
be
good
to
answer…"
Prev
1
2
3
4
5
6
…
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment