Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
5ddabc29
Commit
5ddabc29
authored
Aug 13, 2023
by
lintangsutawika
Browse files
update and fixes
parent
c523063d
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
105 additions
and
77 deletions
+105
-77
lm_eval/benchmarks/t0_eval.yaml
lm_eval/benchmarks/t0_eval.yaml
+102
-75
lm_eval/tasks/__init__.py
lm_eval/tasks/__init__.py
+3
-2
No files found.
lm_eval/benchmarks/t0_eval.yaml
View file @
5ddabc29
group
:
t0_eval
group
:
t0_eval
task
:
task
:
#
#
Coreference Resolution
# Coreference Resolution
#
- dataset_path: super_glue
-
dataset_path
:
super_glue
#
dataset_name: wsc.fixed
dataset_name
:
wsc.fixed
#
use_prompt: promptsource:*
use_prompt
:
promptsource:*
#
training_split: train
training_split
:
train
#
validation_split: validation
validation_split
:
validation
#
metric_list:
metric_list
:
#
- metric: exact_match
-
metric
:
exact_match
#
aggregation: mean
aggregation
:
mean
#
higher_is_better: true
higher_is_better
:
true
#
ignore_case: true
ignore_case
:
true
#
ignore_punctuation: true
ignore_punctuation
:
true
#
#
Coreference Resolution
# Coreference Resolution
#
- dataset_path: winogrande
-
dataset_path
:
winogrande
#
dataset_name: winogrande_xl
dataset_name
:
winogrande_xl
#
use_prompt: promptsource:*
use_prompt
:
promptsource:*
#
training_split: train
training_split
:
train
#
validation_split: validation
validation_split
:
validation
#
metric_list:
metric_list
:
#
- metric: exact_match
-
metric
:
exact_match
#
aggregation: mean
aggregation
:
mean
#
higher_is_better: true
higher_is_better
:
true
#
ignore_case: true
ignore_case
:
true
#
ignore_punctuation: true
ignore_punctuation
:
true
# Natural Language Inference
# Natural Language Inference
-
dataset_path
:
super_glue
-
dataset_path
:
super_glue
dataset_name
:
cb
dataset_name
:
cb
...
@@ -37,55 +37,82 @@ task:
...
@@ -37,55 +37,82 @@ task:
higher_is_better
:
true
higher_is_better
:
true
ignore_case
:
true
ignore_case
:
true
ignore_punctuation
:
true
ignore_punctuation
:
true
-
dataset_path
:
super_glue
dataset_name
:
rte
use_prompt
:
promptsource:*
training_split
:
train
validation_split
:
validation
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
higher_is_better
:
true
ignore_case
:
true
ignore_punctuation
:
true
-
task
:
anli_r1
dataset_path
:
anli
use_prompt
:
promptsource:*
training_split
:
train_r1
validation_split
:
dev_r1
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
higher_is_better
:
true
ignore_case
:
true
ignore_punctuation
:
true
-
task
:
anli_r2
dataset_path
:
anli
use_prompt
:
promptsource:*
training_split
:
train_r2
validation_split
:
dev_r2
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
higher_is_better
:
true
ignore_case
:
true
ignore_punctuation
:
true
-
task
:
anli_r3
dataset_path
:
anli
use_prompt
:
promptsource:*
training_split
:
train_r3
validation_split
:
dev_r3
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
higher_is_better
:
true
ignore_case
:
true
ignore_punctuation
:
true
# Sentence Completion
-
dataset_path
:
super_glue
dataset_name
:
copa
use_prompt
:
promptsource:*
training_split
:
train
validation_split
:
validation
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
higher_is_better
:
true
ignore_case
:
true
ignore_punctuation
:
true
# Natural Language Inference
# Natural Language Inference
# - dataset_path: super_glue
-
dataset_path
:
hellaswag
# dataset_name: rte
use_prompt
:
promptsource:*
# use_prompt: promptsource:*
training_split
:
train
# training_split: train
validation_split
:
validation
# validation_split: validation
metric_list
:
# metric_list:
-
metric
:
exact_match
# - metric: exact_match
aggregation
:
mean
# aggregation: mean
higher_is_better
:
true
# higher_is_better: true
ignore_case
:
true
# ignore_case: true
ignore_punctuation
:
true
# ignore_punctuation: true
# Word Sense Disambiguation
# # Natural Language Inference
-
dataset_path
:
super_glue
# # - dataset_path: anli
dataset_name
:
wic
# # use_prompt: promptsource:*
use_prompt
:
promptsource:*
# # training_split: train_r1
training_split
:
train
# # validation_split: dev_r1
validation_split
:
validation
# # Sentence Completion
metric_list
:
# - dataset_path: super_glue
-
metric
:
exact_match
# dataset_name: copa
aggregation
:
mean
# use_prompt: promptsource:*
higher_is_better
:
true
# training_split: train
ignore_case
:
true
# validation_split: validation
ignore_punctuation
:
true
# metric_list:
# - metric: exact_match
# aggregation: mean
# higher_is_better: true
# ignore_case: true
# ignore_punctuation: true
# # Natural Language Inference
# - dataset_path: hellaswag
# use_prompt: promptsource:*
# training_split: train
# validation_split: validation
# metric_list:
# - metric: exact_match
# aggregation: mean
# higher_is_better: true
# ignore_case: true
# ignore_punctuation: true
# # Word Sense Disambiguation
# - dataset_path: super_glue
# dataset_name: wic
# use_prompt: promptsource:*
# training_split: train
# validation_split: validation
# metric_list:
# - metric: exact_match
# aggregation: mean
# higher_is_better: true
# ignore_case: true
# ignore_punctuation: true
lm_eval/tasks/__init__.py
View file @
5ddabc29
...
@@ -44,16 +44,17 @@ def check_prompt_config(config):
...
@@ -44,16 +44,17 @@ def check_prompt_config(config):
prompt_list
=
prompts
.
load_prompt_list
(
prompt_list
=
prompts
.
load_prompt_list
(
use_prompt
=
config
[
"use_prompt"
],
use_prompt
=
config
[
"use_prompt"
],
dataset_name
=
config
[
"dataset_path"
],
dataset_name
=
config
[
"dataset_path"
],
subset_name
=
config
[
"dataset_name"
],
subset_name
=
config
[
"dataset_name"
]
if
"dataset_name"
in
config
else
None
,
)
)
for
idx
,
prompt_variation
in
enumerate
(
prompt_list
):
for
idx
,
prompt_variation
in
enumerate
(
prompt_list
):
task_name
=
[
config
[
"task"
]]
if
"task"
in
config
else
[]
all_configs
.
append
(
all_configs
.
append
(
{
{
**
config
,
**
config
,
**
{
"use_prompt"
:
prompt_variation
},
**
{
"use_prompt"
:
prompt_variation
},
**
{
**
{
"task"
:
"_"
.
join
(
"task"
:
"_"
.
join
(
[
task_name
+
[
get_task_name_from_config
(
config
),
get_task_name_from_config
(
config
),
prompt_variation
,
prompt_variation
,
]
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment