Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
e56b950a
Commit
e56b950a
authored
May 19, 2023
by
lintangsutawika
Browse files
able to use prompts from promptsource
parent
1c7521a0
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
101 additions
and
36 deletions
+101
-36
lm_eval/api/task.py
lm_eval/api/task.py
+27
-22
lm_eval/prompts/__init__.py
lm_eval/prompts/__init__.py
+32
-14
lm_eval/tasks/super_glue/wsc.fixed/template-00.yaml
lm_eval/tasks/super_glue/wsc.fixed/template-00.yaml
+14
-0
lm_eval/tasks/super_glue/wsc.fixed/template-01.yaml
lm_eval/tasks/super_glue/wsc.fixed/template-01.yaml
+14
-0
lm_eval/tasks/super_glue/wsc.fixed/template-02.yaml
lm_eval/tasks/super_glue/wsc.fixed/template-02.yaml
+14
-0
No files found.
lm_eval/api/task.py
View file @
e56b950a
...
...
@@ -228,7 +228,7 @@ class Task(abc.ABC):
return
self
.
validation_docs
()
else
:
eval_logger
.
warning
(
"has_training_docs and has_validation_docs are False"
,
"has_training_docs and has_validation_docs are False"
"using test_docs but this is not recommended."
)
return
self
.
test_docs
()
...
...
@@ -519,7 +519,19 @@ class ConfigurableTask(Task):
[[
"take_first"
,
None
]]
)
]
if
self
.
_config
.
use_prompt
is
not
None
:
eval_logger
.
info
(
f
"loading prompt
{
self
.
_config
.
use_prompt
}
"
)
self
.
prompt
=
get_prompt
(
self
.
_config
.
use_prompt
,
self
.
DATASET_PATH
,
self
.
DATASET_NAME
)
else
:
self
.
prompt
=
None
if
self
.
fewshot_docs
()
!=
None
:
self
.
sampler
=
samplers
.
Sampler
(
list
(
self
.
fewshot_docs
()),
self
,
rnd
=
random
.
Random
())
# TODO: pass the correct docs in here
...
...
@@ -583,42 +595,35 @@ class ConfigurableTask(Task):
return
doc
def
doc_to_text
(
self
,
doc
):
if
self
.
_config
.
use_prompt
is
not
None
:
doc_to_text
=
get_prompt
(
self
.
_config
.
use_prompt
,
self
.
DATASET_NAME
,
self
.
DATASET_PATH
)
if
self
.
prompt
is
not
None
:
doc_to_text
=
self
.
prompt
else
:
doc_to_text
=
self
.
_config
.
doc_to_text
if
type
(
doc_to_text
)
==
str
:
return
utils
.
apply_template
(
doc_to_text
,
doc
)
elif
callable
(
doc_to_text
):
if
hasattr
(
doc_to_text
,
"apply"
):
return
doc_to_text
.
apply
(
doc
)[
0
]
else
:
return
doc_to_text
(
doc
)
return
doc_to_text
(
doc
)
if
hasattr
(
doc_to_text
,
"apply"
):
return
doc_to_text
.
apply
(
doc
)[
0
]
else
:
print
(
type
(
doc_to_text
))
raise
TypeError
def
doc_to_target
(
self
,
doc
):
if
self
.
_config
.
use_prompt
is
not
None
:
doc_to_target
=
get_prompt
(
self
.
_config
.
use_prompt
,
self
.
DATASET_NAME
,
self
.
DATASET_PATH
)
if
self
.
prompt
is
not
None
:
doc_to_target
=
self
.
prompt
else
:
doc_to_target
=
self
.
_config
.
doc_to_target
if
type
(
doc_to_target
)
==
str
:
return
utils
.
apply_template
(
doc_to_target
,
doc
)
elif
callable
(
doc_to_target
):
if
hasattr
(
doc_to_target
,
"apply"
):
return
doc_to_target
.
apply
(
doc
)[
1
]
else
:
return
doc_to_target
(
doc
)
return
doc_to_target
(
doc
)
elif
hasattr
(
doc_to_target
,
"apply"
):
return
doc_to_target
.
apply
(
doc
)[
1
]
else
:
raise
TypeError
...
...
lm_eval/prompts/__init__.py
View file @
e56b950a
from
lm_eval.logger
import
eval_logger
from
promptsource.templates
import
DatasetTemplates
# TODO: decide whether we want jinja2 or f-string prompts. would it be cursed to support both?
# Prompt library.
# Stores prompts in a dictionary indexed by 2 levels:
...
...
@@ -10,20 +13,35 @@ PROMPT_REGISTRY = {
},
}
def
get_prompt
(
prompt_id
:
str
,
dataset_name
=
None
,
dataset_path
=
None
):
def
get_prompt
(
prompt_id
:
str
,
dataset_name
=
None
,
subset_name
=
None
):
# unpack prompt name
category_name
,
prompt_name
=
prompt_id
.
split
(
":"
)
if
category_name
==
"promptsource"
:
from
promptsource.templates
import
DatasetTemplates
if
prompt_name
in
prompts
.
all_template_names
:
prompts
=
DatasetTemplates
(
dataset_name
,
dataset_path
)
return
prompts
[
prompt_name
]
category_name
,
prompt_name
=
prompt_id
.
split
(
":"
)
eval_logger
.
info
(
f
"Loading prompt from
{
category_name
}
"
)
if
category_name
==
"promptsource"
:
try
:
# prompts = DatasetTemplates(dataset_name, dataset_path)
if
subset_name
==
None
:
prompts
=
DatasetTemplates
(
dataset_name
=
dataset_name
)
else
:
prompts
=
DatasetTemplates
(
dataset_name
=
dataset_name
,
subset_name
=
subset_name
)
except
:
raise
ValueError
(
f
"
{
dataset_name
}
and
{
subset_name
}
not found"
)
if
prompt_name
in
prompts
.
all_template_names
:
return
prompts
[
prompt_name
]
else
:
try
:
return
PROMPT_REGISTRY
[
category_name
][
prompt_name
]
except
:
raise
ValueError
(
f
"expected only a single `:` as separator between
\
prompt category and name, but got `
{
prompt_id
}
` instead"
)
raise
ValueError
(
f
"
{
prompt_name
}
not in prompt list
{
prompts
.
all_template_names
}
"
)
else
:
try
:
return
PROMPT_REGISTRY
[
category_name
][
prompt_name
]
except
:
raise
ValueError
(
f
"expected only a single `:` as separator between
\
prompt category and name, but got `
{
prompt_id
}
` instead"
)
lm_eval/tasks/super_glue/wsc.fixed/template-00.yaml
0 → 100644
View file @
e56b950a
group
:
-
t0-eval
task
:
"
does
the
pronoun
refer
to"
dataset_path
:
super_glue
dataset_name
:
wsc.fixed
training_split
:
train
validation_split
:
validation
use_prompt
:
"
promptsource:does
the
pronoun
refer
to"
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
higher_is_better
:
true
ignore_case
:
true
ignore_punctuation
:
true
lm_eval/tasks/super_glue/wsc.fixed/template-01.yaml
0 → 100644
View file @
e56b950a
group
:
-
t0-eval
task
:
"
by
p
they
mean"
dataset_path
:
super_glue
dataset_name
:
wsc.fixed
training_split
:
train
validation_split
:
validation
use_prompt
:
"
promptsource:by
p
they
mean"
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
higher_is_better
:
true
ignore_case
:
true
ignore_punctuation
:
true
lm_eval/tasks/super_glue/wsc.fixed/template-02.yaml
0 → 100644
View file @
e56b950a
group
:
-
t0-eval
task
:
"
in
other
words"
dataset_path
:
super_glue
dataset_name
:
wsc.fixed
training_split
:
train
validation_split
:
validation
use_prompt
:
"
promptsource:in
other
words"
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
higher_is_better
:
true
ignore_case
:
true
ignore_punctuation
:
true
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment