Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
4e5a328e
Commit
4e5a328e
authored
Jul 06, 2023
by
haileyschoelkopf
Browse files
bugfixes + add write_out
parent
ff8903f2
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
24 additions
and
18 deletions
+24
-18
lm_eval/api/metrics.py
lm_eval/api/metrics.py
+0
-7
lm_eval/api/task.py
lm_eval/api/task.py
+2
-2
lm_eval/evaluator.py
lm_eval/evaluator.py
+22
-9
No files found.
lm_eval/api/metrics.py
View file @
4e5a328e
...
...
@@ -362,10 +362,3 @@ def stderr_for_metric(metric, bootstrap_iters):
stderr
=
{
mean
:
mean_stderr
,
acc_all
:
acc_all_stderr
}
return
stderr
.
get
(
metric
,
None
)
def
yesno
(
x
):
if
x
:
return
"yes"
else
:
return
"no"
lm_eval/api/task.py
View file @
4e5a328e
...
...
@@ -63,7 +63,7 @@ class TaskConfig(dict):
fewshot_split
:
str
=
None
# TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
# formatting / prompting options.
# see docs/advanced_task_guide.md for more info
template_aliases
:
str
=
None
template_aliases
:
str
=
""
doc_to_text
:
Union
[
Callable
,
str
]
=
None
doc_to_target
:
Union
[
Callable
,
str
]
=
None
gold_alias
:
Union
[
Callable
,
str
]
=
None
...
...
@@ -89,7 +89,7 @@ class TaskConfig(dict):
# allow user-specified aliases so that users can
# force prompt-compatibility for some prompt regardless of
# field names in prompt
if
self
.
template_aliases
is
not
None
:
if
type
(
self
.
template_aliases
)
==
str
:
if
type
(
self
.
doc_to_text
)
==
str
:
self
.
doc_to_text
=
self
.
template_aliases
+
self
.
doc_to_text
...
...
lm_eval/evaluator.py
View file @
4e5a328e
...
...
@@ -199,6 +199,19 @@ def evaluate(
task
.
build_all_requests
(
limit
=
limit
,
rank
=
lm
.
rank
,
world_size
=
lm
.
world_size
)
eval_logger
.
info
(
f
"Task:
{
task_name
}
; number of requests on this rank:
{
len
(
task
.
instances
)
}
"
)
if
write_out
:
for
inst
in
task
.
instances
:
# print the prompt for the first few documents
if
inst
.
doc_id
<
4
:
print
(
f
"Task:
{
task_name
}
; document
{
inst
.
doc_id
}
; context prompt (starting on next line):
\n
{
inst
.
args
[
0
]
}
\n
(end of prompt on previous line)"
)
print
(
"Request:"
,
inst
)
# aggregate Instances by LM method requested to get output.
reqtype
=
(
"loglikelihood"
...
...
@@ -338,16 +351,16 @@ def evaluate(
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
if
bootstrap_iters
>
0
:
stderr
=
lm_eval
.
api
.
metrics
.
stderr_for_metric
(
metric
=
task
.
aggregation
()[
metric
],
bootstrap_iters
=
min
(
bootstrap_iters
,
1000
)
if
metric
in
[
"bleu"
,
"chrf"
,
"ter"
]
else
bootstrap_iters
,
)
stderr
=
lm_eval
.
api
.
metrics
.
stderr_for_metric
(
metric
=
task
.
aggregation
()[
metric
],
bootstrap_iters
=
min
(
bootstrap_iters
,
1000
)
if
metric
in
[
"bleu"
,
"chrf"
,
"ter"
]
else
bootstrap_iters
,
)
if
stderr
is
not
None
:
results
[
task_name
][
metric
+
"_stderr"
+
","
+
key
]
=
stderr
(
items
)
if
stderr
is
not
None
:
results
[
task_name
][
metric
+
"_stderr"
+
","
+
key
]
=
stderr
(
items
)
return
{
"results"
:
dict
(
results
),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment