Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
af3cccc8
Commit
af3cccc8
authored
Apr 26, 2022
by
Tian Yun
Browse files
Merge branch 'master' of
https://github.com/cjlovering/ps-eh
parents
716c87d6
94218002
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
81 additions
and
6 deletions
+81
-6
lm_eval/base.py
lm_eval/base.py
+20
-6
lm_eval/metrics.py
lm_eval/metrics.py
+61
-0
No files found.
lm_eval/base.py
View file @
af3cccc8
import
abc
from
typing
import
Iterable
from
typing
import
Iterable
,
Optional
import
promptsource
import
numpy
as
np
...
...
@@ -348,17 +348,25 @@ class BaseLM(LM):
for
context
,
until
in
tqdm
(
reord
.
get_reordered
()):
if
isinstance
(
until
,
str
):
until
=
[
until
]
max_length
=
None
elif
isinstance
(
until
,
list
)
and
len
(
until
)
==
2
:
until
,
max_length
=
[
until
[
0
]],
until
[
1
]
elif
isinstance
(
until
,
list
):
max_length
=
None
# TODO: Come back to for generation `eos`.
primary_until
=
self
.
tok_encode
(
until
[
0
])
context_enc
=
torch
.
tensor
(
[
self
.
tok_encode
(
context
)[
self
.
max_gen_toks
-
self
.
max_length
:]]
).
to
(
self
.
device
)
if
max_length
is
not
None
:
max_length
=
min
(
max_length
,
context_enc
.
shape
[
1
]
+
self
.
max_gen_toks
)
else
:
max_length
=
context_enc
.
shape
[
1
]
+
self
.
max_gen_toks
cont
=
self
.
_model_generate
(
context_enc
,
context_enc
.
shape
[
1
]
+
self
.
max_
g
en
_toks
,
max_
l
en
gth
,
torch
.
tensor
(
primary_until
),
)
...
...
@@ -652,7 +660,7 @@ class PromptSourceTask(Task):
super
().
__init__
(
data_dir
,
cache_dir
,
download_mode
)
self
.
prompt
=
prompt
def
stopping_criteria
(
self
):
def
stopping_criteria
(
self
)
->
Optional
[
str
]
:
"""Denote where the generation should end.
For example, for coqa, this is '
\n
Q:' and for drop '.'.
...
...
@@ -661,6 +669,10 @@ class PromptSourceTask(Task):
"""
return
None
def
max_generation_length
(
self
)
->
Optional
[
int
]:
"""Denote where the max length of the generation if it is obvious from the task."""
return
None
def
is_generation_task
(
self
):
return
(
"BLEU"
in
self
.
prompt
.
metadata
.
metrics
...
...
@@ -718,7 +730,9 @@ class PromptSourceTask(Task):
_requests
.
append
(
ll_answer_choice
)
else
:
# TODO(Albert): What is the stop symbol? Is it model specific?
cont_request
=
rf
.
greedy_until
(
ctx
,
[
self
.
stopping_criteria
()])
cont_request
=
rf
.
greedy_until
(
ctx
,
[
self
.
stopping_criteria
(),
self
.
max_generation_length
()]
)
_requests
.
append
(
cont_request
)
return
_requests
...
...
lm_eval/metrics.py
View file @
af3cccc8
import
typing
import
math
from
collections.abc
import
Iterable
import
numpy
as
np
import
sacrebleu
from
rouge_score
import
rouge_scorer
import
sklearn.metrics
import
random
...
...
@@ -184,6 +186,65 @@ def _sacreformat(refs, preds):
return
refs
,
preds
def
rouge
(
refs
:
typing
.
List
[
str
],
pred
:
str
,
rouge_types
:
typing
.
List
[
str
]
=
[
"rouge1"
,
"rouge2"
,
"rougeL"
,
"rougeLsum"
]
):
""" ROUGE with multi-reference support
Implementation based on GEM-metrics:
https://github.com/GEM-benchmark/GEM-metrics/blob/431a8174bd6b3637e8d6118bfad2983e39e99733/gem_metrics/rouge.py
:param refs:
A `list` of reference `str`s.
:param pred:
A single prediction `str`s.
"""
scorer
=
rouge_scorer
.
RougeScorer
(
rouge_types
=
rouge_types
,
use_stemmer
=
True
)
# ROUGE multi-ref jackknifing
if
len
(
refs
)
>
1
:
cur_scores
=
[
scorer
.
score
(
ref
,
pred
)
for
ref
in
refs
]
# get best score for all leave-one-out sets
best_scores
=
[]
for
leave
in
range
(
len
(
refs
)):
cur_scores_leave_one
=
[
cur_scores
[
s
]
for
s
in
range
(
len
(
refs
))
if
s
!=
leave
]
best_scores
.
append
(
{
rouge_type
:
max
(
[
s
[
rouge_type
]
for
s
in
cur_scores_leave_one
],
key
=
lambda
s
:
s
.
fmeasure
,
)
for
rouge_type
in
rouge_types
}
)
# average the leave-one-out bests to produce the final score
score
=
{
rouge_type
:
rouge_scorer
.
scoring
.
Score
(
np
.
mean
([
b
[
rouge_type
].
precision
for
b
in
best_scores
]),
np
.
mean
([
b
[
rouge_type
].
recall
for
b
in
best_scores
]),
np
.
mean
([
b
[
rouge_type
].
fmeasure
for
b
in
best_scores
]),
)
for
rouge_type
in
rouge_types
}
else
:
score
=
scorer
.
score
(
refs
[
0
],
pred
)
# convert the named tuples to plain nested dicts
score
=
{
rouge_type
:
{
"precision"
:
score
[
rouge_type
].
precision
,
"recall"
:
score
[
rouge_type
].
recall
,
"fmeasure"
:
score
[
rouge_type
].
fmeasure
,
}
for
rouge_type
in
rouge_types
}
return
score
# stderr stuff
class
_bootstrap_internal
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment