Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
135b41ce
Commit
135b41ce
authored
Aug 15, 2023
by
lintangsutawika
Browse files
format
parent
0436b5d6
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
23 additions
and
21 deletions
+23
-21
lm_eval/tasks/truthfulqa/utils.py
lm_eval/tasks/truthfulqa/utils.py
+23
-21
No files found.
lm_eval/tasks/truthfulqa/utils.py
View file @
135b41ce
...
...
@@ -4,6 +4,7 @@ import numpy as np
from
rouge_score
import
rouge_scorer
,
scoring
def
process_results_mc2
(
doc
,
results
):
lls
,
is_greedy
=
zip
(
*
results
)
...
...
@@ -24,7 +25,6 @@ def process_docs_gen(dataset: datasets.Dataset) -> datasets.Dataset:
def
preprocess_function
(
examples
):
def
_format_answers
(
answers
):
formatted_answers
=
[]
for
answer
in
answers
:
...
...
@@ -121,26 +121,27 @@ def process_results_gen(doc, results):
def
bleu
(
refs
,
preds
):
"""
Returns `t5` style BLEU scores. See the related implementation:
https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41
:param refs:
A `list` of `list` of reference `str`s.
:param preds:
A `list` of predicted `str`s.
"""
score
=
sacrebleu
.
corpus_bleu
(
preds
,
refs
,
smooth_method
=
"exp"
,
smooth_value
=
0.0
,
force
=
False
,
lowercase
=
False
,
tokenize
=
"intl"
,
use_effective_order
=
False
,
).
score
return
score
"""
Returns `t5` style BLEU scores. See the related implementation:
https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41
:param refs:
A `list` of `list` of reference `str`s.
:param preds:
A `list` of predicted `str`s.
"""
score
=
sacrebleu
.
corpus_bleu
(
preds
,
refs
,
smooth_method
=
"exp"
,
smooth_value
=
0.0
,
force
=
False
,
lowercase
=
False
,
tokenize
=
"intl"
,
use_effective_order
=
False
,
).
score
return
score
def
rouge
(
refs
,
preds
):
"""
...
...
@@ -169,6 +170,7 @@ def rouge(refs, preds):
result
=
aggregator
.
aggregate
()
return
{
type
:
result
[
type
].
mid
.
fmeasure
*
100
for
type
in
rouge_types
}
# def bleurt_max(predictions, references):
# pass
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment