Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
85ffb7fc
Commit
85ffb7fc
authored
Oct 06, 2021
by
Leo Gao
Browse files
Add underscores to metrics
parent
0538d4c9
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
46 additions
and
46 deletions
+46
-46
lm_eval/tasks/truthfulqa.py
lm_eval/tasks/truthfulqa.py
+45
-45
tests/testdata/truthfulqa_gen-v0-res.json
tests/testdata/truthfulqa_gen-v0-res.json
+1
-1
No files found.
lm_eval/tasks/truthfulqa.py
View file @
85ffb7fc
...
...
@@ -294,69 +294,69 @@ class TruthfulQAGeneration(Task):
rougeL_acc
=
int
(
rougeL_correct
>
rougeL_incorrect
)
return
{
"bleurt
max"
:
bleurt_max
,
"bleurt
acc"
:
bleurt_acc
,
"bleurt
diff"
:
bleurt_diff
,
"bleurt
_
max"
:
bleurt_max
,
"bleurt
_
acc"
:
bleurt_acc
,
"bleurt
_
diff"
:
bleurt_diff
,
"bleu
max"
:
bleu_max
,
"bleu
acc"
:
bleu_acc
,
"bleu
diff"
:
bleu_diff
,
"bleu
_
max"
:
bleu_max
,
"bleu
_
acc"
:
bleu_acc
,
"bleu
_
diff"
:
bleu_diff
,
"rouge1
max"
:
rouge1_max
,
"rouge1
acc"
:
rouge1_acc
,
"rouge1
diff"
:
rouge1_diff
,
"rouge1
_
max"
:
rouge1_max
,
"rouge1
_
acc"
:
rouge1_acc
,
"rouge1
_
diff"
:
rouge1_diff
,
"rouge2
max"
:
rouge2_max
,
"rouge2
acc"
:
rouge2_acc
,
"rouge2
diff"
:
rouge2_diff
,
"rouge2
_
max"
:
rouge2_max
,
"rouge2
_
acc"
:
rouge2_acc
,
"rouge2
_
diff"
:
rouge2_diff
,
"rougeL
max"
:
rougeL_max
,
"rougeL
acc"
:
rougeL_acc
,
"rougeL
diff"
:
rougeL_diff
,
"rougeL
_
max"
:
rougeL_max
,
"rougeL
_
acc"
:
rougeL_acc
,
"rougeL
_
diff"
:
rougeL_diff
,
}
def
aggregation
(
self
):
return
{
"bleurt
max"
:
mean
,
"bleurt
acc"
:
mean
,
"bleurt
diff"
:
mean
,
"bleurt
_
max"
:
mean
,
"bleurt
_
acc"
:
mean
,
"bleurt
_
diff"
:
mean
,
"bleu
max"
:
mean
,
"bleu
acc"
:
mean
,
"bleu
diff"
:
mean
,
"bleu
_
max"
:
mean
,
"bleu
_
acc"
:
mean
,
"bleu
_
diff"
:
mean
,
"rouge1
max"
:
mean
,
"rouge1
acc"
:
mean
,
"rouge1
diff"
:
mean
,
"rouge1
_
max"
:
mean
,
"rouge1
_
acc"
:
mean
,
"rouge1
_
diff"
:
mean
,
"rouge2
max"
:
mean
,
"rouge2
acc"
:
mean
,
"rouge2
diff"
:
mean
,
"rouge2
_
max"
:
mean
,
"rouge2
_
acc"
:
mean
,
"rouge2
_
diff"
:
mean
,
"rougeL
max"
:
mean
,
"rougeL
acc"
:
mean
,
"rougeL
diff"
:
mean
,
"rougeL
_
max"
:
mean
,
"rougeL
_
acc"
:
mean
,
"rougeL
_
diff"
:
mean
,
}
def
higher_is_better
(
self
):
return
{
"bleurt
max"
:
True
,
"bleurt
acc"
:
True
,
"bleurt
diff"
:
True
,
"bleurt
_
max"
:
True
,
"bleurt
_
acc"
:
True
,
"bleurt
_
diff"
:
True
,
"bleu
max"
:
True
,
"bleu
acc"
:
True
,
"bleu
diff"
:
True
,
"bleu
_
max"
:
True
,
"bleu
_
acc"
:
True
,
"bleu
_
diff"
:
True
,
"rouge1
max"
:
True
,
"rouge1
acc"
:
True
,
"rouge1
diff"
:
True
,
"rouge1
_
max"
:
True
,
"rouge1
_
acc"
:
True
,
"rouge1
_
diff"
:
True
,
"rouge2
max"
:
True
,
"rouge2
acc"
:
True
,
"rouge2
diff"
:
True
,
"rouge2
_
max"
:
True
,
"rouge2
_
acc"
:
True
,
"rouge2
_
diff"
:
True
,
"rougeL
max"
:
True
,
"rougeL
acc"
:
True
,
"rougeL
diff"
:
True
,
"rougeL
_
max"
:
True
,
"rougeL
_
acc"
:
True
,
"rougeL
_
diff"
:
True
,
}
tests/testdata/truthfulqa_gen-v0-res.json
View file @
85ffb7fc
{
"results"
:
{
"truthfulqa_gen"
:
{
"bleu acc"
:
0.0
,
"bleu acc_stderr"
:
0.0
,
"bleu diff"
:
0.0
,
"bleu diff_stderr"
:
0.0
,
"bleu max"
:
0.0
,
"bleu max_stderr"
:
0.0
,
"bleurt acc"
:
0.835985312117503
,
"bleurt acc_stderr"
:
0.012962704327492454
,
"bleurt diff"
:
0.14077322143090107
,
"bleurt diff_stderr"
:
0.005459888909582694
,
"bleurt max"
:
-1.4399358725752065
,
"bleurt max_stderr"
:
0.0022126992369197133
,
"rouge1 acc"
:
0.0
,
"rouge1 acc_stderr"
:
0.0
,
"rouge1 diff"
:
0.0
,
"rouge1 diff_stderr"
:
0.0
,
"rouge1 max"
:
0.0
,
"rouge1 max_stderr"
:
0.0
,
"rouge2 acc"
:
0.0
,
"rouge2 acc_stderr"
:
0.0
,
"rouge2 diff"
:
0.0
,
"rouge2 diff_stderr"
:
0.0
,
"rouge2 max"
:
0.0
,
"rouge2 max_stderr"
:
0.0
,
"rougeL acc"
:
0.0
,
"rougeL acc_stderr"
:
0.0
,
"rougeL diff"
:
0.0
,
"rougeL diff_stderr"
:
0.0
,
"rougeL max"
:
0.0
,
"rougeL max_stderr"
:
0.0
}},
"versions"
:
{
"truthfulqa_gen"
:
0
}}
\ No newline at end of file
{
"results"
:
{
"truthfulqa_gen"
:
{
"bleu_acc"
:
0.0
,
"bleu_acc_stderr"
:
0.0
,
"bleu_diff"
:
0.0
,
"bleu_diff_stderr"
:
0.0
,
"bleu_max"
:
0.0
,
"bleu_max_stderr"
:
0.0
,
"bleurt_acc"
:
0.835985312117503
,
"bleurt_acc_stderr"
:
0.012962704327492454
,
"bleurt_diff"
:
0.14077322143090107
,
"bleurt_diff_stderr"
:
0.005459888909582694
,
"bleurt_max"
:
-1.4399358725752065
,
"bleurt_max_stderr"
:
0.0022126992369197133
,
"rouge1_acc"
:
0.0
,
"rouge1_acc_stderr"
:
0.0
,
"rouge1_diff"
:
0.0
,
"rouge1_diff_stderr"
:
0.0
,
"rouge1_max"
:
0.0
,
"rouge1_max_stderr"
:
0.0
,
"rouge2_acc"
:
0.0
,
"rouge2_acc_stderr"
:
0.0
,
"rouge2_diff"
:
0.0
,
"rouge2_diff_stderr"
:
0.0
,
"rouge2_max"
:
0.0
,
"rouge2_max_stderr"
:
0.0
,
"rougeL_acc"
:
0.0
,
"rougeL_acc_stderr"
:
0.0
,
"rougeL_diff"
:
0.0
,
"rougeL_diff_stderr"
:
0.0
,
"rougeL_max"
:
0.0
,
"rougeL_max_stderr"
:
0.0
}},
"versions"
:
{
"truthfulqa_gen"
:
0
}}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment