Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
cf8a257c
Commit
cf8a257c
authored
Dec 18, 2024
by
Baber
Browse files
fix metrics
parent
76e517d1
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
14 additions
and
6 deletions
+14
-6
lm_eval/api/task.py
lm_eval/api/task.py
+5
-4
lm_eval/tasks/longbench/metrics.py
lm_eval/tasks/longbench/metrics.py
+9
-2
No files found.
lm_eval/api/task.py
View file @
cf8a257c
...
@@ -322,10 +322,11 @@ class Task(abc.ABC):
...
@@ -322,10 +322,11 @@ class Task(abc.ABC):
elif
self
.
has_validation_docs
():
elif
self
.
has_validation_docs
():
return
self
.
validation_docs
()
return
self
.
validation_docs
()
else
:
else
:
eval_logger
.
warning
(
if
self
.
config
.
get
(
"num_fewshot"
,
0
)
>
0
:
f
"[Task:
{
self
.
config
.
task
}
] has_training_docs and has_validation_docs are False"
eval_logger
.
warning
(
", using test_docs as fewshot_docs but this is not recommended."
f
"[Task:
{
self
.
config
.
task
}
] has_training_docs and has_validation_docs are False"
)
", using test_docs as fewshot_docs but this is not recommended."
)
return
self
.
test_docs
()
return
self
.
test_docs
()
def
_process_doc
(
self
,
doc
:
dict
)
->
dict
:
def
_process_doc
(
self
,
doc
:
dict
)
->
dict
:
...
...
lm_eval/tasks/longbench/metrics.py
View file @
cf8a257c
...
@@ -134,7 +134,10 @@ def rouge_zh_score(predictions: list[str], references: list[str], **kwargs) -> f
...
@@ -134,7 +134,10 @@ def rouge_zh_score(predictions: list[str], references: list[str], **kwargs) -> f
def
f1_score
(
predictions
:
list
[
str
],
references
:
list
[
str
],
**
kwargs
):
def
f1_score
(
predictions
:
list
[
str
],
references
:
list
[
str
],
**
kwargs
):
prediction
,
ground_truth
=
predictions
[
0
],
references
[
0
]
try
:
prediction
,
ground_truth
=
predictions
[
0
],
references
[
0
]
except
:
return
0.0
common
=
Counter
(
prediction
)
&
Counter
(
ground_truth
)
common
=
Counter
(
prediction
)
&
Counter
(
ground_truth
)
num_same
=
sum
(
common
.
values
())
num_same
=
sum
(
common
.
values
())
if
num_same
==
0
:
if
num_same
==
0
:
...
@@ -152,7 +155,11 @@ def qa_f1_score(predictions: list[str], references: list[str], **kwargs) -> floa
...
@@ -152,7 +155,11 @@ def qa_f1_score(predictions: list[str], references: list[str], **kwargs) -> floa
prediction_tokens
=
normalized_prediction
.
split
()
prediction_tokens
=
normalized_prediction
.
split
()
ground_truth_tokens
=
normalized_ground_truth
.
split
()
ground_truth_tokens
=
normalized_ground_truth
.
split
()
return
f1_score
(
prediction_tokens
,
ground_truth_tokens
)
try
:
res
=
f1_score
(
prediction_tokens
,
ground_truth_tokens
)
except
:
return
0.0
return
res
def
qa_f1_zh_score
(
predictions
:
list
[
str
],
references
:
list
[
str
],
**
kwargs
)
->
float
:
def
qa_f1_zh_score
(
predictions
:
list
[
str
],
references
:
list
[
str
],
**
kwargs
)
->
float
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment