Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
fc329d31
Commit
fc329d31
authored
Aug 15, 2023
by
lintangsutawika
Browse files
update
parent
60ca1a27
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
42 additions
and
74 deletions
+42
-74
lm_eval/tasks/squadv2/default.yaml
lm_eval/tasks/squadv2/default.yaml
+24
-24
lm_eval/tasks/squadv2/metric.py
lm_eval/tasks/squadv2/metric.py
+0
-39
lm_eval/tasks/squadv2/utils.py
lm_eval/tasks/squadv2/utils.py
+18
-11
No files found.
lm_eval/tasks/squadv2/default.yaml
View file @
fc329d31
...
...
@@ -21,27 +21,27 @@ filter_list:
-
function
:
remove_whitespace
-
function
:
take_first
metric_list
:
-
metric
:
!function
metric.
exact
aggregation
:
mean
higher_is_better
:
true
-
metric
:
!function
metric.
f1
aggregation
:
mean
higher_is_better
:
true
-
metric
:
!function
metric.
HasAns_exact
aggregation
:
mean
higher_is_better
:
true
-
metric
:
!function
metric.
HasAns_f1
aggregation
:
mean
higher_is_better
:
true
-
metric
:
!function
metric.
NoAns_exact
aggregation
:
mean
higher_is_better
:
true
-
metric
:
!function
metric.
NoAns_f1
aggregation
:
mean
higher_is_better
:
true
-
metric
:
!function
metric.
best_exact
aggregation
:
mean
higher_is_better
:
true
-
metric
:
!function
metric.
best_f1
aggregation
:
mean
higher_is_better
:
true
-
metric
:
exact
aggregation
:
!function
utils.exact
higher_is_better
:
true
#
- metric: f1
#
aggregation: mean
#
higher_is_better: true
#
- metric: HasAns_exact
#
aggregation: mean
#
higher_is_better: true
#
- metric: HasAns_f1
#
aggregation: mean
#
higher_is_better: true
#
- metric: NoAns_exact
#
aggregation: mean
#
higher_is_better: true
#
- metric: NoAns_f1
#
aggregation: mean
#
higher_is_better: true
#
- metric: best_exact
#
aggregation: mean
#
higher_is_better: true
#
- metric: best_f1
#
aggregation: mean
#
higher_is_better: true
lm_eval/tasks/squadv2/metric.py
deleted
100644 → 0
View file @
60ca1a27
import
evaluate
from
functools
import
partial
def
_squad_metric
(
predictions
,
references
):
squad_metric
=
evaluate
.
load
(
"squad_v2"
)
return
squad_metric
.
compute
(
predictions
=
predictions
,
references
=
references
)
# Exact match (the normalized answer exactly match the gold answer)
def
exact
(
predictions
,
references
):
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
).
get
(
"exact"
,
0
)
# The F-score of predicted tokens versus the gold answer
def
f1
(
predictions
,
references
):
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
).
get
(
"f1"
,
0
)
# Exact match (the normalized answer exactly match the gold answer)
def
HasAns_exact
(
predictions
,
references
):
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
).
get
(
"HasAns_exact"
,
0
)
# The F-score of predicted tokens versus the gold answer
def
HasAns_f1
(
predictions
,
references
):
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
).
get
(
"HasAns_f1"
,
0
)
# Exact match (the normalized answer exactly match the gold answer)
def
NoAns_exact
(
predictions
,
references
):
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
).
get
(
"NoAns_exact"
,
0
)
# The F-score of predicted tokens versus the gold answer
def
NoAns_f1
(
predictions
,
references
):
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
).
get
(
"NoAns_f1"
,
0
)
# Best exact match (with varying threshold)
def
best_exact
(
predictions
,
references
):
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
).
get
(
"best_exact"
,
0
)
# Best F1 (with varying threshold)
def
best_f1
(
predictions
,
references
):
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
).
get
(
"best_f1"
,
0
)
lm_eval/tasks/squadv2/utils.py
View file @
fc329d31
...
...
@@ -20,8 +20,12 @@ def process_results(doc, results):
"answers"
:
doc
[
"answers"
],
}
print
(
_squad_metric
(
predictions
,
references
))
return
_squad_metric
(
predictions
,
references
)
return
{
"predictions"
:
predictions
,
"reference"
:
references
}
# return _squad_metric([predictions], [references])
# return {key: value if key in metrics for key, value in score.items()}
def
_squad_metric
(
predictions
,
references
):
...
...
@@ -29,33 +33,36 @@ def _squad_metric(predictions, references):
return
squad_metric
.
compute
(
predictions
=
predictions
,
references
=
references
)
# Exact match (the normalized answer exactly match the gold answer)
def
exact
(
predictions
,
references
):
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
).
get
(
"exact"
,
0
)
def
exact
(
items
):
print
(
items
)
import
sys
;
sys
.
exit
()
predictions
,
references
=
zip
(
*
items
)
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
)[
"exact"
]
# The F-score of predicted tokens versus the gold answer
def
f1
(
predictions
,
references
):
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
)
.
get
(
"f1"
,
0
)
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
)
[
"f1"
]
# Exact match (the normalized answer exactly match the gold answer)
def
HasAns_exact
(
predictions
,
references
):
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
)
.
get
(
"HasAns_exact"
,
0
)
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
)
[
"HasAns_exact"
]
# The F-score of predicted tokens versus the gold answer
def
HasAns_f1
(
predictions
,
references
):
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
)
.
get
(
"HasAns_f1"
,
0
)
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
)
[
"HasAns_f1"
]
# Exact match (the normalized answer exactly match the gold answer)
def
NoAns_exact
(
predictions
,
references
):
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
)
.
get
(
"NoAns_exact"
,
0
)
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
)
[
"NoAns_exact"
]
# The F-score of predicted tokens versus the gold answer
def
NoAns_f1
(
predictions
,
references
):
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
)
.
get
(
"NoAns_f1"
,
0
)
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
)
[
"NoAns_f1"
]
# Best exact match (with varying threshold)
def
best_exact
(
predictions
,
references
):
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
)
.
get
(
"best_exact"
,
0
)
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
)
[
"best_exact"
]
# Best F1 (with varying threshold)
def
best_f1
(
predictions
,
references
):
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
)
.
get
(
"best_f1"
,
0
)
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
)
[
"best_f1"
]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment