Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
a6bd7126
Commit
a6bd7126
authored
Aug 19, 2023
by
Herbie Bradley
Browse files
Implement ece, remove plotting
parent
dadfd4a8
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
96 additions
and
43 deletions
+96
-43
lm_eval/api/metrics.py
lm_eval/api/metrics.py
+53
-2
lm_eval/api/task.py
lm_eval/api/task.py
+17
-12
lm_eval/evaluator.py
lm_eval/evaluator.py
+0
-26
lm_eval/tasks/logiqa/logiqa.yaml
lm_eval/tasks/logiqa/logiqa.yaml
+1
-2
lm_eval/tasks/logiqa/logiqa_calibration.yaml
lm_eval/tasks/logiqa/logiqa_calibration.yaml
+24
-0
lm_eval/tasks/logiqa/utils_logiqa.py
lm_eval/tasks/logiqa/utils_logiqa.py
+1
-1
No files found.
lm_eval/api/metrics.py
View file @
a6bd7126
import
math
import
random
from
collections.abc
import
Iterable
import
numpy
as
np
import
sacrebleu
import
sklearn.metrics
import
random
from
lm_eval.api.registry
import
register_
metric
,
register_aggregation
from
lm_eval.api.registry
import
register_
aggregation
,
register_metric
# Register Aggregations First
...
...
@@ -56,6 +56,37 @@ def matthews_corrcoef(items):
return
sklearn
.
metrics
.
matthews_corrcoef
(
golds
,
preds
)
@
register_aggregation
(
"ece"
)
def
ece
(
items
:
list
)
->
float
:
probs
:
list
[
float
]
=
[]
scores
:
list
[
float
]
=
[]
for
i
in
range
(
len
(
items
)):
# Get only largest probability from each example
largest_idx
=
np
.
argmax
(
items
[
i
][
"probs"
])
probs
.
append
(
items
[
i
][
"probs"
][
largest_idx
])
scores
.
append
(
items
[
i
][
"scores"
][
largest_idx
])
sorted_indices
=
np
.
argsort
(
probs
)
sorted_probs
=
np
.
asarray
(
probs
)[
sorted_indices
]
sorted_scores
=
np
.
asarray
(
scores
)[
sorted_indices
]
def
bin_to_subsets
(
array
:
np
.
ndarray
,
num_subsets
:
int
=
10
)
->
np
.
ndarray
:
subset_size
:
int
=
len
(
array
)
//
num_subsets
remainder
:
int
=
len
(
array
)
%
num_subsets
subsets
:
list
[
np
.
ndarray
]
=
[]
start
:
int
=
0
for
_
in
range
(
num_subsets
):
subset_end
:
int
=
start
+
subset_size
+
(
1
if
remainder
>
0
else
0
)
subsets
.
append
(
array
[
start
:
subset_end
])
start
=
subset_end
remainder
-=
1
return
subsets
probs
=
np
.
asarray
([
np
.
mean
(
x
)
for
x
in
bin_to_subsets
(
sorted_probs
,
10
)])
freqs
=
np
.
asarray
([
np
.
mean
(
x
)
for
x
in
bin_to_subsets
(
sorted_scores
,
10
)])
return
np
.
sum
(
np
.
abs
(
freqs
-
probs
))
/
len
(
freqs
)
@
register_metric
(
metric
=
"acc"
,
higher_is_better
=
True
,
...
...
@@ -86,6 +117,26 @@ def acc_mutual_info_fn(items): # This is a passthrough function
return
items
@
register_metric
(
metric
=
"ece"
,
higher_is_better
=
False
,
output_type
=
"multiple_choice"
,
aggregation
=
"ece"
,
)
def
ece_fn
(
items
):
# This is a passthrough function
"""
Expected Calibration Error (ECE).
This consists of the average absolute difference between the fraction of
model predictions which are correct and the mean of the model's normalized
probability for those predictions (after binning), for multiple choice questions.
Lower is better.
Paper: https://arxiv.org/abs/2207.05221
"""
return
items
@
register_metric
(
metric
=
"perplexity"
,
higher_is_better
=
False
,
...
...
lm_eval/api/task.py
View file @
a6bd7126
...
...
@@ -651,8 +651,6 @@ class ConfigurableTask(Task):
if
type
(
test_target
)
is
list
:
self
.
multiple_target
=
len
(
test_target
)
self
.
calibrations
:
list
=
[]
def
download
(
self
,
dataset_kwargs
=
None
):
self
.
dataset
=
datasets
.
load_dataset
(
path
=
self
.
DATASET_PATH
,
...
...
@@ -948,10 +946,7 @@ class ConfigurableTask(Task):
choices
=
self
.
doc_to_choice
(
doc
)
completion_len
=
np
.
array
([
float
(
len
(
i
))
for
i
in
choices
])
if
(
2
*
len
(
choices
)
==
len
(
lls
)
and
"acc_mutual_info"
in
self
.
_metric_fn_list
.
keys
()
):
if
2
*
len
(
choices
)
==
len
(
lls
)
and
"acc_mutual_info"
in
use_metric
:
# then we are doing mutual info.
# this stores the "dryrun" / unconditional answer loglikelihoods
lls_unconditional
=
lls
[
1
::
2
]
...
...
@@ -968,18 +963,27 @@ class ConfigurableTask(Task):
gold
=
self
.
doc_to_target
(
doc
)
if
type
(
gold
)
is
str
:
gold
=
choices
.
index
(
gold
)
# Convert lls from log-probabilities to normalized probabilities
norm_probs
=
np
.
exp
(
lls
-
sp
.
logsumexp
(
lls
))
print
(
norm_probs
)
if
"ece"
in
use_metric
:
# Convert lls from log-probabilities to normalized probabilities
norm_probs
:
np
.
ndarray
=
np
.
exp
(
lls
-
sp
.
logsumexp
(
lls
))
calib_scores
:
np
.
ndarray
=
np
.
zeros
(
len
(
choices
))
if
isinstance
(
gold
,
list
):
for
g
in
gold
:
calib_scores
[
g
]
=
1.0
else
:
calib_scores
[
gold
]
=
1.0
calibration_probs
:
dict
[
str
,
np
.
ndarray
]
=
{
"probs"
:
norm_probs
,
"scores"
:
calib_scores
,
}
if
self
.
multiple_target
:
acc
=
1.0
if
pred
in
gold
else
0.0
acc_norm
=
1.0
if
pred_norm
in
gold
else
0.0
exact_match
=
int
(
any
([
is_greedy
[
i
]
for
i
in
gold
]))
else
:
acc
=
1.0
if
pred
==
gold
else
0.0
for
i
,
choice
in
enumerate
(
choices
):
calib_score
=
1.0
if
i
==
gold
else
0.0
self
.
calibrations
.
append
((
norm_probs
[
i
],
calib_score
))
acc_norm
=
1.0
if
pred_norm
==
gold
else
0.0
# TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
exact_match
=
int
(
is_greedy
[
gold
])
...
...
@@ -990,6 +994,7 @@ class ConfigurableTask(Task):
**
({
"mcc"
:
(
gold
,
pred
)}
if
"mcc"
in
use_metric
else
{}),
**
({
"acc_norm"
:
acc_norm
}
if
"acc_norm"
in
use_metric
else
{}),
**
({
"exact_match"
:
exact_match
}
if
"exact_match"
in
use_metric
else
{}),
**
({
"ece"
:
calibration_probs
}
if
"ece"
in
use_metric
else
{}),
}
if
"acc_mutual_info"
in
use_metric
:
...
...
lm_eval/evaluator.py
View file @
a6bd7126
...
...
@@ -341,32 +341,6 @@ def evaluate(
for
metric
,
value
in
metrics
.
items
():
vals
[(
task_name
,
key
,
metric
)].
append
(
value
)
calibs
=
sorted
(
task
.
calibrations
,
key
=
lambda
x
:
x
[
0
])
def
bin_list_into_subsets
(
input_list
,
num_subsets
=
10
):
subset_size
=
len
(
input_list
)
//
num_subsets
remainder
=
len
(
input_list
)
%
num_subsets
subsets
=
[]
start
=
0
for
_
in
range
(
num_subsets
):
subset_end
=
start
+
subset_size
+
(
1
if
remainder
>
0
else
0
)
subsets
.
append
(
input_list
[
start
:
subset_end
])
start
=
subset_end
remainder
-=
1
return
subsets
subsets
=
bin_list_into_subsets
(
calibs
,
10
)
x_coords
=
[
np
.
mean
([
x
[
0
]
for
x
in
subset
])
for
subset
in
subsets
]
y_coords
=
[
np
.
mean
([
x
[
1
]
for
x
in
subset
])
for
subset
in
subsets
]
model_name
=
lm
.
config
.
_name_or_path
.
split
(
"/"
)[
1
]
plt
.
plot
(
x_coords
,
y_coords
,
label
=
model_name
)
plt
.
plot
([
0
,
1
],
[
0
,
1
],
linestyle
=
"--"
,
color
=
"black"
)
plt
.
xlabel
(
"Probabilities"
)
plt
.
ylabel
(
"Frequences"
)
plt
.
title
(
"Calibration"
)
plt
.
legend
()
plt
.
savefig
(
f
"
{
model_name
}
-long.png"
)
if
lm
.
world_size
>
1
:
# if multigpu, then gather data across all ranks
# first gather logged samples across all ranks
...
...
lm_eval/tasks/logiqa/logiqa.yaml
View file @
a6bd7126
...
...
@@ -5,8 +5,7 @@ output_type: multiple_choice
training_split
:
train
validation_split
:
validation
test_split
:
test
num_fewshot
:
5
doc_to_choice
:
!function
utils_logiqa.doc_to_choice
doc_to_choice
:
"
{{options}}"
doc_to_text
:
!function
utils_logiqa.doc_to_text
doc_to_target
:
!function
utils_logiqa.doc_to_target
doc_to_decontamination_query
:
"
{{context}}"
...
...
lm_eval/tasks/logiqa/logiqa_calibration.yaml
0 → 100644
View file @
a6bd7126
task
:
logiqa_calibration
dataset_path
:
EleutherAI/logiqa
dataset_name
:
logiqa
output_type
:
multiple_choice
training_split
:
train
validation_split
:
validation
test_split
:
test
num_fewshot
:
5
fewshot_split
:
train
doc_to_choice
:
!function
utils_logiqa.doc_to_choice
doc_to_text
:
!function
utils_logiqa.doc_to_text
doc_to_target
:
!function
utils_logiqa.doc_to_target
doc_to_decontamination_query
:
"
{{context}}"
should_decontaminate
:
true
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
-
metric
:
acc_norm
aggregation
:
mean
higher_is_better
:
true
-
metric
:
ece
aggregation
:
ece
higher_is_better
:
false
lm_eval/tasks/logiqa/utils_logiqa.py
View file @
a6bd7126
...
...
@@ -24,5 +24,5 @@ def doc_to_target(doc) -> int:
return
choices
.
index
(
doc
[
"label"
].
strip
())
def
doc_to_choice
(
doc
):
def
doc_to_choice
(
doc
)
->
list
:
return
[
"(A)"
,
"(B)"
,
"(C)"
,
"(D)"
]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment