Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
f692caa9
Commit
f692caa9
authored
Feb 20, 2024
by
lintangsutawika
Browse files
updated to appease the pre-commit
parent
ab96fc7e
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
32 additions
and
18 deletions
+32
-18
lm_eval/__main__.py
lm_eval/__main__.py
+1
-1
lm_eval/api/metrics.py
lm_eval/api/metrics.py
+2
-3
lm_eval/api/registry.py
lm_eval/api/registry.py
+1
-0
lm_eval/api/task.py
lm_eval/api/task.py
+1
-1
lm_eval/evaluator.py
lm_eval/evaluator.py
+26
-9
lm_eval/tasks/bbh/cot_zeroshot/multistep_arithmetic_two.yaml
lm_eval/tasks/bbh/cot_zeroshot/multistep_arithmetic_two.yaml
+0
-1
lm_eval/tasks/bbh/cot_zeroshot/object_counting.yaml
lm_eval/tasks/bbh/cot_zeroshot/object_counting.yaml
+0
-1
lm_eval/tasks/mmlu/alternative_worlds/output_variation/style_05_generative/a/_template_yaml
...lds/output_variation/style_05_generative/a/_template_yaml
+1
-1
lm_eval/utils.py
lm_eval/utils.py
+0
-1
No files found.
lm_eval/__main__.py
View file @
f692caa9
...
...
@@ -11,7 +11,7 @@ from typing import Union
import
numpy
as
np
from
lm_eval
import
evaluator
,
utils
from
lm_eval.tasks
import
TaskManager
,
include_path
,
initialize_tasks
from
lm_eval.tasks
import
TaskManager
,
initialize_tasks
from
lm_eval.utils
import
make_table
...
...
lm_eval/api/metrics.py
View file @
f692caa9
import
logging
import
math
import
random
from
collections.abc
import
Iterable
from
collections
import
defaultdict
from
collections.abc
import
Iterable
from
typing
import
List
import
evaluate
import
numpy
as
np
import
sacrebleu
import
sklearn.metrics
import
evaluate
from
lm_eval.api.registry
import
register_aggregation
,
register_metric
...
...
@@ -119,7 +119,6 @@ def ter(items):
@
register_aggregation
(
"brier_score"
)
def
brier_score
(
items
):
# This is a passthrough function
# Certain datasets like arc_easy can have a different number of choices.
golds
,
predictions
=
list
(
zip
(
*
items
))
...
...
lm_eval/api/registry.py
View file @
f692caa9
...
...
@@ -2,6 +2,7 @@ import logging
from
typing
import
Callable
,
Dict
import
evaluate
from
lm_eval.api.model
import
LM
...
...
lm_eval/api/task.py
View file @
f692caa9
...
...
@@ -1193,8 +1193,8 @@ class ConfigurableTask(Task):
**
({
"mcc"
:
(
gold
,
pred
)}
if
"mcc"
in
use_metric
else
{}),
**
({
"acc_norm"
:
acc_norm
}
if
"acc_norm"
in
use_metric
else
{}),
**
({
"exact_match"
:
exact_match
}
if
"exact_match"
in
use_metric
else
{}),
# {"brier_score": (gold, prob_norm)}
**
(
# {"brier_score": (gold, prob_norm)}
{
"brier_score"
:
[
np
.
eye
(
len
(
prob_norm
))[
gold
],
prob_norm
]}
if
"brier_score"
in
use_metric
else
{}
...
...
lm_eval/evaluator.py
View file @
f692caa9
...
...
@@ -498,7 +498,6 @@ def evaluate(
metric_key
=
f
"
{
metric
}
,
{
key
}
"
agg_fn
=
task
.
aggregation
()[
metric
]
results
[
task_name
][
metric_key
]
=
agg_fn
(
items
)
results
[
task_name
][
"samples"
]
=
len
(
items
)
...
...
@@ -524,19 +523,37 @@ def evaluate(
# or `task_name: []`.
# we only want to operate on groups here.
continue
for
metric
in
[
key
for
key
in
results
[
task_list
[
0
]].
keys
()
if
"_stderr"
not
in
key
and
key
not
in
[
"alias"
,
"samples"
]
]:
# TODO: what if tasks don't all share the same metrics
group_metrics
=
list
(
dict
.
fromkeys
(
[
key
for
task
in
task_list
for
key
in
results
[
task
].
keys
()
if
"_stderr"
not
in
key
and
key
not
in
[
"alias"
,
"samples"
]
]
)
)
for
metric
in
group_metrics
:
# TODO: what if tasks don't all share the same metrics
stderr
=
"_stderr,"
.
join
(
metric
.
split
(
","
))
# gather metrics, sizes, and stderrs from subtasks
metrics
=
[
results
[
task
][
metric
]
for
task
in
task_list
results
[
task
][
metric
]
for
task
in
task_list
if
metric
in
results
[
task
]
]
# TODO: copy?
stderrs
=
[
results
[
task
][
stderr
]
for
task
in
task_list
]
sizes
=
[
results
[
task
][
"samples"
]
for
task
in
task_list
]
stderrs
=
[
results
[
task
][
stderr
]
for
task
in
task_list
if
stderr
in
results
[
task
]
]
sizes
=
[
results
[
task
][
"samples"
]
for
task
in
task_list
if
metric
in
results
[
task
]
]
# compute group's pooled metric and stderr
results
[
group
][
...
...
lm_eval/tasks/bbh/cot_zeroshot/multistep_arithmetic_two.yaml
View file @
f692caa9
...
...
@@ -16,4 +16,3 @@ filter_list:
-
function
:
"
regex"
regex_pattern
:
"
((?<=The
answer
is
)(.*)(?=.)|(?<=the
answer
is
)(.*)(?=.)|(?<=The
answer:
)(.*)(?=.)|(?<=The
final
answer:
)(.*)(?=.))"
-
function
:
"
take_first"
lm_eval/tasks/bbh/cot_zeroshot/object_counting.yaml
View file @
f692caa9
...
...
@@ -15,4 +15,3 @@ filter_list:
-
function
:
"
regex"
regex_pattern
:
"
((?<=The
answer
is
)(.*)(?=.)|(?<=the
answer
is
)(.*)(?=.)|(?<=The
answer:
)(.*)(?=.)|(?<=The
final
answer:
)(.*)(?=.))"
-
function
:
"
take_first"
lm_eval/tasks/mmlu/alternative_worlds/output_variation/style_05_generative/a/_template_yaml
View file @
f692caa9
...
...
@@ -15,4 +15,4 @@ metric_list:
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
\ No newline at end of file
version: 0.0
lm_eval/utils.py
View file @
f692caa9
...
...
@@ -17,7 +17,6 @@ from typing import (
)
import
numpy
as
np
import
yaml
from
jinja2
import
BaseLoader
,
Environment
,
StrictUndefined
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment