Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
b06e797a
Unverified
Commit
b06e797a
authored
Jun 16, 2023
by
Lintang Sutawika
Committed by
GitHub
Jun 16, 2023
Browse files
Merge branch 'big-refactor' into big-refactor-merge
parents
b59fe379
3fc5bedc
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
44 additions
and
3 deletions
+44
-3
lm_eval/evaluator.py
lm_eval/evaluator.py
+20
-1
lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml
lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml
+1
-1
main.py
main.py
+23
-1
No files found.
lm_eval/evaluator.py
View file @
b06e797a
import
random
import
random
import
itertools
import
itertools
import
json
import
collections
import
collections
import
logging
import
sys
import
torch
import
torch
...
@@ -22,6 +25,10 @@ from lm_eval.utils import (
...
@@ -22,6 +25,10 @@ from lm_eval.utils import (
from
lm_eval.logger
import
eval_logger
from
lm_eval.logger
import
eval_logger
logger
=
logging
.
getLogger
(
__name__
)
logger
.
setLevel
(
logging
.
INFO
)
logger
.
addHandler
(
logging
.
StreamHandler
(
sys
.
stdout
))
@
positional_deprecated
@
positional_deprecated
def
simple_evaluate
(
def
simple_evaluate
(
...
@@ -159,7 +166,7 @@ def evaluate(
...
@@ -159,7 +166,7 @@ def evaluate(
results
=
collections
.
defaultdict
(
dict
)
results
=
collections
.
defaultdict
(
dict
)
versions
=
collections
.
defaultdict
(
dict
)
versions
=
collections
.
defaultdict
(
dict
)
configs
=
collections
.
defaultdict
(
dict
)
configs
=
collections
.
defaultdict
(
dict
)
samples
=
collections
.
defaultdict
(
list
)
requests
=
collections
.
defaultdict
(
list
)
requests
=
collections
.
defaultdict
(
list
)
# docs = {}
# docs = {}
...
@@ -250,6 +257,7 @@ def evaluate(
...
@@ -250,6 +257,7 @@ def evaluate(
enumerate
(
task
.
validation_docs
()),
lm
.
rank
,
limit
,
lm
.
world_size
enumerate
(
task
.
validation_docs
()),
lm
.
rank
,
limit
,
lm
.
world_size
)
)
)
)
for
doc_id
,
doc
in
doc_iterator
:
for
doc_id
,
doc
in
doc_iterator
:
# subset instances to only this document id ; sort by idx
# subset instances to only this document id ; sort by idx
requests
=
list
(
filter
(
lambda
x
:
x
.
doc_id
==
doc_id
,
task
.
instances
))
requests
=
list
(
filter
(
lambda
x
:
x
.
doc_id
==
doc_id
,
task
.
instances
))
...
@@ -257,6 +265,16 @@ def evaluate(
...
@@ -257,6 +265,16 @@ def evaluate(
metrics
=
task
.
process_results
(
metrics
=
task
.
process_results
(
doc
,
[
req
.
filtered_resps
[
key
]
for
req
in
requests
]
doc
,
[
req
.
filtered_resps
[
key
]
for
req
in
requests
]
)
)
target
=
task
.
doc_to_target
(
doc
)
example
=
{
"doc_id"
:
doc_id
,
"doc"
:
doc
,
"target"
:
target
,
"resps"
:
[
req
.
resps
for
req
in
requests
],
"filtered_resps"
:
[
req
.
filtered_resps
[
key
]
for
req
in
requests
],
}
example
.
update
(
metrics
)
samples
[
task_name
].
append
(
example
)
for
metric
,
value
in
metrics
.
items
():
for
metric
,
value
in
metrics
.
items
():
vals
[(
task_name
,
key
,
metric
)].
append
(
value
)
vals
[(
task_name
,
key
,
metric
)].
append
(
value
)
...
@@ -321,6 +339,7 @@ def evaluate(
...
@@ -321,6 +339,7 @@ def evaluate(
"results"
:
dict
(
results
),
"results"
:
dict
(
results
),
"configs"
:
dict
(
configs
),
"configs"
:
dict
(
configs
),
"versions"
:
dict
(
versions
),
"versions"
:
dict
(
versions
),
"samples"
:
samples
,
}
}
else
:
else
:
...
...
lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml
View file @
b06e797a
...
@@ -16,6 +16,6 @@ metric_list:
...
@@ -16,6 +16,6 @@ metric_list:
-
metric
:
perplexity
-
metric
:
perplexity
aggregation
:
perplexity
aggregation
:
perplexity
higher_is_better
:
false
higher_is_better
:
false
-
metric
:
acc
uracy
-
metric
:
acc
aggregation
:
mean
aggregation
:
mean
higher_is_better
:
true
higher_is_better
:
true
main.py
View file @
b06e797a
import
os
import
os
import
re
import
json
import
json
import
fnmatch
import
jsonlines
import
argparse
import
argparse
import
logging
from
lm_eval
import
evaluator
,
utils
from
lm_eval
import
evaluator
,
utils
from
lm_eval.api.registry
import
ALL_TASKS
from
lm_eval.api.registry
import
ALL_TASKS
...
@@ -88,15 +92,33 @@ def main():
...
@@ -88,15 +92,33 @@ def main():
)
)
if
results
is
not
None
:
if
results
is
not
None
:
samples
=
results
.
pop
(
"samples"
)
dumped
=
json
.
dumps
(
results
,
indent
=
2
)
dumped
=
json
.
dumps
(
results
,
indent
=
2
)
print
(
dumped
)
print
(
dumped
)
batch_sizes
=
","
.
join
(
map
(
str
,
results
[
"config"
][
"batch_sizes"
]))
if
args
.
output_path
:
if
args
.
output_path
:
os
.
makedirs
(
os
.
path
.
dirname
(
args
.
output_path
),
exist_ok
=
True
)
os
.
makedirs
(
os
.
path
.
dirname
(
args
.
output_path
),
exist_ok
=
True
)
with
open
(
args
.
output_path
,
"w"
)
as
f
:
with
open
(
args
.
output_path
,
"w"
)
as
f
:
f
.
write
(
dumped
)
f
.
write
(
dumped
)
batch_sizes
=
","
.
join
(
map
(
str
,
results
[
"config"
][
"batch_sizes"
]))
for
task_name
,
config
in
results
[
"configs"
].
items
():
output_name
=
"{}_{}"
.
format
(
re
.
sub
(
"/"
,
"__"
,
args
.
model_args
),
task_name
)
if
os
.
path
.
isdir
(
args
.
output_path
):
filename
=
f
"./
{
args
.
output_path
}
/
{
output_name
}
.jsonl"
elif
os
.
path
.
isfile
(
args
.
output_path
):
filename
=
(
f
"./
{
os
.
path
.
dirname
(
args
.
output_path
)
}
/
{
output_name
}
.jsonl"
)
with
jsonlines
.
open
(
filename
,
"w"
)
as
f
:
f
.
write_all
(
samples
[
task_name
])
print
(
print
(
f
"
{
args
.
model
}
(
{
args
.
model_args
}
), limit:
{
args
.
limit
}
, num_fewshot:
{
args
.
num_fewshot
}
, "
f
"
{
args
.
model
}
(
{
args
.
model_args
}
), limit:
{
args
.
limit
}
, num_fewshot:
{
args
.
num_fewshot
}
, "
f
"batch_size:
{
args
.
batch_size
}{
f
' (
{
batch_sizes
}
)
' if batch_sizes else ''
}
"
f
"batch_size:
{
args
.
batch_size
}{
f
' (
{
batch_sizes
}
)
' if batch_sizes else ''
}
"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment