Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
6f4f9e1c
Commit
6f4f9e1c
authored
Dec 13, 2023
by
lintangsutawika
Browse files
resolved merge conflict
parents
0d5748b7
aed90773
Changes
145
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
194 additions
and
12 deletions
+194
-12
lm_eval/tasks/sciq/alternative_worlds/output_variation/styles.py
.../tasks/sciq/alternative_worlds/output_variation/styles.py
+12
-9
lm_eval/tasks/scrolls/task.py
lm_eval/tasks/scrolls/task.py
+1
-1
lm_eval/utils.py
lm_eval/utils.py
+55
-1
pyproject.toml
pyproject.toml
+1
-1
scripts/model_comparator.py
scripts/model_comparator.py
+125
-0
No files found.
lm_eval/tasks/sciq/alternative_worlds/output_variation/styles.py
View file @
6f4f9e1c
import
string
from
functools
import
partial
def
doc_to_text_base
(
alphabet
,
style
,
doc
):
choices
=
doc
[
"choices"
][
"text"
]
...
...
@@ -13,19 +14,21 @@ def doc_to_text_base(alphabet, style, doc):
else
:
choice_string
=
"{} {}"
doc_to_text
=
"
\n\n
"
.
join
([
"Question: "
+
doc
[
"question"
]
+
"
\n
Answer:"
,
]
+
[
choice_string
.
format
(
i
,
j
)
for
i
,
j
in
zip
(
letter_list
,
choices
)
doc_to_text
=
"
\n\n
"
.
join
(
[
"Question: "
+
doc
[
"question"
]
+
"
\n
Answer:"
,
]
+
[
choice_string
.
format
(
i
,
j
)
for
i
,
j
in
zip
(
letter_list
,
choices
)]
)
return
doc_to_text
# Full continuation
def
choice_A
(
doc
):
return
doc
[
"choices"
][
"text"
]
# Letters only
def
choice_B
(
alphabet
,
style
,
doc
):
...
...
@@ -34,10 +37,11 @@ def choice_B(alphabet, style, doc):
letter_list
=
[
style
.
format
(
letter
)
for
letter
in
alphabet
[
0
:
num
]]
if
"
\t
"
in
style
:
letter_list
=
[
letter
.
replace
(
"
\t
"
,
""
)
for
letter
in
letter_list
]
letter_list
=
[
letter
.
replace
(
"
\t
"
,
""
)
for
letter
in
letter_list
]
return
letter_list
# Letters + Full continuation
def
choice_C
(
alphabet
,
style
,
doc
):
...
...
@@ -46,9 +50,10 @@ def choice_C(alphabet, style, doc):
letter_list
=
[
style
.
format
(
letter
)
for
letter
in
alphabet
[
0
:
num
]]
if
"
\t
"
not
in
style
:
letter_list
=
[
letter
+
" "
for
letter
in
letter_list
]
letter_list
=
[
letter
+
" "
for
letter
in
letter_list
]
return
[
letter
+
choice
for
letter
,
choice
in
zip
(
letter_list
,
choices
)]
return
[
letter
+
choice
for
letter
,
choice
in
zip
(
letter_list
,
choices
)]
template_01
=
partial
(
doc_to_text_base
,
string
.
ascii_lowercase
,
"({})"
)
choice_01a
=
choice_A
...
...
@@ -82,5 +87,3 @@ template_08 = partial(doc_to_text_base, string.ascii_uppercase, "{}\t")
choice_08a
=
choice_A
choice_08b
=
partial
(
choice_B
,
string
.
ascii_uppercase
,
"{}
\t
"
)
choice_08c
=
partial
(
choice_C
,
string
.
ascii_uppercase
,
"{}
\t
"
)
lm_eval/tasks/scrolls/task.py
View file @
6f4f9e1c
...
...
@@ -244,7 +244,7 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
idx
=
i
,
**
kwargs
,
)
for
i
,
choice
in
doc
[
"choices"
]
for
i
,
choice
in
enumerate
(
doc
[
"choices"
]
)
]
return
request_list
...
...
lm_eval/utils.py
View file @
6f4f9e1c
...
...
@@ -378,7 +378,9 @@ def make_table(result_dict, column: str = "results"):
if
m
+
"_stderr"
+
","
+
f
in
dic
:
se
=
dic
[
m
+
"_stderr"
+
","
+
f
]
values
.
append
([
k
,
version
,
f
,
n
,
m
,
"%.4f"
%
v
,
"±"
,
"%.4f"
%
se
])
if
se
!=
"N/A"
:
se
=
"%.4f"
%
se
values
.
append
([
k
,
version
,
f
,
n
,
m
,
"%.4f"
%
v
,
"±"
,
se
])
else
:
values
.
append
([
k
,
version
,
f
,
n
,
m
,
"%.4f"
%
v
,
""
,
""
])
k
=
""
...
...
@@ -669,3 +671,55 @@ def stop_sequences_criteria(
],
]
)
# from more_itertools
def
divide
(
iterable
,
n
)
->
List
[
Iterator
]:
"""Divide the elements from *iterable* into *n* parts, maintaining
order.
>>> group_1, group_2 = divide(2, [1, 2, 3, 4, 5, 6])
>>> list(group_1)
[1, 2, 3]
>>> list(group_2)
[4, 5, 6]
If the length of *iterable* is not evenly divisible by *n*, then the
length of the returned iterables will not be identical:
>>> children = divide(3, [1, 2, 3, 4, 5, 6, 7])
>>> [list(c) for c in children]
[[1, 2, 3], [4, 5], [6, 7]]
If the length of the iterable is smaller than n, then the last returned
iterables will be empty:
>>> children = divide(5, [1, 2, 3])
>>> [list(c) for c in children]
[[1], [2], [3], [], []]
This function will exhaust the iterable before returning and may require
significant storage. If order is not important, see :func:`distribute`,
which does not first pull the iterable into memory.
"""
if
n
<
1
:
raise
ValueError
(
"n must be at least 1"
)
try
:
iterable
[:
0
]
except
TypeError
:
seq
=
tuple
(
iterable
)
else
:
seq
=
iterable
q
,
r
=
divmod
(
len
(
seq
),
n
)
ret
=
[]
stop
=
0
for
i
in
range
(
1
,
n
+
1
):
start
=
stop
stop
+=
q
+
1
if
i
<=
r
else
q
ret
.
append
(
iter
(
seq
[
start
:
stop
]))
return
ret
pyproject.toml
View file @
6f4f9e1c
...
...
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name
=
"lm_eval"
version
=
"
1.0
.0"
version
=
"
0.4
.0"
authors
=
[
{name=
"EleutherAI"
,
email=
"contact@eleuther.ai"
}
]
...
...
scripts/model_comparator.py
0 → 100644
View file @
6f4f9e1c
import
argparse
import
numpy
as
np
import
lm_eval.evaluator
from
lm_eval
import
tasks
import
scipy.stats
from
typing
import
Tuple
,
Dict
,
List
import
pandas
as
pd
import
torch
import
os
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"false"
eval_logger
=
lm_eval
.
utils
.
eval_logger
def
calculate_z_value
(
res1
:
Dict
,
res2
:
Dict
)
->
Tuple
[
float
,
float
]:
acc1
,
acc2
=
res1
[
"acc,none"
],
res2
[
"acc,none"
]
st_err1
,
st_err2
=
res1
[
"acc_stderr,none"
],
res2
[
"acc_stderr,none"
]
Z
=
(
acc1
-
acc2
)
/
np
.
sqrt
((
st_err1
**
2
)
+
(
st_err2
**
2
))
# Determining the p-value
p_value
=
2
*
scipy
.
stats
.
norm
.
sf
(
abs
(
Z
))
# two-tailed test
return
Z
,
p_value
def
print_results
(
data_to_print
:
List
=
None
,
results_dict
:
Dict
=
None
,
alpha
:
float
=
None
):
model1_data
=
data_to_print
[
0
]
model2_data
=
data_to_print
[
1
]
table_data
=
[]
for
task
in
model1_data
.
keys
():
row
=
{
"Task"
:
task
,
"HF Accuracy"
:
model1_data
[
task
][
"acc,none"
],
"vLLM Accuracy"
:
model2_data
[
task
][
"acc,none"
],
"HF StdErr"
:
model1_data
[
task
][
"acc_stderr,none"
],
"vLLM StdErr"
:
model2_data
[
task
][
"acc_stderr,none"
],
}
table_data
.
append
(
row
)
comparison_df
=
pd
.
DataFrame
(
table_data
)
comparison_df
[
"Z-Score"
]
=
comparison_df
[
"Task"
].
apply
(
lambda
task
:
results_dict
[
task
][
"z"
]
)
comparison_df
[
"P-Value"
]
=
comparison_df
[
"Task"
].
apply
(
lambda
task
:
results_dict
[
task
][
"p_value"
]
)
comparison_df
[
f
"p >
{
alpha
}
"
]
=
comparison_df
[
"P-Value"
].
apply
(
lambda
p
:
"✓"
if
p
>
alpha
else
"×"
)
return
comparison_df
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--pretrained"
,
default
=
"EleutherAI/pythia-70m"
,
help
=
"name of model to compare"
)
parser
.
add_argument
(
"--hf_args"
,
help
=
"huggingface model args <arg>=<value>"
,
default
=
""
)
parser
.
add_argument
(
"--vllm_args"
,
help
=
"vllm model args <arg>=<value>"
,
default
=
""
)
parser
.
add_argument
(
"--tasks"
,
type
=
str
,
default
=
"arc_easy,hellaswag"
)
parser
.
add_argument
(
"--limit"
,
type
=
float
,
default
=
100
,
)
parser
.
add_argument
(
"--alpha"
,
type
=
float
,
default
=
0.05
,
help
=
"Significance level for two-tailed z-test"
,
)
parser
.
add_argument
(
"--device"
,
type
=
str
,
default
=
"cuda"
,
)
parser
.
add_argument
(
"--batch"
,
type
=
str
,
default
=
8
,
)
parser
.
add_argument
(
"--verbosity"
,
type
=
str
,
default
=
"INFO"
,
help
=
"Logging verbosity"
,
)
return
parser
.
parse_args
()
if
__name__
==
"__main__"
:
tasks
.
initialize_tasks
()
args
=
parse_args
()
tasks
=
args
.
tasks
.
split
(
","
)
print
(
tasks
)
hf_args
,
vllm_args
=
","
+
args
.
hf_args
,
","
+
args
.
vllm_args
results_vllm
=
lm_eval
.
evaluator
.
simple_evaluate
(
model
=
"vllm"
,
model_args
=
f
"pretrained=
{
args
.
pretrained
}
"
+
vllm_args
,
tasks
=
tasks
,
limit
=
args
.
limit
,
device
=
args
.
device
,
batch_size
=
args
.
batch
,
)
torch
.
cuda
.
empty_cache
()
results_hf
=
lm_eval
.
evaluator
.
simple_evaluate
(
model
=
"hf"
,
model_args
=
f
"pretrained=
{
args
.
pretrained
}
"
+
hf_args
,
tasks
=
tasks
,
limit
=
args
.
limit
,
device
=
args
.
device
,
batch_size
=
args
.
batch
,
)
all_res
=
{}
for
task1
,
task2
in
zip
(
results_hf
[
"results"
].
items
(),
results_vllm
[
"results"
].
items
()
):
assert
task1
[
0
]
==
task2
[
0
]
z
,
p_value
=
calculate_z_value
(
task1
[
1
],
task2
[
1
])
all_res
[
task1
[
0
]]
=
{
"z"
:
z
,
"p_value"
:
p_value
}
df
=
print_results
(
[
results_hf
[
"results"
],
results_vllm
[
"results"
]],
all_res
,
args
.
alpha
)
print
(
df
)
Prev
1
…
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment