Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
24e3e3fa
Unverified
Commit
24e3e3fa
authored
Jun 16, 2023
by
Lintang Sutawika
Committed by
GitHub
Jun 16, 2023
Browse files
Merge pull request #590 from gakada/big-refactor-merge
Merge master into big-refactor
parents
3fc5bedc
b7c3580a
Changes
202
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
196 additions
and
1 deletion
+196
-1
scripts/regression.py
scripts/regression.py
+193
-0
setup.py
setup.py
+3
-1
No files found.
scripts/regression.py
0 → 100644
View file @
24e3e3fa
import
argparse
import
json
import
os
import
subprocess
import
time
from
pathlib
import
Path
from
lm_eval
import
evaluator
,
utils
from
lm_eval.api.registry
import
ALL_TASKS
seq2seq_models
=
[
"google/flan-t5-small"
]
causal_models
=
[
"gpt2"
,
"facebook/opt-125m"
,
"EleutherAI/gpt-neo-125m"
,
"EleutherAI/pythia-160m"
,
]
model_names
=
seq2seq_models
+
causal_models
completion_tasks
=
[
"boolq"
,
"lambada_openai"
,
"winogrande"
]
choice_tasks
=
[
"hellaswag"
,
"openbookqa"
,
"piqa"
]
perplexity_tasks
=
[
"wikitext"
]
generation_tasks
=
[]
task_names
=
completion_tasks
+
choice_tasks
+
perplexity_tasks
+
generation_tasks
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--branches"
,
default
=
[])
parser
.
add_argument
(
"--models"
,
default
=
model_names
)
parser
.
add_argument
(
"--tasks"
,
default
=
task_names
)
parser
.
add_argument
(
"--acc_norm"
,
type
=
bool
,
default
=
False
)
parser
.
add_argument
(
"--perplexity"
,
default
=
None
)
# TODO: implement num_fewshot and limit per task, e.g. task1:5,task2:1:100,task3::1000
parser
.
add_argument
(
"--num_fewshot"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
"--limit"
,
type
=
float
,
default
=
None
)
# TODO: implement hf-auto to pick between causal and seq2seq models so we don't need this
parser
.
add_argument
(
"--model"
,
default
=
"hf-causal"
)
# Use whatever is faster here
parser
.
add_argument
(
"--model_args"
,
default
=
"use_accelerate=True,load_in_8bit=True"
)
parser
.
add_argument
(
"--batch_size"
,
default
=
"auto"
)
return
parser
.
parse_args
()
def
eval_models
(
args
,
branch
=
None
):
if
branch
is
not
None
:
if
os
.
system
(
f
"git checkout
{
branch
}
"
)
!=
0
:
return
{},
0
branch
=
branch
or
initial_branch
start_time
=
time
.
time
()
results
=
{}
for
model
in
args
.
models
:
model_type
=
(
"hf-causal"
if
model
in
causal_models
else
"hf-seq2seq"
if
model
in
seq2seq_models
else
args
.
model
)
model_args
=
f
"pretrained=
{
model
}
,
{
args
.
model_args
}
"
# TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527
tasks
=
(
args
.
tasks
if
model
in
causal_models
or
model_type
==
"hf-causal"
else
list
(
filter
(
lambda
task
:
task
not
in
perplexity_tasks
,
args
.
tasks
))
)
# TODO: OOM with auto for seq2seq models, also can OOM with llama
batch_size
=
(
args
.
batch_size
if
model
in
causal_models
or
model_type
==
"hf-causal"
else
64
if
args
.
batch_size
==
"auto"
else
args
.
batch_size
)
output_path
=
(
f
"data/regression/
{
int
(
start_time
)
}
-
{
branch
}
-
{
Path
(
model
).
name
}
.json"
)
command
=
(
f
"python3 main.py --model
{
model_type
}
--model_args
{
model_args
}
--tasks
{
','
.
join
(
tasks
)
}
"
f
"--num_fewshot
{
args
.
num_fewshot
}{
''
if
args
.
limit
is
None
else
f
' --limit
{
args
.
limit
}
'
}
"
f
"--batch_size
{
batch_size
}
--no_cache --output_path
{
output_path
}
"
)
print
(
f
"
{
'='
*
80
}
\n
Evaluating
{
model
}
on
{
', '
.
join
(
tasks
)
}
at
{
branch
}
with:
\n\n
{
command
}
\n
{
'='
*
80
}
"
)
ret
=
os
.
system
(
command
)
results
[
model
]
=
json
.
load
(
open
(
output_path
))
if
ret
==
0
else
{
"results"
:
{}}
end_time
=
time
.
time
()
return
results
,
end_time
-
start_time
def
extract_value
(
args
,
results
,
model
,
task
,
err
=
False
):
if
model
not
in
results
:
return
0
results
=
results
[
model
][
"results"
]
if
task
not
in
results
:
return
0
results
=
results
[
task
]
if
args
.
acc_norm
and
"acc_norm,none"
in
results
:
return
results
[
"acc_norm,none"
]
if
not
err
else
results
[
"acc_norm_stderr,none"
]
if
"acc,none"
in
results
:
return
results
[
"acc,none"
]
if
not
err
else
results
[
"acc_stderr,none"
]
if
(
args
.
perplexity
or
"word_perplexity"
)
+
",none"
in
results
:
return
(
results
[(
args
.
perplexity
or
"word_perplexity"
)
+
",none"
]
if
not
err
else
0
)
return
0
def
format_value
(
args
,
results
,
model
,
task
):
val
=
100
*
extract_value
(
args
,
results
,
model
,
task
)
err
=
100
*
extract_value
(
args
,
results
,
model
,
task
,
err
=
True
)
return
f
"
{
val
:.
2
f
}{
f
' ±
{
err
:.
2
f
}
' if err != 0 else ''
}
"
def
format_diff
(
args
,
results1
,
results2
,
model
,
task
):
val1
=
100
*
extract_value
(
args
,
results1
,
model
,
task
)
val2
=
100
*
extract_value
(
args
,
results2
,
model
,
task
)
diff
=
val2
-
val1
return
f
"**+
{
diff
:.
2
f
}
**"
if
diff
>
0
else
f
"
{
diff
:.
2
f
}
"
def
main
():
args
=
parse_args
()
args
.
branches
=
(
args
.
branches
.
split
(
","
)
if
type
(
args
.
branches
)
==
str
else
args
.
branches
)
args
.
models
=
args
.
models
.
split
(
","
)
if
type
(
args
.
models
)
==
str
else
args
.
models
args
.
tasks
=
(
ALL_TASKS
if
args
.
tasks
==
"all_tasks"
else
utils
.
pattern_match
(
args
.
tasks
.
split
(
","
),
ALL_TASKS
)
if
type
(
args
.
tasks
)
==
str
else
args
.
tasks
)
global
initial_branch
initial_branch
=
(
subprocess
.
check_output
(
"git branch --show-current"
,
shell
=
True
)
.
decode
(
"ascii"
)
.
strip
()
)
# TODO: implement proper timing for each task
# TODO: reduce IO by sharing tasks between models?
results
,
runtime
=
eval_models
(
args
)
print
(
results
,
runtime
)
runs
=
[]
for
branch
in
args
.
branches
:
runs
.
append
((
branch
,
*
eval_models
(
args
,
branch
)))
os
.
system
(
f
"git checkout
{
initial_branch
}
"
)
print
(
""
)
print
(
f
"|task|
{
'|'
.
join
(
map
(
lambda
model
:
Path
(
model
).
name
,
args
.
models
))
}
|"
)
print
(
f
"|--|
{
'--|'
*
len
(
args
.
models
)
}
"
)
for
task
in
args
.
tasks
:
print
(
f
"|
{
task
}
(
{
initial_branch
}
)|
{
'|'
.
join
(
map
(
lambda
model
:
format_value
(
args
,
results
,
model
,
task
),
args
.
models
))
}
|"
)
for
branch
,
branch_results
,
branch_runtime
in
runs
:
print
(
f
"|
{
task
}
(
{
branch
}
)|
{
'|'
.
join
(
map
(
lambda
model
:
format_value
(
args
,
branch_results
,
model
,
task
),
args
.
models
))
}
|"
)
print
(
f
"|
{
task
}
(diff)|
{
'|'
.
join
(
map
(
lambda
model
:
format_diff
(
args
,
results
,
branch_results
,
model
,
task
),
args
.
models
))
}
|"
)
print
(
""
)
print
(
"|branch|runtime|%|"
)
print
(
"|--|--|--|"
)
print
(
f
"|
{
initial_branch
}
|
{
runtime
:.
1
f
}
s|100%|"
)
for
branch
,
_
,
branch_runtime
in
runs
:
print
(
f
"|
{
branch
}
|
{
branch_runtime
:.
1
f
}
s|
{
100
*
branch_runtime
/
runtime
:.
2
f
}
%|"
)
if
__name__
==
"__main__"
:
main
()
setup.py
View file @
24e3e3fa
...
...
@@ -19,7 +19,7 @@ setuptools.setup(
"License :: OSI Approved :: MIT License"
,
"Operating System :: OS Independent"
,
],
python_requires
=
">=3.
6
"
,
python_requires
=
">=3.
9
"
,
install_requires
=
[
"accelerate>=0.18.0"
,
"datasets>=2.0.0"
,
...
...
@@ -47,5 +47,7 @@ setuptools.setup(
"promptsource"
:
[
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
],
"auto-gptq"
:
[
"auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"
],
"anthropic"
:
[
"anthropic"
],
},
)
Prev
1
…
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment