Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
b7c3580a
Commit
b7c3580a
authored
Jun 16, 2023
by
lintangsutawika
Browse files
reformatted
parent
86db4a4e
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
100 additions
and
35 deletions
+100
-35
README.md
README.md
+1
-1
lm_eval/evaluator.py
lm_eval/evaluator.py
+12
-3
lm_eval/models/anthropic_llms.py
lm_eval/models/anthropic_llms.py
+5
-2
lm_eval/utils.py
lm_eval/utils.py
+2
-2
main.py
main.py
+16
-6
scripts/regression.py
scripts/regression.py
+64
-21
No files found.
README.md
View file @
b7c3580a
lm_eval/evaluator.py
View file @
b7c3580a
...
@@ -88,7 +88,12 @@ def simple_evaluate(
...
@@ -88,7 +88,12 @@ def simple_evaluate(
if
model_args
is
None
:
if
model_args
is
None
:
model_args
=
""
model_args
=
""
lm
=
lm_eval
.
api
.
registry
.
get_model
(
model
).
create_from_arg_string
(
lm
=
lm_eval
.
api
.
registry
.
get_model
(
model
).
create_from_arg_string
(
model_args
,
{
"batch_size"
:
batch_size
,
"max_batch_size"
:
max_batch_size
,
"device"
:
device
}
model_args
,
{
"batch_size"
:
batch_size
,
"max_batch_size"
:
max_batch_size
,
"device"
:
device
,
},
)
)
else
:
else
:
assert
isinstance
(
model
,
lm_eval
.
api
.
model
.
LM
)
assert
isinstance
(
model
,
lm_eval
.
api
.
model
.
LM
)
...
@@ -112,11 +117,15 @@ def simple_evaluate(
...
@@ -112,11 +117,15 @@ def simple_evaluate(
if
lm
.
rank
==
0
:
if
lm
.
rank
==
0
:
# add info about the model and few shot config
# add info about the model and few shot config
results
[
"config"
]
=
{
results
[
"config"
]
=
{
"model"
:
model
if
isinstance
(
model
,
str
)
else
model
.
model
.
config
.
_name_or_path
,
"model"
:
model
if
isinstance
(
model
,
str
)
else
model
.
model
.
config
.
_name_or_path
,
"model_args"
:
model_args
,
"model_args"
:
model_args
,
"num_fewshot"
:
num_fewshot
,
"num_fewshot"
:
num_fewshot
,
"batch_size"
:
batch_size
,
"batch_size"
:
batch_size
,
"batch_sizes"
:
list
(
lm
.
batch_sizes
.
values
())
if
hasattr
(
lm
,
"batch_sizes"
)
else
[],
"batch_sizes"
:
list
(
lm
.
batch_sizes
.
values
())
if
hasattr
(
lm
,
"batch_sizes"
)
else
[],
"device"
:
device
,
"device"
:
device
,
"no_cache"
:
no_cache
,
"no_cache"
:
no_cache
,
"limit"
:
limit
,
"limit"
:
limit
,
...
...
lm_eval/models/anthropic_llms.py
View file @
b7c3580a
...
@@ -4,7 +4,9 @@ from tqdm import tqdm
...
@@ -4,7 +4,9 @@ from tqdm import tqdm
import
time
import
time
def
anthropic_completion
(
client
,
model
,
prompt
,
max_tokens_to_sample
,
temperature
,
stop
):
def
anthropic_completion
(
client
,
model
,
prompt
,
max_tokens_to_sample
,
temperature
,
stop
):
"""Query Anthropic API for completion.
"""Query Anthropic API for completion.
Retry with back-off until they respond
Retry with back-off until they respond
...
@@ -46,8 +48,9 @@ class AnthropicLM(BaseLM):
...
@@ -46,8 +48,9 @@ class AnthropicLM(BaseLM):
"""
"""
super
().
__init__
()
super
().
__init__
()
import
anthropic
import
anthropic
self
.
model
=
model
self
.
model
=
model
self
.
client
=
anthropic
.
Client
(
os
.
environ
[
'
ANTHROPIC_API_KEY
'
])
self
.
client
=
anthropic
.
Client
(
os
.
environ
[
"
ANTHROPIC_API_KEY
"
])
@
property
@
property
def
eot_token_id
(
self
):
def
eot_token_id
(
self
):
...
...
lm_eval/utils.py
View file @
b7c3580a
...
@@ -168,8 +168,8 @@ def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len
...
@@ -168,8 +168,8 @@ def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len
window_end
=
predicted
+
window_pred_len
window_end
=
predicted
+
window_pred_len
yield
(
yield
(
token_list
[
window_end
-
max_seq_len
-
1
:
window_end
-
1
],
token_list
[
window_end
-
max_seq_len
-
1
:
window_end
-
1
],
token_list
[
window_end
-
window_pred_len
:
window_end
],
token_list
[
window_end
-
window_pred_len
:
window_end
],
)
)
predicted
+=
window_pred_len
predicted
+=
window_pred_len
...
...
main.py
View file @
b7c3580a
...
@@ -17,17 +17,27 @@ def parse_args():
...
@@ -17,17 +17,27 @@ def parse_args():
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--model"
,
required
=
True
)
parser
.
add_argument
(
"--model"
,
required
=
True
)
parser
.
add_argument
(
"--model_args"
,
default
=
""
)
parser
.
add_argument
(
"--model_args"
,
default
=
""
)
parser
.
add_argument
(
"--tasks"
,
default
=
None
,
choices
=
utils
.
MultiChoice
(
sorted
(
ALL_TASKS
)))
parser
.
add_argument
(
"--tasks"
,
default
=
None
,
choices
=
utils
.
MultiChoice
(
sorted
(
ALL_TASKS
))
)
parser
.
add_argument
(
"--config"
,
default
=
None
)
parser
.
add_argument
(
"--config"
,
default
=
None
)
parser
.
add_argument
(
"--num_fewshot"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
"--num_fewshot"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
"--batch_size"
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--batch_size"
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--max_batch_size"
,
type
=
int
,
default
=
None
,
parser
.
add_argument
(
help
=
"Maximal batch size to try with --batch_size auto"
)
"--max_batch_size"
,
type
=
int
,
default
=
None
,
help
=
"Maximal batch size to try with --batch_size auto"
,
)
parser
.
add_argument
(
"--device"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--device"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--output_path"
,
default
=
None
)
parser
.
add_argument
(
"--output_path"
,
default
=
None
)
parser
.
add_argument
(
"--limit"
,
type
=
float
,
default
=
None
,
parser
.
add_argument
(
"--limit"
,
type
=
float
,
default
=
None
,
help
=
"Limit the number of examples per task. "
help
=
"Limit the number of examples per task. "
"If <1, limit is a percentage of the total number of examples."
)
"If <1, limit is a percentage of the total number of examples."
,
)
parser
.
add_argument
(
"--data_sampling"
,
type
=
float
,
default
=
None
)
parser
.
add_argument
(
"--data_sampling"
,
type
=
float
,
default
=
None
)
parser
.
add_argument
(
"--no_cache"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--no_cache"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--decontamination_ngrams_path"
,
default
=
None
)
parser
.
add_argument
(
"--decontamination_ngrams_path"
,
default
=
None
)
...
...
scripts/regression.py
View file @
b7c3580a
...
@@ -10,7 +10,12 @@ from lm_eval.api.registry import ALL_TASKS
...
@@ -10,7 +10,12 @@ from lm_eval.api.registry import ALL_TASKS
seq2seq_models
=
[
"google/flan-t5-small"
]
seq2seq_models
=
[
"google/flan-t5-small"
]
causal_models
=
[
"gpt2"
,
"facebook/opt-125m"
,
"EleutherAI/gpt-neo-125m"
,
"EleutherAI/pythia-160m"
]
causal_models
=
[
"gpt2"
,
"facebook/opt-125m"
,
"EleutherAI/gpt-neo-125m"
,
"EleutherAI/pythia-160m"
,
]
model_names
=
seq2seq_models
+
causal_models
model_names
=
seq2seq_models
+
causal_models
...
@@ -51,22 +56,41 @@ def eval_models(args, branch=None):
...
@@ -51,22 +56,41 @@ def eval_models(args, branch=None):
results
=
{}
results
=
{}
for
model
in
args
.
models
:
for
model
in
args
.
models
:
model_type
=
"hf-causal"
if
model
in
causal_models
\
model_type
=
(
else
"hf-seq2seq"
if
model
in
seq2seq_models
else
args
.
model
"hf-causal"
if
model
in
causal_models
else
"hf-seq2seq"
if
model
in
seq2seq_models
else
args
.
model
)
model_args
=
f
"pretrained=
{
model
}
,
{
args
.
model_args
}
"
model_args
=
f
"pretrained=
{
model
}
,
{
args
.
model_args
}
"
# TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527
# TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527
tasks
=
args
.
tasks
if
model
in
causal_models
or
model_type
==
"hf-causal"
\
tasks
=
(
args
.
tasks
if
model
in
causal_models
or
model_type
==
"hf-causal"
else
list
(
filter
(
lambda
task
:
task
not
in
perplexity_tasks
,
args
.
tasks
))
else
list
(
filter
(
lambda
task
:
task
not
in
perplexity_tasks
,
args
.
tasks
))
)
# TODO: OOM with auto for seq2seq models, also can OOM with llama
# TODO: OOM with auto for seq2seq models, also can OOM with llama
batch_size
=
args
.
batch_size
if
model
in
causal_models
or
model_type
==
"hf-causal"
\
batch_size
=
(
else
64
if
args
.
batch_size
==
"auto"
else
args
.
batch_size
args
.
batch_size
output_path
=
f
"data/regression/
{
int
(
start_time
)
}
-
{
branch
}
-
{
Path
(
model
).
name
}
.json"
if
model
in
causal_models
or
model_type
==
"hf-causal"
else
64
command
=
f
"python3 main.py --model
{
model_type
}
--model_args
{
model_args
}
--tasks
{
','
.
join
(
tasks
)
}
"
\
if
args
.
batch_size
==
"auto"
f
"--num_fewshot
{
args
.
num_fewshot
}{
''
if
args
.
limit
is
None
else
f
' --limit
{
args
.
limit
}
'
}
"
\
else
args
.
batch_size
)
output_path
=
(
f
"data/regression/
{
int
(
start_time
)
}
-
{
branch
}
-
{
Path
(
model
).
name
}
.json"
)
command
=
(
f
"python3 main.py --model
{
model_type
}
--model_args
{
model_args
}
--tasks
{
','
.
join
(
tasks
)
}
"
f
"--num_fewshot
{
args
.
num_fewshot
}{
''
if
args
.
limit
is
None
else
f
' --limit
{
args
.
limit
}
'
}
"
f
"--batch_size
{
batch_size
}
--no_cache --output_path
{
output_path
}
"
f
"--batch_size
{
batch_size
}
--no_cache --output_path
{
output_path
}
"
)
print
(
f
"
{
'='
*
80
}
\n
Evaluating
{
model
}
on
{
', '
.
join
(
tasks
)
}
at
{
branch
}
with:
\n\n
{
command
}
\n
{
'='
*
80
}
"
)
print
(
f
"
{
'='
*
80
}
\n
Evaluating
{
model
}
on
{
', '
.
join
(
tasks
)
}
at
{
branch
}
with:
\n\n
{
command
}
\n
{
'='
*
80
}
"
)
ret
=
os
.
system
(
command
)
ret
=
os
.
system
(
command
)
...
@@ -89,7 +113,9 @@ def extract_value(args, results, model, task, err=False):
...
@@ -89,7 +113,9 @@ def extract_value(args, results, model, task, err=False):
if
"acc,none"
in
results
:
if
"acc,none"
in
results
:
return
results
[
"acc,none"
]
if
not
err
else
results
[
"acc_stderr,none"
]
return
results
[
"acc,none"
]
if
not
err
else
results
[
"acc_stderr,none"
]
if
(
args
.
perplexity
or
"word_perplexity"
)
+
",none"
in
results
:
if
(
args
.
perplexity
or
"word_perplexity"
)
+
",none"
in
results
:
return
results
[(
args
.
perplexity
or
"word_perplexity"
)
+
",none"
]
if
not
err
else
0
return
(
results
[(
args
.
perplexity
or
"word_perplexity"
)
+
",none"
]
if
not
err
else
0
)
return
0
return
0
...
@@ -109,13 +135,24 @@ def format_diff(args, results1, results2, model, task):
...
@@ -109,13 +135,24 @@ def format_diff(args, results1, results2, model, task):
def
main
():
def
main
():
args
=
parse_args
()
args
=
parse_args
()
args
.
branches
=
args
.
branches
.
split
(
","
)
if
type
(
args
.
branches
)
==
str
else
args
.
branches
args
.
branches
=
(
args
.
branches
.
split
(
","
)
if
type
(
args
.
branches
)
==
str
else
args
.
branches
)
args
.
models
=
args
.
models
.
split
(
","
)
if
type
(
args
.
models
)
==
str
else
args
.
models
args
.
models
=
args
.
models
.
split
(
","
)
if
type
(
args
.
models
)
==
str
else
args
.
models
args
.
tasks
=
ALL_TASKS
if
args
.
tasks
==
"all_tasks"
\
args
.
tasks
=
(
else
utils
.
pattern_match
(
args
.
tasks
.
split
(
","
),
ALL_TASKS
)
if
type
(
args
.
tasks
)
==
str
else
args
.
tasks
ALL_TASKS
if
args
.
tasks
==
"all_tasks"
else
utils
.
pattern_match
(
args
.
tasks
.
split
(
","
),
ALL_TASKS
)
if
type
(
args
.
tasks
)
==
str
else
args
.
tasks
)
global
initial_branch
global
initial_branch
initial_branch
=
subprocess
.
check_output
(
"git branch --show-current"
,
shell
=
True
).
decode
(
"ascii"
).
strip
()
initial_branch
=
(
subprocess
.
check_output
(
"git branch --show-current"
,
shell
=
True
)
.
decode
(
"ascii"
)
.
strip
()
)
# TODO: implement proper timing for each task
# TODO: implement proper timing for each task
# TODO: reduce IO by sharing tasks between models?
# TODO: reduce IO by sharing tasks between models?
...
@@ -133,10 +170,16 @@ def main():
...
@@ -133,10 +170,16 @@ def main():
print
(
f
"|task|
{
'|'
.
join
(
map
(
lambda
model
:
Path
(
model
).
name
,
args
.
models
))
}
|"
)
print
(
f
"|task|
{
'|'
.
join
(
map
(
lambda
model
:
Path
(
model
).
name
,
args
.
models
))
}
|"
)
print
(
f
"|--|
{
'--|'
*
len
(
args
.
models
)
}
"
)
print
(
f
"|--|
{
'--|'
*
len
(
args
.
models
)
}
"
)
for
task
in
args
.
tasks
:
for
task
in
args
.
tasks
:
print
(
f
"|
{
task
}
(
{
initial_branch
}
)|
{
'|'
.
join
(
map
(
lambda
model
:
format_value
(
args
,
results
,
model
,
task
),
args
.
models
))
}
|"
)
print
(
f
"|
{
task
}
(
{
initial_branch
}
)|
{
'|'
.
join
(
map
(
lambda
model
:
format_value
(
args
,
results
,
model
,
task
),
args
.
models
))
}
|"
)
for
branch
,
branch_results
,
branch_runtime
in
runs
:
for
branch
,
branch_results
,
branch_runtime
in
runs
:
print
(
f
"|
{
task
}
(
{
branch
}
)|
{
'|'
.
join
(
map
(
lambda
model
:
format_value
(
args
,
branch_results
,
model
,
task
),
args
.
models
))
}
|"
)
print
(
print
(
f
"|
{
task
}
(diff)|
{
'|'
.
join
(
map
(
lambda
model
:
format_diff
(
args
,
results
,
branch_results
,
model
,
task
),
args
.
models
))
}
|"
)
f
"|
{
task
}
(
{
branch
}
)|
{
'|'
.
join
(
map
(
lambda
model
:
format_value
(
args
,
branch_results
,
model
,
task
),
args
.
models
))
}
|"
)
print
(
f
"|
{
task
}
(diff)|
{
'|'
.
join
(
map
(
lambda
model
:
format_diff
(
args
,
results
,
branch_results
,
model
,
task
),
args
.
models
))
}
|"
)
print
(
""
)
print
(
""
)
print
(
"|branch|runtime|%|"
)
print
(
"|branch|runtime|%|"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment