Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
e495e3a0
Commit
e495e3a0
authored
Jun 14, 2023
by
gk
Browse files
Merge branch 'master' into big-refactor-test
parents
6d355b85
9d06c953
Changes
208
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
512 additions
and
2 deletions
+512
-2
results/xglm/xglm-7.5B/xglm-7.5B_xcopa_0-shot.json
results/xglm/xglm-7.5B/xglm-7.5B_xcopa_0-shot.json
+72
-0
results/xglm/xglm-7.5B/xglm-7.5B_xnli_0-shot.json
results/xglm/xglm-7.5B/xglm-7.5B_xnli_0-shot.json
+92
-0
results/xglm/xglm-7.5B/xglm-7.5B_xstory_cloze_0-shot.json
results/xglm/xglm-7.5B/xglm-7.5B_xstory_cloze_0-shot.json
+72
-0
results/xglm/xglm-7.5B/xglm-7.5B_xwinograd_0-shot.json
results/xglm/xglm-7.5B/xglm-7.5B_xwinograd_0-shot.json
+47
-0
scripts/cost_estimate.py
scripts/cost_estimate.py
+1
-1
scripts/make_table_results.py
scripts/make_table_results.py
+76
-0
scripts/regression.py
scripts/regression.py
+149
-0
setup.py
setup.py
+3
-1
No files found.
results/xglm/xglm-7.5B/xglm-7.5B_xcopa_0-shot.json
0 → 100644
View file @
e495e3a0
{
"results"
:
{
"xcopa_et"
:
{
"acc"
:
0.612
,
"acc_stderr"
:
0.021814300984787635
},
"xcopa_th"
:
{
"acc"
:
0.594
,
"acc_stderr"
:
0.02198396209008634
},
"xcopa_qu"
:
{
"acc"
:
0.488
,
"acc_stderr"
:
0.02237662679792717
},
"xcopa_ta"
:
{
"acc"
:
0.544
,
"acc_stderr"
:
0.02229623834840705
},
"xcopa_zh"
:
{
"acc"
:
0.638
,
"acc_stderr"
:
0.0215136625275824
},
"xcopa_vi"
:
{
"acc"
:
0.702
,
"acc_stderr"
:
0.02047511809298897
},
"xcopa_sw"
:
{
"acc"
:
0.6
,
"acc_stderr"
:
0.021930844120728505
},
"xcopa_it"
:
{
"acc"
:
0.636
,
"acc_stderr"
:
0.021539170637317685
},
"xcopa_tr"
:
{
"acc"
:
0.584
,
"acc_stderr"
:
0.022064943313928848
},
"xcopa_id"
:
{
"acc"
:
0.694
,
"acc_stderr"
:
0.0206295699983454
},
"xcopa_ht"
:
{
"acc"
:
0.574
,
"acc_stderr"
:
0.022136577335085637
}
},
"versions"
:
{
"xcopa_et"
:
0
,
"xcopa_th"
:
0
,
"xcopa_qu"
:
0
,
"xcopa_ta"
:
0
,
"xcopa_zh"
:
0
,
"xcopa_vi"
:
0
,
"xcopa_sw"
:
0
,
"xcopa_it"
:
0
,
"xcopa_tr"
:
0
,
"xcopa_id"
:
0
,
"xcopa_ht"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=facebook/xglm-7.5B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/xglm/xglm-7.5B/xglm-7.5B_xnli_0-shot.json
0 → 100644
View file @
e495e3a0
{
"results"
:
{
"xnli_ar"
:
{
"acc"
:
0.3337325349301397
,
"acc_stderr"
:
0.00666266628252267
},
"xnli_bg"
:
{
"acc"
:
0.4489021956087824
,
"acc_stderr"
:
0.007027723874210379
},
"xnli_de"
:
{
"acc"
:
0.48982035928143713
,
"acc_stderr"
:
0.0070632481147059134
},
"xnli_el"
:
{
"acc"
:
0.40658682634730536
,
"acc_stderr"
:
0.006940323712177368
},
"xnli_en"
:
{
"acc"
:
0.5385229540918164
,
"acc_stderr"
:
0.0070437128985425335
},
"xnli_es"
:
{
"acc"
:
0.47704590818363274
,
"acc_stderr"
:
0.007057263845316342
},
"xnli_fr"
:
{
"acc"
:
0.4694610778443114
,
"acc_stderr"
:
0.007051522651006734
},
"xnli_hi"
:
{
"acc"
:
0.4720558882235529
,
"acc_stderr"
:
0.007053670508441103
},
"xnli_ru"
:
{
"acc"
:
0.46327345309381235
,
"acc_stderr"
:
0.007045628330322907
},
"xnli_sw"
:
{
"acc"
:
0.45828343313373254
,
"acc_stderr"
:
0.007040080446339805
},
"xnli_th"
:
{
"acc"
:
0.437125748502994
,
"acc_stderr"
:
0.007008633817895695
},
"xnli_tr"
:
{
"acc"
:
0.4626746506986028
,
"acc_stderr"
:
0.007045000071900887
},
"xnli_ur"
:
{
"acc"
:
0.42095808383233535
,
"acc_stderr"
:
0.006975878576227385
},
"xnli_vi"
:
{
"acc"
:
0.46327345309381235
,
"acc_stderr"
:
0.007045628330322896
},
"xnli_zh"
:
{
"acc"
:
0.3536926147704591
,
"acc_stderr"
:
0.006755492859492898
}
},
"versions"
:
{
"xnli_ar"
:
0
,
"xnli_bg"
:
0
,
"xnli_de"
:
0
,
"xnli_el"
:
0
,
"xnli_en"
:
0
,
"xnli_es"
:
0
,
"xnli_fr"
:
0
,
"xnli_hi"
:
0
,
"xnli_ru"
:
0
,
"xnli_sw"
:
0
,
"xnli_th"
:
0
,
"xnli_tr"
:
0
,
"xnli_ur"
:
0
,
"xnli_vi"
:
0
,
"xnli_zh"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=facebook/xglm-7.5B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/xglm/xglm-7.5B/xglm-7.5B_xstory_cloze_0-shot.json
0 → 100644
View file @
e495e3a0
{
"results"
:
{
"xstory_cloze_es"
:
{
"acc"
:
0.6406353408338848
,
"acc_stderr"
:
0.012347659802101675
},
"xstory_cloze_zh"
:
{
"acc"
:
0.5890138980807412
,
"acc_stderr"
:
0.012661578894368948
},
"xstory_cloze_sw"
:
{
"acc"
:
0.5929847782925215
,
"acc_stderr"
:
0.012642664836816926
},
"xstory_cloze_en"
:
{
"acc"
:
0.6982131039046989
,
"acc_stderr"
:
0.011812877848905303
},
"xstory_cloze_hi"
:
{
"acc"
:
0.5876902713434812
,
"acc_stderr"
:
0.012667694122397068
},
"xstory_cloze_ar"
:
{
"acc"
:
0.5618795499669094
,
"acc_stderr"
:
0.012768206616277757
},
"xstory_cloze_eu"
:
{
"acc"
:
0.5771012574454004
,
"acc_stderr"
:
0.0127132250091262
},
"xstory_cloze_id"
:
{
"acc"
:
0.6293845135671741
,
"acc_stderr"
:
0.012428861084065903
},
"xstory_cloze_ru"
:
{
"acc"
:
0.6353408338848445
,
"acc_stderr"
:
0.012386781532906161
},
"xstory_cloze_te"
:
{
"acc"
:
0.6022501654533422
,
"acc_stderr"
:
0.012595197856703525
},
"xstory_cloze_my"
:
{
"acc"
:
0.57114493712773
,
"acc_stderr"
:
0.01273620271314778
}
},
"versions"
:
{
"xstory_cloze_es"
:
0
,
"xstory_cloze_zh"
:
0
,
"xstory_cloze_sw"
:
0
,
"xstory_cloze_en"
:
0
,
"xstory_cloze_hi"
:
0
,
"xstory_cloze_ar"
:
0
,
"xstory_cloze_eu"
:
0
,
"xstory_cloze_id"
:
0
,
"xstory_cloze_ru"
:
0
,
"xstory_cloze_te"
:
0
,
"xstory_cloze_my"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=facebook/xglm-7.5B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/xglm/xglm-7.5B/xglm-7.5B_xwinograd_0-shot.json
0 → 100644
View file @
e495e3a0
{
"results"
:
{
"xwinograd_zh"
:
{
"acc"
:
0.7281746031746031
,
"acc_stderr"
:
0.01983712759311063
},
"xwinograd_ru"
:
{
"acc"
:
0.6317460317460317
,
"acc_stderr"
:
0.027219500732466696
},
"xwinograd_pt"
:
{
"acc"
:
0.6730038022813688
,
"acc_stderr"
:
0.028982074243683254
},
"xwinograd_en"
:
{
"acc"
:
0.7948387096774193
,
"acc_stderr"
:
0.008376626547826555
},
"xwinograd_jp"
:
{
"acc"
:
0.6496350364963503
,
"acc_stderr"
:
0.01541389159576608
},
"xwinograd_fr"
:
{
"acc"
:
0.6506024096385542
,
"acc_stderr"
:
0.05265151356440471
}
},
"versions"
:
{
"xwinograd_zh"
:
0
,
"xwinograd_ru"
:
0
,
"xwinograd_pt"
:
0
,
"xwinograd_en"
:
0
,
"xwinograd_jp"
:
0
,
"xwinograd_fr"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=facebook/xglm-7.5B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
scripts/cost_estimate.py
View file @
e495e3a0
...
...
@@ -26,7 +26,7 @@ class DryrunLM(LM):
def
greedy_until
(
self
,
requests
):
res
=
[]
for
ctx
,
until
in
requests
:
for
ctx
,
_
in
requests
:
res
.
append
(
"lol"
)
# assume worst case - generates until 256
...
...
scripts/make_table_results.py
0 → 100644
View file @
e495e3a0
"""
Usage:
python make_table_tasks.py --output <markdown_filename>
"""
import
logging
from
lm_eval
import
tasks
from
pytablewriter
import
MarkdownTableWriter
,
LatexTableWriter
import
os
import
json
logging
.
basicConfig
(
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
def
make_table
(
result_dict
):
"""Generate table of results."""
md_writer
=
MarkdownTableWriter
()
latex_writer
=
LatexTableWriter
()
md_writer
.
headers
=
[
"Task"
,
"Version"
,
"Metric"
,
"Value"
,
""
,
"Stderr"
]
latex_writer
.
headers
=
[
"Task"
,
"Version"
,
"Metric"
,
"Value"
,
""
,
"Stderr"
]
values
=
[]
for
k
,
dic
in
sorted
(
result_dict
[
"results"
].
items
()):
version
=
result_dict
[
"versions"
][
k
]
percent
=
k
==
"squad2"
for
m
,
v
in
dic
.
items
():
if
m
.
endswith
(
"_stderr"
):
continue
if
m
+
"_stderr"
in
dic
:
se
=
dic
[
m
+
"_stderr"
]
if
percent
or
m
==
"ppl"
:
values
.
append
([
k
,
version
,
m
,
"%.2f"
%
v
,
"±"
,
"%.2f"
%
se
])
else
:
values
.
append
(
[
k
,
version
,
m
,
"%.2f"
%
(
v
*
100
),
"±"
,
"%.2f"
%
(
se
*
100
)]
)
else
:
if
percent
or
m
==
"ppl"
:
values
.
append
([
k
,
version
,
m
,
"%.2f"
%
v
,
""
,
""
])
else
:
values
.
append
([
k
,
version
,
m
,
"%.2f"
%
(
v
*
100
),
""
,
""
])
k
=
""
version
=
""
md_writer
.
value_matrix
=
values
latex_writer
.
value_matrix
=
values
# todo: make latex table look good
# print(latex_writer.dumps())
return
md_writer
.
dumps
()
if
__name__
==
"__main__"
:
task_names
=
tasks
.
ALL_TASKS
# loop dirs and subdirs in results dir
# for each dir, load json files
for
dirpath
,
dirnames
,
filenames
in
os
.
walk
(
"../results"
):
# skip dirs without files
if
not
filenames
:
continue
path_readme
=
os
.
path
.
join
(
dirpath
,
"README.md"
)
with
open
(
path_readme
,
"w"
)
as
f
:
# get path name, only last folder
path_name
=
dirpath
.
split
(
"/"
)[
-
1
]
f
.
write
(
f
"#
{
path_name
}
\n\n
"
)
for
filename
in
sorted
([
f
for
f
in
filenames
if
f
.
endswith
(
".json"
)]):
path
=
os
.
path
.
join
(
dirpath
,
filename
)
with
open
(
path
,
"r"
)
as
f
:
result_dict
=
json
.
load
(
f
)
with
open
(
path_readme
,
"a"
)
as
f
:
f
.
write
(
f
"##
{
filename
}
\n
"
)
f
.
write
(
f
"
{
make_table
(
result_dict
)
}
\n
"
)
scripts/regression.py
0 → 100644
View file @
e495e3a0
import
argparse
import
json
import
os
import
subprocess
import
time
from
pathlib
import
Path
from
lm_eval
import
tasks
,
utils
seq2seq_models
=
[
"google/flan-t5-small"
]
causal_models
=
[
"gpt2"
,
"facebook/opt-125m"
,
"EleutherAI/gpt-neo-125m"
,
"EleutherAI/pythia-160m"
]
model_names
=
seq2seq_models
+
causal_models
completion_tasks
=
[
"boolq"
,
"lambada_openai"
,
"winogrande"
]
choice_tasks
=
[
"hellaswag"
,
"openbookqa"
,
"piqa"
]
perplexity_tasks
=
[
"wikitext"
]
generation_tasks
=
[]
task_names
=
completion_tasks
+
choice_tasks
+
perplexity_tasks
+
generation_tasks
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--branches"
,
default
=
[])
parser
.
add_argument
(
"--models"
,
default
=
model_names
)
parser
.
add_argument
(
"--tasks"
,
default
=
task_names
)
parser
.
add_argument
(
"--acc_norm"
,
type
=
bool
,
default
=
False
)
parser
.
add_argument
(
"--perplexity"
,
default
=
None
)
# TODO: implement num_fewshot and limit per task, e.g. task1:5,task2:1:100,task3::1000
parser
.
add_argument
(
"--num_fewshot"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
"--limit"
,
type
=
float
,
default
=
None
)
# TODO: implement hf-auto to pick between causal and seq2seq models so we don't need this
parser
.
add_argument
(
"--model"
,
default
=
"hf-causal-experimental"
)
# Use whatever is faster here
parser
.
add_argument
(
"--model_args"
,
default
=
"use_accelerate=True,load_in_8bit=True"
)
parser
.
add_argument
(
"--batch_size"
,
default
=
"auto"
)
return
parser
.
parse_args
()
def
eval_models
(
args
,
branch
=
None
):
if
branch
is
not
None
:
if
os
.
system
(
f
"git checkout
{
branch
}
"
)
!=
0
:
return
{},
0
branch
=
branch
or
initial_branch
start_time
=
time
.
time
()
results
=
{}
for
model
in
args
.
models
:
model_type
=
"hf-causal-experimental"
if
model
in
causal_models
\
else
"hf-seq2seq"
if
model
in
seq2seq_models
else
args
.
model
model_args
=
f
"pretrained=
{
model
}
,
{
args
.
model_args
}
"
# TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527
tasks
=
args
.
tasks
if
model
in
causal_models
or
model_type
==
"hf-causal-experimental"
\
else
list
(
filter
(
lambda
task
:
task
not
in
perplexity_tasks
,
args
.
tasks
))
# TODO: OOM with auto for seq2seq models, also can OOM with llama
batch_size
=
args
.
batch_size
if
model
in
causal_models
or
model_type
==
"hf-causal-experimental"
\
else
64
if
args
.
batch_size
==
"auto"
else
args
.
batch_size
output_path
=
f
"data/regression/
{
int
(
start_time
)
}
-
{
branch
}
-
{
Path
(
model
).
name
}
.json"
command
=
f
"python3 main.py --model
{
model_type
}
--model_args
{
model_args
}
--tasks
{
','
.
join
(
tasks
)
}
"
\
f
"--num_fewshot
{
args
.
num_fewshot
}{
''
if
args
.
limit
is
None
else
f
' --limit
{
args
.
limit
}
'
}
"
\
f
"--batch_size
{
batch_size
}
--no_cache --output_path
{
output_path
}
"
print
(
f
"
{
'='
*
80
}
\n
Evaluating
{
model
}
on
{
', '
.
join
(
tasks
)
}
at
{
branch
}
with:
\n\n
{
command
}
\n
{
'='
*
80
}
"
)
ret
=
os
.
system
(
command
)
results
[
model
]
=
json
.
load
(
open
(
output_path
))
if
ret
==
0
else
{
"results"
:
{}}
end_time
=
time
.
time
()
return
results
,
end_time
-
start_time
def
extract_value
(
args
,
results
,
model
,
task
,
err
=
False
):
if
model
not
in
results
:
return
0
results
=
results
[
model
][
"results"
]
if
task
not
in
results
:
return
0
results
=
results
[
task
]
if
args
.
acc_norm
and
"acc_norm"
in
results
:
return
results
[
"acc_norm"
]
if
not
err
else
results
[
"acc_norm_stderr"
]
if
"acc"
in
results
:
return
results
[
"acc"
]
if
not
err
else
results
[
"acc_stderr"
]
if
(
args
.
perplexity
or
"word_perplexity"
)
in
results
:
return
results
[
args
.
perplexity
or
"word_perplexity"
]
if
not
err
else
0
return
0
def
format_value
(
args
,
results
,
model
,
task
):
val
=
100
*
extract_value
(
args
,
results
,
model
,
task
)
err
=
100
*
extract_value
(
args
,
results
,
model
,
task
,
err
=
True
)
return
f
"
{
val
:.
2
f
}{
f
' ±
{
err
:.
2
f
}
' if err != 0 else ''
}
"
def
format_diff
(
args
,
results1
,
results2
,
model
,
task
):
val1
=
100
*
extract_value
(
args
,
results1
,
model
,
task
)
val2
=
100
*
extract_value
(
args
,
results2
,
model
,
task
)
diff
=
val2
-
val1
return
f
"**+
{
diff
:.
2
f
}
**"
if
diff
>
0
else
f
"
{
diff
:.
2
f
}
"
def
main
():
args
=
parse_args
()
args
.
branches
=
args
.
branches
.
split
(
","
)
if
type
(
args
.
branches
)
==
str
else
args
.
branches
args
.
models
=
args
.
models
.
split
(
","
)
if
type
(
args
.
models
)
==
str
else
args
.
models
args
.
tasks
=
tasks
.
ALL_TASKS
if
args
.
tasks
==
"all_tasks"
\
else
utils
.
pattern_match
(
args
.
tasks
.
split
(
","
)
if
type
(
args
.
tasks
)
==
str
else
args
.
tasks
,
tasks
.
ALL_TASKS
)
global
initial_branch
initial_branch
=
subprocess
.
check_output
(
"git branch --show-current"
,
shell
=
True
).
decode
(
"ascii"
).
strip
()
# TODO: implement proper timing for each task
# TODO: reduce IO by sharing tasks between models?
results
,
runtime
=
eval_models
(
args
)
print
(
results
,
runtime
)
runs
=
[]
for
branch
in
args
.
branches
:
runs
.
append
((
branch
,
*
eval_models
(
args
,
branch
)))
os
.
system
(
f
"git checkout
{
initial_branch
}
"
)
print
(
""
)
print
(
f
"|task|
{
'|'
.
join
(
map
(
lambda
model
:
Path
(
model
).
name
,
args
.
models
))
}
|"
)
print
(
f
"|--|
{
'--|'
*
len
(
args
.
models
)
}
"
)
for
task
in
args
.
tasks
:
print
(
f
"|
{
task
}
(
{
initial_branch
}
)|
{
'|'
.
join
(
map
(
lambda
model
:
format_value
(
args
,
results
,
model
,
task
),
args
.
models
))
}
|"
)
for
branch
,
branch_results
,
branch_runtime
in
runs
:
print
(
f
"|
{
task
}
(
{
branch
}
)|
{
'|'
.
join
(
map
(
lambda
model
:
format_value
(
args
,
branch_results
,
model
,
task
),
args
.
models
))
}
|"
)
print
(
f
"|
{
task
}
(diff)|
{
'|'
.
join
(
map
(
lambda
model
:
format_diff
(
args
,
results
,
branch_results
,
model
,
task
),
args
.
models
))
}
|"
)
print
(
""
)
print
(
"|branch|runtime|%|"
)
print
(
"|--|--|--|"
)
print
(
f
"|
{
initial_branch
}
|
{
runtime
:.
1
f
}
s|100%|"
)
for
branch
,
_
,
branch_runtime
in
runs
:
print
(
f
"|
{
branch
}
|
{
branch_runtime
:.
1
f
}
s|
{
100
*
branch_runtime
/
runtime
:.
2
f
}
%|"
)
if
__name__
==
"__main__"
:
main
()
setup.py
View file @
e495e3a0
...
...
@@ -19,7 +19,7 @@ setuptools.setup(
"License :: OSI Approved :: MIT License"
,
"Operating System :: OS Independent"
,
],
python_requires
=
">=3.
6
"
,
python_requires
=
">=3.
9
"
,
install_requires
=
[
"accelerate>=0.18.0"
,
"datasets>=2.0.0"
,
...
...
@@ -47,5 +47,7 @@ setuptools.setup(
"promptsource"
:
[
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
],
"auto-gptq"
:
[
"auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"
],
"anthropic"
:
[
"anthropic"
],
},
)
Prev
1
…
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment