Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
e53eb332
Unverified
Commit
e53eb332
authored
May 19, 2023
by
Stella Biderman
Committed by
GitHub
May 19, 2023
Browse files
Merge pull request #477 from juletx/results
Add results of various models in json and md format
parents
d1327193
92a50856
Changes
189
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
595 additions
and
0 deletions
+595
-0
results/xglm/xglm-7.5B/xglm-7.5B_common_sense_reasoning_0-shot.json
...lm/xglm-7.5B/xglm-7.5B_common_sense_reasoning_0-shot.json
+91
-0
results/xglm/xglm-7.5B/xglm-7.5B_gsm8k_8-shot.json
results/xglm/xglm-7.5B/xglm-7.5B_gsm8k_8-shot.json
+22
-0
results/xglm/xglm-7.5B/xglm-7.5B_mathematical_reasoning_few_shot_5-shot.json
....5B/xglm-7.5B_mathematical_reasoning_few_shot_5-shot.json
+71
-0
results/xglm/xglm-7.5B/xglm-7.5B_pawsx_0-shot.json
results/xglm/xglm-7.5B/xglm-7.5B_pawsx_0-shot.json
+52
-0
results/xglm/xglm-7.5B/xglm-7.5B_xcopa_0-shot.json
results/xglm/xglm-7.5B/xglm-7.5B_xcopa_0-shot.json
+72
-0
results/xglm/xglm-7.5B/xglm-7.5B_xnli_0-shot.json
results/xglm/xglm-7.5B/xglm-7.5B_xnli_0-shot.json
+92
-0
results/xglm/xglm-7.5B/xglm-7.5B_xstory_cloze_0-shot.json
results/xglm/xglm-7.5B/xglm-7.5B_xstory_cloze_0-shot.json
+72
-0
results/xglm/xglm-7.5B/xglm-7.5B_xwinograd_0-shot.json
results/xglm/xglm-7.5B/xglm-7.5B_xwinograd_0-shot.json
+47
-0
scripts/make_table_results.py
scripts/make_table_results.py
+76
-0
No files found.
results/xglm/xglm-7.5B/xglm-7.5B_common_sense_reasoning_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"openbookqa"
:
{
"acc"
:
0.254
,
"acc_stderr"
:
0.019486596801643385
,
"acc_norm"
:
0.358
,
"acc_norm_stderr"
:
0.02146143486285912
},
"winogrande"
:
{
"acc"
:
0.5785319652722968
,
"acc_stderr"
:
0.0138780723774976
},
"arc_easy"
:
{
"acc"
:
0.6237373737373737
,
"acc_stderr"
:
0.009940646221513789
,
"acc_norm"
:
0.5862794612794613
,
"acc_norm_stderr"
:
0.010105878530238135
},
"copa"
:
{
"acc"
:
0.79
,
"acc_stderr"
:
0.040936018074033256
},
"mc_taco"
:
{
"em"
:
0.13813813813813813
,
"f1"
:
0.479152974631639
},
"wsc273"
:
{
"acc"
:
0.7582417582417582
,
"acc_stderr"
:
0.02596031999685269
},
"hellaswag"
:
{
"acc"
:
0.45688109938259314
,
"acc_stderr"
:
0.004971192387202445
,
"acc_norm"
:
0.6123282214698267
,
"acc_norm_stderr"
:
0.004862232790041574
},
"boolq"
:
{
"acc"
:
0.6018348623853211
,
"acc_stderr"
:
0.008561755594317445
},
"swag"
:
{
"acc"
:
0.505148455463361
,
"acc_stderr"
:
0.003534904635576977
,
"acc_norm"
:
0.692292312306308
,
"acc_norm_stderr"
:
0.003263207195550976
},
"piqa"
:
{
"acc"
:
0.7393906420021763
,
"acc_stderr"
:
0.010241826155811627
,
"acc_norm"
:
0.749183895538629
,
"acc_norm_stderr"
:
0.010113869547069046
},
"prost"
:
{
"acc"
:
0.2588599487617421
,
"acc_stderr"
:
0.0032000423309913543
,
"acc_norm"
:
0.26361016225448336
,
"acc_norm_stderr"
:
0.0032189046983713983
},
"arc_challenge"
:
{
"acc"
:
0.28754266211604096
,
"acc_stderr"
:
0.013226719056266129
,
"acc_norm"
:
0.3191126279863481
,
"acc_norm_stderr"
:
0.013621696119173304
}
},
"versions"
:
{
"openbookqa"
:
0
,
"winogrande"
:
0
,
"arc_easy"
:
0
,
"copa"
:
0
,
"mc_taco"
:
0
,
"wsc273"
:
0
,
"hellaswag"
:
0
,
"boolq"
:
1
,
"swag"
:
0
,
"piqa"
:
0
,
"prost"
:
0
,
"arc_challenge"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=facebook/xglm-7.5B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/xglm/xglm-7.5B/xglm-7.5B_gsm8k_8-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"gsm8k"
:
{
"acc"
:
0.001516300227445034
,
"acc_stderr"
:
0.0010717793485492655
}
},
"versions"
:
{
"gsm8k"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=facebook/xglm-7.5B,use_accelerate=True"
,
"num_fewshot"
:
8
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/xglm/xglm-7.5B/xglm-7.5B_mathematical_reasoning_few_shot_5-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"math_num_theory"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"gsm8k"
:
{
"acc"
:
0.002274450341167551
,
"acc_stderr"
:
0.0013121578148674316
},
"math_geometry"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"drop"
:
{
"em"
:
0.05421560402684564
,
"em_stderr"
:
0.002318984649948223
,
"f1"
:
0.08962458053691245
,
"f1_stderr"
:
0.0026401926224488034
},
"math_prealgebra"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_counting_and_prob"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_precalc"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_intermediate_algebra"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_algebra"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"mathqa"
:
{
"acc"
:
0.23986599664991626
,
"acc_stderr"
:
0.007816818250028128
,
"acc_norm"
:
0.23517587939698492
,
"acc_norm_stderr"
:
0.0077638612776946255
}
},
"versions"
:
{
"math_num_theory"
:
1
,
"gsm8k"
:
0
,
"math_geometry"
:
1
,
"drop"
:
1
,
"math_prealgebra"
:
1
,
"math_counting_and_prob"
:
1
,
"math_precalc"
:
1
,
"math_intermediate_algebra"
:
1
,
"math_algebra"
:
1
,
"mathqa"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=facebook/xglm-7.5B,use_accelerate=True"
,
"num_fewshot"
:
5
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/xglm/xglm-7.5B/xglm-7.5B_pawsx_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"pawsx_en"
:
{
"acc"
:
0.5885
,
"acc_stderr"
:
0.011006563824537298
},
"pawsx_es"
:
{
"acc"
:
0.528
,
"acc_stderr"
:
0.011165587094621537
},
"pawsx_fr"
:
{
"acc"
:
0.518
,
"acc_stderr"
:
0.011175886999478619
},
"pawsx_zh"
:
{
"acc"
:
0.513
,
"acc_stderr"
:
0.01117935548207038
},
"pawsx_ja"
:
{
"acc"
:
0.52
,
"acc_stderr"
:
0.011174185930778312
},
"pawsx_de"
:
{
"acc"
:
0.559
,
"acc_stderr"
:
0.011105006104468736
},
"pawsx_ko"
:
{
"acc"
:
0.4595
,
"acc_stderr"
:
0.011146389370464362
}
},
"versions"
:
{
"pawsx_en"
:
0
,
"pawsx_es"
:
0
,
"pawsx_fr"
:
0
,
"pawsx_zh"
:
0
,
"pawsx_ja"
:
0
,
"pawsx_de"
:
0
,
"pawsx_ko"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=facebook/xglm-7.5B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/xglm/xglm-7.5B/xglm-7.5B_xcopa_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"xcopa_et"
:
{
"acc"
:
0.612
,
"acc_stderr"
:
0.021814300984787635
},
"xcopa_th"
:
{
"acc"
:
0.594
,
"acc_stderr"
:
0.02198396209008634
},
"xcopa_qu"
:
{
"acc"
:
0.488
,
"acc_stderr"
:
0.02237662679792717
},
"xcopa_ta"
:
{
"acc"
:
0.544
,
"acc_stderr"
:
0.02229623834840705
},
"xcopa_zh"
:
{
"acc"
:
0.638
,
"acc_stderr"
:
0.0215136625275824
},
"xcopa_vi"
:
{
"acc"
:
0.702
,
"acc_stderr"
:
0.02047511809298897
},
"xcopa_sw"
:
{
"acc"
:
0.6
,
"acc_stderr"
:
0.021930844120728505
},
"xcopa_it"
:
{
"acc"
:
0.636
,
"acc_stderr"
:
0.021539170637317685
},
"xcopa_tr"
:
{
"acc"
:
0.584
,
"acc_stderr"
:
0.022064943313928848
},
"xcopa_id"
:
{
"acc"
:
0.694
,
"acc_stderr"
:
0.0206295699983454
},
"xcopa_ht"
:
{
"acc"
:
0.574
,
"acc_stderr"
:
0.022136577335085637
}
},
"versions"
:
{
"xcopa_et"
:
0
,
"xcopa_th"
:
0
,
"xcopa_qu"
:
0
,
"xcopa_ta"
:
0
,
"xcopa_zh"
:
0
,
"xcopa_vi"
:
0
,
"xcopa_sw"
:
0
,
"xcopa_it"
:
0
,
"xcopa_tr"
:
0
,
"xcopa_id"
:
0
,
"xcopa_ht"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=facebook/xglm-7.5B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/xglm/xglm-7.5B/xglm-7.5B_xnli_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"xnli_ar"
:
{
"acc"
:
0.3337325349301397
,
"acc_stderr"
:
0.00666266628252267
},
"xnli_bg"
:
{
"acc"
:
0.4489021956087824
,
"acc_stderr"
:
0.007027723874210379
},
"xnli_de"
:
{
"acc"
:
0.48982035928143713
,
"acc_stderr"
:
0.0070632481147059134
},
"xnli_el"
:
{
"acc"
:
0.40658682634730536
,
"acc_stderr"
:
0.006940323712177368
},
"xnli_en"
:
{
"acc"
:
0.5385229540918164
,
"acc_stderr"
:
0.0070437128985425335
},
"xnli_es"
:
{
"acc"
:
0.47704590818363274
,
"acc_stderr"
:
0.007057263845316342
},
"xnli_fr"
:
{
"acc"
:
0.4694610778443114
,
"acc_stderr"
:
0.007051522651006734
},
"xnli_hi"
:
{
"acc"
:
0.4720558882235529
,
"acc_stderr"
:
0.007053670508441103
},
"xnli_ru"
:
{
"acc"
:
0.46327345309381235
,
"acc_stderr"
:
0.007045628330322907
},
"xnli_sw"
:
{
"acc"
:
0.45828343313373254
,
"acc_stderr"
:
0.007040080446339805
},
"xnli_th"
:
{
"acc"
:
0.437125748502994
,
"acc_stderr"
:
0.007008633817895695
},
"xnli_tr"
:
{
"acc"
:
0.4626746506986028
,
"acc_stderr"
:
0.007045000071900887
},
"xnli_ur"
:
{
"acc"
:
0.42095808383233535
,
"acc_stderr"
:
0.006975878576227385
},
"xnli_vi"
:
{
"acc"
:
0.46327345309381235
,
"acc_stderr"
:
0.007045628330322896
},
"xnli_zh"
:
{
"acc"
:
0.3536926147704591
,
"acc_stderr"
:
0.006755492859492898
}
},
"versions"
:
{
"xnli_ar"
:
0
,
"xnli_bg"
:
0
,
"xnli_de"
:
0
,
"xnli_el"
:
0
,
"xnli_en"
:
0
,
"xnli_es"
:
0
,
"xnli_fr"
:
0
,
"xnli_hi"
:
0
,
"xnli_ru"
:
0
,
"xnli_sw"
:
0
,
"xnli_th"
:
0
,
"xnli_tr"
:
0
,
"xnli_ur"
:
0
,
"xnli_vi"
:
0
,
"xnli_zh"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=facebook/xglm-7.5B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/xglm/xglm-7.5B/xglm-7.5B_xstory_cloze_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"xstory_cloze_es"
:
{
"acc"
:
0.6406353408338848
,
"acc_stderr"
:
0.012347659802101675
},
"xstory_cloze_zh"
:
{
"acc"
:
0.5890138980807412
,
"acc_stderr"
:
0.012661578894368948
},
"xstory_cloze_sw"
:
{
"acc"
:
0.5929847782925215
,
"acc_stderr"
:
0.012642664836816926
},
"xstory_cloze_en"
:
{
"acc"
:
0.6982131039046989
,
"acc_stderr"
:
0.011812877848905303
},
"xstory_cloze_hi"
:
{
"acc"
:
0.5876902713434812
,
"acc_stderr"
:
0.012667694122397068
},
"xstory_cloze_ar"
:
{
"acc"
:
0.5618795499669094
,
"acc_stderr"
:
0.012768206616277757
},
"xstory_cloze_eu"
:
{
"acc"
:
0.5771012574454004
,
"acc_stderr"
:
0.0127132250091262
},
"xstory_cloze_id"
:
{
"acc"
:
0.6293845135671741
,
"acc_stderr"
:
0.012428861084065903
},
"xstory_cloze_ru"
:
{
"acc"
:
0.6353408338848445
,
"acc_stderr"
:
0.012386781532906161
},
"xstory_cloze_te"
:
{
"acc"
:
0.6022501654533422
,
"acc_stderr"
:
0.012595197856703525
},
"xstory_cloze_my"
:
{
"acc"
:
0.57114493712773
,
"acc_stderr"
:
0.01273620271314778
}
},
"versions"
:
{
"xstory_cloze_es"
:
0
,
"xstory_cloze_zh"
:
0
,
"xstory_cloze_sw"
:
0
,
"xstory_cloze_en"
:
0
,
"xstory_cloze_hi"
:
0
,
"xstory_cloze_ar"
:
0
,
"xstory_cloze_eu"
:
0
,
"xstory_cloze_id"
:
0
,
"xstory_cloze_ru"
:
0
,
"xstory_cloze_te"
:
0
,
"xstory_cloze_my"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=facebook/xglm-7.5B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/xglm/xglm-7.5B/xglm-7.5B_xwinograd_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"xwinograd_zh"
:
{
"acc"
:
0.7281746031746031
,
"acc_stderr"
:
0.01983712759311063
},
"xwinograd_ru"
:
{
"acc"
:
0.6317460317460317
,
"acc_stderr"
:
0.027219500732466696
},
"xwinograd_pt"
:
{
"acc"
:
0.6730038022813688
,
"acc_stderr"
:
0.028982074243683254
},
"xwinograd_en"
:
{
"acc"
:
0.7948387096774193
,
"acc_stderr"
:
0.008376626547826555
},
"xwinograd_jp"
:
{
"acc"
:
0.6496350364963503
,
"acc_stderr"
:
0.01541389159576608
},
"xwinograd_fr"
:
{
"acc"
:
0.6506024096385542
,
"acc_stderr"
:
0.05265151356440471
}
},
"versions"
:
{
"xwinograd_zh"
:
0
,
"xwinograd_ru"
:
0
,
"xwinograd_pt"
:
0
,
"xwinograd_en"
:
0
,
"xwinograd_jp"
:
0
,
"xwinograd_fr"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=facebook/xglm-7.5B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
scripts/make_table_results.py
0 → 100644
View file @
e53eb332
"""
Usage:
python make_table_tasks.py --output <markdown_filename>
"""
import
logging
from
lm_eval
import
tasks
from
pytablewriter
import
MarkdownTableWriter
,
LatexTableWriter
import
os
import
json
logging
.
basicConfig
(
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
def
make_table
(
result_dict
):
"""Generate table of results."""
md_writer
=
MarkdownTableWriter
()
latex_writer
=
LatexTableWriter
()
md_writer
.
headers
=
[
"Task"
,
"Version"
,
"Metric"
,
"Value"
,
""
,
"Stderr"
]
latex_writer
.
headers
=
[
"Task"
,
"Version"
,
"Metric"
,
"Value"
,
""
,
"Stderr"
]
values
=
[]
for
k
,
dic
in
sorted
(
result_dict
[
"results"
].
items
()):
version
=
result_dict
[
"versions"
][
k
]
percent
=
k
==
"squad2"
for
m
,
v
in
dic
.
items
():
if
m
.
endswith
(
"_stderr"
):
continue
if
m
+
"_stderr"
in
dic
:
se
=
dic
[
m
+
"_stderr"
]
if
percent
or
m
==
"ppl"
:
values
.
append
([
k
,
version
,
m
,
"%.2f"
%
v
,
"±"
,
"%.2f"
%
se
])
else
:
values
.
append
(
[
k
,
version
,
m
,
"%.2f"
%
(
v
*
100
),
"±"
,
"%.2f"
%
(
se
*
100
)]
)
else
:
if
percent
or
m
==
"ppl"
:
values
.
append
([
k
,
version
,
m
,
"%.2f"
%
v
,
""
,
""
])
else
:
values
.
append
([
k
,
version
,
m
,
"%.2f"
%
(
v
*
100
),
""
,
""
])
k
=
""
version
=
""
md_writer
.
value_matrix
=
values
latex_writer
.
value_matrix
=
values
# todo: make latex table look good
# print(latex_writer.dumps())
return
md_writer
.
dumps
()
if
__name__
==
"__main__"
:
task_names
=
tasks
.
ALL_TASKS
# loop dirs and subdirs in results dir
# for each dir, load json files
for
dirpath
,
dirnames
,
filenames
in
os
.
walk
(
"../results"
):
# skip dirs without files
if
not
filenames
:
continue
path_readme
=
os
.
path
.
join
(
dirpath
,
"README.md"
)
with
open
(
path_readme
,
"w"
)
as
f
:
# get path name, only last folder
path_name
=
dirpath
.
split
(
"/"
)[
-
1
]
f
.
write
(
f
"#
{
path_name
}
\n\n
"
)
for
filename
in
sorted
([
f
for
f
in
filenames
if
f
.
endswith
(
".json"
)]):
path
=
os
.
path
.
join
(
dirpath
,
filename
)
with
open
(
path
,
"r"
)
as
f
:
result_dict
=
json
.
load
(
f
)
with
open
(
path_readme
,
"a"
)
as
f
:
f
.
write
(
f
"##
{
filename
}
\n
"
)
f
.
write
(
f
"
{
make_table
(
result_dict
)
}
\n
"
)
Prev
1
…
6
7
8
9
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment