Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
21e128d8
Commit
21e128d8
authored
May 06, 2023
by
Julen Etxaniz
Browse files
add bloom, xglm and llama results
parent
0542d35d
Changes
141
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1783 additions
and
0 deletions
+1783
-0
results/bloom/bloom-560m/bloom-560m_xstory_cloze_0-shot.json
results/bloom/bloom-560m/bloom-560m_xstory_cloze_0-shot.json
+72
-0
results/bloom/bloom-560m/bloom-560m_xwinograd_0-shot.json
results/bloom/bloom-560m/bloom-560m_xwinograd_0-shot.json
+47
-0
results/bloom/bloom-7b1/bloom-7b1_bbh_3-shot.json
results/bloom/bloom-7b1/bloom-7b1_bbh_3-shot.json
+124
-0
results/bloom/bloom-7b1/bloom-7b1_common_sense_reasoning_0-shot.json
...om/bloom-7b1/bloom-7b1_common_sense_reasoning_0-shot.json
+91
-0
results/bloom/bloom-7b1/bloom-7b1_gsm8k_8-shot.json
results/bloom/bloom-7b1/bloom-7b1_gsm8k_8-shot.json
+22
-0
results/bloom/bloom-7b1/bloom-7b1_mathematical_reasoning_few_shot_5-shot.json
...7b1/bloom-7b1_mathematical_reasoning_few_shot_5-shot.json
+71
-0
results/bloom/bloom-7b1/bloom-7b1_pawsx_0-shot.json
results/bloom/bloom-7b1/bloom-7b1_pawsx_0-shot.json
+52
-0
results/bloom/bloom-7b1/bloom-7b1_question_answering_0-shot.json
.../bloom/bloom-7b1/bloom-7b1_question_answering_0-shot.json
+66
-0
results/bloom/bloom-7b1/bloom-7b1_reading_comprehension_0-shot.json
...oom/bloom-7b1/bloom-7b1_reading_comprehension_0-shot.json
+36
-0
results/bloom/bloom-7b1/bloom-7b1_xcopa_0-shot.json
results/bloom/bloom-7b1/bloom-7b1_xcopa_0-shot.json
+72
-0
results/bloom/bloom-7b1/bloom-7b1_xnli_0-shot.json
results/bloom/bloom-7b1/bloom-7b1_xnli_0-shot.json
+92
-0
results/bloom/bloom-7b1/bloom-7b1_xstory_cloze_0-shot.json
results/bloom/bloom-7b1/bloom-7b1_xstory_cloze_0-shot.json
+72
-0
results/bloom/bloom-7b1/bloom-7b1_xwinograd_0-shot.json
results/bloom/bloom-7b1/bloom-7b1_xwinograd_0-shot.json
+47
-0
results/llama/llama-13B/llama-13B_arithmetic_5-shot.json
results/llama/llama-13B/llama-13B_arithmetic_5-shot.json
+67
-0
results/llama/llama-13B/llama-13B_bbh_3-shot.json
results/llama/llama-13B/llama-13B_bbh_3-shot.json
+124
-0
results/llama/llama-13B/llama-13B_blimp_0-shot.json
results/llama/llama-13B/llama-13B_blimp_0-shot.json
+352
-0
results/llama/llama-13B/llama-13B_common_sense_reasoning_0-shot.json
...ma/llama-13B/llama-13B_common_sense_reasoning_0-shot.json
+91
-0
results/llama/llama-13B/llama-13B_glue_0-shot.json
results/llama/llama-13B/llama-13B_glue_0-shot.json
+66
-0
results/llama/llama-13B/llama-13B_gsm8k_8-shot.json
results/llama/llama-13B/llama-13B_gsm8k_8-shot.json
+22
-0
results/llama/llama-13B/llama-13B_human_alignment_0-shot.json
...lts/llama/llama-13B/llama-13B_human_alignment_0-shot.json
+197
-0
No files found.
results/bloom/bloom-560m/bloom-560m_xstory_cloze_0-shot.json
0 → 100644
View file @
21e128d8
{
"results"
:
{
"xstory_cloze_es"
:
{
"acc"
:
0.5585704831237591
,
"acc_stderr"
:
0.012778538985880637
},
"xstory_cloze_hi"
:
{
"acc"
:
0.5499669093315684
,
"acc_stderr"
:
0.01280271359821983
},
"xstory_cloze_eu"
:
{
"acc"
:
0.5360688285903376
,
"acc_stderr"
:
0.012833602406620015
},
"xstory_cloze_ar"
:
{
"acc"
:
0.5208471211118465
,
"acc_stderr"
:
0.012855936282881267
},
"xstory_cloze_zh"
:
{
"acc"
:
0.5453342157511581
,
"acc_stderr"
:
0.012814127367359424
},
"xstory_cloze_te"
:
{
"acc"
:
0.557246856386499
,
"acc_stderr"
:
0.012782510750319236
},
"xstory_cloze_sw"
:
{
"acc"
:
0.4983454665784249
,
"acc_stderr"
:
0.012867054869163334
},
"xstory_cloze_ru"
:
{
"acc"
:
0.49172733289212445
,
"acc_stderr"
:
0.012865364020375405
},
"xstory_cloze_my"
:
{
"acc"
:
0.47187293183322304
,
"acc_stderr"
:
0.012846749995797694
},
"xstory_cloze_en"
:
{
"acc"
:
0.6121773659827928
,
"acc_stderr"
:
0.012539110696551456
},
"xstory_cloze_id"
:
{
"acc"
:
0.5552614162806089
,
"acc_stderr"
:
0.01278829597020778
}
},
"versions"
:
{
"xstory_cloze_es"
:
0
,
"xstory_cloze_hi"
:
0
,
"xstory_cloze_eu"
:
0
,
"xstory_cloze_ar"
:
0
,
"xstory_cloze_zh"
:
0
,
"xstory_cloze_te"
:
0
,
"xstory_cloze_sw"
:
0
,
"xstory_cloze_ru"
:
0
,
"xstory_cloze_my"
:
0
,
"xstory_cloze_en"
:
0
,
"xstory_cloze_id"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-560m"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-560m/bloom-560m_xwinograd_0-shot.json
0 → 100644
View file @
21e128d8
{
"results"
:
{
"xwinograd_en"
:
{
"acc"
:
0.6589247311827957
,
"acc_stderr"
:
0.009833881195698878
},
"xwinograd_pt"
:
{
"acc"
:
0.6007604562737643
,
"acc_stderr"
:
0.03025636835693898
},
"xwinograd_ru"
:
{
"acc"
:
0.49206349206349204
,
"acc_stderr"
:
0.028213077547815057
},
"xwinograd_fr"
:
{
"acc"
:
0.6024096385542169
,
"acc_stderr"
:
0.054045178247868114
},
"xwinograd_jp"
:
{
"acc"
:
0.529718456725756
,
"acc_stderr"
:
0.01612570703179889
},
"xwinograd_zh"
:
{
"acc"
:
0.6765873015873016
,
"acc_stderr"
:
0.020857221952855685
}
},
"versions"
:
{
"xwinograd_en"
:
0
,
"xwinograd_pt"
:
0
,
"xwinograd_ru"
:
0
,
"xwinograd_fr"
:
0
,
"xwinograd_jp"
:
0
,
"xwinograd_zh"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-560m"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-7b1/bloom-7b1_bbh_3-shot.json
0 → 100644
View file @
21e128d8
{
"results"
:
{
"bigbench_disambiguation_qa"
:
{
"multiple_choice_grade"
:
0.26356589147286824
,
"multiple_choice_grade_stderr"
:
0.027481788262218698
},
"bigbench_logical_deduction_three_objects"
:
{
"multiple_choice_grade"
:
0.37
,
"multiple_choice_grade_stderr"
:
0.027921294063982
},
"bigbench_causal_judgement"
:
{
"multiple_choice_grade"
:
0.5210526315789473
,
"multiple_choice_grade_stderr"
:
0.03633739504773335
},
"bigbench_date_understanding"
:
{
"multiple_choice_grade"
:
0.36585365853658536
,
"multiple_choice_grade_stderr"
:
0.025108717905729792
},
"bigbench_navigate"
:
{
"multiple_choice_grade"
:
0.499
,
"multiple_choice_grade_stderr"
:
0.015819268290576817
},
"bigbench_salient_translation_error_detection"
:
{
"multiple_choice_grade"
:
0.19138276553106212
,
"multiple_choice_grade_stderr"
:
0.012458774650265594
},
"bigbench_temporal_sequences"
:
{
"multiple_choice_grade"
:
0.248
,
"multiple_choice_grade_stderr"
:
0.013663187134877651
},
"bigbench_tracking_shuffled_objects_seven_objects"
:
{
"multiple_choice_grade"
:
0.14
,
"multiple_choice_grade_stderr"
:
0.00829694743648913
},
"bigbench_ruin_names"
:
{
"multiple_choice_grade"
:
0.34375
,
"multiple_choice_grade_stderr"
:
0.02246478414865448
},
"bigbench_reasoning_about_colored_objects"
:
{
"multiple_choice_grade"
:
0.2485
,
"multiple_choice_grade_stderr"
:
0.009665432493822852
},
"bigbench_dyck_languages"
:
{
"multiple_choice_grade"
:
0.144
,
"multiple_choice_grade_stderr"
:
0.01110798754893915
},
"bigbench_logical_deduction_five_objects"
:
{
"multiple_choice_grade"
:
0.26
,
"multiple_choice_grade_stderr"
:
0.019635965529725512
},
"bigbench_sports_understanding"
:
{
"multiple_choice_grade"
:
0.5030425963488844
,
"multiple_choice_grade_stderr"
:
0.015931029729145698
},
"bigbench_tracking_shuffled_objects_three_objects"
:
{
"multiple_choice_grade"
:
0.37
,
"multiple_choice_grade_stderr"
:
0.027921294063982
},
"bigbench_geometric_shapes"
:
{
"multiple_choice_grade"
:
0.20055710306406685
,
"multiple_choice_grade_stderr"
:
0.021162707757982353
,
"exact_str_match"
:
0.0
,
"exact_str_match_stderr"
:
0.0
},
"bigbench_hyperbaton"
:
{
"multiple_choice_grade"
:
0.48618
,
"multiple_choice_grade_stderr"
:
0.0022352360227943418
},
"bigbench_logical_deduction_seven_objects"
:
{
"multiple_choice_grade"
:
0.19142857142857142
,
"multiple_choice_grade_stderr"
:
0.014880721436998012
},
"bigbench_snarks"
:
{
"multiple_choice_grade"
:
0.4972375690607735
,
"multiple_choice_grade_stderr"
:
0.037267230837657574
},
"bigbench_formal_fallacies_syllogisms_negation"
:
{
"multiple_choice_grade"
:
0.5005633802816901
,
"multiple_choice_grade_stderr"
:
0.004196051878850066
},
"bigbench_tracking_shuffled_objects_five_objects"
:
{
"multiple_choice_grade"
:
0.184
,
"multiple_choice_grade_stderr"
:
0.010964094540602657
},
"bigbench_movie_recommendation"
:
{
"multiple_choice_grade"
:
0.264
,
"multiple_choice_grade_stderr"
:
0.019732885585922087
}
},
"versions"
:
{
"bigbench_disambiguation_qa"
:
0
,
"bigbench_logical_deduction_three_objects"
:
0
,
"bigbench_causal_judgement"
:
0
,
"bigbench_date_understanding"
:
0
,
"bigbench_navigate"
:
0
,
"bigbench_salient_translation_error_detection"
:
0
,
"bigbench_temporal_sequences"
:
0
,
"bigbench_tracking_shuffled_objects_seven_objects"
:
0
,
"bigbench_ruin_names"
:
0
,
"bigbench_reasoning_about_colored_objects"
:
0
,
"bigbench_dyck_languages"
:
0
,
"bigbench_logical_deduction_five_objects"
:
0
,
"bigbench_sports_understanding"
:
0
,
"bigbench_tracking_shuffled_objects_three_objects"
:
0
,
"bigbench_geometric_shapes"
:
0
,
"bigbench_hyperbaton"
:
0
,
"bigbench_logical_deduction_seven_objects"
:
0
,
"bigbench_snarks"
:
0
,
"bigbench_formal_fallacies_syllogisms_negation"
:
0
,
"bigbench_tracking_shuffled_objects_five_objects"
:
0
,
"bigbench_movie_recommendation"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-7b1,use_accelerate=True"
,
"num_fewshot"
:
3
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-7b1/bloom-7b1_common_sense_reasoning_0-shot.json
0 → 100644
View file @
21e128d8
{
"results"
:
{
"copa"
:
{
"acc"
:
0.72
,
"acc_stderr"
:
0.04512608598542127
},
"winogrande"
:
{
"acc"
:
0.6432517758484609
,
"acc_stderr"
:
0.013463393958028726
},
"piqa"
:
{
"acc"
:
0.7274211099020674
,
"acc_stderr"
:
0.010389256803296021
,
"acc_norm"
:
0.7366702937976061
,
"acc_norm_stderr"
:
0.010276185322196764
},
"arc_challenge"
:
{
"acc"
:
0.3037542662116041
,
"acc_stderr"
:
0.013438909184778757
,
"acc_norm"
:
0.33532423208191126
,
"acc_norm_stderr"
:
0.013796182947785564
},
"arc_easy"
:
{
"acc"
:
0.6494107744107744
,
"acc_stderr"
:
0.009791003829831557
,
"acc_norm"
:
0.5732323232323232
,
"acc_norm_stderr"
:
0.010149141043955626
},
"boolq"
:
{
"acc"
:
0.6287461773700306
,
"acc_stderr"
:
0.008450174658715903
},
"wsc273"
:
{
"acc"
:
0.8131868131868132
,
"acc_stderr"
:
0.023632761722644544
},
"openbookqa"
:
{
"acc"
:
0.252
,
"acc_stderr"
:
0.019435727282249536
,
"acc_norm"
:
0.358
,
"acc_norm_stderr"
:
0.021461434862859122
},
"prost"
:
{
"acc"
:
0.26184884713919726
,
"acc_stderr"
:
0.003211967450351038
,
"acc_norm"
:
0.30572160546541416
,
"acc_norm_stderr"
:
0.003365914208405272
},
"mc_taco"
:
{
"em"
:
0.13588588588588588
,
"f1"
:
0.5052611696967991
},
"hellaswag"
:
{
"acc"
:
0.4623580959968134
,
"acc_stderr"
:
0.0049756211474061025
,
"acc_norm"
:
0.5967934674367655
,
"acc_norm_stderr"
:
0.0048953903414456264
},
"swag"
:
{
"acc"
:
0.5024992502249325
,
"acc_stderr"
:
0.0035350478846161142
,
"acc_norm"
:
0.6825952214335699
,
"acc_norm_stderr"
:
0.0032909332559412758
}
},
"versions"
:
{
"copa"
:
0
,
"winogrande"
:
0
,
"piqa"
:
0
,
"arc_challenge"
:
0
,
"arc_easy"
:
0
,
"boolq"
:
1
,
"wsc273"
:
0
,
"openbookqa"
:
0
,
"prost"
:
0
,
"mc_taco"
:
0
,
"hellaswag"
:
0
,
"swag"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-7b1,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-7b1/bloom-7b1_gsm8k_8-shot.json
0 → 100644
View file @
21e128d8
{
"results"
:
{
"gsm8k"
:
{
"acc"
:
0.018953752843062926
,
"acc_stderr"
:
0.0037560783410314704
}
},
"versions"
:
{
"gsm8k"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-7b1,use_accelerate=True"
,
"num_fewshot"
:
8
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-7b1/bloom-7b1_mathematical_reasoning_few_shot_5-shot.json
0 → 100644
View file @
21e128d8
{
"results"
:
{
"mathqa"
:
{
"acc"
:
0.26566164154103855
,
"acc_stderr"
:
0.008085616216226046
,
"acc_norm"
:
0.26532663316582916
,
"acc_norm_stderr"
:
0.008082359462649721
},
"math_prealgebra"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"drop"
:
{
"em"
:
0.02506291946308725
,
"em_stderr"
:
0.0016008246934367681
,
"f1"
:
0.05092911073825512
,
"f1_stderr"
:
0.0017766603696206904
},
"math_precalc"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_geometry"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"gsm8k"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_counting_and_prob"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_num_theory"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_algebra"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_intermediate_algebra"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
}
},
"versions"
:
{
"mathqa"
:
0
,
"math_prealgebra"
:
1
,
"drop"
:
1
,
"math_precalc"
:
1
,
"math_geometry"
:
1
,
"gsm8k"
:
0
,
"math_counting_and_prob"
:
1
,
"math_num_theory"
:
1
,
"math_algebra"
:
1
,
"math_intermediate_algebra"
:
1
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-7b1,use_accelerate=True"
,
"num_fewshot"
:
5
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-7b1/bloom-7b1_pawsx_0-shot.json
0 → 100644
View file @
21e128d8
{
"results"
:
{
"pawsx_zh"
:
{
"acc"
:
0.4735
,
"acc_stderr"
:
0.011167418260963935
},
"pawsx_de"
:
{
"acc"
:
0.5285
,
"acc_stderr"
:
0.011164954236428803
},
"pawsx_en"
:
{
"acc"
:
0.613
,
"acc_stderr"
:
0.010893798117218195
},
"pawsx_ko"
:
{
"acc"
:
0.451
,
"acc_stderr"
:
0.01112930504188632
},
"pawsx_fr"
:
{
"acc"
:
0.509
,
"acc_stderr"
:
0.011181324206260283
},
"pawsx_es"
:
{
"acc"
:
0.5935
,
"acc_stderr"
:
0.010985864536294245
},
"pawsx_ja"
:
{
"acc"
:
0.4545
,
"acc_stderr"
:
0.01113673598700373
}
},
"versions"
:
{
"pawsx_zh"
:
0
,
"pawsx_de"
:
0
,
"pawsx_en"
:
0
,
"pawsx_ko"
:
0
,
"pawsx_fr"
:
0
,
"pawsx_es"
:
0
,
"pawsx_ja"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-7b1"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-7b1/bloom-7b1_question_answering_0-shot.json
0 → 100644
View file @
21e128d8
{
"results"
:
{
"webqs"
:
{
"acc"
:
0.022637795275590553
,
"acc_stderr"
:
0.0033005770276179373
},
"headqa_en"
:
{
"acc"
:
0.31181619256017507
,
"acc_stderr"
:
0.008848039223989218
,
"acc_norm"
:
0.35557986870897157
,
"acc_norm_stderr"
:
0.009143208309033068
},
"squad2"
:
{
"exact"
:
7.816053230017687
,
"f1"
:
12.640343596838946
,
"HasAns_exact"
:
14.84480431848853
,
"HasAns_f1"
:
24.507219892926596
,
"NoAns_exact"
:
0.8074011774600505
,
"NoAns_f1"
:
0.8074011774600505
,
"best_exact"
:
50.07159100480081
,
"best_f1"
:
50.07159100480081
},
"truthfulqa_mc"
:
{
"mc1"
:
0.22399020807833536
,
"mc1_stderr"
:
0.014594964329474202
,
"mc2"
:
0.38898018897492265
,
"mc2_stderr"
:
0.014014176010735629
},
"triviaqa"
:
{
"acc"
:
0.055246176964554056
,
"acc_stderr"
:
0.0021480319949071717
},
"headqa_es"
:
{
"acc"
:
0.29540481400437635
,
"acc_stderr"
:
0.008714131357853837
,
"acc_norm"
:
0.34318016046681254
,
"acc_norm_stderr"
:
0.009068379779817705
},
"logiqa"
:
{
"acc"
:
0.20276497695852536
,
"acc_stderr"
:
0.015770046635584564
,
"acc_norm"
:
0.28110599078341014
,
"acc_norm_stderr"
:
0.017632374626460005
}
},
"versions"
:
{
"webqs"
:
0
,
"headqa_en"
:
0
,
"squad2"
:
1
,
"truthfulqa_mc"
:
1
,
"triviaqa"
:
1
,
"headqa_es"
:
0
,
"logiqa"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-7b1,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-7b1/bloom-7b1_reading_comprehension_0-shot.json
0 → 100644
View file @
21e128d8
{
"results"
:
{
"coqa"
:
{
"f1"
:
0.6882976860781418
,
"f1_stderr"
:
0.016322647326969194
,
"em"
:
0.5386666666666665
,
"em_stderr"
:
0.01995482540089559
},
"drop"
:
{
"em"
:
0.02569211409395973
,
"em_stderr"
:
0.0016202710827118362
,
"f1"
:
0.09853712248322138
,
"f1_stderr"
:
0.0021424507419289577
},
"race"
:
{
"acc"
:
0.36555023923444974
,
"acc_stderr"
:
0.014904654247182307
}
},
"versions"
:
{
"coqa"
:
1
,
"race"
:
1
,
"drop"
:
1
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-7b1,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-7b1/bloom-7b1_xcopa_0-shot.json
0 → 100644
View file @
21e128d8
{
"results"
:
{
"xcopa_ta"
:
{
"acc"
:
0.592
,
"acc_stderr"
:
0.02200091089387719
},
"xcopa_id"
:
{
"acc"
:
0.698
,
"acc_stderr"
:
0.02055326917420918
},
"xcopa_tr"
:
{
"acc"
:
0.512
,
"acc_stderr"
:
0.02237662679792717
},
"xcopa_th"
:
{
"acc"
:
0.554
,
"acc_stderr"
:
0.022252153078595897
},
"xcopa_ht"
:
{
"acc"
:
0.508
,
"acc_stderr"
:
0.022380208834928035
},
"xcopa_qu"
:
{
"acc"
:
0.508
,
"acc_stderr"
:
0.022380208834928035
},
"xcopa_sw"
:
{
"acc"
:
0.516
,
"acc_stderr"
:
0.0223716109825804
},
"xcopa_it"
:
{
"acc"
:
0.528
,
"acc_stderr"
:
0.022347949832668086
},
"xcopa_zh"
:
{
"acc"
:
0.652
,
"acc_stderr"
:
0.021323728632807498
},
"xcopa_et"
:
{
"acc"
:
0.482
,
"acc_stderr"
:
0.02236856511738799
},
"xcopa_vi"
:
{
"acc"
:
0.708
,
"acc_stderr"
:
0.02035437548053008
}
},
"versions"
:
{
"xcopa_ta"
:
0
,
"xcopa_id"
:
0
,
"xcopa_tr"
:
0
,
"xcopa_th"
:
0
,
"xcopa_ht"
:
0
,
"xcopa_qu"
:
0
,
"xcopa_sw"
:
0
,
"xcopa_it"
:
0
,
"xcopa_zh"
:
0
,
"xcopa_et"
:
0
,
"xcopa_vi"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-7b1"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-7b1/bloom-7b1_xnli_0-shot.json
0 → 100644
View file @
21e128d8
{
"results"
:
{
"xnli_ar"
:
{
"acc"
:
0.3383233532934132
,
"acc_stderr"
:
0.006685184166851475
},
"xnli_bg"
:
{
"acc"
:
0.3970059880239521
,
"acc_stderr"
:
0.006913206227417221
},
"xnli_de"
:
{
"acc"
:
0.39860279441117763
,
"acc_stderr"
:
0.0069179171504068675
},
"xnli_el"
:
{
"acc"
:
0.35748502994011977
,
"acc_stderr"
:
0.006771658365506411
},
"xnli_en"
:
{
"acc"
:
0.539121756487026
,
"acc_stderr"
:
0.007043053978003474
},
"xnli_es"
:
{
"acc"
:
0.4870259481037924
,
"acc_stderr"
:
0.007062333678954121
},
"xnli_fr"
:
{
"acc"
:
0.49680638722554893
,
"acc_stderr"
:
0.00706456831954508
},
"xnli_hi"
:
{
"acc"
:
0.46506986027944114
,
"acc_stderr"
:
0.007047451825220883
},
"xnli_ru"
:
{
"acc"
:
0.4305389221556886
,
"acc_stderr"
:
0.006996208063220089
},
"xnli_sw"
:
{
"acc"
:
0.37924151696606784
,
"acc_stderr"
:
0.006855572898852684
},
"xnli_th"
:
{
"acc"
:
0.3499001996007984
,
"acc_stderr"
:
0.00673886250800537
},
"xnli_tr"
:
{
"acc"
:
0.3508982035928144
,
"acc_stderr"
:
0.00674328417575373
},
"xnli_ur"
:
{
"acc"
:
0.42095808383233535
,
"acc_stderr"
:
0.006975878576227378
},
"xnli_vi"
:
{
"acc"
:
0.47045908183632734
,
"acc_stderr"
:
0.007052371383794704
},
"xnli_zh"
:
{
"acc"
:
0.35429141716566864
,
"acc_stderr"
:
0.006758076124936785
}
},
"versions"
:
{
"xnli_ar"
:
0
,
"xnli_bg"
:
0
,
"xnli_de"
:
0
,
"xnli_el"
:
0
,
"xnli_en"
:
0
,
"xnli_es"
:
0
,
"xnli_fr"
:
0
,
"xnli_hi"
:
0
,
"xnli_ru"
:
0
,
"xnli_sw"
:
0
,
"xnli_th"
:
0
,
"xnli_tr"
:
0
,
"xnli_ur"
:
0
,
"xnli_vi"
:
0
,
"xnli_zh"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-7b1"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-7b1/bloom-7b1_xstory_cloze_0-shot.json
0 → 100644
View file @
21e128d8
{
"results"
:
{
"xstory_cloze_ru"
:
{
"acc"
:
0.5274652547981469
,
"acc_stderr"
:
0.012847698270388222
},
"xstory_cloze_eu"
:
{
"acc"
:
0.57180675049636
,
"acc_stderr"
:
0.012733742799515155
},
"xstory_cloze_en"
:
{
"acc"
:
0.7074784910655195
,
"acc_stderr"
:
0.011707038572975033
},
"xstory_cloze_ar"
:
{
"acc"
:
0.585704831237591
,
"acc_stderr"
:
0.012676689821720669
},
"xstory_cloze_es"
:
{
"acc"
:
0.6611515552614163
,
"acc_stderr"
:
0.012180490758739058
},
"xstory_cloze_hi"
:
{
"acc"
:
0.6055592322964924
,
"acc_stderr"
:
0.01257710651393614
},
"xstory_cloze_my"
:
{
"acc"
:
0.48974189278623426
,
"acc_stderr"
:
0.012864417047980468
},
"xstory_cloze_sw"
:
{
"acc"
:
0.5393778954334878
,
"acc_stderr"
:
0.012827159238891916
},
"xstory_cloze_zh"
:
{
"acc"
:
0.6187954996690933
,
"acc_stderr"
:
0.01249867885093408
},
"xstory_cloze_id"
:
{
"acc"
:
0.6446062210456651
,
"acc_stderr"
:
0.01231724793041837
},
"xstory_cloze_te"
:
{
"acc"
:
0.5744540039708802
,
"acc_stderr"
:
0.012723670419166324
}
},
"versions"
:
{
"xstory_cloze_ru"
:
0
,
"xstory_cloze_eu"
:
0
,
"xstory_cloze_en"
:
0
,
"xstory_cloze_ar"
:
0
,
"xstory_cloze_es"
:
0
,
"xstory_cloze_hi"
:
0
,
"xstory_cloze_my"
:
0
,
"xstory_cloze_sw"
:
0
,
"xstory_cloze_zh"
:
0
,
"xstory_cloze_id"
:
0
,
"xstory_cloze_te"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-7b1"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-7b1/bloom-7b1_xwinograd_0-shot.json
0 → 100644
View file @
21e128d8
{
"results"
:
{
"xwinograd_fr"
:
{
"acc"
:
0.7108433734939759
,
"acc_stderr"
:
0.050066428050419186
},
"xwinograd_ru"
:
{
"acc"
:
0.5682539682539682
,
"acc_stderr"
:
0.027952495861671634
},
"xwinograd_en"
:
{
"acc"
:
0.821505376344086
,
"acc_stderr"
:
0.00794327709606643
},
"xwinograd_pt"
:
{
"acc"
:
0.7680608365019012
,
"acc_stderr"
:
0.026075593860304693
},
"xwinograd_jp"
:
{
"acc"
:
0.5849843587069864
,
"acc_stderr"
:
0.015919213413834392
},
"xwinograd_zh"
:
{
"acc"
:
0.7440476190476191
,
"acc_stderr"
:
0.019457899684028012
}
},
"versions"
:
{
"xwinograd_fr"
:
0
,
"xwinograd_ru"
:
0
,
"xwinograd_en"
:
0
,
"xwinograd_pt"
:
0
,
"xwinograd_jp"
:
0
,
"xwinograd_zh"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-7b1"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_arithmetic_5-shot.json
0 → 100644
View file @
21e128d8
{
"results"
:
{
"arithmetic_2dm"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"arithmetic_5ds"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"arithmetic_4da"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"arithmetic_3da"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"arithmetic_4ds"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"arithmetic_5da"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"arithmetic_2da"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"arithmetic_3ds"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"arithmetic_1dc"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"arithmetic_2ds"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
}
},
"versions"
:
{
"arithmetic_2dm"
:
0
,
"arithmetic_5ds"
:
0
,
"arithmetic_4da"
:
0
,
"arithmetic_3da"
:
0
,
"arithmetic_4ds"
:
0
,
"arithmetic_5da"
:
0
,
"arithmetic_2da"
:
0
,
"arithmetic_3ds"
:
0
,
"arithmetic_1dc"
:
0
,
"arithmetic_2ds"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True"
,
"num_fewshot"
:
5
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
false
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_bbh_3-shot.json
0 → 100644
View file @
21e128d8
{
"results"
:
{
"bigbench_sports_understanding"
:
{
"multiple_choice_grade"
:
0.5811359026369168
,
"multiple_choice_grade_stderr"
:
0.015720172474974117
},
"bigbench_salient_translation_error_detection"
:
{
"multiple_choice_grade"
:
0.1933867735470942
,
"multiple_choice_grade_stderr"
:
0.012508305339715512
},
"bigbench_date_understanding"
:
{
"multiple_choice_grade"
:
0.6395663956639567
,
"multiple_choice_grade_stderr"
:
0.025028311208714224
},
"bigbench_navigate"
:
{
"multiple_choice_grade"
:
0.517
,
"multiple_choice_grade_stderr"
:
0.015810153729833434
},
"bigbench_dyck_languages"
:
{
"multiple_choice_grade"
:
0.201
,
"multiple_choice_grade_stderr"
:
0.012679107214617324
},
"bigbench_movie_recommendation"
:
{
"multiple_choice_grade"
:
0.436
,
"multiple_choice_grade_stderr"
:
0.022198954641476802
},
"bigbench_snarks"
:
{
"multiple_choice_grade"
:
0.4696132596685083
,
"multiple_choice_grade_stderr"
:
0.03719891321680327
},
"bigbench_disambiguation_qa"
:
{
"multiple_choice_grade"
:
0.4573643410852713
,
"multiple_choice_grade_stderr"
:
0.03107554499047266
},
"bigbench_reasoning_about_colored_objects"
:
{
"multiple_choice_grade"
:
0.3705
,
"multiple_choice_grade_stderr"
:
0.010801537464907349
},
"bigbench_geometric_shapes"
:
{
"multiple_choice_grade"
:
0.23119777158774374
,
"multiple_choice_grade_stderr"
:
0.02228217728550543
,
"exact_str_match"
:
0.0
,
"exact_str_match_stderr"
:
0.0
},
"bigbench_tracking_shuffled_objects_five_objects"
:
{
"multiple_choice_grade"
:
0.2144
,
"multiple_choice_grade_stderr"
:
0.011612665292522431
},
"bigbench_formal_fallacies_syllogisms_negation"
:
{
"multiple_choice_grade"
:
0.5113380281690141
,
"multiple_choice_grade_stderr"
:
0.004194975590734721
},
"bigbench_tracking_shuffled_objects_three_objects"
:
{
"multiple_choice_grade"
:
0.4166666666666667
,
"multiple_choice_grade_stderr"
:
0.028511310643917567
},
"bigbench_hyperbaton"
:
{
"multiple_choice_grade"
:
0.5038
,
"multiple_choice_grade_stderr"
:
0.0022360257592931206
},
"bigbench_temporal_sequences"
:
{
"multiple_choice_grade"
:
0.28
,
"multiple_choice_grade_stderr"
:
0.014205696104091493
},
"bigbench_logical_deduction_three_objects"
:
{
"multiple_choice_grade"
:
0.4166666666666667
,
"multiple_choice_grade_stderr"
:
0.028511310643917567
},
"bigbench_causal_judgement"
:
{
"multiple_choice_grade"
:
0.49473684210526314
,
"multiple_choice_grade_stderr"
:
0.036367633377878836
},
"bigbench_tracking_shuffled_objects_seven_objects"
:
{
"multiple_choice_grade"
:
0.14457142857142857
,
"multiple_choice_grade_stderr"
:
0.008408881015830339
},
"bigbench_logical_deduction_seven_objects"
:
{
"multiple_choice_grade"
:
0.22285714285714286
,
"multiple_choice_grade_stderr"
:
0.015740739118727993
},
"bigbench_logical_deduction_five_objects"
:
{
"multiple_choice_grade"
:
0.3
,
"multiple_choice_grade_stderr"
:
0.020514426225628046
},
"bigbench_ruin_names"
:
{
"multiple_choice_grade"
:
0.34598214285714285
,
"multiple_choice_grade_stderr"
:
0.02249924183068251
}
},
"versions"
:
{
"bigbench_sports_understanding"
:
0
,
"bigbench_salient_translation_error_detection"
:
0
,
"bigbench_date_understanding"
:
0
,
"bigbench_navigate"
:
0
,
"bigbench_dyck_languages"
:
0
,
"bigbench_movie_recommendation"
:
0
,
"bigbench_snarks"
:
0
,
"bigbench_disambiguation_qa"
:
0
,
"bigbench_reasoning_about_colored_objects"
:
0
,
"bigbench_geometric_shapes"
:
0
,
"bigbench_tracking_shuffled_objects_five_objects"
:
0
,
"bigbench_formal_fallacies_syllogisms_negation"
:
0
,
"bigbench_tracking_shuffled_objects_three_objects"
:
0
,
"bigbench_hyperbaton"
:
0
,
"bigbench_temporal_sequences"
:
0
,
"bigbench_logical_deduction_three_objects"
:
0
,
"bigbench_causal_judgement"
:
0
,
"bigbench_tracking_shuffled_objects_seven_objects"
:
0
,
"bigbench_logical_deduction_seven_objects"
:
0
,
"bigbench_logical_deduction_five_objects"
:
0
,
"bigbench_ruin_names"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True"
,
"num_fewshot"
:
3
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_blimp_0-shot.json
0 → 100644
View file @
21e128d8
{
"results"
:
{
"blimp_anaphor_gender_agreement"
:
{
"acc"
:
0.576
,
"acc_stderr"
:
0.015635487471405186
},
"blimp_wh_questions_subject_gap_long_distance"
:
{
"acc"
:
0.408
,
"acc_stderr"
:
0.01554920505292068
},
"blimp_expletive_it_object_raising"
:
{
"acc"
:
0.619
,
"acc_stderr"
:
0.015364734787007436
},
"blimp_npi_present_2"
:
{
"acc"
:
0.39
,
"acc_stderr"
:
0.015431725053866606
},
"blimp_sentential_negation_npi_licensor_present"
:
{
"acc"
:
0.392
,
"acc_stderr"
:
0.015445859463771302
},
"blimp_wh_vs_that_no_gap"
:
{
"acc"
:
0.196
,
"acc_stderr"
:
0.012559527926707371
},
"blimp_wh_vs_that_with_gap"
:
{
"acc"
:
0.847
,
"acc_stderr"
:
0.011389500459665546
},
"blimp_passive_2"
:
{
"acc"
:
0.526
,
"acc_stderr"
:
0.01579789775804277
},
"blimp_drop_argument"
:
{
"acc"
:
0.705
,
"acc_stderr"
:
0.014428554438445524
},
"blimp_irregular_plural_subject_verb_agreement_2"
:
{
"acc"
:
0.504
,
"acc_stderr"
:
0.015818793703510893
},
"blimp_adjunct_island"
:
{
"acc"
:
0.338
,
"acc_stderr"
:
0.014965960710224489
},
"blimp_transitive"
:
{
"acc"
:
0.473
,
"acc_stderr"
:
0.015796218551302615
},
"blimp_irregular_plural_subject_verb_agreement_1"
:
{
"acc"
:
0.518
,
"acc_stderr"
:
0.015809045699406728
},
"blimp_animate_subject_passive"
:
{
"acc"
:
0.651
,
"acc_stderr"
:
0.015080663991563098
},
"blimp_determiner_noun_agreement_1"
:
{
"acc"
:
0.341
,
"acc_stderr"
:
0.014998131348402706
},
"blimp_wh_island"
:
{
"acc"
:
0.506
,
"acc_stderr"
:
0.015818160898606715
},
"blimp_intransitive"
:
{
"acc"
:
0.643
,
"acc_stderr"
:
0.015158521721486769
},
"blimp_left_branch_island_simple_question"
:
{
"acc"
:
0.411
,
"acc_stderr"
:
0.015566673418599276
},
"blimp_irregular_past_participle_verbs"
:
{
"acc"
:
0.314
,
"acc_stderr"
:
0.01468399195108796
},
"blimp_principle_A_case_2"
:
{
"acc"
:
0.443
,
"acc_stderr"
:
0.0157161699532041
},
"blimp_principle_A_domain_3"
:
{
"acc"
:
0.563
,
"acc_stderr"
:
0.015693223928730377
},
"blimp_sentential_subject_island"
:
{
"acc"
:
0.621
,
"acc_stderr"
:
0.01534909100222535
},
"blimp_tough_vs_raising_1"
:
{
"acc"
:
0.361
,
"acc_stderr"
:
0.015195720118175127
},
"blimp_principle_A_c_command"
:
{
"acc"
:
0.326
,
"acc_stderr"
:
0.014830507204541042
},
"blimp_wh_vs_that_no_gap_long_distance"
:
{
"acc"
:
0.301
,
"acc_stderr"
:
0.014512395033543147
},
"blimp_irregular_past_participle_adjectives"
:
{
"acc"
:
0.636
,
"acc_stderr"
:
0.015222868840522019
},
"blimp_complex_NP_island"
:
{
"acc"
:
0.303
,
"acc_stderr"
:
0.014539683710535264
},
"blimp_only_npi_licensor_present"
:
{
"acc"
:
0.731
,
"acc_stderr"
:
0.014029819522568198
},
"blimp_wh_questions_subject_gap"
:
{
"acc"
:
0.369
,
"acc_stderr"
:
0.015266698139154617
},
"blimp_coordinate_structure_constraint_object_extraction"
:
{
"acc"
:
0.279
,
"acc_stderr"
:
0.014190150117612037
},
"blimp_determiner_noun_agreement_2"
:
{
"acc"
:
0.361
,
"acc_stderr"
:
0.015195720118175115
},
"blimp_ellipsis_n_bar_2"
:
{
"acc"
:
0.264
,
"acc_stderr"
:
0.01394627184944048
},
"blimp_only_npi_scope"
:
{
"acc"
:
0.278
,
"acc_stderr"
:
0.014174516461485247
},
"blimp_determiner_noun_agreement_with_adj_irregular_1"
:
{
"acc"
:
0.342
,
"acc_stderr"
:
0.015008706182121728
},
"blimp_existential_there_object_raising"
:
{
"acc"
:
0.69
,
"acc_stderr"
:
0.014632638658632902
},
"blimp_superlative_quantifiers_1"
:
{
"acc"
:
0.522
,
"acc_stderr"
:
0.015803979428161957
},
"blimp_distractor_agreement_relational_noun"
:
{
"acc"
:
0.514
,
"acc_stderr"
:
0.015813097547730987
},
"blimp_wh_vs_that_with_gap_long_distance"
:
{
"acc"
:
0.692
,
"acc_stderr"
:
0.014606483127342761
},
"blimp_determiner_noun_agreement_with_adj_2"
:
{
"acc"
:
0.392
,
"acc_stderr"
:
0.015445859463771295
},
"blimp_principle_A_domain_1"
:
{
"acc"
:
0.324
,
"acc_stderr"
:
0.01480686473373886
},
"blimp_distractor_agreement_relative_clause"
:
{
"acc"
:
0.423
,
"acc_stderr"
:
0.015630589090476345
},
"blimp_inchoative"
:
{
"acc"
:
0.474
,
"acc_stderr"
:
0.015797897758042766
},
"blimp_superlative_quantifiers_2"
:
{
"acc"
:
0.714
,
"acc_stderr"
:
0.01429714686251791
},
"blimp_tough_vs_raising_2"
:
{
"acc"
:
0.642
,
"acc_stderr"
:
0.015167928865407557
},
"blimp_principle_A_domain_2"
:
{
"acc"
:
0.74
,
"acc_stderr"
:
0.013877773329774166
},
"blimp_determiner_noun_agreement_irregular_2"
:
{
"acc"
:
0.369
,
"acc_stderr"
:
0.015266698139154614
},
"blimp_animate_subject_trans"
:
{
"acc"
:
0.616
,
"acc_stderr"
:
0.015387682761897071
},
"blimp_ellipsis_n_bar_1"
:
{
"acc"
:
0.624
,
"acc_stderr"
:
0.015325105508898134
},
"blimp_existential_there_quantifiers_1"
:
{
"acc"
:
0.308
,
"acc_stderr"
:
0.014606483127342763
},
"blimp_regular_plural_subject_verb_agreement_1"
:
{
"acc"
:
0.56
,
"acc_stderr"
:
0.01570498795436179
},
"blimp_wh_questions_object_gap"
:
{
"acc"
:
0.455
,
"acc_stderr"
:
0.01575510149834709
},
"blimp_determiner_noun_agreement_with_adj_irregular_2"
:
{
"acc"
:
0.393
,
"acc_stderr"
:
0.015452824654081496
},
"blimp_sentential_negation_npi_scope"
:
{
"acc"
:
0.638
,
"acc_stderr"
:
0.015204840912919498
},
"blimp_principle_A_case_1"
:
{
"acc"
:
0.028
,
"acc_stderr"
:
0.005219506034410047
},
"blimp_existential_there_subject_raising"
:
{
"acc"
:
0.701
,
"acc_stderr"
:
0.014484778521220482
},
"blimp_causative"
:
{
"acc"
:
0.359
,
"acc_stderr"
:
0.015177264224798597
},
"blimp_determiner_noun_agreement_with_adjective_1"
:
{
"acc"
:
0.391
,
"acc_stderr"
:
0.015438826294681783
},
"blimp_coordinate_structure_constraint_complex_left_branch"
:
{
"acc"
:
0.345
,
"acc_stderr"
:
0.015039986742055238
},
"blimp_passive_1"
:
{
"acc"
:
0.529
,
"acc_stderr"
:
0.015792669451628896
},
"blimp_npi_present_1"
:
{
"acc"
:
0.304
,
"acc_stderr"
:
0.014553205687950424
},
"blimp_left_branch_island_echo_question"
:
{
"acc"
:
0.49
,
"acc_stderr"
:
0.015816135752773207
},
"blimp_existential_there_quantifiers_2"
:
{
"acc"
:
0.788
,
"acc_stderr"
:
0.012931481864938041
},
"blimp_regular_plural_subject_verb_agreement_2"
:
{
"acc"
:
0.456
,
"acc_stderr"
:
0.01575792855397917
},
"blimp_principle_A_reconstruction"
:
{
"acc"
:
0.792
,
"acc_stderr"
:
0.012841374572096921
},
"blimp_determiner_noun_agreement_irregular_1"
:
{
"acc"
:
0.356
,
"acc_stderr"
:
0.015149042659306628
},
"blimp_matrix_question_npi_licensor_present"
:
{
"acc"
:
0.548
,
"acc_stderr"
:
0.01574623586588068
},
"blimp_anaphor_number_agreement"
:
{
"acc"
:
0.565
,
"acc_stderr"
:
0.0156850572527172
}
},
"versions"
:
{
"blimp_anaphor_gender_agreement"
:
0
,
"blimp_wh_questions_subject_gap_long_distance"
:
0
,
"blimp_expletive_it_object_raising"
:
0
,
"blimp_npi_present_2"
:
0
,
"blimp_sentential_negation_npi_licensor_present"
:
0
,
"blimp_wh_vs_that_no_gap"
:
0
,
"blimp_wh_vs_that_with_gap"
:
0
,
"blimp_passive_2"
:
0
,
"blimp_drop_argument"
:
0
,
"blimp_irregular_plural_subject_verb_agreement_2"
:
0
,
"blimp_adjunct_island"
:
0
,
"blimp_transitive"
:
0
,
"blimp_irregular_plural_subject_verb_agreement_1"
:
0
,
"blimp_animate_subject_passive"
:
0
,
"blimp_determiner_noun_agreement_1"
:
0
,
"blimp_wh_island"
:
0
,
"blimp_intransitive"
:
0
,
"blimp_left_branch_island_simple_question"
:
0
,
"blimp_irregular_past_participle_verbs"
:
0
,
"blimp_principle_A_case_2"
:
0
,
"blimp_principle_A_domain_3"
:
0
,
"blimp_sentential_subject_island"
:
0
,
"blimp_tough_vs_raising_1"
:
0
,
"blimp_principle_A_c_command"
:
0
,
"blimp_wh_vs_that_no_gap_long_distance"
:
0
,
"blimp_irregular_past_participle_adjectives"
:
0
,
"blimp_complex_NP_island"
:
0
,
"blimp_only_npi_licensor_present"
:
0
,
"blimp_wh_questions_subject_gap"
:
0
,
"blimp_coordinate_structure_constraint_object_extraction"
:
0
,
"blimp_determiner_noun_agreement_2"
:
0
,
"blimp_ellipsis_n_bar_2"
:
0
,
"blimp_only_npi_scope"
:
0
,
"blimp_determiner_noun_agreement_with_adj_irregular_1"
:
0
,
"blimp_existential_there_object_raising"
:
0
,
"blimp_superlative_quantifiers_1"
:
0
,
"blimp_distractor_agreement_relational_noun"
:
0
,
"blimp_wh_vs_that_with_gap_long_distance"
:
0
,
"blimp_determiner_noun_agreement_with_adj_2"
:
0
,
"blimp_principle_A_domain_1"
:
0
,
"blimp_distractor_agreement_relative_clause"
:
0
,
"blimp_inchoative"
:
0
,
"blimp_superlative_quantifiers_2"
:
0
,
"blimp_tough_vs_raising_2"
:
0
,
"blimp_principle_A_domain_2"
:
0
,
"blimp_determiner_noun_agreement_irregular_2"
:
0
,
"blimp_animate_subject_trans"
:
0
,
"blimp_ellipsis_n_bar_1"
:
0
,
"blimp_existential_there_quantifiers_1"
:
0
,
"blimp_regular_plural_subject_verb_agreement_1"
:
0
,
"blimp_wh_questions_object_gap"
:
0
,
"blimp_determiner_noun_agreement_with_adj_irregular_2"
:
0
,
"blimp_sentential_negation_npi_scope"
:
0
,
"blimp_principle_A_case_1"
:
0
,
"blimp_existential_there_subject_raising"
:
0
,
"blimp_causative"
:
0
,
"blimp_determiner_noun_agreement_with_adjective_1"
:
0
,
"blimp_coordinate_structure_constraint_complex_left_branch"
:
0
,
"blimp_passive_1"
:
0
,
"blimp_npi_present_1"
:
0
,
"blimp_left_branch_island_echo_question"
:
0
,
"blimp_existential_there_quantifiers_2"
:
0
,
"blimp_regular_plural_subject_verb_agreement_2"
:
0
,
"blimp_principle_A_reconstruction"
:
0
,
"blimp_determiner_noun_agreement_irregular_1"
:
0
,
"blimp_matrix_question_npi_licensor_present"
:
0
,
"blimp_anaphor_number_agreement"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_common_sense_reasoning_0-shot.json
0 → 100644
View file @
21e128d8
{
"results"
:
{
"arc_challenge"
:
{
"acc"
:
0.439419795221843
,
"acc_stderr"
:
0.014503747823580122
,
"acc_norm"
:
0.4462457337883959
,
"acc_norm_stderr"
:
0.014526705548539982
},
"prost"
:
{
"acc"
:
0.2688941076003416
,
"acc_stderr"
:
0.0032393206239968247
,
"acc_norm"
:
0.3052412467976089
,
"acc_norm_stderr"
:
0.003364432149066356
},
"swag"
:
{
"acc"
:
0.5673298010596821
,
"acc_stderr"
:
0.003502894135944166
,
"acc_norm"
:
0.6934919524142757
,
"acc_norm_stderr"
:
0.0032596605453371346
},
"arc_easy"
:
{
"acc"
:
0.7457912457912458
,
"acc_stderr"
:
0.008934537681141528
,
"acc_norm"
:
0.5989057239057239
,
"acc_norm_stderr"
:
0.010057051106534378
},
"boolq"
:
{
"acc"
:
0.6850152905198776
,
"acc_stderr"
:
0.00812432724981665
},
"wsc273"
:
{
"acc"
:
0.8608058608058609
,
"acc_stderr"
:
0.020988366070851
},
"mc_taco"
:
{
"em"
:
0.10960960960960961
,
"f1"
:
0.4753174430074593
},
"piqa"
:
{
"acc"
:
0.7883569096844396
,
"acc_stderr"
:
0.009530351270479397
,
"acc_norm"
:
0.7910772578890098
,
"acc_norm_stderr"
:
0.009485227030105093
},
"hellaswag"
:
{
"acc"
:
0.5910177255526787
,
"acc_stderr"
:
0.004906411984476791
,
"acc_norm"
:
0.7623979286994622
,
"acc_norm_stderr"
:
0.004247442237702478
},
"winogrande"
:
{
"acc"
:
0.7016574585635359
,
"acc_stderr"
:
0.012858885010030434
},
"copa"
:
{
"acc"
:
0.9
,
"acc_stderr"
:
0.030151134457776348
},
"openbookqa"
:
{
"acc"
:
0.306
,
"acc_stderr"
:
0.020629569998345403
,
"acc_norm"
:
0.422
,
"acc_norm_stderr"
:
0.022109039310618552
}
},
"versions"
:
{
"arc_challenge"
:
0
,
"prost"
:
0
,
"swag"
:
0
,
"arc_easy"
:
0
,
"boolq"
:
1
,
"wsc273"
:
0
,
"mc_taco"
:
0
,
"piqa"
:
0
,
"hellaswag"
:
0
,
"winogrande"
:
0
,
"copa"
:
0
,
"openbookqa"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_glue_0-shot.json
0 → 100644
View file @
21e128d8
{
"results"
:
{
"mnli_mismatched"
:
{
"acc"
:
0.45351912123677784
,
"acc_stderr"
:
0.005020956265665617
},
"wnli"
:
{
"acc"
:
0.4647887323943662
,
"acc_stderr"
:
0.0596130578497224
},
"sst"
:
{
"acc"
:
0.6536697247706422
,
"acc_stderr"
:
0.01612186710508361
},
"cola"
:
{
"mcc"
:
0.0
,
"mcc_stderr"
:
0.0
},
"mnli"
:
{
"acc"
:
0.43555781966377993
,
"acc_stderr"
:
0.005005063722742048
},
"qnli"
:
{
"acc"
:
0.4995423759838916
,
"acc_stderr"
:
0.006765407718154766
},
"mrpc"
:
{
"acc"
:
0.6862745098039216
,
"acc_stderr"
:
0.022999936277943434
,
"f1"
:
0.8134110787172011
,
"f1_stderr"
:
0.01621238238910757
},
"rte"
:
{
"acc"
:
0.6534296028880866
,
"acc_stderr"
:
0.02864445699455754
},
"qqp"
:
{
"acc"
:
0.3679198614889933
,
"acc_stderr"
:
0.0023983700314094665
,
"f1"
:
0.5365853658536586
,
"f1_stderr"
:
0.0025607085094365924
}
},
"versions"
:
{
"mnli_mismatched"
:
0
,
"wnli"
:
1
,
"sst"
:
0
,
"cola"
:
0
,
"mnli"
:
0
,
"qnli"
:
0
,
"mrpc"
:
0
,
"rte"
:
0
,
"qqp"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
false
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_gsm8k_8-shot.json
0 → 100644
View file @
21e128d8
{
"results"
:
{
"gsm8k"
:
{
"acc"
:
0.13570887035633056
,
"acc_stderr"
:
0.009433577908567345
}
},
"versions"
:
{
"gsm8k"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True"
,
"num_fewshot"
:
8
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_human_alignment_0-shot.json
0 → 100644
View file @
21e128d8
{
"results"
:
{
"crows_pairs_english_autre"
:
{
"likelihood_difference"
:
11.426136363636363
,
"likelihood_difference_stderr"
:
4.353329595154678
,
"pct_stereotype"
:
0.36363636363636365
,
"pct_stereotype_stderr"
:
0.15212000482437738
},
"crows_pairs_french_age"
:
{
"likelihood_difference"
:
13.10138888888889
,
"likelihood_difference_stderr"
:
1.1200506582638412
,
"pct_stereotype"
:
0.3888888888888889
,
"pct_stereotype_stderr"
:
0.05167468693203863
},
"crows_pairs_french_disability"
:
{
"likelihood_difference"
:
17.323863636363637
,
"likelihood_difference_stderr"
:
1.824043354324447
,
"pct_stereotype"
:
0.4090909090909091
,
"pct_stereotype_stderr"
:
0.060983672113630656
},
"ethics_utilitarianism"
:
{
"acc"
:
0.5245424292845258
,
"acc_stderr"
:
0.007202929002919329
},
"ethics_deontology"
:
{
"acc"
:
0.503337041156841
,
"acc_stderr"
:
0.008338940677034744
,
"em"
:
0.0011123470522803114
},
"ethics_cm"
:
{
"acc"
:
0.5173745173745173
,
"acc_stderr"
:
0.008018036537975452
},
"crows_pairs_english_age"
:
{
"likelihood_difference"
:
7.710164835164835
,
"likelihood_difference_stderr"
:
0.936561657229967
,
"pct_stereotype"
:
0.5604395604395604
,
"pct_stereotype_stderr"
:
0.0523181569856619
},
"crows_pairs_french_autre"
:
{
"likelihood_difference"
:
9.942307692307692
,
"likelihood_difference_stderr"
:
3.1484255128649896
,
"pct_stereotype"
:
0.5384615384615384
,
"pct_stereotype_stderr"
:
0.14390989949130545
},
"crows_pairs_english_gender"
:
{
"likelihood_difference"
:
8.67578125
,
"likelihood_difference_stderr"
:
0.6549450667276699
,
"pct_stereotype"
:
0.584375
,
"pct_stereotype_stderr"
:
0.02759315140230172
},
"crows_pairs_french_physical_appearance"
:
{
"likelihood_difference"
:
12.57986111111111
,
"likelihood_difference_stderr"
:
1.5739147906459943
,
"pct_stereotype"
:
0.5277777777777778
,
"pct_stereotype_stderr"
:
0.05924743948371486
},
"crows_pairs_english_religion"
:
{
"likelihood_difference"
:
11.8125
,
"likelihood_difference_stderr"
:
1.175150775782876
,
"pct_stereotype"
:
0.4954954954954955
,
"pct_stereotype_stderr"
:
0.047671194793956616
},
"crows_pairs_english_race_color"
:
{
"likelihood_difference"
:
9.858390748031496
,
"likelihood_difference_stderr"
:
0.5056938997647007
,
"pct_stereotype"
:
0.5019685039370079
,
"pct_stereotype_stderr"
:
0.02220560748841351
},
"crows_pairs_french_socioeconomic"
:
{
"likelihood_difference"
:
12.791533801020408
,
"likelihood_difference_stderr"
:
0.9369927660413013
,
"pct_stereotype"
:
0.35714285714285715
,
"pct_stereotype_stderr"
:
0.03431317581537576
},
"toxigen"
:
{
"acc"
:
0.42659574468085104
,
"acc_stderr"
:
0.01614008877637632
,
"acc_norm"
:
0.4319148936170213
,
"acc_norm_stderr"
:
0.016164899004911828
},
"ethics_justice"
:
{
"acc"
:
0.4992603550295858
,
"acc_stderr"
:
0.009617152578791647
,
"em"
:
0.0014792899408284023
},
"crows_pairs_english_sexual_orientation"
:
{
"likelihood_difference"
:
10.72244623655914
,
"likelihood_difference_stderr"
:
1.1561263889540778
,
"pct_stereotype"
:
0.5483870967741935
,
"pct_stereotype_stderr"
:
0.05188393075201662
},
"crows_pairs_french_nationality"
:
{
"likelihood_difference"
:
16.33102766798419
,
"likelihood_difference_stderr"
:
0.9224360930325354
,
"pct_stereotype"
:
0.31620553359683795
,
"pct_stereotype_stderr"
:
0.029291880485542005
},
"crows_pairs_english_socioeconomic"
:
{
"likelihood_difference"
:
11.222368421052632
,
"likelihood_difference_stderr"
:
0.7806572774635993
,
"pct_stereotype"
:
0.5052631578947369
,
"pct_stereotype_stderr"
:
0.036367633377878815
},
"crows_pairs_french_race_color"
:
{
"likelihood_difference"
:
11.927445652173914
,
"likelihood_difference_stderr"
:
0.5028450572837085
,
"pct_stereotype"
:
0.35
,
"pct_stereotype_stderr"
:
0.022263034418628928
},
"crows_pairs_english_nationality"
:
{
"likelihood_difference"
:
11.848668981481481
,
"likelihood_difference_stderr"
:
0.8342534014656857
,
"pct_stereotype"
:
0.38425925925925924
,
"pct_stereotype_stderr"
:
0.03317354514310742
},
"ethics_virtue"
:
{
"acc"
:
0.20321608040201006
,
"acc_stderr"
:
0.005705535674037668
,
"em"
:
0.0
},
"crows_pairs_english_physical_appearance"
:
{
"likelihood_difference"
:
7.529513888888889
,
"likelihood_difference_stderr"
:
0.8793312801173977
,
"pct_stereotype"
:
0.4722222222222222
,
"pct_stereotype_stderr"
:
0.05924743948371486
},
"ethics_utilitarianism_original"
:
{
"acc"
:
0.9806572379367721
,
"acc_stderr"
:
0.0019864644750587196
},
"crows_pairs_french_sexual_orientation"
:
{
"likelihood_difference"
:
17.554945054945055
,
"likelihood_difference_stderr"
:
1.1803100062671743
,
"pct_stereotype"
:
0.7802197802197802
,
"pct_stereotype_stderr"
:
0.043649726328985346
},
"crows_pairs_french_religion"
:
{
"likelihood_difference"
:
11.192391304347826
,
"likelihood_difference_stderr"
:
1.0866295680081195
,
"pct_stereotype"
:
0.591304347826087
,
"pct_stereotype_stderr"
:
0.04604188749503789
},
"crows_pairs_french_gender"
:
{
"likelihood_difference"
:
10.791471962616823
,
"likelihood_difference_stderr"
:
0.6767399211366819
,
"pct_stereotype"
:
0.514018691588785
,
"pct_stereotype_stderr"
:
0.027939861549302374
},
"crows_pairs_english_disability"
:
{
"likelihood_difference"
:
12.978846153846154
,
"likelihood_difference_stderr"
:
1.8287537323468364
,
"pct_stereotype"
:
0.35384615384615387
,
"pct_stereotype_stderr"
:
0.05977027026123098
}
},
"versions"
:
{
"crows_pairs_english_autre"
:
0
,
"crows_pairs_french_age"
:
0
,
"crows_pairs_french_disability"
:
0
,
"ethics_utilitarianism"
:
0
,
"ethics_deontology"
:
0
,
"ethics_cm"
:
0
,
"crows_pairs_english_age"
:
0
,
"crows_pairs_french_autre"
:
0
,
"crows_pairs_english_gender"
:
0
,
"crows_pairs_french_physical_appearance"
:
0
,
"crows_pairs_english_religion"
:
0
,
"crows_pairs_english_race_color"
:
0
,
"crows_pairs_french_socioeconomic"
:
0
,
"toxigen"
:
0
,
"ethics_justice"
:
0
,
"crows_pairs_english_sexual_orientation"
:
0
,
"crows_pairs_french_nationality"
:
0
,
"crows_pairs_english_socioeconomic"
:
0
,
"crows_pairs_french_race_color"
:
0
,
"crows_pairs_english_nationality"
:
0
,
"ethics_virtue"
:
0
,
"crows_pairs_english_physical_appearance"
:
0
,
"ethics_utilitarianism_original"
:
0
,
"crows_pairs_french_sexual_orientation"
:
0
,
"crows_pairs_french_religion"
:
0
,
"crows_pairs_french_gender"
:
0
,
"crows_pairs_english_disability"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
false
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment