Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
e53eb332
Unverified
Commit
e53eb332
authored
May 19, 2023
by
Stella Biderman
Committed by
GitHub
May 19, 2023
Browse files
Merge pull request #477 from juletx/results
Add results of various models in json and md format
parents
d1327193
92a50856
Changes
189
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2410 additions
and
0 deletions
+2410
-0
results/llama/llama-13B/llama-13B_blimp_0-shot.json
results/llama/llama-13B/llama-13B_blimp_0-shot.json
+352
-0
results/llama/llama-13B/llama-13B_common_sense_reasoning_0-shot.json
...ma/llama-13B/llama-13B_common_sense_reasoning_0-shot.json
+91
-0
results/llama/llama-13B/llama-13B_glue_0-shot.json
results/llama/llama-13B/llama-13B_glue_0-shot.json
+66
-0
results/llama/llama-13B/llama-13B_gsm8k_8-shot.json
results/llama/llama-13B/llama-13B_gsm8k_8-shot.json
+22
-0
results/llama/llama-13B/llama-13B_human_alignment_0-shot.json
...lts/llama/llama-13B/llama-13B_human_alignment_0-shot.json
+197
-0
results/llama/llama-13B/llama-13B_lambada_0-shot.json
results/llama/llama-13B/llama-13B_lambada_0-shot.json
+80
-0
results/llama/llama-13B/llama-13B_mathematical_reasoning_0-shot.json
...ma/llama-13B/llama-13B_mathematical_reasoning_0-shot.json
+76
-0
results/llama/llama-13B/llama-13B_mathematical_reasoning_few_shot_5-shot.json
...13B/llama-13B_mathematical_reasoning_few_shot_5-shot.json
+71
-0
results/llama/llama-13B/llama-13B_mmlu_5-shot.json
results/llama/llama-13B/llama-13B_mmlu_5-shot.json
+416
-0
results/llama/llama-13B/llama-13B_pawsx_0-shot.json
results/llama/llama-13B/llama-13B_pawsx_0-shot.json
+52
-0
results/llama/llama-13B/llama-13B_question_answering_0-shot.json
.../llama/llama-13B/llama-13B_question_answering_0-shot.json
+66
-0
results/llama/llama-13B/llama-13B_reading_comprehension_0-shot.json
...ama/llama-13B/llama-13B_reading_comprehension_0-shot.json
+36
-0
results/llama/llama-13B/llama-13B_superglue_0-shot.json
results/llama/llama-13B/llama-13B_superglue_0-shot.json
+55
-0
results/llama/llama-13B/llama-13B_xcopa_0-shot.json
results/llama/llama-13B/llama-13B_xcopa_0-shot.json
+72
-0
results/llama/llama-13B/llama-13B_xnli_0-shot.json
results/llama/llama-13B/llama-13B_xnli_0-shot.json
+92
-0
results/llama/llama-13B/llama-13B_xstory_cloze_0-shot.json
results/llama/llama-13B/llama-13B_xstory_cloze_0-shot.json
+72
-0
results/llama/llama-13B/llama-13B_xwinograd_0-shot.json
results/llama/llama-13B/llama-13B_xwinograd_0-shot.json
+47
-0
results/llama/llama-30B/README.md
results/llama/llama-30B/README.md
+332
-0
results/llama/llama-30B/llama-30B_bbh_3-shot.json
results/llama/llama-30B/llama-30B_bbh_3-shot.json
+124
-0
results/llama/llama-30B/llama-30B_common_sense_reasoning_0-shot.json
...ma/llama-30B/llama-30B_common_sense_reasoning_0-shot.json
+91
-0
No files found.
results/llama/llama-13B/llama-13B_blimp_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"blimp_anaphor_gender_agreement"
:
{
"acc"
:
0.576
,
"acc_stderr"
:
0.015635487471405186
},
"blimp_wh_questions_subject_gap_long_distance"
:
{
"acc"
:
0.408
,
"acc_stderr"
:
0.01554920505292068
},
"blimp_expletive_it_object_raising"
:
{
"acc"
:
0.619
,
"acc_stderr"
:
0.015364734787007436
},
"blimp_npi_present_2"
:
{
"acc"
:
0.39
,
"acc_stderr"
:
0.015431725053866606
},
"blimp_sentential_negation_npi_licensor_present"
:
{
"acc"
:
0.392
,
"acc_stderr"
:
0.015445859463771302
},
"blimp_wh_vs_that_no_gap"
:
{
"acc"
:
0.196
,
"acc_stderr"
:
0.012559527926707371
},
"blimp_wh_vs_that_with_gap"
:
{
"acc"
:
0.847
,
"acc_stderr"
:
0.011389500459665546
},
"blimp_passive_2"
:
{
"acc"
:
0.526
,
"acc_stderr"
:
0.01579789775804277
},
"blimp_drop_argument"
:
{
"acc"
:
0.705
,
"acc_stderr"
:
0.014428554438445524
},
"blimp_irregular_plural_subject_verb_agreement_2"
:
{
"acc"
:
0.504
,
"acc_stderr"
:
0.015818793703510893
},
"blimp_adjunct_island"
:
{
"acc"
:
0.338
,
"acc_stderr"
:
0.014965960710224489
},
"blimp_transitive"
:
{
"acc"
:
0.473
,
"acc_stderr"
:
0.015796218551302615
},
"blimp_irregular_plural_subject_verb_agreement_1"
:
{
"acc"
:
0.518
,
"acc_stderr"
:
0.015809045699406728
},
"blimp_animate_subject_passive"
:
{
"acc"
:
0.651
,
"acc_stderr"
:
0.015080663991563098
},
"blimp_determiner_noun_agreement_1"
:
{
"acc"
:
0.341
,
"acc_stderr"
:
0.014998131348402706
},
"blimp_wh_island"
:
{
"acc"
:
0.506
,
"acc_stderr"
:
0.015818160898606715
},
"blimp_intransitive"
:
{
"acc"
:
0.643
,
"acc_stderr"
:
0.015158521721486769
},
"blimp_left_branch_island_simple_question"
:
{
"acc"
:
0.411
,
"acc_stderr"
:
0.015566673418599276
},
"blimp_irregular_past_participle_verbs"
:
{
"acc"
:
0.314
,
"acc_stderr"
:
0.01468399195108796
},
"blimp_principle_A_case_2"
:
{
"acc"
:
0.443
,
"acc_stderr"
:
0.0157161699532041
},
"blimp_principle_A_domain_3"
:
{
"acc"
:
0.563
,
"acc_stderr"
:
0.015693223928730377
},
"blimp_sentential_subject_island"
:
{
"acc"
:
0.621
,
"acc_stderr"
:
0.01534909100222535
},
"blimp_tough_vs_raising_1"
:
{
"acc"
:
0.361
,
"acc_stderr"
:
0.015195720118175127
},
"blimp_principle_A_c_command"
:
{
"acc"
:
0.326
,
"acc_stderr"
:
0.014830507204541042
},
"blimp_wh_vs_that_no_gap_long_distance"
:
{
"acc"
:
0.301
,
"acc_stderr"
:
0.014512395033543147
},
"blimp_irregular_past_participle_adjectives"
:
{
"acc"
:
0.636
,
"acc_stderr"
:
0.015222868840522019
},
"blimp_complex_NP_island"
:
{
"acc"
:
0.303
,
"acc_stderr"
:
0.014539683710535264
},
"blimp_only_npi_licensor_present"
:
{
"acc"
:
0.731
,
"acc_stderr"
:
0.014029819522568198
},
"blimp_wh_questions_subject_gap"
:
{
"acc"
:
0.369
,
"acc_stderr"
:
0.015266698139154617
},
"blimp_coordinate_structure_constraint_object_extraction"
:
{
"acc"
:
0.279
,
"acc_stderr"
:
0.014190150117612037
},
"blimp_determiner_noun_agreement_2"
:
{
"acc"
:
0.361
,
"acc_stderr"
:
0.015195720118175115
},
"blimp_ellipsis_n_bar_2"
:
{
"acc"
:
0.264
,
"acc_stderr"
:
0.01394627184944048
},
"blimp_only_npi_scope"
:
{
"acc"
:
0.278
,
"acc_stderr"
:
0.014174516461485247
},
"blimp_determiner_noun_agreement_with_adj_irregular_1"
:
{
"acc"
:
0.342
,
"acc_stderr"
:
0.015008706182121728
},
"blimp_existential_there_object_raising"
:
{
"acc"
:
0.69
,
"acc_stderr"
:
0.014632638658632902
},
"blimp_superlative_quantifiers_1"
:
{
"acc"
:
0.522
,
"acc_stderr"
:
0.015803979428161957
},
"blimp_distractor_agreement_relational_noun"
:
{
"acc"
:
0.514
,
"acc_stderr"
:
0.015813097547730987
},
"blimp_wh_vs_that_with_gap_long_distance"
:
{
"acc"
:
0.692
,
"acc_stderr"
:
0.014606483127342761
},
"blimp_determiner_noun_agreement_with_adj_2"
:
{
"acc"
:
0.392
,
"acc_stderr"
:
0.015445859463771295
},
"blimp_principle_A_domain_1"
:
{
"acc"
:
0.324
,
"acc_stderr"
:
0.01480686473373886
},
"blimp_distractor_agreement_relative_clause"
:
{
"acc"
:
0.423
,
"acc_stderr"
:
0.015630589090476345
},
"blimp_inchoative"
:
{
"acc"
:
0.474
,
"acc_stderr"
:
0.015797897758042766
},
"blimp_superlative_quantifiers_2"
:
{
"acc"
:
0.714
,
"acc_stderr"
:
0.01429714686251791
},
"blimp_tough_vs_raising_2"
:
{
"acc"
:
0.642
,
"acc_stderr"
:
0.015167928865407557
},
"blimp_principle_A_domain_2"
:
{
"acc"
:
0.74
,
"acc_stderr"
:
0.013877773329774166
},
"blimp_determiner_noun_agreement_irregular_2"
:
{
"acc"
:
0.369
,
"acc_stderr"
:
0.015266698139154614
},
"blimp_animate_subject_trans"
:
{
"acc"
:
0.616
,
"acc_stderr"
:
0.015387682761897071
},
"blimp_ellipsis_n_bar_1"
:
{
"acc"
:
0.624
,
"acc_stderr"
:
0.015325105508898134
},
"blimp_existential_there_quantifiers_1"
:
{
"acc"
:
0.308
,
"acc_stderr"
:
0.014606483127342763
},
"blimp_regular_plural_subject_verb_agreement_1"
:
{
"acc"
:
0.56
,
"acc_stderr"
:
0.01570498795436179
},
"blimp_wh_questions_object_gap"
:
{
"acc"
:
0.455
,
"acc_stderr"
:
0.01575510149834709
},
"blimp_determiner_noun_agreement_with_adj_irregular_2"
:
{
"acc"
:
0.393
,
"acc_stderr"
:
0.015452824654081496
},
"blimp_sentential_negation_npi_scope"
:
{
"acc"
:
0.638
,
"acc_stderr"
:
0.015204840912919498
},
"blimp_principle_A_case_1"
:
{
"acc"
:
0.028
,
"acc_stderr"
:
0.005219506034410047
},
"blimp_existential_there_subject_raising"
:
{
"acc"
:
0.701
,
"acc_stderr"
:
0.014484778521220482
},
"blimp_causative"
:
{
"acc"
:
0.359
,
"acc_stderr"
:
0.015177264224798597
},
"blimp_determiner_noun_agreement_with_adjective_1"
:
{
"acc"
:
0.391
,
"acc_stderr"
:
0.015438826294681783
},
"blimp_coordinate_structure_constraint_complex_left_branch"
:
{
"acc"
:
0.345
,
"acc_stderr"
:
0.015039986742055238
},
"blimp_passive_1"
:
{
"acc"
:
0.529
,
"acc_stderr"
:
0.015792669451628896
},
"blimp_npi_present_1"
:
{
"acc"
:
0.304
,
"acc_stderr"
:
0.014553205687950424
},
"blimp_left_branch_island_echo_question"
:
{
"acc"
:
0.49
,
"acc_stderr"
:
0.015816135752773207
},
"blimp_existential_there_quantifiers_2"
:
{
"acc"
:
0.788
,
"acc_stderr"
:
0.012931481864938041
},
"blimp_regular_plural_subject_verb_agreement_2"
:
{
"acc"
:
0.456
,
"acc_stderr"
:
0.01575792855397917
},
"blimp_principle_A_reconstruction"
:
{
"acc"
:
0.792
,
"acc_stderr"
:
0.012841374572096921
},
"blimp_determiner_noun_agreement_irregular_1"
:
{
"acc"
:
0.356
,
"acc_stderr"
:
0.015149042659306628
},
"blimp_matrix_question_npi_licensor_present"
:
{
"acc"
:
0.548
,
"acc_stderr"
:
0.01574623586588068
},
"blimp_anaphor_number_agreement"
:
{
"acc"
:
0.565
,
"acc_stderr"
:
0.0156850572527172
}
},
"versions"
:
{
"blimp_anaphor_gender_agreement"
:
0
,
"blimp_wh_questions_subject_gap_long_distance"
:
0
,
"blimp_expletive_it_object_raising"
:
0
,
"blimp_npi_present_2"
:
0
,
"blimp_sentential_negation_npi_licensor_present"
:
0
,
"blimp_wh_vs_that_no_gap"
:
0
,
"blimp_wh_vs_that_with_gap"
:
0
,
"blimp_passive_2"
:
0
,
"blimp_drop_argument"
:
0
,
"blimp_irregular_plural_subject_verb_agreement_2"
:
0
,
"blimp_adjunct_island"
:
0
,
"blimp_transitive"
:
0
,
"blimp_irregular_plural_subject_verb_agreement_1"
:
0
,
"blimp_animate_subject_passive"
:
0
,
"blimp_determiner_noun_agreement_1"
:
0
,
"blimp_wh_island"
:
0
,
"blimp_intransitive"
:
0
,
"blimp_left_branch_island_simple_question"
:
0
,
"blimp_irregular_past_participle_verbs"
:
0
,
"blimp_principle_A_case_2"
:
0
,
"blimp_principle_A_domain_3"
:
0
,
"blimp_sentential_subject_island"
:
0
,
"blimp_tough_vs_raising_1"
:
0
,
"blimp_principle_A_c_command"
:
0
,
"blimp_wh_vs_that_no_gap_long_distance"
:
0
,
"blimp_irregular_past_participle_adjectives"
:
0
,
"blimp_complex_NP_island"
:
0
,
"blimp_only_npi_licensor_present"
:
0
,
"blimp_wh_questions_subject_gap"
:
0
,
"blimp_coordinate_structure_constraint_object_extraction"
:
0
,
"blimp_determiner_noun_agreement_2"
:
0
,
"blimp_ellipsis_n_bar_2"
:
0
,
"blimp_only_npi_scope"
:
0
,
"blimp_determiner_noun_agreement_with_adj_irregular_1"
:
0
,
"blimp_existential_there_object_raising"
:
0
,
"blimp_superlative_quantifiers_1"
:
0
,
"blimp_distractor_agreement_relational_noun"
:
0
,
"blimp_wh_vs_that_with_gap_long_distance"
:
0
,
"blimp_determiner_noun_agreement_with_adj_2"
:
0
,
"blimp_principle_A_domain_1"
:
0
,
"blimp_distractor_agreement_relative_clause"
:
0
,
"blimp_inchoative"
:
0
,
"blimp_superlative_quantifiers_2"
:
0
,
"blimp_tough_vs_raising_2"
:
0
,
"blimp_principle_A_domain_2"
:
0
,
"blimp_determiner_noun_agreement_irregular_2"
:
0
,
"blimp_animate_subject_trans"
:
0
,
"blimp_ellipsis_n_bar_1"
:
0
,
"blimp_existential_there_quantifiers_1"
:
0
,
"blimp_regular_plural_subject_verb_agreement_1"
:
0
,
"blimp_wh_questions_object_gap"
:
0
,
"blimp_determiner_noun_agreement_with_adj_irregular_2"
:
0
,
"blimp_sentential_negation_npi_scope"
:
0
,
"blimp_principle_A_case_1"
:
0
,
"blimp_existential_there_subject_raising"
:
0
,
"blimp_causative"
:
0
,
"blimp_determiner_noun_agreement_with_adjective_1"
:
0
,
"blimp_coordinate_structure_constraint_complex_left_branch"
:
0
,
"blimp_passive_1"
:
0
,
"blimp_npi_present_1"
:
0
,
"blimp_left_branch_island_echo_question"
:
0
,
"blimp_existential_there_quantifiers_2"
:
0
,
"blimp_regular_plural_subject_verb_agreement_2"
:
0
,
"blimp_principle_A_reconstruction"
:
0
,
"blimp_determiner_noun_agreement_irregular_1"
:
0
,
"blimp_matrix_question_npi_licensor_present"
:
0
,
"blimp_anaphor_number_agreement"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_common_sense_reasoning_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"arc_challenge"
:
{
"acc"
:
0.439419795221843
,
"acc_stderr"
:
0.014503747823580122
,
"acc_norm"
:
0.4462457337883959
,
"acc_norm_stderr"
:
0.014526705548539982
},
"prost"
:
{
"acc"
:
0.2688941076003416
,
"acc_stderr"
:
0.0032393206239968247
,
"acc_norm"
:
0.3052412467976089
,
"acc_norm_stderr"
:
0.003364432149066356
},
"swag"
:
{
"acc"
:
0.5673298010596821
,
"acc_stderr"
:
0.003502894135944166
,
"acc_norm"
:
0.6934919524142757
,
"acc_norm_stderr"
:
0.0032596605453371346
},
"arc_easy"
:
{
"acc"
:
0.7457912457912458
,
"acc_stderr"
:
0.008934537681141528
,
"acc_norm"
:
0.5989057239057239
,
"acc_norm_stderr"
:
0.010057051106534378
},
"boolq"
:
{
"acc"
:
0.6850152905198776
,
"acc_stderr"
:
0.00812432724981665
},
"wsc273"
:
{
"acc"
:
0.8608058608058609
,
"acc_stderr"
:
0.020988366070851
},
"mc_taco"
:
{
"em"
:
0.10960960960960961
,
"f1"
:
0.4753174430074593
},
"piqa"
:
{
"acc"
:
0.7883569096844396
,
"acc_stderr"
:
0.009530351270479397
,
"acc_norm"
:
0.7910772578890098
,
"acc_norm_stderr"
:
0.009485227030105093
},
"hellaswag"
:
{
"acc"
:
0.5910177255526787
,
"acc_stderr"
:
0.004906411984476791
,
"acc_norm"
:
0.7623979286994622
,
"acc_norm_stderr"
:
0.004247442237702478
},
"winogrande"
:
{
"acc"
:
0.7016574585635359
,
"acc_stderr"
:
0.012858885010030434
},
"copa"
:
{
"acc"
:
0.9
,
"acc_stderr"
:
0.030151134457776348
},
"openbookqa"
:
{
"acc"
:
0.306
,
"acc_stderr"
:
0.020629569998345403
,
"acc_norm"
:
0.422
,
"acc_norm_stderr"
:
0.022109039310618552
}
},
"versions"
:
{
"arc_challenge"
:
0
,
"prost"
:
0
,
"swag"
:
0
,
"arc_easy"
:
0
,
"boolq"
:
1
,
"wsc273"
:
0
,
"mc_taco"
:
0
,
"piqa"
:
0
,
"hellaswag"
:
0
,
"winogrande"
:
0
,
"copa"
:
0
,
"openbookqa"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_glue_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"mnli_mismatched"
:
{
"acc"
:
0.45351912123677784
,
"acc_stderr"
:
0.005020956265665617
},
"wnli"
:
{
"acc"
:
0.4647887323943662
,
"acc_stderr"
:
0.0596130578497224
},
"sst"
:
{
"acc"
:
0.6536697247706422
,
"acc_stderr"
:
0.01612186710508361
},
"cola"
:
{
"mcc"
:
0.0
,
"mcc_stderr"
:
0.0
},
"mnli"
:
{
"acc"
:
0.43555781966377993
,
"acc_stderr"
:
0.005005063722742048
},
"qnli"
:
{
"acc"
:
0.4995423759838916
,
"acc_stderr"
:
0.006765407718154766
},
"mrpc"
:
{
"acc"
:
0.6862745098039216
,
"acc_stderr"
:
0.022999936277943434
,
"f1"
:
0.8134110787172011
,
"f1_stderr"
:
0.01621238238910757
},
"rte"
:
{
"acc"
:
0.6534296028880866
,
"acc_stderr"
:
0.02864445699455754
},
"qqp"
:
{
"acc"
:
0.3679198614889933
,
"acc_stderr"
:
0.0023983700314094665
,
"f1"
:
0.5365853658536586
,
"f1_stderr"
:
0.0025607085094365924
}
},
"versions"
:
{
"mnli_mismatched"
:
0
,
"wnli"
:
1
,
"sst"
:
0
,
"cola"
:
0
,
"mnli"
:
0
,
"qnli"
:
0
,
"mrpc"
:
0
,
"rte"
:
0
,
"qqp"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
false
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_gsm8k_8-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"gsm8k"
:
{
"acc"
:
0.13570887035633056
,
"acc_stderr"
:
0.009433577908567345
}
},
"versions"
:
{
"gsm8k"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True"
,
"num_fewshot"
:
8
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_human_alignment_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"crows_pairs_english_autre"
:
{
"likelihood_difference"
:
11.426136363636363
,
"likelihood_difference_stderr"
:
4.353329595154678
,
"pct_stereotype"
:
0.36363636363636365
,
"pct_stereotype_stderr"
:
0.15212000482437738
},
"crows_pairs_french_age"
:
{
"likelihood_difference"
:
13.10138888888889
,
"likelihood_difference_stderr"
:
1.1200506582638412
,
"pct_stereotype"
:
0.3888888888888889
,
"pct_stereotype_stderr"
:
0.05167468693203863
},
"crows_pairs_french_disability"
:
{
"likelihood_difference"
:
17.323863636363637
,
"likelihood_difference_stderr"
:
1.824043354324447
,
"pct_stereotype"
:
0.4090909090909091
,
"pct_stereotype_stderr"
:
0.060983672113630656
},
"ethics_utilitarianism"
:
{
"acc"
:
0.5245424292845258
,
"acc_stderr"
:
0.007202929002919329
},
"ethics_deontology"
:
{
"acc"
:
0.503337041156841
,
"acc_stderr"
:
0.008338940677034744
,
"em"
:
0.0011123470522803114
},
"ethics_cm"
:
{
"acc"
:
0.5173745173745173
,
"acc_stderr"
:
0.008018036537975452
},
"crows_pairs_english_age"
:
{
"likelihood_difference"
:
7.710164835164835
,
"likelihood_difference_stderr"
:
0.936561657229967
,
"pct_stereotype"
:
0.5604395604395604
,
"pct_stereotype_stderr"
:
0.0523181569856619
},
"crows_pairs_french_autre"
:
{
"likelihood_difference"
:
9.942307692307692
,
"likelihood_difference_stderr"
:
3.1484255128649896
,
"pct_stereotype"
:
0.5384615384615384
,
"pct_stereotype_stderr"
:
0.14390989949130545
},
"crows_pairs_english_gender"
:
{
"likelihood_difference"
:
8.67578125
,
"likelihood_difference_stderr"
:
0.6549450667276699
,
"pct_stereotype"
:
0.584375
,
"pct_stereotype_stderr"
:
0.02759315140230172
},
"crows_pairs_french_physical_appearance"
:
{
"likelihood_difference"
:
12.57986111111111
,
"likelihood_difference_stderr"
:
1.5739147906459943
,
"pct_stereotype"
:
0.5277777777777778
,
"pct_stereotype_stderr"
:
0.05924743948371486
},
"crows_pairs_english_religion"
:
{
"likelihood_difference"
:
11.8125
,
"likelihood_difference_stderr"
:
1.175150775782876
,
"pct_stereotype"
:
0.4954954954954955
,
"pct_stereotype_stderr"
:
0.047671194793956616
},
"crows_pairs_english_race_color"
:
{
"likelihood_difference"
:
9.858390748031496
,
"likelihood_difference_stderr"
:
0.5056938997647007
,
"pct_stereotype"
:
0.5019685039370079
,
"pct_stereotype_stderr"
:
0.02220560748841351
},
"crows_pairs_french_socioeconomic"
:
{
"likelihood_difference"
:
12.791533801020408
,
"likelihood_difference_stderr"
:
0.9369927660413013
,
"pct_stereotype"
:
0.35714285714285715
,
"pct_stereotype_stderr"
:
0.03431317581537576
},
"toxigen"
:
{
"acc"
:
0.42659574468085104
,
"acc_stderr"
:
0.01614008877637632
,
"acc_norm"
:
0.4319148936170213
,
"acc_norm_stderr"
:
0.016164899004911828
},
"ethics_justice"
:
{
"acc"
:
0.4992603550295858
,
"acc_stderr"
:
0.009617152578791647
,
"em"
:
0.0014792899408284023
},
"crows_pairs_english_sexual_orientation"
:
{
"likelihood_difference"
:
10.72244623655914
,
"likelihood_difference_stderr"
:
1.1561263889540778
,
"pct_stereotype"
:
0.5483870967741935
,
"pct_stereotype_stderr"
:
0.05188393075201662
},
"crows_pairs_french_nationality"
:
{
"likelihood_difference"
:
16.33102766798419
,
"likelihood_difference_stderr"
:
0.9224360930325354
,
"pct_stereotype"
:
0.31620553359683795
,
"pct_stereotype_stderr"
:
0.029291880485542005
},
"crows_pairs_english_socioeconomic"
:
{
"likelihood_difference"
:
11.222368421052632
,
"likelihood_difference_stderr"
:
0.7806572774635993
,
"pct_stereotype"
:
0.5052631578947369
,
"pct_stereotype_stderr"
:
0.036367633377878815
},
"crows_pairs_french_race_color"
:
{
"likelihood_difference"
:
11.927445652173914
,
"likelihood_difference_stderr"
:
0.5028450572837085
,
"pct_stereotype"
:
0.35
,
"pct_stereotype_stderr"
:
0.022263034418628928
},
"crows_pairs_english_nationality"
:
{
"likelihood_difference"
:
11.848668981481481
,
"likelihood_difference_stderr"
:
0.8342534014656857
,
"pct_stereotype"
:
0.38425925925925924
,
"pct_stereotype_stderr"
:
0.03317354514310742
},
"ethics_virtue"
:
{
"acc"
:
0.20321608040201006
,
"acc_stderr"
:
0.005705535674037668
,
"em"
:
0.0
},
"crows_pairs_english_physical_appearance"
:
{
"likelihood_difference"
:
7.529513888888889
,
"likelihood_difference_stderr"
:
0.8793312801173977
,
"pct_stereotype"
:
0.4722222222222222
,
"pct_stereotype_stderr"
:
0.05924743948371486
},
"ethics_utilitarianism_original"
:
{
"acc"
:
0.9806572379367721
,
"acc_stderr"
:
0.0019864644750587196
},
"crows_pairs_french_sexual_orientation"
:
{
"likelihood_difference"
:
17.554945054945055
,
"likelihood_difference_stderr"
:
1.1803100062671743
,
"pct_stereotype"
:
0.7802197802197802
,
"pct_stereotype_stderr"
:
0.043649726328985346
},
"crows_pairs_french_religion"
:
{
"likelihood_difference"
:
11.192391304347826
,
"likelihood_difference_stderr"
:
1.0866295680081195
,
"pct_stereotype"
:
0.591304347826087
,
"pct_stereotype_stderr"
:
0.04604188749503789
},
"crows_pairs_french_gender"
:
{
"likelihood_difference"
:
10.791471962616823
,
"likelihood_difference_stderr"
:
0.6767399211366819
,
"pct_stereotype"
:
0.514018691588785
,
"pct_stereotype_stderr"
:
0.027939861549302374
},
"crows_pairs_english_disability"
:
{
"likelihood_difference"
:
12.978846153846154
,
"likelihood_difference_stderr"
:
1.8287537323468364
,
"pct_stereotype"
:
0.35384615384615387
,
"pct_stereotype_stderr"
:
0.05977027026123098
}
},
"versions"
:
{
"crows_pairs_english_autre"
:
0
,
"crows_pairs_french_age"
:
0
,
"crows_pairs_french_disability"
:
0
,
"ethics_utilitarianism"
:
0
,
"ethics_deontology"
:
0
,
"ethics_cm"
:
0
,
"crows_pairs_english_age"
:
0
,
"crows_pairs_french_autre"
:
0
,
"crows_pairs_english_gender"
:
0
,
"crows_pairs_french_physical_appearance"
:
0
,
"crows_pairs_english_religion"
:
0
,
"crows_pairs_english_race_color"
:
0
,
"crows_pairs_french_socioeconomic"
:
0
,
"toxigen"
:
0
,
"ethics_justice"
:
0
,
"crows_pairs_english_sexual_orientation"
:
0
,
"crows_pairs_french_nationality"
:
0
,
"crows_pairs_english_socioeconomic"
:
0
,
"crows_pairs_french_race_color"
:
0
,
"crows_pairs_english_nationality"
:
0
,
"ethics_virtue"
:
0
,
"crows_pairs_english_physical_appearance"
:
0
,
"ethics_utilitarianism_original"
:
0
,
"crows_pairs_french_sexual_orientation"
:
0
,
"crows_pairs_french_religion"
:
0
,
"crows_pairs_french_gender"
:
0
,
"crows_pairs_english_disability"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
false
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_lambada_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"lambada_openai"
:
{
"ppl"
:
1279051.053451683
,
"ppl_stderr"
:
60995.62964377304
,
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"lambada_openai_mt_de"
:
{
"ppl"
:
1310285.4433720284
,
"ppl_stderr"
:
71395.90633942866
,
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"lambada_openai_mt_it"
:
{
"ppl"
:
4091504.352954044
,
"ppl_stderr"
:
218020.965277226
,
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"lambada_standard"
:
{
"ppl"
:
1409047.9981006894
,
"ppl_stderr"
:
47832.883755899915
,
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"lambada_standard_cloze"
:
{
"ppl"
:
4235345.031433833
,
"ppl_stderr"
:
132892.5654001927
,
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"lambada_openai_mt_fr"
:
{
"ppl"
:
2461448.491005768
,
"ppl_stderr"
:
128013.98724687536
,
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"lambada_openai_cloze"
:
{
"ppl"
:
204515.38946166556
,
"ppl_stderr"
:
9705.341358126625
,
"acc"
:
0.00019406171162429653
,
"acc_stderr"
:
0.00019406171162430135
},
"lambada_openai_mt_en"
:
{
"ppl"
:
1279051.053451683
,
"ppl_stderr"
:
60995.62964377304
,
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"lambada_openai_mt_es"
:
{
"ppl"
:
1980241.7718905837
,
"ppl_stderr"
:
101614.2034914904
,
"acc"
:
0.0
,
"acc_stderr"
:
0.0
}
},
"versions"
:
{
"lambada_openai"
:
0
,
"lambada_openai_mt_de"
:
0
,
"lambada_openai_mt_it"
:
0
,
"lambada_standard"
:
0
,
"lambada_standard_cloze"
:
0
,
"lambada_openai_mt_fr"
:
0
,
"lambada_openai_cloze"
:
0
,
"lambada_openai_mt_en"
:
0
,
"lambada_openai_mt_es"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_mathematical_reasoning_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"math_prealgebra"
:
{
"acc"
:
0.02870264064293915
,
"acc_stderr"
:
0.0056607946989983855
},
"math_num_theory"
:
{
"acc"
:
0.014814814814814815
,
"acc_stderr"
:
0.005203704987512651
},
"drop"
:
{
"em"
:
0.0388003355704698
,
"em_stderr"
:
0.0019777172311177993
,
"f1"
:
0.13990771812080444
,
"f1_stderr"
:
0.002512880034517493
},
"gsm8k"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_intermediate_algebra"
:
{
"acc"
:
0.012181616832779624
,
"acc_stderr"
:
0.0036524791938863576
},
"math_algebra"
:
{
"acc"
:
0.018534119629317607
,
"acc_stderr"
:
0.003916347676363957
},
"math_counting_and_prob"
:
{
"acc"
:
0.014767932489451477
,
"acc_stderr"
:
0.0055462385896684775
},
"math_geometry"
:
{
"acc"
:
0.012526096033402923
,
"acc_stderr"
:
0.005086941389677977
},
"math_precalc"
:
{
"acc"
:
0.01098901098901099
,
"acc_stderr"
:
0.004465618427331416
},
"mathqa"
:
{
"acc"
:
0.28442211055276384
,
"acc_stderr"
:
0.008258681628795297
,
"acc_norm"
:
0.28676716917922945
,
"acc_norm_stderr"
:
0.00827905882129993
},
"math_asdiv"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
}
},
"versions"
:
{
"math_prealgebra"
:
1
,
"math_num_theory"
:
1
,
"drop"
:
1
,
"mathqa"
:
0
,
"gsm8k"
:
0
,
"math_intermediate_algebra"
:
1
,
"math_algebra"
:
1
,
"math_counting_and_prob"
:
1
,
"math_geometry"
:
1
,
"math_precalc"
:
1
,
"math_asdiv"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_mathematical_reasoning_few_shot_5-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"math_prealgebra"
:
{
"acc"
:
0.001148105625717566
,
"acc_stderr"
:
0.0011481056257175704
},
"drop"
:
{
"em"
:
0.01709312080536913
,
"em_stderr"
:
0.001327414384722433
,
"f1"
:
0.024450503355704672
,
"f1_stderr"
:
0.001413124400630544
},
"math_intermediate_algebra"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_counting_and_prob"
:
{
"acc"
:
0.002109704641350211
,
"acc_stderr"
:
0.0021097046413502104
},
"math_num_theory"
:
{
"acc"
:
0.001851851851851852
,
"acc_stderr"
:
0.0018518518518518502
},
"gsm8k"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_geometry"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_algebra"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_precalc"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"mathqa"
:
{
"acc"
:
0.2998324958123953
,
"acc_stderr"
:
0.008387661895516162
,
"acc_norm"
:
0.3035175879396985
,
"acc_norm_stderr"
:
0.008416811454701563
}
},
"versions"
:
{
"math_prealgebra"
:
1
,
"drop"
:
1
,
"mathqa"
:
0
,
"math_intermediate_algebra"
:
1
,
"math_counting_and_prob"
:
1
,
"math_num_theory"
:
1
,
"gsm8k"
:
0
,
"math_geometry"
:
1
,
"math_algebra"
:
1
,
"math_precalc"
:
1
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True"
,
"num_fewshot"
:
5
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_mmlu_5-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"hendrycksTest-college_biology"
:
{
"acc"
:
0.4583333333333333
,
"acc_stderr"
:
0.04166666666666665
,
"acc_norm"
:
0.3263888888888889
,
"acc_norm_stderr"
:
0.03921067198982266
},
"hendrycksTest-clinical_knowledge"
:
{
"acc"
:
0.46037735849056605
,
"acc_stderr"
:
0.030676096599389188
,
"acc_norm"
:
0.3849056603773585
,
"acc_norm_stderr"
:
0.029946498567699948
},
"hendrycksTest-high_school_european_history"
:
{
"acc"
:
0.5272727272727272
,
"acc_stderr"
:
0.03898531605579418
,
"acc_norm"
:
0.49696969696969695
,
"acc_norm_stderr"
:
0.03904272341431855
},
"hendrycksTest-high_school_psychology"
:
{
"acc"
:
0.6073394495412844
,
"acc_stderr"
:
0.02093750516120109
,
"acc_norm"
:
0.3688073394495413
,
"acc_norm_stderr"
:
0.020686227560729537
},
"hendrycksTest-business_ethics"
:
{
"acc"
:
0.53
,
"acc_stderr"
:
0.05016135580465919
,
"acc_norm"
:
0.44
,
"acc_norm_stderr"
:
0.04988876515698589
},
"hendrycksTest-high_school_government_and_politics"
:
{
"acc"
:
0.5854922279792746
,
"acc_stderr"
:
0.035553003195576686
,
"acc_norm"
:
0.38860103626943004
,
"acc_norm_stderr"
:
0.03517739796373132
},
"hendrycksTest-security_studies"
:
{
"acc"
:
0.45714285714285713
,
"acc_stderr"
:
0.03189141832421396
,
"acc_norm"
:
0.37551020408163266
,
"acc_norm_stderr"
:
0.03100120903989484
},
"hendrycksTest-high_school_macroeconomics"
:
{
"acc"
:
0.3769230769230769
,
"acc_stderr"
:
0.024570975364225995
,
"acc_norm"
:
0.31794871794871793
,
"acc_norm_stderr"
:
0.02361088430892786
},
"hendrycksTest-sociology"
:
{
"acc"
:
0.582089552238806
,
"acc_stderr"
:
0.034875586404620636
,
"acc_norm"
:
0.4577114427860697
,
"acc_norm_stderr"
:
0.035228658640995975
},
"hendrycksTest-college_mathematics"
:
{
"acc"
:
0.29
,
"acc_stderr"
:
0.04560480215720683
,
"acc_norm"
:
0.34
,
"acc_norm_stderr"
:
0.04760952285695235
},
"hendrycksTest-professional_accounting"
:
{
"acc"
:
0.2978723404255319
,
"acc_stderr"
:
0.02728160834446941
,
"acc_norm"
:
0.2801418439716312
,
"acc_norm_stderr"
:
0.02678917235114023
},
"hendrycksTest-anatomy"
:
{
"acc"
:
0.42962962962962964
,
"acc_stderr"
:
0.04276349494376599
,
"acc_norm"
:
0.2962962962962963
,
"acc_norm_stderr"
:
0.03944624162501116
},
"hendrycksTest-professional_psychology"
:
{
"acc"
:
0.42320261437908496
,
"acc_stderr"
:
0.019987809769482067
,
"acc_norm"
:
0.3300653594771242
,
"acc_norm_stderr"
:
0.01902372616072456
},
"hendrycksTest-moral_scenarios"
:
{
"acc"
:
0.28268156424581004
,
"acc_stderr"
:
0.015060381730018082
,
"acc_norm"
:
0.27262569832402234
,
"acc_norm_stderr"
:
0.014893391735249588
},
"hendrycksTest-conceptual_physics"
:
{
"acc"
:
0.42127659574468085
,
"acc_stderr"
:
0.03227834510146268
,
"acc_norm"
:
0.2425531914893617
,
"acc_norm_stderr"
:
0.028020226271200217
},
"hendrycksTest-virology"
:
{
"acc"
:
0.40963855421686746
,
"acc_stderr"
:
0.03828401115079021
,
"acc_norm"
:
0.30120481927710846
,
"acc_norm_stderr"
:
0.035716092300534796
},
"hendrycksTest-world_religions"
:
{
"acc"
:
0.7426900584795322
,
"acc_stderr"
:
0.03352799844161865
,
"acc_norm"
:
0.6491228070175439
,
"acc_norm_stderr"
:
0.03660298834049162
},
"hendrycksTest-high_school_computer_science"
:
{
"acc"
:
0.49
,
"acc_stderr"
:
0.05024183937956911
,
"acc_norm"
:
0.41
,
"acc_norm_stderr"
:
0.049431107042371025
},
"hendrycksTest-abstract_algebra"
:
{
"acc"
:
0.32
,
"acc_stderr"
:
0.046882617226215034
,
"acc_norm"
:
0.3
,
"acc_norm_stderr"
:
0.046056618647183814
},
"hendrycksTest-medical_genetics"
:
{
"acc"
:
0.49
,
"acc_stderr"
:
0.05024183937956911
,
"acc_norm"
:
0.48
,
"acc_norm_stderr"
:
0.050211673156867795
},
"hendrycksTest-nutrition"
:
{
"acc"
:
0.45098039215686275
,
"acc_stderr"
:
0.02849199358617156
,
"acc_norm"
:
0.4673202614379085
,
"acc_norm_stderr"
:
0.02856869975222588
},
"hendrycksTest-elementary_mathematics"
:
{
"acc"
:
0.36772486772486773
,
"acc_stderr"
:
0.024833839825562424
,
"acc_norm"
:
0.328042328042328
,
"acc_norm_stderr"
:
0.024180497164376907
},
"hendrycksTest-philosophy"
:
{
"acc"
:
0.45980707395498394
,
"acc_stderr"
:
0.028306190403305696
,
"acc_norm"
:
0.3858520900321543
,
"acc_norm_stderr"
:
0.02764814959975146
},
"hendrycksTest-high_school_microeconomics"
:
{
"acc"
:
0.42016806722689076
,
"acc_stderr"
:
0.03206183783236152
,
"acc_norm"
:
0.40756302521008403
,
"acc_norm_stderr"
:
0.031918633744784645
},
"hendrycksTest-management"
:
{
"acc"
:
0.6407766990291263
,
"acc_stderr"
:
0.04750458399041696
,
"acc_norm"
:
0.4174757281553398
,
"acc_norm_stderr"
:
0.048828405482122375
},
"hendrycksTest-us_foreign_policy"
:
{
"acc"
:
0.68
,
"acc_stderr"
:
0.046882617226215034
,
"acc_norm"
:
0.52
,
"acc_norm_stderr"
:
0.050211673156867795
},
"hendrycksTest-international_law"
:
{
"acc"
:
0.5619834710743802
,
"acc_stderr"
:
0.04529146804435792
,
"acc_norm"
:
0.6033057851239669
,
"acc_norm_stderr"
:
0.044658697805310094
},
"hendrycksTest-college_chemistry"
:
{
"acc"
:
0.31
,
"acc_stderr"
:
0.04648231987117316
,
"acc_norm"
:
0.3
,
"acc_norm_stderr"
:
0.046056618647183814
},
"hendrycksTest-high_school_mathematics"
:
{
"acc"
:
0.26666666666666666
,
"acc_stderr"
:
0.026962424325073817
,
"acc_norm"
:
0.31851851851851853
,
"acc_norm_stderr"
:
0.028406533090608463
},
"hendrycksTest-high_school_world_history"
:
{
"acc"
:
0.4978902953586498
,
"acc_stderr"
:
0.032546938018020076
,
"acc_norm"
:
0.42616033755274263
,
"acc_norm_stderr"
:
0.03219035703131774
},
"hendrycksTest-human_sexuality"
:
{
"acc"
:
0.549618320610687
,
"acc_stderr"
:
0.04363643698524779
,
"acc_norm"
:
0.3969465648854962
,
"acc_norm_stderr"
:
0.04291135671009224
},
"hendrycksTest-college_computer_science"
:
{
"acc"
:
0.33
,
"acc_stderr"
:
0.047258156262526045
,
"acc_norm"
:
0.28
,
"acc_norm_stderr"
:
0.045126085985421276
},
"hendrycksTest-college_medicine"
:
{
"acc"
:
0.4277456647398844
,
"acc_stderr"
:
0.037724468575180255
,
"acc_norm"
:
0.30057803468208094
,
"acc_norm_stderr"
:
0.0349610148119118
},
"hendrycksTest-formal_logic"
:
{
"acc"
:
0.3253968253968254
,
"acc_stderr"
:
0.041905964388711366
,
"acc_norm"
:
0.3412698412698413
,
"acc_norm_stderr"
:
0.04240799327574925
},
"hendrycksTest-high_school_physics"
:
{
"acc"
:
0.271523178807947
,
"acc_stderr"
:
0.03631329803969653
,
"acc_norm"
:
0.25165562913907286
,
"acc_norm_stderr"
:
0.035433042343899844
},
"hendrycksTest-marketing"
:
{
"acc"
:
0.7264957264957265
,
"acc_stderr"
:
0.029202540153431173
,
"acc_norm"
:
0.6153846153846154
,
"acc_norm_stderr"
:
0.03187195347942466
},
"hendrycksTest-jurisprudence"
:
{
"acc"
:
0.48148148148148145
,
"acc_stderr"
:
0.04830366024635331
,
"acc_norm"
:
0.5
,
"acc_norm_stderr"
:
0.04833682445228318
},
"hendrycksTest-computer_security"
:
{
"acc"
:
0.57
,
"acc_stderr"
:
0.049756985195624284
,
"acc_norm"
:
0.44
,
"acc_norm_stderr"
:
0.04988876515698589
},
"hendrycksTest-high_school_chemistry"
:
{
"acc"
:
0.3103448275862069
,
"acc_stderr"
:
0.03255086769970103
,
"acc_norm"
:
0.32019704433497537
,
"acc_norm_stderr"
:
0.032826493853041504
},
"hendrycksTest-prehistory"
:
{
"acc"
:
0.49691358024691357
,
"acc_stderr"
:
0.02782021415859437
,
"acc_norm"
:
0.345679012345679
,
"acc_norm_stderr"
:
0.026462487777001876
},
"hendrycksTest-machine_learning"
:
{
"acc"
:
0.2857142857142857
,
"acc_stderr"
:
0.04287858751340455
,
"acc_norm"
:
0.29464285714285715
,
"acc_norm_stderr"
:
0.043270409325787296
},
"hendrycksTest-professional_medicine"
:
{
"acc"
:
0.39338235294117646
,
"acc_stderr"
:
0.02967428828131118
,
"acc_norm"
:
0.33088235294117646
,
"acc_norm_stderr"
:
0.028582709753898452
},
"hendrycksTest-global_facts"
:
{
"acc"
:
0.34
,
"acc_stderr"
:
0.04760952285695235
,
"acc_norm"
:
0.29
,
"acc_norm_stderr"
:
0.04560480215720684
},
"hendrycksTest-high_school_us_history"
:
{
"acc"
:
0.5245098039215687
,
"acc_stderr"
:
0.03505093194348798
,
"acc_norm"
:
0.37254901960784315
,
"acc_norm_stderr"
:
0.033933885849584046
},
"hendrycksTest-high_school_geography"
:
{
"acc"
:
0.5757575757575758
,
"acc_stderr"
:
0.03521224908841586
,
"acc_norm"
:
0.42424242424242425
,
"acc_norm_stderr"
:
0.03521224908841583
},
"hendrycksTest-human_aging"
:
{
"acc"
:
0.5739910313901345
,
"acc_stderr"
:
0.033188332862172806
,
"acc_norm"
:
0.336322869955157
,
"acc_norm_stderr"
:
0.03170882426845501
},
"hendrycksTest-high_school_biology"
:
{
"acc"
:
0.4967741935483871
,
"acc_stderr"
:
0.028443414226438316
,
"acc_norm"
:
0.36129032258064514
,
"acc_norm_stderr"
:
0.027327548447957553
},
"hendrycksTest-public_relations"
:
{
"acc"
:
0.5454545454545454
,
"acc_stderr"
:
0.04769300568972744
,
"acc_norm"
:
0.2909090909090909
,
"acc_norm_stderr"
:
0.04350271442923243
},
"hendrycksTest-professional_law"
:
{
"acc"
:
0.30378096479791394
,
"acc_stderr"
:
0.011745787720472483
,
"acc_norm"
:
0.3089960886571056
,
"acc_norm_stderr"
:
0.011801729777239246
},
"hendrycksTest-electrical_engineering"
:
{
"acc"
:
0.41379310344827586
,
"acc_stderr"
:
0.041042692118062316
,
"acc_norm"
:
0.3448275862068966
,
"acc_norm_stderr"
:
0.039609335494512087
},
"hendrycksTest-logical_fallacies"
:
{
"acc"
:
0.4539877300613497
,
"acc_stderr"
:
0.0391170190467718
,
"acc_norm"
:
0.36809815950920244
,
"acc_norm_stderr"
:
0.03789213935838396
},
"hendrycksTest-moral_disputes"
:
{
"acc"
:
0.4479768786127168
,
"acc_stderr"
:
0.026772990653361816
,
"acc_norm"
:
0.3815028901734104
,
"acc_norm_stderr"
:
0.0261521986197268
},
"hendrycksTest-high_school_statistics"
:
{
"acc"
:
0.38425925925925924
,
"acc_stderr"
:
0.03317354514310742
,
"acc_norm"
:
0.375
,
"acc_norm_stderr"
:
0.033016908987210894
},
"hendrycksTest-college_physics"
:
{
"acc"
:
0.28431372549019607
,
"acc_stderr"
:
0.04488482852329017
,
"acc_norm"
:
0.35294117647058826
,
"acc_norm_stderr"
:
0.04755129616062947
},
"hendrycksTest-econometrics"
:
{
"acc"
:
0.2719298245614035
,
"acc_stderr"
:
0.04185774424022056
,
"acc_norm"
:
0.2631578947368421
,
"acc_norm_stderr"
:
0.041424397194893624
},
"hendrycksTest-miscellaneous"
:
{
"acc"
:
0.6960408684546615
,
"acc_stderr"
:
0.016448321686769043
,
"acc_norm"
:
0.48531289910600256
,
"acc_norm_stderr"
:
0.01787224802442912
},
"hendrycksTest-astronomy"
:
{
"acc"
:
0.48026315789473684
,
"acc_stderr"
:
0.04065771002562603
,
"acc_norm"
:
0.48026315789473684
,
"acc_norm_stderr"
:
0.040657710025626036
}
},
"versions"
:
{
"hendrycksTest-college_biology"
:
0
,
"hendrycksTest-clinical_knowledge"
:
0
,
"hendrycksTest-high_school_european_history"
:
0
,
"hendrycksTest-high_school_psychology"
:
0
,
"hendrycksTest-business_ethics"
:
0
,
"hendrycksTest-high_school_government_and_politics"
:
0
,
"hendrycksTest-security_studies"
:
0
,
"hendrycksTest-high_school_macroeconomics"
:
0
,
"hendrycksTest-sociology"
:
0
,
"hendrycksTest-college_mathematics"
:
0
,
"hendrycksTest-professional_accounting"
:
0
,
"hendrycksTest-anatomy"
:
0
,
"hendrycksTest-professional_psychology"
:
0
,
"hendrycksTest-moral_scenarios"
:
0
,
"hendrycksTest-conceptual_physics"
:
0
,
"hendrycksTest-virology"
:
0
,
"hendrycksTest-world_religions"
:
0
,
"hendrycksTest-high_school_computer_science"
:
0
,
"hendrycksTest-abstract_algebra"
:
0
,
"hendrycksTest-medical_genetics"
:
0
,
"hendrycksTest-nutrition"
:
0
,
"hendrycksTest-elementary_mathematics"
:
0
,
"hendrycksTest-philosophy"
:
0
,
"hendrycksTest-high_school_microeconomics"
:
0
,
"hendrycksTest-management"
:
0
,
"hendrycksTest-us_foreign_policy"
:
0
,
"hendrycksTest-international_law"
:
0
,
"hendrycksTest-college_chemistry"
:
0
,
"hendrycksTest-high_school_mathematics"
:
0
,
"hendrycksTest-high_school_world_history"
:
0
,
"hendrycksTest-human_sexuality"
:
0
,
"hendrycksTest-college_computer_science"
:
0
,
"hendrycksTest-college_medicine"
:
0
,
"hendrycksTest-formal_logic"
:
0
,
"hendrycksTest-high_school_physics"
:
0
,
"hendrycksTest-marketing"
:
0
,
"hendrycksTest-jurisprudence"
:
0
,
"hendrycksTest-computer_security"
:
0
,
"hendrycksTest-high_school_chemistry"
:
0
,
"hendrycksTest-prehistory"
:
0
,
"hendrycksTest-machine_learning"
:
0
,
"hendrycksTest-professional_medicine"
:
0
,
"hendrycksTest-global_facts"
:
0
,
"hendrycksTest-high_school_us_history"
:
0
,
"hendrycksTest-high_school_geography"
:
0
,
"hendrycksTest-human_aging"
:
0
,
"hendrycksTest-high_school_biology"
:
0
,
"hendrycksTest-public_relations"
:
0
,
"hendrycksTest-professional_law"
:
0
,
"hendrycksTest-electrical_engineering"
:
0
,
"hendrycksTest-logical_fallacies"
:
0
,
"hendrycksTest-moral_disputes"
:
0
,
"hendrycksTest-high_school_statistics"
:
0
,
"hendrycksTest-college_physics"
:
0
,
"hendrycksTest-econometrics"
:
0
,
"hendrycksTest-miscellaneous"
:
0
,
"hendrycksTest-astronomy"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True"
,
"num_fewshot"
:
5
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_pawsx_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"pawsx_fr"
:
{
"acc"
:
0.545
,
"acc_stderr"
:
0.011137752231145222
},
"pawsx_en"
:
{
"acc"
:
0.537
,
"acc_stderr"
:
0.011152474561478174
},
"pawsx_ko"
:
{
"acc"
:
0.4705
,
"acc_stderr"
:
0.011163654804511664
},
"pawsx_ja"
:
{
"acc"
:
0.45
,
"acc_stderr"
:
0.011127079848413735
},
"pawsx_es"
:
{
"acc"
:
0.521
,
"acc_stderr"
:
0.011173268141438304
},
"pawsx_de"
:
{
"acc"
:
0.5295
,
"acc_stderr"
:
0.011163654804511655
},
"pawsx_zh"
:
{
"acc"
:
0.452
,
"acc_stderr"
:
0.01113148485052578
}
},
"versions"
:
{
"pawsx_fr"
:
0
,
"pawsx_en"
:
0
,
"pawsx_ko"
:
0
,
"pawsx_ja"
:
0
,
"pawsx_es"
:
0
,
"pawsx_de"
:
0
,
"pawsx_zh"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_question_answering_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"triviaqa"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"headqa_es"
:
{
"acc"
:
0.3056163384390955
,
"acc_stderr"
:
0.008799003959214539
,
"acc_norm"
:
0.3515681983953319
,
"acc_norm_stderr"
:
0.009119739372039878
},
"logiqa"
:
{
"acc"
:
0.2642089093701997
,
"acc_stderr"
:
0.017293954549744514
,
"acc_norm"
:
0.3210445468509985
,
"acc_norm_stderr"
:
0.018312456701476108
},
"headqa_en"
:
{
"acc"
:
0.34427425237053244
,
"acc_stderr"
:
0.009075255747504299
,
"acc_norm"
:
0.38584974471188915
,
"acc_norm_stderr"
:
0.009298050684004381
},
"truthfulqa_mc"
:
{
"mc1"
:
0.2582619339045288
,
"mc1_stderr"
:
0.0153218216884762
,
"mc2"
:
0.39884734031519786
,
"mc2_stderr"
:
0.013703865869126058
},
"squad2"
:
{
"exact"
:
16.440663690726858
,
"f1"
:
24.060945088960178
,
"HasAns_exact"
:
21.086369770580298
,
"HasAns_f1"
:
36.34878560074651
,
"NoAns_exact"
:
11.808242220353238
,
"NoAns_f1"
:
11.808242220353238
,
"best_exact"
:
50.07159100480081
,
"best_f1"
:
50.073888042388
},
"webqs"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
}
},
"versions"
:
{
"triviaqa"
:
1
,
"headqa_es"
:
0
,
"logiqa"
:
0
,
"headqa_en"
:
0
,
"truthfulqa_mc"
:
1
,
"squad2"
:
1
,
"webqs"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_reading_comprehension_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"coqa"
:
{
"f1"
:
0.7704068983762044
,
"f1_stderr"
:
0.014191975492335083
,
"em"
:
0.637
,
"em_stderr"
:
0.01847461201879917
},
"drop"
:
{
"em"
:
0.035864093959731544
,
"em_stderr"
:
0.0019043146639119552
,
"f1"
:
0.13376153523489834
,
"f1_stderr"
:
0.002439665460318613
},
"race"
:
{
"acc"
:
0.39330143540669854
,
"acc_stderr"
:
0.01511816218614914
}
},
"versions"
:
{
"coqa"
:
1
,
"drop"
:
1
,
"race"
:
1
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
false
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_superglue_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"boolq"
:
{
"acc"
:
0.6844036697247706
,
"acc_stderr"
:
0.008128579858785895
},
"wic"
:
{
"acc"
:
0.49843260188087773
,
"acc_stderr"
:
0.019810623954060382
},
"copa"
:
{
"acc"
:
0.9
,
"acc_stderr"
:
0.030151134457776348
},
"wsc"
:
{
"acc"
:
0.3557692307692308
,
"acc_stderr"
:
0.04717221961050337
},
"cb"
:
{
"acc"
:
0.48214285714285715
,
"acc_stderr"
:
0.0673769750864465
,
"f1"
:
0.3881876266167991
},
"record"
:
{
"f1"
:
0.9231828571428571
,
"f1_stderr"
:
0.0026119602574627677
,
"em"
:
0.9154
,
"em_stderr"
:
0.002782994521347745
},
"multirc"
:
{
"acc"
:
0.015739769150052464
,
"acc_stderr"
:
0.00403399795659578
}
},
"versions"
:
{
"boolq"
:
1
,
"wic"
:
0
,
"copa"
:
0
,
"wsc"
:
0
,
"cb"
:
1
,
"record"
:
0
,
"multirc"
:
1
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
false
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_xcopa_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"xcopa_it"
:
{
"acc"
:
0.672
,
"acc_stderr"
:
0.021017027165175485
},
"xcopa_vi"
:
{
"acc"
:
0.538
,
"acc_stderr"
:
0.02231833811987053
},
"xcopa_zh"
:
{
"acc"
:
0.584
,
"acc_stderr"
:
0.02206494331392886
},
"xcopa_ta"
:
{
"acc"
:
0.544
,
"acc_stderr"
:
0.022296238348407053
},
"xcopa_sw"
:
{
"acc"
:
0.512
,
"acc_stderr"
:
0.02237662679792717
},
"xcopa_id"
:
{
"acc"
:
0.578
,
"acc_stderr"
:
0.022109039310618552
},
"xcopa_tr"
:
{
"acc"
:
0.53
,
"acc_stderr"
:
0.02234274819250285
},
"xcopa_ht"
:
{
"acc"
:
0.528
,
"acc_stderr"
:
0.02234794983266809
},
"xcopa_qu"
:
{
"acc"
:
0.502
,
"acc_stderr"
:
0.02238289498648353
},
"xcopa_th"
:
{
"acc"
:
0.546
,
"acc_stderr"
:
0.022288147591176945
},
"xcopa_et"
:
{
"acc"
:
0.482
,
"acc_stderr"
:
0.02236856511738799
}
},
"versions"
:
{
"xcopa_it"
:
0
,
"xcopa_vi"
:
0
,
"xcopa_zh"
:
0
,
"xcopa_ta"
:
0
,
"xcopa_sw"
:
0
,
"xcopa_id"
:
0
,
"xcopa_tr"
:
0
,
"xcopa_ht"
:
0
,
"xcopa_qu"
:
0
,
"xcopa_th"
:
0
,
"xcopa_et"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_xnli_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"xnli_ru"
:
{
"acc"
:
0.3379241516966068
,
"acc_stderr"
:
0.006683254094065008
},
"xnli_vi"
:
{
"acc"
:
0.34211576846307384
,
"acc_stderr"
:
0.006703255428996599
},
"xnli_zh"
:
{
"acc"
:
0.3447105788423154
,
"acc_stderr"
:
0.006715345603576115
},
"xnli_bg"
:
{
"acc"
:
0.34211576846307384
,
"acc_stderr"
:
0.0067032554289965995
},
"xnli_el"
:
{
"acc"
:
0.3469061876247505
,
"acc_stderr"
:
0.0067254026681375706
},
"xnli_fr"
:
{
"acc"
:
0.3349301397205589
,
"acc_stderr"
:
0.006668608672768922
},
"xnli_ur"
:
{
"acc"
:
0.34211576846307384
,
"acc_stderr"
:
0.006703255428996604
},
"xnli_hi"
:
{
"acc"
:
0.35588822355289423
,
"acc_stderr"
:
0.00676490827777005
},
"xnli_es"
:
{
"acc"
:
0.3349301397205589
,
"acc_stderr"
:
0.006668608672768919
},
"xnli_sw"
:
{
"acc"
:
0.3315369261477046
,
"acc_stderr"
:
0.006651646309907708
},
"xnli_th"
:
{
"acc"
:
0.34830339321357284
,
"acc_stderr"
:
0.006731720358995404
},
"xnli_ar"
:
{
"acc"
:
0.3407185628742515
,
"acc_stderr"
:
0.006696653153866837
},
"xnli_en"
:
{
"acc"
:
0.3562874251497006
,
"acc_stderr"
:
0.006766603483662201
},
"xnli_de"
:
{
"acc"
:
0.3524950099800399
,
"acc_stderr"
:
0.006750291549188483
},
"xnli_tr"
:
{
"acc"
:
0.3399201596806387
,
"acc_stderr"
:
0.006692851356332768
}
},
"versions"
:
{
"xnli_ru"
:
0
,
"xnli_vi"
:
0
,
"xnli_zh"
:
0
,
"xnli_bg"
:
0
,
"xnli_el"
:
0
,
"xnli_fr"
:
0
,
"xnli_ur"
:
0
,
"xnli_hi"
:
0
,
"xnli_es"
:
0
,
"xnli_sw"
:
0
,
"xnli_th"
:
0
,
"xnli_ar"
:
0
,
"xnli_en"
:
0
,
"xnli_de"
:
0
,
"xnli_tr"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_xstory_cloze_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"xstory_cloze_zh"
:
{
"acc"
:
0.5645268034414295
,
"acc_stderr"
:
0.012759525506489228
},
"xstory_cloze_my"
:
{
"acc"
:
0.47782925215089345
,
"acc_stderr"
:
0.012854469625936085
},
"xstory_cloze_id"
:
{
"acc"
:
0.5526141628060887
,
"acc_stderr"
:
0.012795688167385315
},
"xstory_cloze_te"
:
{
"acc"
:
0.5334215751158173
,
"acc_stderr"
:
0.012838347934731667
},
"xstory_cloze_ar"
:
{
"acc"
:
0.49702183984116477
,
"acc_stderr"
:
0.012866897066011233
},
"xstory_cloze_sw"
:
{
"acc"
:
0.4990072799470549
,
"acc_stderr"
:
0.01286709995542293
},
"xstory_cloze_hi"
:
{
"acc"
:
0.5234943745863666
,
"acc_stderr"
:
0.012852912530051748
},
"xstory_cloze_eu"
:
{
"acc"
:
0.5069490403706155
,
"acc_stderr"
:
0.012865882570960722
},
"xstory_cloze_en"
:
{
"acc"
:
0.7729980145598941
,
"acc_stderr"
:
0.010779920137756025
},
"xstory_cloze_es"
:
{
"acc"
:
0.6942422236929185
,
"acc_stderr"
:
0.011856480568871262
},
"xstory_cloze_ru"
:
{
"acc"
:
0.6340172071475844
,
"acc_stderr"
:
0.012396308684399372
}
},
"versions"
:
{
"xstory_cloze_zh"
:
0
,
"xstory_cloze_my"
:
0
,
"xstory_cloze_id"
:
0
,
"xstory_cloze_te"
:
0
,
"xstory_cloze_ar"
:
0
,
"xstory_cloze_sw"
:
0
,
"xstory_cloze_hi"
:
0
,
"xstory_cloze_eu"
:
0
,
"xstory_cloze_en"
:
0
,
"xstory_cloze_es"
:
0
,
"xstory_cloze_ru"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_xwinograd_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"xwinograd_pt"
:
{
"acc"
:
0.714828897338403
,
"acc_stderr"
:
0.02789350966043832
},
"xwinograd_jp"
:
{
"acc"
:
0.5985401459854015
,
"acc_stderr"
:
0.01583743878453324
},
"xwinograd_en"
:
{
"acc"
:
0.8675268817204301
,
"acc_stderr"
:
0.007032136436579812
},
"xwinograd_ru"
:
{
"acc"
:
0.707936507936508
,
"acc_stderr"
:
0.02566084582577463
},
"xwinograd_zh"
:
{
"acc"
:
0.7003968253968254
,
"acc_stderr"
:
0.020424963888406065
},
"xwinograd_fr"
:
{
"acc"
:
0.6867469879518072
,
"acc_stderr"
:
0.051219942106581456
}
},
"versions"
:
{
"xwinograd_pt"
:
0
,
"xwinograd_jp"
:
0
,
"xwinograd_en"
:
0
,
"xwinograd_ru"
:
0
,
"xwinograd_zh"
:
0
,
"xwinograd_fr"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-30B/README.md
0 → 100644
View file @
e53eb332
# llama-30B
## llama-30B_bbh_3-shot.json
| Task |Version| Metric |Value| |Stderr|
|------------------------------------------------|------:|---------------------|----:|---|-----:|
|bigbench_causal_judgement | 0|multiple_choice_grade|57.37|± | 3.60|
|bigbench_date_understanding | 0|multiple_choice_grade|69.92|± | 2.39|
|bigbench_disambiguation_qa | 0|multiple_choice_grade|54.26|± | 3.11|
|bigbench_dyck_languages | 0|multiple_choice_grade|21.20|± | 1.29|
|bigbench_formal_fallacies_syllogisms_negation | 0|multiple_choice_grade|50.58|± | 0.42|
|bigbench_geometric_shapes | 0|multiple_choice_grade|27.86|± | 2.37|
| | |exact_str_match | 0.00|± | 0.00|
|bigbench_hyperbaton | 0|multiple_choice_grade|51.52|± | 0.22|
|bigbench_logical_deduction_five_objects | 0|multiple_choice_grade|36.80|± | 2.16|
|bigbench_logical_deduction_seven_objects | 0|multiple_choice_grade|25.29|± | 1.64|
|bigbench_logical_deduction_three_objects | 0|multiple_choice_grade|53.00|± | 2.89|
|bigbench_movie_recommendation | 0|multiple_choice_grade|63.20|± | 2.16|
|bigbench_navigate | 0|multiple_choice_grade|49.00|± | 1.58|
|bigbench_reasoning_about_colored_objects | 0|multiple_choice_grade|55.65|± | 1.11|
|bigbench_ruin_names | 0|multiple_choice_grade|39.73|± | 2.31|
|bigbench_salient_translation_error_detection | 0|multiple_choice_grade|19.84|± | 1.26|
|bigbench_snarks | 0|multiple_choice_grade|46.96|± | 3.72|
|bigbench_sports_understanding | 0|multiple_choice_grade|62.37|± | 1.54|
|bigbench_temporal_sequences | 0|multiple_choice_grade|14.60|± | 1.12|
|bigbench_tracking_shuffled_objects_five_objects | 0|multiple_choice_grade|21.28|± | 1.16|
|bigbench_tracking_shuffled_objects_seven_objects| 0|multiple_choice_grade|15.49|± | 0.87|
|bigbench_tracking_shuffled_objects_three_objects| 0|multiple_choice_grade|53.00|± | 2.89|
## llama-30B_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |46.76|± | 1.46|
| | |acc_norm|45.48|± | 1.46|
|arc_easy | 0|acc |75.34|± | 0.88|
| | |acc_norm|58.96|± | 1.01|
|boolq | 1|acc |68.41|± | 0.81|
|copa | 0|acc |90.00|± | 3.02|
|hellaswag | 0|acc |62.65|± | 0.48|
| | |acc_norm|79.24|± | 0.40|
|mc_taco | 0|em |11.41| | |
| | |f1 |48.36| | |
|openbookqa | 0|acc |29.40|± | 2.04|
| | |acc_norm|42.00|± | 2.21|
|piqa | 0|acc |80.96|± | 0.92|
| | |acc_norm|80.09|± | 0.93|
|prost | 0|acc |25.99|± | 0.32|
| | |acc_norm|29.11|± | 0.33|
|swag | 0|acc |58.61|± | 0.35|
| | |acc_norm|70.36|± | 0.32|
|winogrande | 0|acc |72.77|± | 1.25|
|wsc273 | 0|acc |86.81|± | 2.05|
## llama-30B_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc |30.48|± | 1.27|
## llama-30B_human_alignment_0-shot.json
| Task |Version| Metric | Value | |Stderr|
|---------------------------------------|------:|---------------------|------:|---|-----:|
|crows_pairs_english_age | 0|likelihood_difference| 512.91|± | 58.13|
| | |pct_stereotype | 58.24|± | 5.20|
|crows_pairs_english_autre | 0|likelihood_difference|1138.07|± |348.77|
| | |pct_stereotype | 63.64|± | 15.21|
|crows_pairs_english_disability | 0|likelihood_difference| 888.65|± |103.42|
| | |pct_stereotype | 53.85|± | 6.23|
|crows_pairs_english_gender | 0|likelihood_difference| 666.15|± | 42.85|
| | |pct_stereotype | 54.06|± | 2.79|
|crows_pairs_english_nationality | 0|likelihood_difference| 587.28|± | 39.94|
| | |pct_stereotype | 53.24|± | 3.40|
|crows_pairs_english_physical_appearance| 0|likelihood_difference| 540.10|± | 59.14|
| | |pct_stereotype | 52.78|± | 5.92|
|crows_pairs_english_race_color | 0|likelihood_difference| 768.21|± | 39.14|
| | |pct_stereotype | 56.10|± | 2.20|
|crows_pairs_english_religion | 0|likelihood_difference| 807.57|± | 94.38|
| | |pct_stereotype | 62.16|± | 4.62|
|crows_pairs_english_sexual_orientation | 0|likelihood_difference| 754.77|± | 76.83|
| | |pct_stereotype | 63.44|± | 5.02|
|crows_pairs_english_socioeconomic | 0|likelihood_difference| 730.39|± | 54.63|
| | |pct_stereotype | 53.68|± | 3.63|
|crows_pairs_french_age | 0|likelihood_difference| 892.50|± |101.09|
| | |pct_stereotype | 40.00|± | 5.19|
|crows_pairs_french_autre | 0|likelihood_difference| 637.98|± |165.68|
| | |pct_stereotype | 61.54|± | 14.04|
|crows_pairs_french_disability | 0|likelihood_difference|1020.27|± |126.17|
| | |pct_stereotype | 56.06|± | 6.16|
|crows_pairs_french_gender | 0|likelihood_difference|1373.28|± |110.30|
| | |pct_stereotype | 50.16|± | 2.80|
|crows_pairs_french_nationality | 0|likelihood_difference| 985.10|± | 89.08|
| | |pct_stereotype | 38.74|± | 3.07|
|crows_pairs_french_physical_appearance | 0|likelihood_difference| 821.79|± |132.68|
| | |pct_stereotype | 56.94|± | 5.88|
|crows_pairs_french_race_color | 0|likelihood_difference|1061.17|± | 76.68|
| | |pct_stereotype | 41.74|± | 2.30|
|crows_pairs_french_religion | 0|likelihood_difference| 794.02|± | 93.89|
| | |pct_stereotype | 56.52|± | 4.64|
|crows_pairs_french_sexual_orientation | 0|likelihood_difference| 989.08|± |161.13|
| | |pct_stereotype | 71.43|± | 4.76|
|crows_pairs_french_socioeconomic | 0|likelihood_difference| 831.29|± | 87.37|
| | |pct_stereotype | 52.55|± | 3.58|
|ethics_cm | 0|acc | 57.50|± | 0.79|
|ethics_deontology | 0|acc | 54.17|± | 0.83|
| | |em | 6.12| | |
|ethics_justice | 0|acc | 51.70|± | 0.96|
| | |em | 1.33| | |
|ethics_utilitarianism | 0|acc | 50.12|± | 0.72|
|ethics_utilitarianism_original | 0|acc | 93.97|± | 0.34|
|ethics_virtue | 0|acc | 51.82|± | 0.71|
| | |em | 8.14| | |
|toxigen | 0|acc | 42.66|± | 1.61|
| | |acc_norm | 43.19|± | 1.62|
## llama-30B_mathematical_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 3.83|± | 0.20|
| | |f1 |13.91|± | 0.25|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 2.95|± | 0.49|
|math_asdiv | 0|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 4.01|± | 0.90|
|math_geometry | 1|acc | 1.46|± | 0.55|
|math_intermediate_algebra| 1|acc | 0.89|± | 0.31|
|math_num_theory | 1|acc | 2.96|± | 0.73|
|math_prealgebra | 1|acc | 4.13|± | 0.67|
|math_precalc | 1|acc | 1.83|± | 0.57|
|mathqa | 0|acc |30.59|± | 0.84|
| | |acc_norm|30.89|± | 0.85|
## llama-30B_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 0.84|± | 0.09|
| | |f1 | 1.65|± | 0.10|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.00|± | 0.00|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.00|± | 0.00|
|math_prealgebra | 1|acc | 0.11|± | 0.11|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |34.74|± | 0.87|
| | |acc_norm|34.54|± | 0.87|
## llama-30B_mmlu_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------------------------------|------:|--------|----:|---|-----:|
|hendrycksTest-abstract_algebra | 0|acc |26.00|± | 4.41|
| | |acc_norm|29.00|± | 4.56|
|hendrycksTest-anatomy | 0|acc |51.85|± | 4.32|
| | |acc_norm|40.74|± | 4.24|
|hendrycksTest-astronomy | 0|acc |57.24|± | 4.03|
| | |acc_norm|56.58|± | 4.03|
|hendrycksTest-business_ethics | 0|acc |67.00|± | 4.73|
| | |acc_norm|48.00|± | 5.02|
|hendrycksTest-clinical_knowledge | 0|acc |53.21|± | 3.07|
| | |acc_norm|46.42|± | 3.07|
|hendrycksTest-college_biology | 0|acc |61.11|± | 4.08|
| | |acc_norm|42.36|± | 4.13|
|hendrycksTest-college_chemistry | 0|acc |31.00|± | 4.65|
| | |acc_norm|32.00|± | 4.69|
|hendrycksTest-college_computer_science | 0|acc |43.00|± | 4.98|
| | |acc_norm|34.00|± | 4.76|
|hendrycksTest-college_mathematics | 0|acc |37.00|± | 4.85|
| | |acc_norm|30.00|± | 4.61|
|hendrycksTest-college_medicine | 0|acc |51.45|± | 3.81|
| | |acc_norm|43.35|± | 3.78|
|hendrycksTest-college_physics | 0|acc |23.53|± | 4.22|
| | |acc_norm|29.41|± | 4.53|
|hendrycksTest-computer_security | 0|acc |66.00|± | 4.76|
| | |acc_norm|58.00|± | 4.96|
|hendrycksTest-conceptual_physics | 0|acc |51.06|± | 3.27|
| | |acc_norm|32.77|± | 3.07|
|hendrycksTest-econometrics | 0|acc |35.09|± | 4.49|
| | |acc_norm|31.58|± | 4.37|
|hendrycksTest-electrical_engineering | 0|acc |51.72|± | 4.16|
| | |acc_norm|38.62|± | 4.06|
|hendrycksTest-elementary_mathematics | 0|acc |44.18|± | 2.56|
| | |acc_norm|37.04|± | 2.49|
|hendrycksTest-formal_logic | 0|acc |42.06|± | 4.42|
| | |acc_norm|39.68|± | 4.38|
|hendrycksTest-global_facts | 0|acc |47.00|± | 5.02|
| | |acc_norm|37.00|± | 4.85|
|hendrycksTest-high_school_biology | 0|acc |67.10|± | 2.67|
| | |acc_norm|54.52|± | 2.83|
|hendrycksTest-high_school_chemistry | 0|acc |39.90|± | 3.45|
| | |acc_norm|36.95|± | 3.40|
|hendrycksTest-high_school_computer_science | 0|acc |61.00|± | 4.90|
| | |acc_norm|47.00|± | 5.02|
|hendrycksTest-high_school_european_history | 0|acc |69.70|± | 3.59|
| | |acc_norm|56.36|± | 3.87|
|hendrycksTest-high_school_geography | 0|acc |75.76|± | 3.05|
| | |acc_norm|55.05|± | 3.54|
|hendrycksTest-high_school_government_and_politics| 0|acc |80.83|± | 2.84|
| | |acc_norm|61.14|± | 3.52|
|hendrycksTest-high_school_macroeconomics | 0|acc |51.54|± | 2.53|
| | |acc_norm|41.54|± | 2.50|
|hendrycksTest-high_school_mathematics | 0|acc |25.93|± | 2.67|
| | |acc_norm|31.48|± | 2.83|
|hendrycksTest-high_school_microeconomics | 0|acc |58.40|± | 3.20|
| | |acc_norm|48.32|± | 3.25|
|hendrycksTest-high_school_physics | 0|acc |31.79|± | 3.80|
| | |acc_norm|31.13|± | 3.78|
|hendrycksTest-high_school_psychology | 0|acc |77.06|± | 1.80|
| | |acc_norm|55.41|± | 2.13|
|hendrycksTest-high_school_statistics | 0|acc |43.52|± | 3.38|
| | |acc_norm|35.65|± | 3.27|
|hendrycksTest-high_school_us_history | 0|acc |72.06|± | 3.15|
| | |acc_norm|55.39|± | 3.49|
|hendrycksTest-high_school_world_history | 0|acc |69.62|± | 2.99|
| | |acc_norm|56.96|± | 3.22|
|hendrycksTest-human_aging | 0|acc |67.26|± | 3.15|
| | |acc_norm|36.32|± | 3.23|
|hendrycksTest-human_sexuality | 0|acc |70.23|± | 4.01|
| | |acc_norm|46.56|± | 4.37|
|hendrycksTest-international_law | 0|acc |70.25|± | 4.17|
| | |acc_norm|76.86|± | 3.85|
|hendrycksTest-jurisprudence | 0|acc |66.67|± | 4.56|
| | |acc_norm|55.56|± | 4.80|
|hendrycksTest-logical_fallacies | 0|acc |69.94|± | 3.60|
| | |acc_norm|53.99|± | 3.92|
|hendrycksTest-machine_learning | 0|acc |40.18|± | 4.65|
| | |acc_norm|30.36|± | 4.36|
|hendrycksTest-management | 0|acc |71.84|± | 4.45|
| | |acc_norm|55.34|± | 4.92|
|hendrycksTest-marketing | 0|acc |84.62|± | 2.36|
| | |acc_norm|76.50|± | 2.78|
|hendrycksTest-medical_genetics | 0|acc |60.00|± | 4.92|
| | |acc_norm|54.00|± | 5.01|
|hendrycksTest-miscellaneous | 0|acc |81.86|± | 1.38|
| | |acc_norm|61.43|± | 1.74|
|hendrycksTest-moral_disputes | 0|acc |61.85|± | 2.62|
| | |acc_norm|45.95|± | 2.68|
|hendrycksTest-moral_scenarios | 0|acc |34.30|± | 1.59|
| | |acc_norm|27.26|± | 1.49|
|hendrycksTest-nutrition | 0|acc |61.11|± | 2.79|
| | |acc_norm|50.33|± | 2.86|
|hendrycksTest-philosophy | 0|acc |67.52|± | 2.66|
| | |acc_norm|50.16|± | 2.84|
|hendrycksTest-prehistory | 0|acc |66.36|± | 2.63|
| | |acc_norm|42.90|± | 2.75|
|hendrycksTest-professional_accounting | 0|acc |39.72|± | 2.92|
| | |acc_norm|33.69|± | 2.82|
|hendrycksTest-professional_law | 0|acc |40.03|± | 1.25|
| | |acc_norm|34.35|± | 1.21|
|hendrycksTest-professional_medicine | 0|acc |55.51|± | 3.02|
| | |acc_norm|35.66|± | 2.91|
|hendrycksTest-professional_psychology | 0|acc |58.82|± | 1.99|
| | |acc_norm|43.30|± | 2.00|
|hendrycksTest-public_relations | 0|acc |64.55|± | 4.58|
| | |acc_norm|40.91|± | 4.71|
|hendrycksTest-security_studies | 0|acc |57.14|± | 3.17|
| | |acc_norm|40.41|± | 3.14|
|hendrycksTest-sociology | 0|acc |76.12|± | 3.01|
| | |acc_norm|66.17|± | 3.35|
|hendrycksTest-us_foreign_policy | 0|acc |79.00|± | 4.09|
| | |acc_norm|59.00|± | 4.94|
|hendrycksTest-virology | 0|acc |49.40|± | 3.89|
| | |acc_norm|34.34|± | 3.70|
|hendrycksTest-world_religions | 0|acc |81.29|± | 2.99|
| | |acc_norm|76.61|± | 3.25|
## llama-30B_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |58.20|± | 1.10|
|pawsx_en| 0|acc |58.75|± | 1.10|
|pawsx_es| 0|acc |55.80|± | 1.11|
|pawsx_fr| 0|acc |52.85|± | 1.12|
|pawsx_ja| 0|acc |46.75|± | 1.12|
|pawsx_ko| 0|acc |45.70|± | 1.11|
|pawsx_zh| 0|acc |45.90|± | 1.11|
## llama-30B_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 47.2|± | 2.23|
|xcopa_ht| 0|acc | 51.8|± | 2.24|
|xcopa_id| 0|acc | 60.6|± | 2.19|
|xcopa_it| 0|acc | 71.4|± | 2.02|
|xcopa_qu| 0|acc | 49.4|± | 2.24|
|xcopa_sw| 0|acc | 52.4|± | 2.24|
|xcopa_ta| 0|acc | 53.2|± | 2.23|
|xcopa_th| 0|acc | 54.6|± | 2.23|
|xcopa_tr| 0|acc | 52.2|± | 2.24|
|xcopa_vi| 0|acc | 52.4|± | 2.24|
|xcopa_zh| 0|acc | 62.2|± | 2.17|
## llama-30B_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |34.49|± | 0.67|
|xnli_bg| 0|acc |38.52|± | 0.69|
|xnli_de| 0|acc |43.87|± | 0.70|
|xnli_el| 0|acc |34.91|± | 0.67|
|xnli_en| 0|acc |48.18|± | 0.71|
|xnli_es| 0|acc |40.24|± | 0.69|
|xnli_fr| 0|acc |42.95|± | 0.70|
|xnli_hi| 0|acc |36.47|± | 0.68|
|xnli_ru| 0|acc |38.12|± | 0.69|
|xnli_sw| 0|acc |34.09|± | 0.67|
|xnli_th| 0|acc |33.97|± | 0.67|
|xnli_tr| 0|acc |36.53|± | 0.68|
|xnli_ur| 0|acc |34.31|± | 0.67|
|xnli_vi| 0|acc |35.67|± | 0.68|
|xnli_zh| 0|acc |33.51|± | 0.67|
## llama-30B_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |50.89|± | 1.29|
|xstory_cloze_en| 0|acc |78.16|± | 1.06|
|xstory_cloze_es| 0|acc |70.81|± | 1.17|
|xstory_cloze_eu| 0|acc |51.36|± | 1.29|
|xstory_cloze_hi| 0|acc |56.65|± | 1.28|
|xstory_cloze_id| 0|acc |59.23|± | 1.26|
|xstory_cloze_my| 0|acc |48.78|± | 1.29|
|xstory_cloze_ru| 0|acc |66.71|± | 1.21|
|xstory_cloze_sw| 0|acc |50.63|± | 1.29|
|xstory_cloze_te| 0|acc |53.21|± | 1.28|
|xstory_cloze_zh| 0|acc |58.57|± | 1.27|
## llama-30B_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |87.40|± | 0.69|
|xwinograd_fr| 0|acc |73.49|± | 4.87|
|xwinograd_jp| 0|acc |67.36|± | 1.51|
|xwinograd_pt| 0|acc |76.81|± | 2.61|
|xwinograd_ru| 0|acc |66.98|± | 2.65|
|xwinograd_zh| 0|acc |71.23|± | 2.02|
results/llama/llama-30B/llama-30B_bbh_3-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"bigbench_hyperbaton"
:
{
"multiple_choice_grade"
:
0.51524
,
"multiple_choice_grade_stderr"
:
0.0022350513992069
},
"bigbench_salient_translation_error_detection"
:
{
"multiple_choice_grade"
:
0.19839679358717435
,
"multiple_choice_grade_stderr"
:
0.012629887094728112
},
"bigbench_geometric_shapes"
:
{
"multiple_choice_grade"
:
0.2785515320334262
,
"multiple_choice_grade_stderr"
:
0.023692665345206258
,
"exact_str_match"
:
0.0
,
"exact_str_match_stderr"
:
0.0
},
"bigbench_navigate"
:
{
"multiple_choice_grade"
:
0.49
,
"multiple_choice_grade_stderr"
:
0.015816135752773193
},
"bigbench_date_understanding"
:
{
"multiple_choice_grade"
:
0.6991869918699187
,
"multiple_choice_grade_stderr"
:
0.023906779002093273
},
"bigbench_disambiguation_qa"
:
{
"multiple_choice_grade"
:
0.5426356589147286
,
"multiple_choice_grade_stderr"
:
0.031075544990472662
},
"bigbench_tracking_shuffled_objects_three_objects"
:
{
"multiple_choice_grade"
:
0.53
,
"multiple_choice_grade_stderr"
:
0.02886365132641709
},
"bigbench_dyck_languages"
:
{
"multiple_choice_grade"
:
0.212
,
"multiple_choice_grade_stderr"
:
0.01293148186493804
},
"bigbench_formal_fallacies_syllogisms_negation"
:
{
"multiple_choice_grade"
:
0.5058450704225352
,
"multiple_choice_grade_stderr"
:
0.004195767817554208
},
"bigbench_tracking_shuffled_objects_seven_objects"
:
{
"multiple_choice_grade"
:
0.15485714285714286
,
"multiple_choice_grade_stderr"
:
0.00865039181414196
},
"bigbench_causal_judgement"
:
{
"multiple_choice_grade"
:
0.5736842105263158
,
"multiple_choice_grade_stderr"
:
0.03597255252302466
},
"bigbench_movie_recommendation"
:
{
"multiple_choice_grade"
:
0.632
,
"multiple_choice_grade_stderr"
:
0.02158898256835354
},
"bigbench_tracking_shuffled_objects_five_objects"
:
{
"multiple_choice_grade"
:
0.2128
,
"multiple_choice_grade_stderr"
:
0.01158102863217863
},
"bigbench_snarks"
:
{
"multiple_choice_grade"
:
0.4696132596685083
,
"multiple_choice_grade_stderr"
:
0.03719891321680327
},
"bigbench_sports_understanding"
:
{
"multiple_choice_grade"
:
0.6237322515212982
,
"multiple_choice_grade_stderr"
:
0.01543581207286162
},
"bigbench_logical_deduction_seven_objects"
:
{
"multiple_choice_grade"
:
0.25285714285714284
,
"multiple_choice_grade_stderr"
:
0.01643996352811702
},
"bigbench_temporal_sequences"
:
{
"multiple_choice_grade"
:
0.146
,
"multiple_choice_grade_stderr"
:
0.011171786285496496
},
"bigbench_logical_deduction_five_objects"
:
{
"multiple_choice_grade"
:
0.368
,
"multiple_choice_grade_stderr"
:
0.021588982568353548
},
"bigbench_ruin_names"
:
{
"multiple_choice_grade"
:
0.39732142857142855
,
"multiple_choice_grade_stderr"
:
0.023145155753004788
},
"bigbench_logical_deduction_three_objects"
:
{
"multiple_choice_grade"
:
0.53
,
"multiple_choice_grade_stderr"
:
0.02886365132641709
},
"bigbench_reasoning_about_colored_objects"
:
{
"multiple_choice_grade"
:
0.5565
,
"multiple_choice_grade_stderr"
:
0.011111507899646487
}
},
"versions"
:
{
"bigbench_hyperbaton"
:
0
,
"bigbench_salient_translation_error_detection"
:
0
,
"bigbench_geometric_shapes"
:
0
,
"bigbench_navigate"
:
0
,
"bigbench_date_understanding"
:
0
,
"bigbench_disambiguation_qa"
:
0
,
"bigbench_tracking_shuffled_objects_three_objects"
:
0
,
"bigbench_dyck_languages"
:
0
,
"bigbench_formal_fallacies_syllogisms_negation"
:
0
,
"bigbench_tracking_shuffled_objects_seven_objects"
:
0
,
"bigbench_causal_judgement"
:
0
,
"bigbench_movie_recommendation"
:
0
,
"bigbench_tracking_shuffled_objects_five_objects"
:
0
,
"bigbench_snarks"
:
0
,
"bigbench_sports_understanding"
:
0
,
"bigbench_logical_deduction_seven_objects"
:
0
,
"bigbench_temporal_sequences"
:
0
,
"bigbench_logical_deduction_five_objects"
:
0
,
"bigbench_ruin_names"
:
0
,
"bigbench_logical_deduction_three_objects"
:
0
,
"bigbench_reasoning_about_colored_objects"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/30B,use_accelerate=True"
,
"num_fewshot"
:
3
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-30B/llama-30B_common_sense_reasoning_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"hellaswag"
:
{
"acc"
:
0.6264688309101772
,
"acc_stderr"
:
0.00482752658488968
,
"acc_norm"
:
0.7923720374427405
,
"acc_norm_stderr"
:
0.00404779964623464
},
"copa"
:
{
"acc"
:
0.9
,
"acc_stderr"
:
0.030151134457776348
},
"prost"
:
{
"acc"
:
0.2598740392826644
,
"acc_stderr"
:
0.003204110008963041
,
"acc_norm"
:
0.2910973526900085
,
"acc_norm_stderr"
:
0.003318834364612203
},
"boolq"
:
{
"acc"
:
0.6840978593272171
,
"acc_stderr"
:
0.008130700051380873
},
"mc_taco"
:
{
"em"
:
0.11411411411411411
,
"f1"
:
0.48361974757894227
},
"winogrande"
:
{
"acc"
:
0.7277032359905288
,
"acc_stderr"
:
0.012510697991453936
},
"arc_challenge"
:
{
"acc"
:
0.46757679180887374
,
"acc_stderr"
:
0.014580637569995423
,
"acc_norm"
:
0.454778156996587
,
"acc_norm_stderr"
:
0.014551507060836352
},
"wsc273"
:
{
"acc"
:
0.8681318681318682
,
"acc_stderr"
:
0.020515321360773595
},
"openbookqa"
:
{
"acc"
:
0.294
,
"acc_stderr"
:
0.020395095484936603
,
"acc_norm"
:
0.42
,
"acc_norm_stderr"
:
0.02209471322976178
},
"swag"
:
{
"acc"
:
0.5861241627511746
,
"acc_stderr"
:
0.0034822550028030703
,
"acc_norm"
:
0.7036389083275018
,
"acc_norm_stderr"
:
0.0032286148364766096
},
"arc_easy"
:
{
"acc"
:
0.7533670033670034
,
"acc_stderr"
:
0.008844984581934908
,
"acc_norm"
:
0.5896464646464646
,
"acc_norm_stderr"
:
0.01009353125576545
},
"piqa"
:
{
"acc"
:
0.809575625680087
,
"acc_stderr"
:
0.009160842206469637
,
"acc_norm"
:
0.8008705114254625
,
"acc_norm_stderr"
:
0.009317391893706834
}
},
"versions"
:
{
"hellaswag"
:
0
,
"copa"
:
0
,
"prost"
:
0
,
"boolq"
:
1
,
"mc_taco"
:
0
,
"winogrande"
:
0
,
"arc_challenge"
:
0
,
"wsc273"
:
0
,
"openbookqa"
:
0
,
"swag"
:
0
,
"arc_easy"
:
0
,
"piqa"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/30B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
Prev
1
2
3
4
5
6
7
8
…
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment