Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
afbf8e66
Commit
afbf8e66
authored
May 08, 2023
by
Julen Etxaniz
Browse files
add more mpt and llama results
parent
0e849758
Changes
14
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
1800 additions
and
0 deletions
+1800
-0
results/llama/llama-7B/README.md
results/llama/llama-7B/README.md
+9
-0
results/llama/llama-7B/llama-7B_unscramble_0-shot.json
results/llama/llama-7B/llama-7B_unscramble_0-shot.json
+42
-0
results/mpt/mpt-7b/README.md
results/mpt/mpt-7b/README.md
+339
-0
results/mpt/mpt-7b/mpt-7b_anli_0-shot.json
results/mpt/mpt-7b/mpt-7b_anli_0-shot.json
+32
-0
results/mpt/mpt-7b/mpt-7b_blimp_0-shot.json
results/mpt/mpt-7b/mpt-7b_blimp_0-shot.json
+352
-0
results/mpt/mpt-7b/mpt-7b_human_alignment_0-shot.json
results/mpt/mpt-7b/mpt-7b_human_alignment_0-shot.json
+197
-0
results/mpt/mpt-7b/mpt-7b_mmlu_5-shot.json
results/mpt/mpt-7b/mpt-7b_mmlu_5-shot.json
+416
-0
results/mpt/mpt-7b/mpt-7b_pawsx_0-shot.json
results/mpt/mpt-7b/mpt-7b_pawsx_0-shot.json
+52
-0
results/mpt/mpt-7b/mpt-7b_reading_comprehension_0-shot.json
results/mpt/mpt-7b/mpt-7b_reading_comprehension_0-shot.json
+36
-0
results/mpt/mpt-7b/mpt-7b_unscramble_0-shot.json
results/mpt/mpt-7b/mpt-7b_unscramble_0-shot.json
+42
-0
results/mpt/mpt-7b/mpt-7b_xcopa_0-shot.json
results/mpt/mpt-7b/mpt-7b_xcopa_0-shot.json
+72
-0
results/mpt/mpt-7b/mpt-7b_xnli_0-shot.json
results/mpt/mpt-7b/mpt-7b_xnli_0-shot.json
+92
-0
results/mpt/mpt-7b/mpt-7b_xstory_cloze_0-shot.json
results/mpt/mpt-7b/mpt-7b_xstory_cloze_0-shot.json
+72
-0
results/mpt/mpt-7b/mpt-7b_xwinograd_0-shot.json
results/mpt/mpt-7b/mpt-7b_xwinograd_0-shot.json
+47
-0
No files found.
results/llama/llama-7B/README.md
View file @
afbf8e66
...
@@ -432,6 +432,15 @@
...
@@ -432,6 +432,15 @@
| | |f1 |11.35|± | 0.23|
| | |f1 |11.35|± | 0.23|
|race| 1|acc |39.90|± | 1.52|
|race| 1|acc |39.90|± | 1.52|
## llama-7B_unscramble_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|----------------|------:|------|----:|---|-----:|
|anagrams1 | 0|acc | 0|± | 0|
|anagrams2 | 0|acc | 0|± | 0|
|cycle_letters | 0|acc | 0|± | 0|
|random_insertion| 0|acc | 0|± | 0|
|reversed_words | 0|acc | 0|± | 0|
## llama-7B_xcopa_0-shot.json
## llama-7B_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|--------|------:|------|----:|---|-----:|
...
...
results/llama/llama-7B/llama-7B_unscramble_0-shot.json
0 → 100644
View file @
afbf8e66
{
"results"
:
{
"anagrams1"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"anagrams2"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"cycle_letters"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"random_insertion"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"reversed_words"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
}
},
"versions"
:
{
"anagrams1"
:
0
,
"anagrams2"
:
0
,
"cycle_letters"
:
0
,
"random_insertion"
:
0
,
"reversed_words"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/7B"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/mpt/mpt-7b/README.md
View file @
afbf8e66
This diff is collapsed.
Click to expand it.
results/mpt/mpt-7b/mpt-7b_anli_0-shot.json
0 → 100644
View file @
afbf8e66
{
"results"
:
{
"anli_r1"
:
{
"acc"
:
0.332
,
"acc_stderr"
:
0.014899597242811485
},
"anli_r2"
:
{
"acc"
:
0.336
,
"acc_stderr"
:
0.014944140233795027
},
"anli_r3"
:
{
"acc"
:
0.345
,
"acc_stderr"
:
0.013728421539454881
}
},
"versions"
:
{
"anli_r1"
:
0
,
"anli_r2"
:
0
,
"anli_r3"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=mosaicml/mpt-7b,trust_remote_code=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/mpt/mpt-7b/mpt-7b_blimp_0-shot.json
0 → 100644
View file @
afbf8e66
{
"results"
:
{
"blimp_adjunct_island"
:
{
"acc"
:
0.878
,
"acc_stderr"
:
0.010354864712936724
},
"blimp_anaphor_gender_agreement"
:
{
"acc"
:
0.995
,
"acc_stderr"
:
0.002231586874844882
},
"blimp_anaphor_number_agreement"
:
{
"acc"
:
0.995
,
"acc_stderr"
:
0.0022315868748448817
},
"blimp_animate_subject_passive"
:
{
"acc"
:
0.775
,
"acc_stderr"
:
0.013211720158614753
},
"blimp_animate_subject_trans"
:
{
"acc"
:
0.884
,
"acc_stderr"
:
0.010131468138756997
},
"blimp_causative"
:
{
"acc"
:
0.747
,
"acc_stderr"
:
0.01375427861358708
},
"blimp_complex_NP_island"
:
{
"acc"
:
0.528
,
"acc_stderr"
:
0.01579447578951147
},
"blimp_coordinate_structure_constraint_complex_left_branch"
:
{
"acc"
:
0.779
,
"acc_stderr"
:
0.013127502859696239
},
"blimp_coordinate_structure_constraint_object_extraction"
:
{
"acc"
:
0.848
,
"acc_stderr"
:
0.011358918303475293
},
"blimp_determiner_noun_agreement_1"
:
{
"acc"
:
0.992
,
"acc_stderr"
:
0.002818500300504507
},
"blimp_determiner_noun_agreement_2"
:
{
"acc"
:
0.974
,
"acc_stderr"
:
0.0050348137353182195
},
"blimp_determiner_noun_agreement_irregular_1"
:
{
"acc"
:
0.936
,
"acc_stderr"
:
0.007743640226919308
},
"blimp_determiner_noun_agreement_irregular_2"
:
{
"acc"
:
0.928
,
"acc_stderr"
:
0.008178195576218681
},
"blimp_determiner_noun_agreement_with_adj_2"
:
{
"acc"
:
0.935
,
"acc_stderr"
:
0.007799733061832037
},
"blimp_determiner_noun_agreement_with_adj_irregular_1"
:
{
"acc"
:
0.883
,
"acc_stderr"
:
0.010169287802713327
},
"blimp_determiner_noun_agreement_with_adj_irregular_2"
:
{
"acc"
:
0.919
,
"acc_stderr"
:
0.008632121032139973
},
"blimp_determiner_noun_agreement_with_adjective_1"
:
{
"acc"
:
0.972
,
"acc_stderr"
:
0.005219506034410047
},
"blimp_distractor_agreement_relational_noun"
:
{
"acc"
:
0.889
,
"acc_stderr"
:
0.009938701010583726
},
"blimp_distractor_agreement_relative_clause"
:
{
"acc"
:
0.743
,
"acc_stderr"
:
0.013825416526895042
},
"blimp_drop_argument"
:
{
"acc"
:
0.787
,
"acc_stderr"
:
0.012953717566737234
},
"blimp_ellipsis_n_bar_1"
:
{
"acc"
:
0.79
,
"acc_stderr"
:
0.012886662332274522
},
"blimp_ellipsis_n_bar_2"
:
{
"acc"
:
0.928
,
"acc_stderr"
:
0.008178195576218681
},
"blimp_existential_there_object_raising"
:
{
"acc"
:
0.834
,
"acc_stderr"
:
0.01177211037081219
},
"blimp_existential_there_quantifiers_1"
:
{
"acc"
:
0.988
,
"acc_stderr"
:
0.003444977194099841
},
"blimp_existential_there_quantifiers_2"
:
{
"acc"
:
0.27
,
"acc_stderr"
:
0.014046255632633908
},
"blimp_existential_there_subject_raising"
:
{
"acc"
:
0.888
,
"acc_stderr"
:
0.00997775303139724
},
"blimp_expletive_it_object_raising"
:
{
"acc"
:
0.8
,
"acc_stderr"
:
0.012655439943366646
},
"blimp_inchoative"
:
{
"acc"
:
0.673
,
"acc_stderr"
:
0.01484221315341125
},
"blimp_intransitive"
:
{
"acc"
:
0.832
,
"acc_stderr"
:
0.01182860583145426
},
"blimp_irregular_past_participle_adjectives"
:
{
"acc"
:
0.972
,
"acc_stderr"
:
0.005219506034410053
},
"blimp_irregular_past_participle_verbs"
:
{
"acc"
:
0.885
,
"acc_stderr"
:
0.010093407594904614
},
"blimp_irregular_plural_subject_verb_agreement_1"
:
{
"acc"
:
0.92
,
"acc_stderr"
:
0.008583336977753651
},
"blimp_irregular_plural_subject_verb_agreement_2"
:
{
"acc"
:
0.908
,
"acc_stderr"
:
0.009144376393151112
},
"blimp_left_branch_island_echo_question"
:
{
"acc"
:
0.432
,
"acc_stderr"
:
0.01567232023733621
},
"blimp_left_branch_island_simple_question"
:
{
"acc"
:
0.897
,
"acc_stderr"
:
0.009616833339695798
},
"blimp_matrix_question_npi_licensor_present"
:
{
"acc"
:
0.705
,
"acc_stderr"
:
0.014428554438445514
},
"blimp_npi_present_1"
:
{
"acc"
:
0.579
,
"acc_stderr"
:
0.015620595475301318
},
"blimp_npi_present_2"
:
{
"acc"
:
0.688
,
"acc_stderr"
:
0.01465847437050901
},
"blimp_only_npi_licensor_present"
:
{
"acc"
:
0.916
,
"acc_stderr"
:
0.008776162089491096
},
"blimp_only_npi_scope"
:
{
"acc"
:
0.732
,
"acc_stderr"
:
0.014013292702729491
},
"blimp_passive_1"
:
{
"acc"
:
0.887
,
"acc_stderr"
:
0.010016552866696863
},
"blimp_passive_2"
:
{
"acc"
:
0.895
,
"acc_stderr"
:
0.009698921026024947
},
"blimp_principle_A_c_command"
:
{
"acc"
:
0.75
,
"acc_stderr"
:
0.013699915608779773
},
"blimp_principle_A_case_1"
:
{
"acc"
:
1.0
,
"acc_stderr"
:
0.0
},
"blimp_principle_A_case_2"
:
{
"acc"
:
0.94
,
"acc_stderr"
:
0.007513751157474928
},
"blimp_principle_A_domain_1"
:
{
"acc"
:
0.997
,
"acc_stderr"
:
0.0017303161543469417
},
"blimp_principle_A_domain_2"
:
{
"acc"
:
0.828
,
"acc_stderr"
:
0.011939788882495321
},
"blimp_principle_A_domain_3"
:
{
"acc"
:
0.761
,
"acc_stderr"
:
0.013493000446937591
},
"blimp_principle_A_reconstruction"
:
{
"acc"
:
0.41
,
"acc_stderr"
:
0.015560917136921646
},
"blimp_regular_plural_subject_verb_agreement_1"
:
{
"acc"
:
0.971
,
"acc_stderr"
:
0.005309160685756986
},
"blimp_regular_plural_subject_verb_agreement_2"
:
{
"acc"
:
0.907
,
"acc_stderr"
:
0.009188875634996659
},
"blimp_sentential_negation_npi_licensor_present"
:
{
"acc"
:
0.989
,
"acc_stderr"
:
0.0032999833166078166
},
"blimp_sentential_negation_npi_scope"
:
{
"acc"
:
0.733
,
"acc_stderr"
:
0.013996674851796264
},
"blimp_sentential_subject_island"
:
{
"acc"
:
0.399
,
"acc_stderr"
:
0.015493193313162906
},
"blimp_superlative_quantifiers_1"
:
{
"acc"
:
0.822
,
"acc_stderr"
:
0.012102167676183596
},
"blimp_superlative_quantifiers_2"
:
{
"acc"
:
0.897
,
"acc_stderr"
:
0.009616833339695796
},
"blimp_tough_vs_raising_1"
:
{
"acc"
:
0.69
,
"acc_stderr"
:
0.014632638658632898
},
"blimp_tough_vs_raising_2"
:
{
"acc"
:
0.829
,
"acc_stderr"
:
0.011912216456264606
},
"blimp_transitive"
:
{
"acc"
:
0.872
,
"acc_stderr"
:
0.010570133761108658
},
"blimp_wh_island"
:
{
"acc"
:
0.813
,
"acc_stderr"
:
0.012336254828074126
},
"blimp_wh_questions_object_gap"
:
{
"acc"
:
0.763
,
"acc_stderr"
:
0.01345407046257794
},
"blimp_wh_questions_subject_gap"
:
{
"acc"
:
0.89
,
"acc_stderr"
:
0.009899393819724435
},
"blimp_wh_questions_subject_gap_long_distance"
:
{
"acc"
:
0.893
,
"acc_stderr"
:
0.009779910359847169
},
"blimp_wh_vs_that_no_gap"
:
{
"acc"
:
0.946
,
"acc_stderr"
:
0.007150883521295428
},
"blimp_wh_vs_that_no_gap_long_distance"
:
{
"acc"
:
0.951
,
"acc_stderr"
:
0.006829761756140921
},
"blimp_wh_vs_that_with_gap"
:
{
"acc"
:
0.321
,
"acc_stderr"
:
0.01477082181793466
},
"blimp_wh_vs_that_with_gap_long_distance"
:
{
"acc"
:
0.292
,
"acc_stderr"
:
0.014385511563477347
}
},
"versions"
:
{
"blimp_adjunct_island"
:
0
,
"blimp_anaphor_gender_agreement"
:
0
,
"blimp_anaphor_number_agreement"
:
0
,
"blimp_animate_subject_passive"
:
0
,
"blimp_animate_subject_trans"
:
0
,
"blimp_causative"
:
0
,
"blimp_complex_NP_island"
:
0
,
"blimp_coordinate_structure_constraint_complex_left_branch"
:
0
,
"blimp_coordinate_structure_constraint_object_extraction"
:
0
,
"blimp_determiner_noun_agreement_1"
:
0
,
"blimp_determiner_noun_agreement_2"
:
0
,
"blimp_determiner_noun_agreement_irregular_1"
:
0
,
"blimp_determiner_noun_agreement_irregular_2"
:
0
,
"blimp_determiner_noun_agreement_with_adj_2"
:
0
,
"blimp_determiner_noun_agreement_with_adj_irregular_1"
:
0
,
"blimp_determiner_noun_agreement_with_adj_irregular_2"
:
0
,
"blimp_determiner_noun_agreement_with_adjective_1"
:
0
,
"blimp_distractor_agreement_relational_noun"
:
0
,
"blimp_distractor_agreement_relative_clause"
:
0
,
"blimp_drop_argument"
:
0
,
"blimp_ellipsis_n_bar_1"
:
0
,
"blimp_ellipsis_n_bar_2"
:
0
,
"blimp_existential_there_object_raising"
:
0
,
"blimp_existential_there_quantifiers_1"
:
0
,
"blimp_existential_there_quantifiers_2"
:
0
,
"blimp_existential_there_subject_raising"
:
0
,
"blimp_expletive_it_object_raising"
:
0
,
"blimp_inchoative"
:
0
,
"blimp_intransitive"
:
0
,
"blimp_irregular_past_participle_adjectives"
:
0
,
"blimp_irregular_past_participle_verbs"
:
0
,
"blimp_irregular_plural_subject_verb_agreement_1"
:
0
,
"blimp_irregular_plural_subject_verb_agreement_2"
:
0
,
"blimp_left_branch_island_echo_question"
:
0
,
"blimp_left_branch_island_simple_question"
:
0
,
"blimp_matrix_question_npi_licensor_present"
:
0
,
"blimp_npi_present_1"
:
0
,
"blimp_npi_present_2"
:
0
,
"blimp_only_npi_licensor_present"
:
0
,
"blimp_only_npi_scope"
:
0
,
"blimp_passive_1"
:
0
,
"blimp_passive_2"
:
0
,
"blimp_principle_A_c_command"
:
0
,
"blimp_principle_A_case_1"
:
0
,
"blimp_principle_A_case_2"
:
0
,
"blimp_principle_A_domain_1"
:
0
,
"blimp_principle_A_domain_2"
:
0
,
"blimp_principle_A_domain_3"
:
0
,
"blimp_principle_A_reconstruction"
:
0
,
"blimp_regular_plural_subject_verb_agreement_1"
:
0
,
"blimp_regular_plural_subject_verb_agreement_2"
:
0
,
"blimp_sentential_negation_npi_licensor_present"
:
0
,
"blimp_sentential_negation_npi_scope"
:
0
,
"blimp_sentential_subject_island"
:
0
,
"blimp_superlative_quantifiers_1"
:
0
,
"blimp_superlative_quantifiers_2"
:
0
,
"blimp_tough_vs_raising_1"
:
0
,
"blimp_tough_vs_raising_2"
:
0
,
"blimp_transitive"
:
0
,
"blimp_wh_island"
:
0
,
"blimp_wh_questions_object_gap"
:
0
,
"blimp_wh_questions_subject_gap"
:
0
,
"blimp_wh_questions_subject_gap_long_distance"
:
0
,
"blimp_wh_vs_that_no_gap"
:
0
,
"blimp_wh_vs_that_no_gap_long_distance"
:
0
,
"blimp_wh_vs_that_with_gap"
:
0
,
"blimp_wh_vs_that_with_gap_long_distance"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=mosaicml/mpt-7b,trust_remote_code=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/mpt/mpt-7b/mpt-7b_human_alignment_0-shot.json
0 → 100644
View file @
afbf8e66
{
"results"
:
{
"crows_pairs_english_age"
:
{
"likelihood_difference"
:
4.1510989010989015
,
"likelihood_difference_stderr"
:
0.38322042294913006
,
"pct_stereotype"
:
0.7362637362637363
,
"pct_stereotype_stderr"
:
0.04644942852497395
},
"crows_pairs_english_autre"
:
{
"likelihood_difference"
:
5.056818181818182
,
"likelihood_difference_stderr"
:
1.7702853227564594
,
"pct_stereotype"
:
0.7272727272727273
,
"pct_stereotype_stderr"
:
0.14083575804390605
},
"crows_pairs_english_disability"
:
{
"likelihood_difference"
:
6.019230769230769
,
"likelihood_difference_stderr"
:
0.6330801651940536
,
"pct_stereotype"
:
0.7692307692307693
,
"pct_stereotype_stderr"
:
0.05266563052934291
},
"crows_pairs_english_gender"
:
{
"likelihood_difference"
:
2.682421875
,
"likelihood_difference_stderr"
:
0.17011984636779479
,
"pct_stereotype"
:
0.6375
,
"pct_stereotype_stderr"
:
0.02691527109619775
},
"crows_pairs_english_nationality"
:
{
"likelihood_difference"
:
3.498263888888889
,
"likelihood_difference_stderr"
:
0.21512426671182808
,
"pct_stereotype"
:
0.6157407407407407
,
"pct_stereotype_stderr"
:
0.03317354514310742
},
"crows_pairs_english_physical_appearance"
:
{
"likelihood_difference"
:
3.7378472222222223
,
"likelihood_difference_stderr"
:
0.3384578916047944
,
"pct_stereotype"
:
0.7222222222222222
,
"pct_stereotype_stderr"
:
0.053156331218399945
},
"crows_pairs_english_race_color"
:
{
"likelihood_difference"
:
3.3619586614173227
,
"likelihood_difference_stderr"
:
0.14103384370541044
,
"pct_stereotype"
:
0.5728346456692913
,
"pct_stereotype_stderr"
:
0.021968918082519016
},
"crows_pairs_english_religion"
:
{
"likelihood_difference"
:
3.6644144144144146
,
"likelihood_difference_stderr"
:
0.33863382204528913
,
"pct_stereotype"
:
0.7297297297297297
,
"pct_stereotype_stderr"
:
0.04234321361084538
},
"crows_pairs_english_sexual_orientation"
:
{
"likelihood_difference"
:
4.630376344086022
,
"likelihood_difference_stderr"
:
0.4574502494208299
,
"pct_stereotype"
:
0.8279569892473119
,
"pct_stereotype_stderr"
:
0.03934852812061865
},
"crows_pairs_english_socioeconomic"
:
{
"likelihood_difference"
:
4.065131578947368
,
"likelihood_difference_stderr"
:
0.23523235884669547
,
"pct_stereotype"
:
0.6789473684210526
,
"pct_stereotype_stderr"
:
0.03396059335824887
},
"crows_pairs_french_age"
:
{
"likelihood_difference"
:
3.609722222222222
,
"likelihood_difference_stderr"
:
0.3615396777446631
,
"pct_stereotype"
:
0.4222222222222222
,
"pct_stereotype_stderr"
:
0.05235473399540658
},
"crows_pairs_french_autre"
:
{
"likelihood_difference"
:
2.6923076923076925
,
"likelihood_difference_stderr"
:
0.9229767573858816
,
"pct_stereotype"
:
0.6153846153846154
,
"pct_stereotype_stderr"
:
0.14044168141158106
},
"crows_pairs_french_disability"
:
{
"likelihood_difference"
:
4.958333333333333
,
"likelihood_difference_stderr"
:
0.42689940749326905
,
"pct_stereotype"
:
0.6363636363636364
,
"pct_stereotype_stderr"
:
0.05966637484671758
},
"crows_pairs_french_gender"
:
{
"likelihood_difference"
:
3.213785046728972
,
"likelihood_difference_stderr"
:
0.17588386956758606
,
"pct_stereotype"
:
0.5109034267912772
,
"pct_stereotype_stderr"
:
0.027944203070818633
},
"crows_pairs_french_nationality"
:
{
"likelihood_difference"
:
3.883399209486166
,
"likelihood_difference_stderr"
:
0.21844567559439967
,
"pct_stereotype"
:
0.3438735177865613
,
"pct_stereotype_stderr"
:
0.029922155720849428
},
"crows_pairs_french_physical_appearance"
:
{
"likelihood_difference"
:
3.2274305555555554
,
"likelihood_difference_stderr"
:
0.4328667471914375
,
"pct_stereotype"
:
0.5972222222222222
,
"pct_stereotype_stderr"
:
0.05820650942569533
},
"crows_pairs_french_race_color"
:
{
"likelihood_difference"
:
3.161413043478261
,
"likelihood_difference_stderr"
:
0.16557903974411925
,
"pct_stereotype"
:
0.4369565217391304
,
"pct_stereotype_stderr"
:
0.023151745316873387
},
"crows_pairs_french_religion"
:
{
"likelihood_difference"
:
3.5673913043478263
,
"likelihood_difference_stderr"
:
0.3368331015818195
,
"pct_stereotype"
:
0.6260869565217392
,
"pct_stereotype_stderr"
:
0.045315858286449635
},
"crows_pairs_french_sexual_orientation"
:
{
"likelihood_difference"
:
4.791208791208791
,
"likelihood_difference_stderr"
:
0.4009539855629619
,
"pct_stereotype"
:
0.7802197802197802
,
"pct_stereotype_stderr"
:
0.043649726328985346
},
"crows_pairs_french_socioeconomic"
:
{
"likelihood_difference"
:
3.9939413265306123
,
"likelihood_difference_stderr"
:
0.263093158126228
,
"pct_stereotype"
:
0.6581632653061225
,
"pct_stereotype_stderr"
:
0.033967132039868675
},
"ethics_cm"
:
{
"acc"
:
0.5459459459459459
,
"acc_stderr"
:
0.007988936899457039
},
"ethics_deontology"
:
{
"acc"
:
0.5025027808676307
,
"acc_stderr"
:
0.008339021933755771
,
"em"
:
0.004449388209121246
},
"ethics_justice"
:
{
"acc"
:
0.5196005917159763
,
"acc_stderr"
:
0.009609770755397009
,
"em"
:
0.011834319526627219
},
"ethics_utilitarianism"
:
{
"acc"
:
0.5748752079866889
,
"acc_stderr"
:
0.007130302336230959
},
"ethics_utilitarianism_original"
:
{
"acc"
:
0.9956322795341098
,
"acc_stderr"
:
0.000951129914345755
},
"ethics_virtue"
:
{
"acc"
:
0.8040201005025126
,
"acc_stderr"
:
0.005628417801676332
,
"em"
:
0.12562814070351758
},
"toxigen"
:
{
"acc"
:
0.4319148936170213
,
"acc_stderr"
:
0.016164899004911828
,
"acc_norm"
:
0.4319148936170213
,
"acc_norm_stderr"
:
0.016164899004911828
}
},
"versions"
:
{
"crows_pairs_english_age"
:
0
,
"crows_pairs_english_autre"
:
0
,
"crows_pairs_english_disability"
:
0
,
"crows_pairs_english_gender"
:
0
,
"crows_pairs_english_nationality"
:
0
,
"crows_pairs_english_physical_appearance"
:
0
,
"crows_pairs_english_race_color"
:
0
,
"crows_pairs_english_religion"
:
0
,
"crows_pairs_english_sexual_orientation"
:
0
,
"crows_pairs_english_socioeconomic"
:
0
,
"crows_pairs_french_age"
:
0
,
"crows_pairs_french_autre"
:
0
,
"crows_pairs_french_disability"
:
0
,
"crows_pairs_french_gender"
:
0
,
"crows_pairs_french_nationality"
:
0
,
"crows_pairs_french_physical_appearance"
:
0
,
"crows_pairs_french_race_color"
:
0
,
"crows_pairs_french_religion"
:
0
,
"crows_pairs_french_sexual_orientation"
:
0
,
"crows_pairs_french_socioeconomic"
:
0
,
"ethics_cm"
:
0
,
"ethics_deontology"
:
0
,
"ethics_justice"
:
0
,
"ethics_utilitarianism"
:
0
,
"ethics_utilitarianism_original"
:
0
,
"ethics_virtue"
:
0
,
"toxigen"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=mosaicml/mpt-7b,trust_remote_code=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/mpt/mpt-7b/mpt-7b_mmlu_5-shot.json
0 → 100644
View file @
afbf8e66
{
"results"
:
{
"hendrycksTest-abstract_algebra"
:
{
"acc"
:
0.18
,
"acc_stderr"
:
0.03861229196653695
,
"acc_norm"
:
0.21
,
"acc_norm_stderr"
:
0.040936018074033256
},
"hendrycksTest-anatomy"
:
{
"acc"
:
0.3851851851851852
,
"acc_stderr"
:
0.042039210401562783
,
"acc_norm"
:
0.37777777777777777
,
"acc_norm_stderr"
:
0.04188307537595853
},
"hendrycksTest-astronomy"
:
{
"acc"
:
0.39473684210526316
,
"acc_stderr"
:
0.039777499346220734
,
"acc_norm"
:
0.42105263157894735
,
"acc_norm_stderr"
:
0.04017901275981748
},
"hendrycksTest-business_ethics"
:
{
"acc"
:
0.49
,
"acc_stderr"
:
0.05024183937956911
,
"acc_norm"
:
0.48
,
"acc_norm_stderr"
:
0.050211673156867795
},
"hendrycksTest-clinical_knowledge"
:
{
"acc"
:
0.3320754716981132
,
"acc_stderr"
:
0.028985455652334388
,
"acc_norm"
:
0.37735849056603776
,
"acc_norm_stderr"
:
0.029832808114796005
},
"hendrycksTest-college_biology"
:
{
"acc"
:
0.3819444444444444
,
"acc_stderr"
:
0.040629907841466674
,
"acc_norm"
:
0.3541666666666667
,
"acc_norm_stderr"
:
0.039994111357535424
},
"hendrycksTest-college_chemistry"
:
{
"acc"
:
0.39
,
"acc_stderr"
:
0.04902071300001974
,
"acc_norm"
:
0.41
,
"acc_norm_stderr"
:
0.049431107042371025
},
"hendrycksTest-college_computer_science"
:
{
"acc"
:
0.34
,
"acc_stderr"
:
0.04760952285695235
,
"acc_norm"
:
0.32
,
"acc_norm_stderr"
:
0.046882617226215034
},
"hendrycksTest-college_mathematics"
:
{
"acc"
:
0.27
,
"acc_stderr"
:
0.044619604333847394
,
"acc_norm"
:
0.33
,
"acc_norm_stderr"
:
0.047258156262526045
},
"hendrycksTest-college_medicine"
:
{
"acc"
:
0.36416184971098264
,
"acc_stderr"
:
0.03669072477416907
,
"acc_norm"
:
0.3468208092485549
,
"acc_norm_stderr"
:
0.036291466701596636
},
"hendrycksTest-college_physics"
:
{
"acc"
:
0.30392156862745096
,
"acc_stderr"
:
0.045766654032077615
,
"acc_norm"
:
0.3333333333333333
,
"acc_norm_stderr"
:
0.04690650298201943
},
"hendrycksTest-computer_security"
:
{
"acc"
:
0.41
,
"acc_stderr"
:
0.04943110704237102
,
"acc_norm"
:
0.41
,
"acc_norm_stderr"
:
0.049431107042371025
},
"hendrycksTest-conceptual_physics"
:
{
"acc"
:
0.3276595744680851
,
"acc_stderr"
:
0.030683020843231015
,
"acc_norm"
:
0.2553191489361702
,
"acc_norm_stderr"
:
0.02850485647051418
},
"hendrycksTest-econometrics"
:
{
"acc"
:
0.2719298245614035
,
"acc_stderr"
:
0.04185774424022056
,
"acc_norm"
:
0.23684210526315788
,
"acc_norm_stderr"
:
0.039994238792813365
},
"hendrycksTest-electrical_engineering"
:
{
"acc"
:
0.36551724137931035
,
"acc_stderr"
:
0.040131241954243856
,
"acc_norm"
:
0.33793103448275863
,
"acc_norm_stderr"
:
0.03941707632064889
},
"hendrycksTest-elementary_mathematics"
:
{
"acc"
:
0.29894179894179895
,
"acc_stderr"
:
0.023577604791655802
,
"acc_norm"
:
0.28835978835978837
,
"acc_norm_stderr"
:
0.023330654054535892
},
"hendrycksTest-formal_logic"
:
{
"acc"
:
0.30952380952380953
,
"acc_stderr"
:
0.04134913018303316
,
"acc_norm"
:
0.2857142857142857
,
"acc_norm_stderr"
:
0.040406101782088394
},
"hendrycksTest-global_facts"
:
{
"acc"
:
0.35
,
"acc_stderr"
:
0.047937248544110196
,
"acc_norm"
:
0.33
,
"acc_norm_stderr"
:
0.047258156262526045
},
"hendrycksTest-high_school_biology"
:
{
"acc"
:
0.36451612903225805
,
"acc_stderr"
:
0.02737987122994325
,
"acc_norm"
:
0.3903225806451613
,
"acc_norm_stderr"
:
0.027751256636969583
},
"hendrycksTest-high_school_chemistry"
:
{
"acc"
:
0.21182266009852216
,
"acc_stderr"
:
0.02874898368994106
,
"acc_norm"
:
0.21674876847290642
,
"acc_norm_stderr"
:
0.028990331252516235
},
"hendrycksTest-high_school_computer_science"
:
{
"acc"
:
0.43
,
"acc_stderr"
:
0.04975698519562428
,
"acc_norm"
:
0.41
,
"acc_norm_stderr"
:
0.049431107042371025
},
"hendrycksTest-high_school_european_history"
:
{
"acc"
:
0.38181818181818183
,
"acc_stderr"
:
0.03793713171165635
,
"acc_norm"
:
0.37575757575757573
,
"acc_norm_stderr"
:
0.03781887353205983
},
"hendrycksTest-high_school_geography"
:
{
"acc"
:
0.3838383838383838
,
"acc_stderr"
:
0.03464881675016339
,
"acc_norm"
:
0.40404040404040403
,
"acc_norm_stderr"
:
0.03496130972056128
},
"hendrycksTest-high_school_government_and_politics"
:
{
"acc"
:
0.41450777202072536
,
"acc_stderr"
:
0.03555300319557673
,
"acc_norm"
:
0.41450777202072536
,
"acc_norm_stderr"
:
0.03555300319557672
},
"hendrycksTest-high_school_macroeconomics"
:
{
"acc"
:
0.3487179487179487
,
"acc_stderr"
:
0.024162780284017717
,
"acc_norm"
:
0.29743589743589743
,
"acc_norm_stderr"
:
0.02317740813146594
},
"hendrycksTest-high_school_mathematics"
:
{
"acc"
:
0.29259259259259257
,
"acc_stderr"
:
0.027738969632176088
,
"acc_norm"
:
0.3037037037037037
,
"acc_norm_stderr"
:
0.02803792996911499
},
"hendrycksTest-high_school_microeconomics"
:
{
"acc"
:
0.33613445378151263
,
"acc_stderr"
:
0.030684737115135353
,
"acc_norm"
:
0.3697478991596639
,
"acc_norm_stderr"
:
0.03135709599613591
},
"hendrycksTest-high_school_physics"
:
{
"acc"
:
0.2781456953642384
,
"acc_stderr"
:
0.03658603262763743
,
"acc_norm"
:
0.2781456953642384
,
"acc_norm_stderr"
:
0.03658603262763743
},
"hendrycksTest-high_school_psychology"
:
{
"acc"
:
0.46972477064220186
,
"acc_stderr"
:
0.021397988604936965
,
"acc_norm"
:
0.44587155963302755
,
"acc_norm_stderr"
:
0.02131133500970858
},
"hendrycksTest-high_school_statistics"
:
{
"acc"
:
0.3287037037037037
,
"acc_stderr"
:
0.03203614084670058
,
"acc_norm"
:
0.32407407407407407
,
"acc_norm_stderr"
:
0.03191923445686185
},
"hendrycksTest-high_school_us_history"
:
{
"acc"
:
0.3431372549019608
,
"acc_stderr"
:
0.03332139944668085
,
"acc_norm"
:
0.3137254901960784
,
"acc_norm_stderr"
:
0.032566854844603886
},
"hendrycksTest-high_school_world_history"
:
{
"acc"
:
0.29535864978902954
,
"acc_stderr"
:
0.029696338713422876
,
"acc_norm"
:
0.2869198312236287
,
"acc_norm_stderr"
:
0.02944377302259469
},
"hendrycksTest-human_aging"
:
{
"acc"
:
0.336322869955157
,
"acc_stderr"
:
0.031708824268455
,
"acc_norm"
:
0.3273542600896861
,
"acc_norm_stderr"
:
0.031493846709941306
},
"hendrycksTest-human_sexuality"
:
{
"acc"
:
0.2748091603053435
,
"acc_stderr"
:
0.03915345408847836
,
"acc_norm"
:
0.3282442748091603
,
"acc_norm_stderr"
:
0.041184385658062976
},
"hendrycksTest-international_law"
:
{
"acc"
:
0.371900826446281
,
"acc_stderr"
:
0.04412015806624504
,
"acc_norm"
:
0.49586776859504134
,
"acc_norm_stderr"
:
0.045641987674327526
},
"hendrycksTest-jurisprudence"
:
{
"acc"
:
0.3425925925925926
,
"acc_stderr"
:
0.045879047413018105
,
"acc_norm"
:
0.39814814814814814
,
"acc_norm_stderr"
:
0.04732332615978814
},
"hendrycksTest-logical_fallacies"
:
{
"acc"
:
0.3803680981595092
,
"acc_stderr"
:
0.038142698932618374
,
"acc_norm"
:
0.36809815950920244
,
"acc_norm_stderr"
:
0.03789213935838395
},
"hendrycksTest-machine_learning"
:
{
"acc"
:
0.26785714285714285
,
"acc_stderr"
:
0.04203277291467763
,
"acc_norm"
:
0.24107142857142858
,
"acc_norm_stderr"
:
0.04059867246952686
},
"hendrycksTest-management"
:
{
"acc"
:
0.42718446601941745
,
"acc_stderr"
:
0.04897957737781169
,
"acc_norm"
:
0.39805825242718446
,
"acc_norm_stderr"
:
0.0484674825397724
},
"hendrycksTest-marketing"
:
{
"acc"
:
0.5512820512820513
,
"acc_stderr"
:
0.032583346493868806
,
"acc_norm"
:
0.5512820512820513
,
"acc_norm_stderr"
:
0.032583346493868806
},
"hendrycksTest-medical_genetics"
:
{
"acc"
:
0.39
,
"acc_stderr"
:
0.04902071300001975
,
"acc_norm"
:
0.38
,
"acc_norm_stderr"
:
0.04878317312145633
},
"hendrycksTest-miscellaneous"
:
{
"acc"
:
0.5555555555555556
,
"acc_stderr"
:
0.017769250583533246
,
"acc_norm"
:
0.5568326947637292
,
"acc_norm_stderr"
:
0.01776408503534841
},
"hendrycksTest-moral_disputes"
:
{
"acc"
:
0.3208092485549133
,
"acc_stderr"
:
0.025131000233647904
,
"acc_norm"
:
0.30057803468208094
,
"acc_norm_stderr"
:
0.024685316867257796
},
"hendrycksTest-moral_scenarios"
:
{
"acc"
:
0.26033519553072626
,
"acc_stderr"
:
0.014676252009319483
,
"acc_norm"
:
0.27262569832402234
,
"acc_norm_stderr"
:
0.014893391735249614
},
"hendrycksTest-nutrition"
:
{
"acc"
:
0.3431372549019608
,
"acc_stderr"
:
0.027184498909941616
,
"acc_norm"
:
0.4019607843137255
,
"acc_norm_stderr"
:
0.02807415894760066
},
"hendrycksTest-philosophy"
:
{
"acc"
:
0.3762057877813505
,
"acc_stderr"
:
0.027513925683549427
,
"acc_norm"
:
0.36977491961414793
,
"acc_norm_stderr"
:
0.027417996705630998
},
"hendrycksTest-prehistory"
:
{
"acc"
:
0.33641975308641975
,
"acc_stderr"
:
0.02628973494595293
,
"acc_norm"
:
0.3055555555555556
,
"acc_norm_stderr"
:
0.025630824975621344
},
"hendrycksTest-professional_accounting"
:
{
"acc"
:
0.3049645390070922
,
"acc_stderr"
:
0.027464708442022135
,
"acc_norm"
:
0.2907801418439716
,
"acc_norm_stderr"
:
0.027090664368353178
},
"hendrycksTest-professional_law"
:
{
"acc"
:
0.25945241199478486
,
"acc_stderr"
:
0.011195262076350299
,
"acc_norm"
:
0.2842242503259452
,
"acc_norm_stderr"
:
0.011519880596516074
},
"hendrycksTest-professional_medicine"
:
{
"acc"
:
0.29411764705882354
,
"acc_stderr"
:
0.027678468642144703
,
"acc_norm"
:
0.3161764705882353
,
"acc_norm_stderr"
:
0.02824568739146291
},
"hendrycksTest-professional_psychology"
:
{
"acc"
:
0.315359477124183
,
"acc_stderr"
:
0.018798086284886883
,
"acc_norm"
:
0.3022875816993464
,
"acc_norm_stderr"
:
0.01857923271111388
},
"hendrycksTest-public_relations"
:
{
"acc"
:
0.41818181818181815
,
"acc_stderr"
:
0.04724577405731571
,
"acc_norm"
:
0.42727272727272725
,
"acc_norm_stderr"
:
0.04738198703545483
},
"hendrycksTest-security_studies"
:
{
"acc"
:
0.2816326530612245
,
"acc_stderr"
:
0.028795185574291282
,
"acc_norm"
:
0.24081632653061225
,
"acc_norm_stderr"
:
0.027372942201788163
},
"hendrycksTest-sociology"
:
{
"acc"
:
0.34328358208955223
,
"acc_stderr"
:
0.03357379665433431
,
"acc_norm"
:
0.3681592039800995
,
"acc_norm_stderr"
:
0.034104105654953025
},
"hendrycksTest-us_foreign_policy"
:
{
"acc"
:
0.38
,
"acc_stderr"
:
0.04878317312145632
,
"acc_norm"
:
0.39
,
"acc_norm_stderr"
:
0.04902071300001975
},
"hendrycksTest-virology"
:
{
"acc"
:
0.3253012048192771
,
"acc_stderr"
:
0.03647168523683229
,
"acc_norm"
:
0.3253012048192771
,
"acc_norm_stderr"
:
0.03647168523683227
},
"hendrycksTest-world_religions"
:
{
"acc"
:
0.543859649122807
,
"acc_stderr"
:
0.03820042586602966
,
"acc_norm"
:
0.5789473684210527
,
"acc_norm_stderr"
:
0.03786720706234214
}
},
"versions"
:
{
"hendrycksTest-abstract_algebra"
:
0
,
"hendrycksTest-anatomy"
:
0
,
"hendrycksTest-astronomy"
:
0
,
"hendrycksTest-business_ethics"
:
0
,
"hendrycksTest-clinical_knowledge"
:
0
,
"hendrycksTest-college_biology"
:
0
,
"hendrycksTest-college_chemistry"
:
0
,
"hendrycksTest-college_computer_science"
:
0
,
"hendrycksTest-college_mathematics"
:
0
,
"hendrycksTest-college_medicine"
:
0
,
"hendrycksTest-college_physics"
:
0
,
"hendrycksTest-computer_security"
:
0
,
"hendrycksTest-conceptual_physics"
:
0
,
"hendrycksTest-econometrics"
:
0
,
"hendrycksTest-electrical_engineering"
:
0
,
"hendrycksTest-elementary_mathematics"
:
0
,
"hendrycksTest-formal_logic"
:
0
,
"hendrycksTest-global_facts"
:
0
,
"hendrycksTest-high_school_biology"
:
0
,
"hendrycksTest-high_school_chemistry"
:
0
,
"hendrycksTest-high_school_computer_science"
:
0
,
"hendrycksTest-high_school_european_history"
:
0
,
"hendrycksTest-high_school_geography"
:
0
,
"hendrycksTest-high_school_government_and_politics"
:
0
,
"hendrycksTest-high_school_macroeconomics"
:
0
,
"hendrycksTest-high_school_mathematics"
:
0
,
"hendrycksTest-high_school_microeconomics"
:
0
,
"hendrycksTest-high_school_physics"
:
0
,
"hendrycksTest-high_school_psychology"
:
0
,
"hendrycksTest-high_school_statistics"
:
0
,
"hendrycksTest-high_school_us_history"
:
0
,
"hendrycksTest-high_school_world_history"
:
0
,
"hendrycksTest-human_aging"
:
0
,
"hendrycksTest-human_sexuality"
:
0
,
"hendrycksTest-international_law"
:
0
,
"hendrycksTest-jurisprudence"
:
0
,
"hendrycksTest-logical_fallacies"
:
0
,
"hendrycksTest-machine_learning"
:
0
,
"hendrycksTest-management"
:
0
,
"hendrycksTest-marketing"
:
0
,
"hendrycksTest-medical_genetics"
:
0
,
"hendrycksTest-miscellaneous"
:
0
,
"hendrycksTest-moral_disputes"
:
0
,
"hendrycksTest-moral_scenarios"
:
0
,
"hendrycksTest-nutrition"
:
0
,
"hendrycksTest-philosophy"
:
0
,
"hendrycksTest-prehistory"
:
0
,
"hendrycksTest-professional_accounting"
:
0
,
"hendrycksTest-professional_law"
:
0
,
"hendrycksTest-professional_medicine"
:
0
,
"hendrycksTest-professional_psychology"
:
0
,
"hendrycksTest-public_relations"
:
0
,
"hendrycksTest-security_studies"
:
0
,
"hendrycksTest-sociology"
:
0
,
"hendrycksTest-us_foreign_policy"
:
0
,
"hendrycksTest-virology"
:
0
,
"hendrycksTest-world_religions"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=mosaicml/mpt-7b,trust_remote_code=True,dtype=bfloat16"
,
"num_fewshot"
:
5
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/mpt/mpt-7b/mpt-7b_pawsx_0-shot.json
0 → 100644
View file @
afbf8e66
{
"results"
:
{
"pawsx_de"
:
{
"acc"
:
0.614
,
"acc_stderr"
:
0.010888584877766427
},
"pawsx_en"
:
{
"acc"
:
0.7035
,
"acc_stderr"
:
0.010214991337441791
},
"pawsx_es"
:
{
"acc"
:
0.6495
,
"acc_stderr"
:
0.01067154233969731
},
"pawsx_fr"
:
{
"acc"
:
0.6285
,
"acc_stderr"
:
0.01080751017293364
},
"pawsx_ja"
:
{
"acc"
:
0.493
,
"acc_stderr"
:
0.011182040020027768
},
"pawsx_ko"
:
{
"acc"
:
0.5365
,
"acc_stderr"
:
0.011153298751334327
},
"pawsx_zh"
:
{
"acc"
:
0.5625
,
"acc_stderr"
:
0.011095423796079503
}
},
"versions"
:
{
"pawsx_de"
:
0
,
"pawsx_en"
:
0
,
"pawsx_es"
:
0
,
"pawsx_fr"
:
0
,
"pawsx_ja"
:
0
,
"pawsx_ko"
:
0
,
"pawsx_zh"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=mosaicml/mpt-7b,trust_remote_code=True,dtype=bfloat16"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/mpt/mpt-7b/mpt-7b_reading_comprehension_0-shot.json
0 → 100644
View file @
afbf8e66
{
"results"
:
{
"coqa"
:
{
"f1"
:
0.7650867255895625
,
"f1_stderr"
:
0.01481717694356494
,
"em"
:
0.6301666666666667
,
"em_stderr"
:
0.018680205213012713
},
"drop"
:
{
"em"
:
0.03429110738255033
,
"em_stderr"
:
0.0018636035184959787
,
"f1"
:
0.1338569630872482
,
"f1_stderr"
:
0.0025165760673094154
},
"race"
:
{
"acc"
:
0.3866028708133971
,
"acc_stderr"
:
0.01507138477304713
}
},
"versions"
:
{
"coqa"
:
1
,
"drop"
:
1
,
"race"
:
1
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=mosaicml/mpt-7b,trust_remote_code=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/mpt/mpt-7b/mpt-7b_unscramble_0-shot.json
0 → 100644
View file @
afbf8e66
{
"results"
:
{
"anagrams1"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"anagrams2"
:
{
"acc"
:
0.0001
,
"acc_stderr"
:
0.0001000000000000119
},
"cycle_letters"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"random_insertion"
:
{
"acc"
:
0.0004
,
"acc_stderr"
:
0.00019996999474889126
},
"reversed_words"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
}
},
"versions"
:
{
"anagrams1"
:
0
,
"anagrams2"
:
0
,
"cycle_letters"
:
0
,
"random_insertion"
:
0
,
"reversed_words"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=mosaicml/mpt-7b,trust_remote_code=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/mpt/mpt-7b/mpt-7b_xcopa_0-shot.json
0 → 100644
View file @
afbf8e66
{
"results"
:
{
"xcopa_et"
:
{
"acc"
:
0.474
,
"acc_stderr"
:
0.02235279165091416
},
"xcopa_ht"
:
{
"acc"
:
0.498
,
"acc_stderr"
:
0.02238289498648353
},
"xcopa_id"
:
{
"acc"
:
0.568
,
"acc_stderr"
:
0.02217510926561316
},
"xcopa_it"
:
{
"acc"
:
0.594
,
"acc_stderr"
:
0.02198396209008634
},
"xcopa_qu"
:
{
"acc"
:
0.484
,
"acc_stderr"
:
0.0223716109825804
},
"xcopa_sw"
:
{
"acc"
:
0.516
,
"acc_stderr"
:
0.0223716109825804
},
"xcopa_ta"
:
{
"acc"
:
0.54
,
"acc_stderr"
:
0.022311333245289663
},
"xcopa_th"
:
{
"acc"
:
0.542
,
"acc_stderr"
:
0.02230396677426995
},
"xcopa_tr"
:
{
"acc"
:
0.516
,
"acc_stderr"
:
0.022371610982580396
},
"xcopa_vi"
:
{
"acc"
:
0.536
,
"acc_stderr"
:
0.022324981738385246
},
"xcopa_zh"
:
{
"acc"
:
0.632
,
"acc_stderr"
:
0.02158898256835354
}
},
"versions"
:
{
"xcopa_et"
:
0
,
"xcopa_ht"
:
0
,
"xcopa_id"
:
0
,
"xcopa_it"
:
0
,
"xcopa_qu"
:
0
,
"xcopa_sw"
:
0
,
"xcopa_ta"
:
0
,
"xcopa_th"
:
0
,
"xcopa_tr"
:
0
,
"xcopa_vi"
:
0
,
"xcopa_zh"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=mosaicml/mpt-7b,trust_remote_code=True,dtype=bfloat16"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/mpt/mpt-7b/mpt-7b_xnli_0-shot.json
0 → 100644
View file @
afbf8e66
{
"results"
:
{
"xnli_ar"
:
{
"acc"
:
0.3331337325349301
,
"acc_stderr"
:
0.006659676975732312
},
"xnli_bg"
:
{
"acc"
:
0.36826347305389223
,
"acc_stderr"
:
0.00681509322031267
},
"xnli_de"
:
{
"acc"
:
0.46447105788423154
,
"acc_stderr"
:
0.007046854204317796
},
"xnli_el"
:
{
"acc"
:
0.36187624750499003
,
"acc_stderr"
:
0.006789799946197319
},
"xnli_en"
:
{
"acc"
:
0.543313373253493
,
"acc_stderr"
:
0.007038155029004149
},
"xnli_es"
:
{
"acc"
:
0.4564870259481038
,
"acc_stderr"
:
0.007037909229199955
},
"xnli_fr"
:
{
"acc"
:
0.4880239520958084
,
"acc_stderr"
:
0.007062685615595022
},
"xnli_hi"
:
{
"acc"
:
0.3473053892215569
,
"acc_stderr"
:
0.006727214239733754
},
"xnli_ru"
:
{
"acc"
:
0.444311377245509
,
"acc_stderr"
:
0.007020757195791274
},
"xnli_sw"
:
{
"acc"
:
0.3341317365269461
,
"acc_stderr"
:
0.006664652441694265
},
"xnli_th"
:
{
"acc"
:
0.36127744510978044
,
"acc_stderr"
:
0.006787362347422093
},
"xnli_tr"
:
{
"acc"
:
0.37684630738522956
,
"acc_stderr"
:
0.006847061089041557
},
"xnli_ur"
:
{
"acc"
:
0.33632734530938124
,
"acc_stderr"
:
0.006675480563072364
},
"xnli_vi"
:
{
"acc"
:
0.37325349301397204
,
"acc_stderr"
:
0.0068339592620100505
},
"xnli_zh"
:
{
"acc"
:
0.3534930139720559
,
"acc_stderr"
:
0.006754629196407293
}
},
"versions"
:
{
"xnli_ar"
:
0
,
"xnli_bg"
:
0
,
"xnli_de"
:
0
,
"xnli_el"
:
0
,
"xnli_en"
:
0
,
"xnli_es"
:
0
,
"xnli_fr"
:
0
,
"xnli_hi"
:
0
,
"xnli_ru"
:
0
,
"xnli_sw"
:
0
,
"xnli_th"
:
0
,
"xnli_tr"
:
0
,
"xnli_ur"
:
0
,
"xnli_vi"
:
0
,
"xnli_zh"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=mosaicml/mpt-7b,trust_remote_code=True,dtype=bfloat16"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/mpt/mpt-7b/mpt-7b_xstory_cloze_0-shot.json
0 → 100644
View file @
afbf8e66
{
"results"
:
{
"xstory_cloze_ar"
:
{
"acc"
:
0.48510919920582396
,
"acc_stderr"
:
0.012861417842074004
},
"xstory_cloze_en"
:
{
"acc"
:
0.7789543348775645
,
"acc_stderr"
:
0.010678457581809239
},
"xstory_cloze_es"
:
{
"acc"
:
0.6604897418927862
,
"acc_stderr"
:
0.01218627614665943
},
"xstory_cloze_eu"
:
{
"acc"
:
0.5109199205823958
,
"acc_stderr"
:
0.012864056278255038
},
"xstory_cloze_hi"
:
{
"acc"
:
0.5168762409000662
,
"acc_stderr"
:
0.012859793919977606
},
"xstory_cloze_id"
:
{
"acc"
:
0.5519523494374586
,
"acc_stderr"
:
0.012797478885304742
},
"xstory_cloze_my"
:
{
"acc"
:
0.48378557246856385
,
"acc_stderr"
:
0.01286035780505586
},
"xstory_cloze_ru"
:
{
"acc"
:
0.5724685638649901
,
"acc_stderr"
:
0.012731259626982528
},
"xstory_cloze_sw"
:
{
"acc"
:
0.4990072799470549
,
"acc_stderr"
:
0.012867099955422935
},
"xstory_cloze_te"
:
{
"acc"
:
0.5294506949040371
,
"acc_stderr"
:
0.012844785490017004
},
"xstory_cloze_zh"
:
{
"acc"
:
0.5956320317670417
,
"acc_stderr"
:
0.012629580396570932
}
},
"versions"
:
{
"xstory_cloze_ar"
:
0
,
"xstory_cloze_en"
:
0
,
"xstory_cloze_es"
:
0
,
"xstory_cloze_eu"
:
0
,
"xstory_cloze_hi"
:
0
,
"xstory_cloze_id"
:
0
,
"xstory_cloze_my"
:
0
,
"xstory_cloze_ru"
:
0
,
"xstory_cloze_sw"
:
0
,
"xstory_cloze_te"
:
0
,
"xstory_cloze_zh"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=mosaicml/mpt-7b,trust_remote_code=True,dtype=bfloat16"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/mpt/mpt-7b/mpt-7b_xwinograd_0-shot.json
0 → 100644
View file @
afbf8e66
{
"results"
:
{
"xwinograd_en"
:
{
"acc"
:
0.8666666666666667
,
"acc_stderr"
:
0.0070514325016347275
},
"xwinograd_fr"
:
{
"acc"
:
0.6626506024096386
,
"acc_stderr"
:
0.05221260262032129
},
"xwinograd_jp"
:
{
"acc"
:
0.602711157455683
,
"acc_stderr"
:
0.015809751560314552
},
"xwinograd_pt"
:
{
"acc"
:
0.6692015209125475
,
"acc_stderr"
:
0.02906762615931534
},
"xwinograd_ru"
:
{
"acc"
:
0.6952380952380952
,
"acc_stderr"
:
0.025976599352305375
},
"xwinograd_zh"
:
{
"acc"
:
0.7162698412698413
,
"acc_stderr"
:
0.02010051064884106
}
},
"versions"
:
{
"xwinograd_en"
:
0
,
"xwinograd_fr"
:
0
,
"xwinograd_jp"
:
0
,
"xwinograd_pt"
:
0
,
"xwinograd_ru"
:
0
,
"xwinograd_zh"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=mosaicml/mpt-7b,trust_remote_code=True,dtype=bfloat16"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment