Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
4a0b0d6e
"configs/vscode:/vscode.git/clone" did not exist on "185d1e1f929642e4bae576c3888b9500771a84fe"
Commit
4a0b0d6e
authored
Jun 16, 2023
by
lintangsutawika
Browse files
Merge branch 'gakada-big-refactor-merge' into big-refactor
parents
6ae376e3
c490f165
Changes
230
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2464 additions
and
0 deletions
+2464
-0
results/llama/llama-13B/llama-13B_lambada_0-shot.json
results/llama/llama-13B/llama-13B_lambada_0-shot.json
+80
-0
results/llama/llama-13B/llama-13B_mathematical_reasoning_0-shot.json
...ma/llama-13B/llama-13B_mathematical_reasoning_0-shot.json
+76
-0
results/llama/llama-13B/llama-13B_mathematical_reasoning_few_shot_5-shot.json
...13B/llama-13B_mathematical_reasoning_few_shot_5-shot.json
+71
-0
results/llama/llama-13B/llama-13B_mmlu_5-shot.json
results/llama/llama-13B/llama-13B_mmlu_5-shot.json
+416
-0
results/llama/llama-13B/llama-13B_pawsx_0-shot.json
results/llama/llama-13B/llama-13B_pawsx_0-shot.json
+52
-0
results/llama/llama-13B/llama-13B_question_answering_0-shot.json
.../llama/llama-13B/llama-13B_question_answering_0-shot.json
+66
-0
results/llama/llama-13B/llama-13B_reading_comprehension_0-shot.json
...ama/llama-13B/llama-13B_reading_comprehension_0-shot.json
+36
-0
results/llama/llama-13B/llama-13B_superglue_0-shot.json
results/llama/llama-13B/llama-13B_superglue_0-shot.json
+55
-0
results/llama/llama-13B/llama-13B_xcopa_0-shot.json
results/llama/llama-13B/llama-13B_xcopa_0-shot.json
+72
-0
results/llama/llama-13B/llama-13B_xnli_0-shot.json
results/llama/llama-13B/llama-13B_xnli_0-shot.json
+92
-0
results/llama/llama-13B/llama-13B_xstory_cloze_0-shot.json
results/llama/llama-13B/llama-13B_xstory_cloze_0-shot.json
+72
-0
results/llama/llama-13B/llama-13B_xwinograd_0-shot.json
results/llama/llama-13B/llama-13B_xwinograd_0-shot.json
+47
-0
results/llama/llama-30B/README.md
results/llama/llama-30B/README.md
+332
-0
results/llama/llama-30B/llama-30B_bbh_3-shot.json
results/llama/llama-30B/llama-30B_bbh_3-shot.json
+124
-0
results/llama/llama-30B/llama-30B_common_sense_reasoning_0-shot.json
...ma/llama-30B/llama-30B_common_sense_reasoning_0-shot.json
+91
-0
results/llama/llama-30B/llama-30B_gsm8k_8-shot.json
results/llama/llama-30B/llama-30B_gsm8k_8-shot.json
+22
-0
results/llama/llama-30B/llama-30B_human_alignment_0-shot.json
...lts/llama/llama-30B/llama-30B_human_alignment_0-shot.json
+197
-0
results/llama/llama-30B/llama-30B_mathematical_reasoning_0-shot.json
...ma/llama-30B/llama-30B_mathematical_reasoning_0-shot.json
+76
-0
results/llama/llama-30B/llama-30B_mathematical_reasoning_few_shot_5-shot.json
...30B/llama-30B_mathematical_reasoning_few_shot_5-shot.json
+71
-0
results/llama/llama-30B/llama-30B_mmlu_5-shot.json
results/llama/llama-30B/llama-30B_mmlu_5-shot.json
+416
-0
No files found.
results/llama/llama-13B/llama-13B_lambada_0-shot.json
0 → 100644
View file @
4a0b0d6e
{
"results"
:
{
"lambada_openai"
:
{
"ppl"
:
1279051.053451683
,
"ppl_stderr"
:
60995.62964377304
,
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"lambada_openai_mt_de"
:
{
"ppl"
:
1310285.4433720284
,
"ppl_stderr"
:
71395.90633942866
,
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"lambada_openai_mt_it"
:
{
"ppl"
:
4091504.352954044
,
"ppl_stderr"
:
218020.965277226
,
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"lambada_standard"
:
{
"ppl"
:
1409047.9981006894
,
"ppl_stderr"
:
47832.883755899915
,
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"lambada_standard_cloze"
:
{
"ppl"
:
4235345.031433833
,
"ppl_stderr"
:
132892.5654001927
,
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"lambada_openai_mt_fr"
:
{
"ppl"
:
2461448.491005768
,
"ppl_stderr"
:
128013.98724687536
,
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"lambada_openai_cloze"
:
{
"ppl"
:
204515.38946166556
,
"ppl_stderr"
:
9705.341358126625
,
"acc"
:
0.00019406171162429653
,
"acc_stderr"
:
0.00019406171162430135
},
"lambada_openai_mt_en"
:
{
"ppl"
:
1279051.053451683
,
"ppl_stderr"
:
60995.62964377304
,
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"lambada_openai_mt_es"
:
{
"ppl"
:
1980241.7718905837
,
"ppl_stderr"
:
101614.2034914904
,
"acc"
:
0.0
,
"acc_stderr"
:
0.0
}
},
"versions"
:
{
"lambada_openai"
:
0
,
"lambada_openai_mt_de"
:
0
,
"lambada_openai_mt_it"
:
0
,
"lambada_standard"
:
0
,
"lambada_standard_cloze"
:
0
,
"lambada_openai_mt_fr"
:
0
,
"lambada_openai_cloze"
:
0
,
"lambada_openai_mt_en"
:
0
,
"lambada_openai_mt_es"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_mathematical_reasoning_0-shot.json
0 → 100644
View file @
4a0b0d6e
{
"results"
:
{
"math_prealgebra"
:
{
"acc"
:
0.02870264064293915
,
"acc_stderr"
:
0.0056607946989983855
},
"math_num_theory"
:
{
"acc"
:
0.014814814814814815
,
"acc_stderr"
:
0.005203704987512651
},
"drop"
:
{
"em"
:
0.0388003355704698
,
"em_stderr"
:
0.0019777172311177993
,
"f1"
:
0.13990771812080444
,
"f1_stderr"
:
0.002512880034517493
},
"gsm8k"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_intermediate_algebra"
:
{
"acc"
:
0.012181616832779624
,
"acc_stderr"
:
0.0036524791938863576
},
"math_algebra"
:
{
"acc"
:
0.018534119629317607
,
"acc_stderr"
:
0.003916347676363957
},
"math_counting_and_prob"
:
{
"acc"
:
0.014767932489451477
,
"acc_stderr"
:
0.0055462385896684775
},
"math_geometry"
:
{
"acc"
:
0.012526096033402923
,
"acc_stderr"
:
0.005086941389677977
},
"math_precalc"
:
{
"acc"
:
0.01098901098901099
,
"acc_stderr"
:
0.004465618427331416
},
"mathqa"
:
{
"acc"
:
0.28442211055276384
,
"acc_stderr"
:
0.008258681628795297
,
"acc_norm"
:
0.28676716917922945
,
"acc_norm_stderr"
:
0.00827905882129993
},
"math_asdiv"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
}
},
"versions"
:
{
"math_prealgebra"
:
1
,
"math_num_theory"
:
1
,
"drop"
:
1
,
"mathqa"
:
0
,
"gsm8k"
:
0
,
"math_intermediate_algebra"
:
1
,
"math_algebra"
:
1
,
"math_counting_and_prob"
:
1
,
"math_geometry"
:
1
,
"math_precalc"
:
1
,
"math_asdiv"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_mathematical_reasoning_few_shot_5-shot.json
0 → 100644
View file @
4a0b0d6e
{
"results"
:
{
"math_prealgebra"
:
{
"acc"
:
0.001148105625717566
,
"acc_stderr"
:
0.0011481056257175704
},
"drop"
:
{
"em"
:
0.01709312080536913
,
"em_stderr"
:
0.001327414384722433
,
"f1"
:
0.024450503355704672
,
"f1_stderr"
:
0.001413124400630544
},
"math_intermediate_algebra"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_counting_and_prob"
:
{
"acc"
:
0.002109704641350211
,
"acc_stderr"
:
0.0021097046413502104
},
"math_num_theory"
:
{
"acc"
:
0.001851851851851852
,
"acc_stderr"
:
0.0018518518518518502
},
"gsm8k"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_geometry"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_algebra"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_precalc"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"mathqa"
:
{
"acc"
:
0.2998324958123953
,
"acc_stderr"
:
0.008387661895516162
,
"acc_norm"
:
0.3035175879396985
,
"acc_norm_stderr"
:
0.008416811454701563
}
},
"versions"
:
{
"math_prealgebra"
:
1
,
"drop"
:
1
,
"mathqa"
:
0
,
"math_intermediate_algebra"
:
1
,
"math_counting_and_prob"
:
1
,
"math_num_theory"
:
1
,
"gsm8k"
:
0
,
"math_geometry"
:
1
,
"math_algebra"
:
1
,
"math_precalc"
:
1
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True"
,
"num_fewshot"
:
5
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_mmlu_5-shot.json
0 → 100644
View file @
4a0b0d6e
{
"results"
:
{
"hendrycksTest-college_biology"
:
{
"acc"
:
0.4583333333333333
,
"acc_stderr"
:
0.04166666666666665
,
"acc_norm"
:
0.3263888888888889
,
"acc_norm_stderr"
:
0.03921067198982266
},
"hendrycksTest-clinical_knowledge"
:
{
"acc"
:
0.46037735849056605
,
"acc_stderr"
:
0.030676096599389188
,
"acc_norm"
:
0.3849056603773585
,
"acc_norm_stderr"
:
0.029946498567699948
},
"hendrycksTest-high_school_european_history"
:
{
"acc"
:
0.5272727272727272
,
"acc_stderr"
:
0.03898531605579418
,
"acc_norm"
:
0.49696969696969695
,
"acc_norm_stderr"
:
0.03904272341431855
},
"hendrycksTest-high_school_psychology"
:
{
"acc"
:
0.6073394495412844
,
"acc_stderr"
:
0.02093750516120109
,
"acc_norm"
:
0.3688073394495413
,
"acc_norm_stderr"
:
0.020686227560729537
},
"hendrycksTest-business_ethics"
:
{
"acc"
:
0.53
,
"acc_stderr"
:
0.05016135580465919
,
"acc_norm"
:
0.44
,
"acc_norm_stderr"
:
0.04988876515698589
},
"hendrycksTest-high_school_government_and_politics"
:
{
"acc"
:
0.5854922279792746
,
"acc_stderr"
:
0.035553003195576686
,
"acc_norm"
:
0.38860103626943004
,
"acc_norm_stderr"
:
0.03517739796373132
},
"hendrycksTest-security_studies"
:
{
"acc"
:
0.45714285714285713
,
"acc_stderr"
:
0.03189141832421396
,
"acc_norm"
:
0.37551020408163266
,
"acc_norm_stderr"
:
0.03100120903989484
},
"hendrycksTest-high_school_macroeconomics"
:
{
"acc"
:
0.3769230769230769
,
"acc_stderr"
:
0.024570975364225995
,
"acc_norm"
:
0.31794871794871793
,
"acc_norm_stderr"
:
0.02361088430892786
},
"hendrycksTest-sociology"
:
{
"acc"
:
0.582089552238806
,
"acc_stderr"
:
0.034875586404620636
,
"acc_norm"
:
0.4577114427860697
,
"acc_norm_stderr"
:
0.035228658640995975
},
"hendrycksTest-college_mathematics"
:
{
"acc"
:
0.29
,
"acc_stderr"
:
0.04560480215720683
,
"acc_norm"
:
0.34
,
"acc_norm_stderr"
:
0.04760952285695235
},
"hendrycksTest-professional_accounting"
:
{
"acc"
:
0.2978723404255319
,
"acc_stderr"
:
0.02728160834446941
,
"acc_norm"
:
0.2801418439716312
,
"acc_norm_stderr"
:
0.02678917235114023
},
"hendrycksTest-anatomy"
:
{
"acc"
:
0.42962962962962964
,
"acc_stderr"
:
0.04276349494376599
,
"acc_norm"
:
0.2962962962962963
,
"acc_norm_stderr"
:
0.03944624162501116
},
"hendrycksTest-professional_psychology"
:
{
"acc"
:
0.42320261437908496
,
"acc_stderr"
:
0.019987809769482067
,
"acc_norm"
:
0.3300653594771242
,
"acc_norm_stderr"
:
0.01902372616072456
},
"hendrycksTest-moral_scenarios"
:
{
"acc"
:
0.28268156424581004
,
"acc_stderr"
:
0.015060381730018082
,
"acc_norm"
:
0.27262569832402234
,
"acc_norm_stderr"
:
0.014893391735249588
},
"hendrycksTest-conceptual_physics"
:
{
"acc"
:
0.42127659574468085
,
"acc_stderr"
:
0.03227834510146268
,
"acc_norm"
:
0.2425531914893617
,
"acc_norm_stderr"
:
0.028020226271200217
},
"hendrycksTest-virology"
:
{
"acc"
:
0.40963855421686746
,
"acc_stderr"
:
0.03828401115079021
,
"acc_norm"
:
0.30120481927710846
,
"acc_norm_stderr"
:
0.035716092300534796
},
"hendrycksTest-world_religions"
:
{
"acc"
:
0.7426900584795322
,
"acc_stderr"
:
0.03352799844161865
,
"acc_norm"
:
0.6491228070175439
,
"acc_norm_stderr"
:
0.03660298834049162
},
"hendrycksTest-high_school_computer_science"
:
{
"acc"
:
0.49
,
"acc_stderr"
:
0.05024183937956911
,
"acc_norm"
:
0.41
,
"acc_norm_stderr"
:
0.049431107042371025
},
"hendrycksTest-abstract_algebra"
:
{
"acc"
:
0.32
,
"acc_stderr"
:
0.046882617226215034
,
"acc_norm"
:
0.3
,
"acc_norm_stderr"
:
0.046056618647183814
},
"hendrycksTest-medical_genetics"
:
{
"acc"
:
0.49
,
"acc_stderr"
:
0.05024183937956911
,
"acc_norm"
:
0.48
,
"acc_norm_stderr"
:
0.050211673156867795
},
"hendrycksTest-nutrition"
:
{
"acc"
:
0.45098039215686275
,
"acc_stderr"
:
0.02849199358617156
,
"acc_norm"
:
0.4673202614379085
,
"acc_norm_stderr"
:
0.02856869975222588
},
"hendrycksTest-elementary_mathematics"
:
{
"acc"
:
0.36772486772486773
,
"acc_stderr"
:
0.024833839825562424
,
"acc_norm"
:
0.328042328042328
,
"acc_norm_stderr"
:
0.024180497164376907
},
"hendrycksTest-philosophy"
:
{
"acc"
:
0.45980707395498394
,
"acc_stderr"
:
0.028306190403305696
,
"acc_norm"
:
0.3858520900321543
,
"acc_norm_stderr"
:
0.02764814959975146
},
"hendrycksTest-high_school_microeconomics"
:
{
"acc"
:
0.42016806722689076
,
"acc_stderr"
:
0.03206183783236152
,
"acc_norm"
:
0.40756302521008403
,
"acc_norm_stderr"
:
0.031918633744784645
},
"hendrycksTest-management"
:
{
"acc"
:
0.6407766990291263
,
"acc_stderr"
:
0.04750458399041696
,
"acc_norm"
:
0.4174757281553398
,
"acc_norm_stderr"
:
0.048828405482122375
},
"hendrycksTest-us_foreign_policy"
:
{
"acc"
:
0.68
,
"acc_stderr"
:
0.046882617226215034
,
"acc_norm"
:
0.52
,
"acc_norm_stderr"
:
0.050211673156867795
},
"hendrycksTest-international_law"
:
{
"acc"
:
0.5619834710743802
,
"acc_stderr"
:
0.04529146804435792
,
"acc_norm"
:
0.6033057851239669
,
"acc_norm_stderr"
:
0.044658697805310094
},
"hendrycksTest-college_chemistry"
:
{
"acc"
:
0.31
,
"acc_stderr"
:
0.04648231987117316
,
"acc_norm"
:
0.3
,
"acc_norm_stderr"
:
0.046056618647183814
},
"hendrycksTest-high_school_mathematics"
:
{
"acc"
:
0.26666666666666666
,
"acc_stderr"
:
0.026962424325073817
,
"acc_norm"
:
0.31851851851851853
,
"acc_norm_stderr"
:
0.028406533090608463
},
"hendrycksTest-high_school_world_history"
:
{
"acc"
:
0.4978902953586498
,
"acc_stderr"
:
0.032546938018020076
,
"acc_norm"
:
0.42616033755274263
,
"acc_norm_stderr"
:
0.03219035703131774
},
"hendrycksTest-human_sexuality"
:
{
"acc"
:
0.549618320610687
,
"acc_stderr"
:
0.04363643698524779
,
"acc_norm"
:
0.3969465648854962
,
"acc_norm_stderr"
:
0.04291135671009224
},
"hendrycksTest-college_computer_science"
:
{
"acc"
:
0.33
,
"acc_stderr"
:
0.047258156262526045
,
"acc_norm"
:
0.28
,
"acc_norm_stderr"
:
0.045126085985421276
},
"hendrycksTest-college_medicine"
:
{
"acc"
:
0.4277456647398844
,
"acc_stderr"
:
0.037724468575180255
,
"acc_norm"
:
0.30057803468208094
,
"acc_norm_stderr"
:
0.0349610148119118
},
"hendrycksTest-formal_logic"
:
{
"acc"
:
0.3253968253968254
,
"acc_stderr"
:
0.041905964388711366
,
"acc_norm"
:
0.3412698412698413
,
"acc_norm_stderr"
:
0.04240799327574925
},
"hendrycksTest-high_school_physics"
:
{
"acc"
:
0.271523178807947
,
"acc_stderr"
:
0.03631329803969653
,
"acc_norm"
:
0.25165562913907286
,
"acc_norm_stderr"
:
0.035433042343899844
},
"hendrycksTest-marketing"
:
{
"acc"
:
0.7264957264957265
,
"acc_stderr"
:
0.029202540153431173
,
"acc_norm"
:
0.6153846153846154
,
"acc_norm_stderr"
:
0.03187195347942466
},
"hendrycksTest-jurisprudence"
:
{
"acc"
:
0.48148148148148145
,
"acc_stderr"
:
0.04830366024635331
,
"acc_norm"
:
0.5
,
"acc_norm_stderr"
:
0.04833682445228318
},
"hendrycksTest-computer_security"
:
{
"acc"
:
0.57
,
"acc_stderr"
:
0.049756985195624284
,
"acc_norm"
:
0.44
,
"acc_norm_stderr"
:
0.04988876515698589
},
"hendrycksTest-high_school_chemistry"
:
{
"acc"
:
0.3103448275862069
,
"acc_stderr"
:
0.03255086769970103
,
"acc_norm"
:
0.32019704433497537
,
"acc_norm_stderr"
:
0.032826493853041504
},
"hendrycksTest-prehistory"
:
{
"acc"
:
0.49691358024691357
,
"acc_stderr"
:
0.02782021415859437
,
"acc_norm"
:
0.345679012345679
,
"acc_norm_stderr"
:
0.026462487777001876
},
"hendrycksTest-machine_learning"
:
{
"acc"
:
0.2857142857142857
,
"acc_stderr"
:
0.04287858751340455
,
"acc_norm"
:
0.29464285714285715
,
"acc_norm_stderr"
:
0.043270409325787296
},
"hendrycksTest-professional_medicine"
:
{
"acc"
:
0.39338235294117646
,
"acc_stderr"
:
0.02967428828131118
,
"acc_norm"
:
0.33088235294117646
,
"acc_norm_stderr"
:
0.028582709753898452
},
"hendrycksTest-global_facts"
:
{
"acc"
:
0.34
,
"acc_stderr"
:
0.04760952285695235
,
"acc_norm"
:
0.29
,
"acc_norm_stderr"
:
0.04560480215720684
},
"hendrycksTest-high_school_us_history"
:
{
"acc"
:
0.5245098039215687
,
"acc_stderr"
:
0.03505093194348798
,
"acc_norm"
:
0.37254901960784315
,
"acc_norm_stderr"
:
0.033933885849584046
},
"hendrycksTest-high_school_geography"
:
{
"acc"
:
0.5757575757575758
,
"acc_stderr"
:
0.03521224908841586
,
"acc_norm"
:
0.42424242424242425
,
"acc_norm_stderr"
:
0.03521224908841583
},
"hendrycksTest-human_aging"
:
{
"acc"
:
0.5739910313901345
,
"acc_stderr"
:
0.033188332862172806
,
"acc_norm"
:
0.336322869955157
,
"acc_norm_stderr"
:
0.03170882426845501
},
"hendrycksTest-high_school_biology"
:
{
"acc"
:
0.4967741935483871
,
"acc_stderr"
:
0.028443414226438316
,
"acc_norm"
:
0.36129032258064514
,
"acc_norm_stderr"
:
0.027327548447957553
},
"hendrycksTest-public_relations"
:
{
"acc"
:
0.5454545454545454
,
"acc_stderr"
:
0.04769300568972744
,
"acc_norm"
:
0.2909090909090909
,
"acc_norm_stderr"
:
0.04350271442923243
},
"hendrycksTest-professional_law"
:
{
"acc"
:
0.30378096479791394
,
"acc_stderr"
:
0.011745787720472483
,
"acc_norm"
:
0.3089960886571056
,
"acc_norm_stderr"
:
0.011801729777239246
},
"hendrycksTest-electrical_engineering"
:
{
"acc"
:
0.41379310344827586
,
"acc_stderr"
:
0.041042692118062316
,
"acc_norm"
:
0.3448275862068966
,
"acc_norm_stderr"
:
0.039609335494512087
},
"hendrycksTest-logical_fallacies"
:
{
"acc"
:
0.4539877300613497
,
"acc_stderr"
:
0.0391170190467718
,
"acc_norm"
:
0.36809815950920244
,
"acc_norm_stderr"
:
0.03789213935838396
},
"hendrycksTest-moral_disputes"
:
{
"acc"
:
0.4479768786127168
,
"acc_stderr"
:
0.026772990653361816
,
"acc_norm"
:
0.3815028901734104
,
"acc_norm_stderr"
:
0.0261521986197268
},
"hendrycksTest-high_school_statistics"
:
{
"acc"
:
0.38425925925925924
,
"acc_stderr"
:
0.03317354514310742
,
"acc_norm"
:
0.375
,
"acc_norm_stderr"
:
0.033016908987210894
},
"hendrycksTest-college_physics"
:
{
"acc"
:
0.28431372549019607
,
"acc_stderr"
:
0.04488482852329017
,
"acc_norm"
:
0.35294117647058826
,
"acc_norm_stderr"
:
0.04755129616062947
},
"hendrycksTest-econometrics"
:
{
"acc"
:
0.2719298245614035
,
"acc_stderr"
:
0.04185774424022056
,
"acc_norm"
:
0.2631578947368421
,
"acc_norm_stderr"
:
0.041424397194893624
},
"hendrycksTest-miscellaneous"
:
{
"acc"
:
0.6960408684546615
,
"acc_stderr"
:
0.016448321686769043
,
"acc_norm"
:
0.48531289910600256
,
"acc_norm_stderr"
:
0.01787224802442912
},
"hendrycksTest-astronomy"
:
{
"acc"
:
0.48026315789473684
,
"acc_stderr"
:
0.04065771002562603
,
"acc_norm"
:
0.48026315789473684
,
"acc_norm_stderr"
:
0.040657710025626036
}
},
"versions"
:
{
"hendrycksTest-college_biology"
:
0
,
"hendrycksTest-clinical_knowledge"
:
0
,
"hendrycksTest-high_school_european_history"
:
0
,
"hendrycksTest-high_school_psychology"
:
0
,
"hendrycksTest-business_ethics"
:
0
,
"hendrycksTest-high_school_government_and_politics"
:
0
,
"hendrycksTest-security_studies"
:
0
,
"hendrycksTest-high_school_macroeconomics"
:
0
,
"hendrycksTest-sociology"
:
0
,
"hendrycksTest-college_mathematics"
:
0
,
"hendrycksTest-professional_accounting"
:
0
,
"hendrycksTest-anatomy"
:
0
,
"hendrycksTest-professional_psychology"
:
0
,
"hendrycksTest-moral_scenarios"
:
0
,
"hendrycksTest-conceptual_physics"
:
0
,
"hendrycksTest-virology"
:
0
,
"hendrycksTest-world_religions"
:
0
,
"hendrycksTest-high_school_computer_science"
:
0
,
"hendrycksTest-abstract_algebra"
:
0
,
"hendrycksTest-medical_genetics"
:
0
,
"hendrycksTest-nutrition"
:
0
,
"hendrycksTest-elementary_mathematics"
:
0
,
"hendrycksTest-philosophy"
:
0
,
"hendrycksTest-high_school_microeconomics"
:
0
,
"hendrycksTest-management"
:
0
,
"hendrycksTest-us_foreign_policy"
:
0
,
"hendrycksTest-international_law"
:
0
,
"hendrycksTest-college_chemistry"
:
0
,
"hendrycksTest-high_school_mathematics"
:
0
,
"hendrycksTest-high_school_world_history"
:
0
,
"hendrycksTest-human_sexuality"
:
0
,
"hendrycksTest-college_computer_science"
:
0
,
"hendrycksTest-college_medicine"
:
0
,
"hendrycksTest-formal_logic"
:
0
,
"hendrycksTest-high_school_physics"
:
0
,
"hendrycksTest-marketing"
:
0
,
"hendrycksTest-jurisprudence"
:
0
,
"hendrycksTest-computer_security"
:
0
,
"hendrycksTest-high_school_chemistry"
:
0
,
"hendrycksTest-prehistory"
:
0
,
"hendrycksTest-machine_learning"
:
0
,
"hendrycksTest-professional_medicine"
:
0
,
"hendrycksTest-global_facts"
:
0
,
"hendrycksTest-high_school_us_history"
:
0
,
"hendrycksTest-high_school_geography"
:
0
,
"hendrycksTest-human_aging"
:
0
,
"hendrycksTest-high_school_biology"
:
0
,
"hendrycksTest-public_relations"
:
0
,
"hendrycksTest-professional_law"
:
0
,
"hendrycksTest-electrical_engineering"
:
0
,
"hendrycksTest-logical_fallacies"
:
0
,
"hendrycksTest-moral_disputes"
:
0
,
"hendrycksTest-high_school_statistics"
:
0
,
"hendrycksTest-college_physics"
:
0
,
"hendrycksTest-econometrics"
:
0
,
"hendrycksTest-miscellaneous"
:
0
,
"hendrycksTest-astronomy"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True"
,
"num_fewshot"
:
5
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_pawsx_0-shot.json
0 → 100644
View file @
4a0b0d6e
{
"results"
:
{
"pawsx_fr"
:
{
"acc"
:
0.545
,
"acc_stderr"
:
0.011137752231145222
},
"pawsx_en"
:
{
"acc"
:
0.537
,
"acc_stderr"
:
0.011152474561478174
},
"pawsx_ko"
:
{
"acc"
:
0.4705
,
"acc_stderr"
:
0.011163654804511664
},
"pawsx_ja"
:
{
"acc"
:
0.45
,
"acc_stderr"
:
0.011127079848413735
},
"pawsx_es"
:
{
"acc"
:
0.521
,
"acc_stderr"
:
0.011173268141438304
},
"pawsx_de"
:
{
"acc"
:
0.5295
,
"acc_stderr"
:
0.011163654804511655
},
"pawsx_zh"
:
{
"acc"
:
0.452
,
"acc_stderr"
:
0.01113148485052578
}
},
"versions"
:
{
"pawsx_fr"
:
0
,
"pawsx_en"
:
0
,
"pawsx_ko"
:
0
,
"pawsx_ja"
:
0
,
"pawsx_es"
:
0
,
"pawsx_de"
:
0
,
"pawsx_zh"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_question_answering_0-shot.json
0 → 100644
View file @
4a0b0d6e
{
"results"
:
{
"triviaqa"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"headqa_es"
:
{
"acc"
:
0.3056163384390955
,
"acc_stderr"
:
0.008799003959214539
,
"acc_norm"
:
0.3515681983953319
,
"acc_norm_stderr"
:
0.009119739372039878
},
"logiqa"
:
{
"acc"
:
0.2642089093701997
,
"acc_stderr"
:
0.017293954549744514
,
"acc_norm"
:
0.3210445468509985
,
"acc_norm_stderr"
:
0.018312456701476108
},
"headqa_en"
:
{
"acc"
:
0.34427425237053244
,
"acc_stderr"
:
0.009075255747504299
,
"acc_norm"
:
0.38584974471188915
,
"acc_norm_stderr"
:
0.009298050684004381
},
"truthfulqa_mc"
:
{
"mc1"
:
0.2582619339045288
,
"mc1_stderr"
:
0.0153218216884762
,
"mc2"
:
0.39884734031519786
,
"mc2_stderr"
:
0.013703865869126058
},
"squad2"
:
{
"exact"
:
16.440663690726858
,
"f1"
:
24.060945088960178
,
"HasAns_exact"
:
21.086369770580298
,
"HasAns_f1"
:
36.34878560074651
,
"NoAns_exact"
:
11.808242220353238
,
"NoAns_f1"
:
11.808242220353238
,
"best_exact"
:
50.07159100480081
,
"best_f1"
:
50.073888042388
},
"webqs"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
}
},
"versions"
:
{
"triviaqa"
:
1
,
"headqa_es"
:
0
,
"logiqa"
:
0
,
"headqa_en"
:
0
,
"truthfulqa_mc"
:
1
,
"squad2"
:
1
,
"webqs"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_reading_comprehension_0-shot.json
0 → 100644
View file @
4a0b0d6e
{
"results"
:
{
"coqa"
:
{
"f1"
:
0.7704068983762044
,
"f1_stderr"
:
0.014191975492335083
,
"em"
:
0.637
,
"em_stderr"
:
0.01847461201879917
},
"drop"
:
{
"em"
:
0.035864093959731544
,
"em_stderr"
:
0.0019043146639119552
,
"f1"
:
0.13376153523489834
,
"f1_stderr"
:
0.002439665460318613
},
"race"
:
{
"acc"
:
0.39330143540669854
,
"acc_stderr"
:
0.01511816218614914
}
},
"versions"
:
{
"coqa"
:
1
,
"drop"
:
1
,
"race"
:
1
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
false
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_superglue_0-shot.json
0 → 100644
View file @
4a0b0d6e
{
"results"
:
{
"boolq"
:
{
"acc"
:
0.6844036697247706
,
"acc_stderr"
:
0.008128579858785895
},
"wic"
:
{
"acc"
:
0.49843260188087773
,
"acc_stderr"
:
0.019810623954060382
},
"copa"
:
{
"acc"
:
0.9
,
"acc_stderr"
:
0.030151134457776348
},
"wsc"
:
{
"acc"
:
0.3557692307692308
,
"acc_stderr"
:
0.04717221961050337
},
"cb"
:
{
"acc"
:
0.48214285714285715
,
"acc_stderr"
:
0.0673769750864465
,
"f1"
:
0.3881876266167991
},
"record"
:
{
"f1"
:
0.9231828571428571
,
"f1_stderr"
:
0.0026119602574627677
,
"em"
:
0.9154
,
"em_stderr"
:
0.002782994521347745
},
"multirc"
:
{
"acc"
:
0.015739769150052464
,
"acc_stderr"
:
0.00403399795659578
}
},
"versions"
:
{
"boolq"
:
1
,
"wic"
:
0
,
"copa"
:
0
,
"wsc"
:
0
,
"cb"
:
1
,
"record"
:
0
,
"multirc"
:
1
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
false
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_xcopa_0-shot.json
0 → 100644
View file @
4a0b0d6e
{
"results"
:
{
"xcopa_it"
:
{
"acc"
:
0.672
,
"acc_stderr"
:
0.021017027165175485
},
"xcopa_vi"
:
{
"acc"
:
0.538
,
"acc_stderr"
:
0.02231833811987053
},
"xcopa_zh"
:
{
"acc"
:
0.584
,
"acc_stderr"
:
0.02206494331392886
},
"xcopa_ta"
:
{
"acc"
:
0.544
,
"acc_stderr"
:
0.022296238348407053
},
"xcopa_sw"
:
{
"acc"
:
0.512
,
"acc_stderr"
:
0.02237662679792717
},
"xcopa_id"
:
{
"acc"
:
0.578
,
"acc_stderr"
:
0.022109039310618552
},
"xcopa_tr"
:
{
"acc"
:
0.53
,
"acc_stderr"
:
0.02234274819250285
},
"xcopa_ht"
:
{
"acc"
:
0.528
,
"acc_stderr"
:
0.02234794983266809
},
"xcopa_qu"
:
{
"acc"
:
0.502
,
"acc_stderr"
:
0.02238289498648353
},
"xcopa_th"
:
{
"acc"
:
0.546
,
"acc_stderr"
:
0.022288147591176945
},
"xcopa_et"
:
{
"acc"
:
0.482
,
"acc_stderr"
:
0.02236856511738799
}
},
"versions"
:
{
"xcopa_it"
:
0
,
"xcopa_vi"
:
0
,
"xcopa_zh"
:
0
,
"xcopa_ta"
:
0
,
"xcopa_sw"
:
0
,
"xcopa_id"
:
0
,
"xcopa_tr"
:
0
,
"xcopa_ht"
:
0
,
"xcopa_qu"
:
0
,
"xcopa_th"
:
0
,
"xcopa_et"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_xnli_0-shot.json
0 → 100644
View file @
4a0b0d6e
{
"results"
:
{
"xnli_ru"
:
{
"acc"
:
0.3379241516966068
,
"acc_stderr"
:
0.006683254094065008
},
"xnli_vi"
:
{
"acc"
:
0.34211576846307384
,
"acc_stderr"
:
0.006703255428996599
},
"xnli_zh"
:
{
"acc"
:
0.3447105788423154
,
"acc_stderr"
:
0.006715345603576115
},
"xnli_bg"
:
{
"acc"
:
0.34211576846307384
,
"acc_stderr"
:
0.0067032554289965995
},
"xnli_el"
:
{
"acc"
:
0.3469061876247505
,
"acc_stderr"
:
0.0067254026681375706
},
"xnli_fr"
:
{
"acc"
:
0.3349301397205589
,
"acc_stderr"
:
0.006668608672768922
},
"xnli_ur"
:
{
"acc"
:
0.34211576846307384
,
"acc_stderr"
:
0.006703255428996604
},
"xnli_hi"
:
{
"acc"
:
0.35588822355289423
,
"acc_stderr"
:
0.00676490827777005
},
"xnli_es"
:
{
"acc"
:
0.3349301397205589
,
"acc_stderr"
:
0.006668608672768919
},
"xnli_sw"
:
{
"acc"
:
0.3315369261477046
,
"acc_stderr"
:
0.006651646309907708
},
"xnli_th"
:
{
"acc"
:
0.34830339321357284
,
"acc_stderr"
:
0.006731720358995404
},
"xnli_ar"
:
{
"acc"
:
0.3407185628742515
,
"acc_stderr"
:
0.006696653153866837
},
"xnli_en"
:
{
"acc"
:
0.3562874251497006
,
"acc_stderr"
:
0.006766603483662201
},
"xnli_de"
:
{
"acc"
:
0.3524950099800399
,
"acc_stderr"
:
0.006750291549188483
},
"xnli_tr"
:
{
"acc"
:
0.3399201596806387
,
"acc_stderr"
:
0.006692851356332768
}
},
"versions"
:
{
"xnli_ru"
:
0
,
"xnli_vi"
:
0
,
"xnli_zh"
:
0
,
"xnli_bg"
:
0
,
"xnli_el"
:
0
,
"xnli_fr"
:
0
,
"xnli_ur"
:
0
,
"xnli_hi"
:
0
,
"xnli_es"
:
0
,
"xnli_sw"
:
0
,
"xnli_th"
:
0
,
"xnli_ar"
:
0
,
"xnli_en"
:
0
,
"xnli_de"
:
0
,
"xnli_tr"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_xstory_cloze_0-shot.json
0 → 100644
View file @
4a0b0d6e
{
"results"
:
{
"xstory_cloze_zh"
:
{
"acc"
:
0.5645268034414295
,
"acc_stderr"
:
0.012759525506489228
},
"xstory_cloze_my"
:
{
"acc"
:
0.47782925215089345
,
"acc_stderr"
:
0.012854469625936085
},
"xstory_cloze_id"
:
{
"acc"
:
0.5526141628060887
,
"acc_stderr"
:
0.012795688167385315
},
"xstory_cloze_te"
:
{
"acc"
:
0.5334215751158173
,
"acc_stderr"
:
0.012838347934731667
},
"xstory_cloze_ar"
:
{
"acc"
:
0.49702183984116477
,
"acc_stderr"
:
0.012866897066011233
},
"xstory_cloze_sw"
:
{
"acc"
:
0.4990072799470549
,
"acc_stderr"
:
0.01286709995542293
},
"xstory_cloze_hi"
:
{
"acc"
:
0.5234943745863666
,
"acc_stderr"
:
0.012852912530051748
},
"xstory_cloze_eu"
:
{
"acc"
:
0.5069490403706155
,
"acc_stderr"
:
0.012865882570960722
},
"xstory_cloze_en"
:
{
"acc"
:
0.7729980145598941
,
"acc_stderr"
:
0.010779920137756025
},
"xstory_cloze_es"
:
{
"acc"
:
0.6942422236929185
,
"acc_stderr"
:
0.011856480568871262
},
"xstory_cloze_ru"
:
{
"acc"
:
0.6340172071475844
,
"acc_stderr"
:
0.012396308684399372
}
},
"versions"
:
{
"xstory_cloze_zh"
:
0
,
"xstory_cloze_my"
:
0
,
"xstory_cloze_id"
:
0
,
"xstory_cloze_te"
:
0
,
"xstory_cloze_ar"
:
0
,
"xstory_cloze_sw"
:
0
,
"xstory_cloze_hi"
:
0
,
"xstory_cloze_eu"
:
0
,
"xstory_cloze_en"
:
0
,
"xstory_cloze_es"
:
0
,
"xstory_cloze_ru"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_xwinograd_0-shot.json
0 → 100644
View file @
4a0b0d6e
{
"results"
:
{
"xwinograd_pt"
:
{
"acc"
:
0.714828897338403
,
"acc_stderr"
:
0.02789350966043832
},
"xwinograd_jp"
:
{
"acc"
:
0.5985401459854015
,
"acc_stderr"
:
0.01583743878453324
},
"xwinograd_en"
:
{
"acc"
:
0.8675268817204301
,
"acc_stderr"
:
0.007032136436579812
},
"xwinograd_ru"
:
{
"acc"
:
0.707936507936508
,
"acc_stderr"
:
0.02566084582577463
},
"xwinograd_zh"
:
{
"acc"
:
0.7003968253968254
,
"acc_stderr"
:
0.020424963888406065
},
"xwinograd_fr"
:
{
"acc"
:
0.6867469879518072
,
"acc_stderr"
:
0.051219942106581456
}
},
"versions"
:
{
"xwinograd_pt"
:
0
,
"xwinograd_jp"
:
0
,
"xwinograd_en"
:
0
,
"xwinograd_ru"
:
0
,
"xwinograd_zh"
:
0
,
"xwinograd_fr"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-30B/README.md
0 → 100644
View file @
4a0b0d6e
# llama-30B
## llama-30B_bbh_3-shot.json
| Task |Version| Metric |Value| |Stderr|
|------------------------------------------------|------:|---------------------|----:|---|-----:|
|bigbench_causal_judgement | 0|multiple_choice_grade|57.37|± | 3.60|
|bigbench_date_understanding | 0|multiple_choice_grade|69.92|± | 2.39|
|bigbench_disambiguation_qa | 0|multiple_choice_grade|54.26|± | 3.11|
|bigbench_dyck_languages | 0|multiple_choice_grade|21.20|± | 1.29|
|bigbench_formal_fallacies_syllogisms_negation | 0|multiple_choice_grade|50.58|± | 0.42|
|bigbench_geometric_shapes | 0|multiple_choice_grade|27.86|± | 2.37|
| | |exact_str_match | 0.00|± | 0.00|
|bigbench_hyperbaton | 0|multiple_choice_grade|51.52|± | 0.22|
|bigbench_logical_deduction_five_objects | 0|multiple_choice_grade|36.80|± | 2.16|
|bigbench_logical_deduction_seven_objects | 0|multiple_choice_grade|25.29|± | 1.64|
|bigbench_logical_deduction_three_objects | 0|multiple_choice_grade|53.00|± | 2.89|
|bigbench_movie_recommendation | 0|multiple_choice_grade|63.20|± | 2.16|
|bigbench_navigate | 0|multiple_choice_grade|49.00|± | 1.58|
|bigbench_reasoning_about_colored_objects | 0|multiple_choice_grade|55.65|± | 1.11|
|bigbench_ruin_names | 0|multiple_choice_grade|39.73|± | 2.31|
|bigbench_salient_translation_error_detection | 0|multiple_choice_grade|19.84|± | 1.26|
|bigbench_snarks | 0|multiple_choice_grade|46.96|± | 3.72|
|bigbench_sports_understanding | 0|multiple_choice_grade|62.37|± | 1.54|
|bigbench_temporal_sequences | 0|multiple_choice_grade|14.60|± | 1.12|
|bigbench_tracking_shuffled_objects_five_objects | 0|multiple_choice_grade|21.28|± | 1.16|
|bigbench_tracking_shuffled_objects_seven_objects| 0|multiple_choice_grade|15.49|± | 0.87|
|bigbench_tracking_shuffled_objects_three_objects| 0|multiple_choice_grade|53.00|± | 2.89|
## llama-30B_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |46.76|± | 1.46|
| | |acc_norm|45.48|± | 1.46|
|arc_easy | 0|acc |75.34|± | 0.88|
| | |acc_norm|58.96|± | 1.01|
|boolq | 1|acc |68.41|± | 0.81|
|copa | 0|acc |90.00|± | 3.02|
|hellaswag | 0|acc |62.65|± | 0.48|
| | |acc_norm|79.24|± | 0.40|
|mc_taco | 0|em |11.41| | |
| | |f1 |48.36| | |
|openbookqa | 0|acc |29.40|± | 2.04|
| | |acc_norm|42.00|± | 2.21|
|piqa | 0|acc |80.96|± | 0.92|
| | |acc_norm|80.09|± | 0.93|
|prost | 0|acc |25.99|± | 0.32|
| | |acc_norm|29.11|± | 0.33|
|swag | 0|acc |58.61|± | 0.35|
| | |acc_norm|70.36|± | 0.32|
|winogrande | 0|acc |72.77|± | 1.25|
|wsc273 | 0|acc |86.81|± | 2.05|
## llama-30B_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc |30.48|± | 1.27|
## llama-30B_human_alignment_0-shot.json
| Task |Version| Metric | Value | |Stderr|
|---------------------------------------|------:|---------------------|------:|---|-----:|
|crows_pairs_english_age | 0|likelihood_difference| 512.91|± | 58.13|
| | |pct_stereotype | 58.24|± | 5.20|
|crows_pairs_english_autre | 0|likelihood_difference|1138.07|± |348.77|
| | |pct_stereotype | 63.64|± | 15.21|
|crows_pairs_english_disability | 0|likelihood_difference| 888.65|± |103.42|
| | |pct_stereotype | 53.85|± | 6.23|
|crows_pairs_english_gender | 0|likelihood_difference| 666.15|± | 42.85|
| | |pct_stereotype | 54.06|± | 2.79|
|crows_pairs_english_nationality | 0|likelihood_difference| 587.28|± | 39.94|
| | |pct_stereotype | 53.24|± | 3.40|
|crows_pairs_english_physical_appearance| 0|likelihood_difference| 540.10|± | 59.14|
| | |pct_stereotype | 52.78|± | 5.92|
|crows_pairs_english_race_color | 0|likelihood_difference| 768.21|± | 39.14|
| | |pct_stereotype | 56.10|± | 2.20|
|crows_pairs_english_religion | 0|likelihood_difference| 807.57|± | 94.38|
| | |pct_stereotype | 62.16|± | 4.62|
|crows_pairs_english_sexual_orientation | 0|likelihood_difference| 754.77|± | 76.83|
| | |pct_stereotype | 63.44|± | 5.02|
|crows_pairs_english_socioeconomic | 0|likelihood_difference| 730.39|± | 54.63|
| | |pct_stereotype | 53.68|± | 3.63|
|crows_pairs_french_age | 0|likelihood_difference| 892.50|± |101.09|
| | |pct_stereotype | 40.00|± | 5.19|
|crows_pairs_french_autre | 0|likelihood_difference| 637.98|± |165.68|
| | |pct_stereotype | 61.54|± | 14.04|
|crows_pairs_french_disability | 0|likelihood_difference|1020.27|± |126.17|
| | |pct_stereotype | 56.06|± | 6.16|
|crows_pairs_french_gender | 0|likelihood_difference|1373.28|± |110.30|
| | |pct_stereotype | 50.16|± | 2.80|
|crows_pairs_french_nationality | 0|likelihood_difference| 985.10|± | 89.08|
| | |pct_stereotype | 38.74|± | 3.07|
|crows_pairs_french_physical_appearance | 0|likelihood_difference| 821.79|± |132.68|
| | |pct_stereotype | 56.94|± | 5.88|
|crows_pairs_french_race_color | 0|likelihood_difference|1061.17|± | 76.68|
| | |pct_stereotype | 41.74|± | 2.30|
|crows_pairs_french_religion | 0|likelihood_difference| 794.02|± | 93.89|
| | |pct_stereotype | 56.52|± | 4.64|
|crows_pairs_french_sexual_orientation | 0|likelihood_difference| 989.08|± |161.13|
| | |pct_stereotype | 71.43|± | 4.76|
|crows_pairs_french_socioeconomic | 0|likelihood_difference| 831.29|± | 87.37|
| | |pct_stereotype | 52.55|± | 3.58|
|ethics_cm | 0|acc | 57.50|± | 0.79|
|ethics_deontology | 0|acc | 54.17|± | 0.83|
| | |em | 6.12| | |
|ethics_justice | 0|acc | 51.70|± | 0.96|
| | |em | 1.33| | |
|ethics_utilitarianism | 0|acc | 50.12|± | 0.72|
|ethics_utilitarianism_original | 0|acc | 93.97|± | 0.34|
|ethics_virtue | 0|acc | 51.82|± | 0.71|
| | |em | 8.14| | |
|toxigen | 0|acc | 42.66|± | 1.61|
| | |acc_norm | 43.19|± | 1.62|
## llama-30B_mathematical_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 3.83|± | 0.20|
| | |f1 |13.91|± | 0.25|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 2.95|± | 0.49|
|math_asdiv | 0|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 4.01|± | 0.90|
|math_geometry | 1|acc | 1.46|± | 0.55|
|math_intermediate_algebra| 1|acc | 0.89|± | 0.31|
|math_num_theory | 1|acc | 2.96|± | 0.73|
|math_prealgebra | 1|acc | 4.13|± | 0.67|
|math_precalc | 1|acc | 1.83|± | 0.57|
|mathqa | 0|acc |30.59|± | 0.84|
| | |acc_norm|30.89|± | 0.85|
## llama-30B_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 0.84|± | 0.09|
| | |f1 | 1.65|± | 0.10|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.00|± | 0.00|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.00|± | 0.00|
|math_prealgebra | 1|acc | 0.11|± | 0.11|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |34.74|± | 0.87|
| | |acc_norm|34.54|± | 0.87|
## llama-30B_mmlu_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------------------------------|------:|--------|----:|---|-----:|
|hendrycksTest-abstract_algebra | 0|acc |26.00|± | 4.41|
| | |acc_norm|29.00|± | 4.56|
|hendrycksTest-anatomy | 0|acc |51.85|± | 4.32|
| | |acc_norm|40.74|± | 4.24|
|hendrycksTest-astronomy | 0|acc |57.24|± | 4.03|
| | |acc_norm|56.58|± | 4.03|
|hendrycksTest-business_ethics | 0|acc |67.00|± | 4.73|
| | |acc_norm|48.00|± | 5.02|
|hendrycksTest-clinical_knowledge | 0|acc |53.21|± | 3.07|
| | |acc_norm|46.42|± | 3.07|
|hendrycksTest-college_biology | 0|acc |61.11|± | 4.08|
| | |acc_norm|42.36|± | 4.13|
|hendrycksTest-college_chemistry | 0|acc |31.00|± | 4.65|
| | |acc_norm|32.00|± | 4.69|
|hendrycksTest-college_computer_science | 0|acc |43.00|± | 4.98|
| | |acc_norm|34.00|± | 4.76|
|hendrycksTest-college_mathematics | 0|acc |37.00|± | 4.85|
| | |acc_norm|30.00|± | 4.61|
|hendrycksTest-college_medicine | 0|acc |51.45|± | 3.81|
| | |acc_norm|43.35|± | 3.78|
|hendrycksTest-college_physics | 0|acc |23.53|± | 4.22|
| | |acc_norm|29.41|± | 4.53|
|hendrycksTest-computer_security | 0|acc |66.00|± | 4.76|
| | |acc_norm|58.00|± | 4.96|
|hendrycksTest-conceptual_physics | 0|acc |51.06|± | 3.27|
| | |acc_norm|32.77|± | 3.07|
|hendrycksTest-econometrics | 0|acc |35.09|± | 4.49|
| | |acc_norm|31.58|± | 4.37|
|hendrycksTest-electrical_engineering | 0|acc |51.72|± | 4.16|
| | |acc_norm|38.62|± | 4.06|
|hendrycksTest-elementary_mathematics | 0|acc |44.18|± | 2.56|
| | |acc_norm|37.04|± | 2.49|
|hendrycksTest-formal_logic | 0|acc |42.06|± | 4.42|
| | |acc_norm|39.68|± | 4.38|
|hendrycksTest-global_facts | 0|acc |47.00|± | 5.02|
| | |acc_norm|37.00|± | 4.85|
|hendrycksTest-high_school_biology | 0|acc |67.10|± | 2.67|
| | |acc_norm|54.52|± | 2.83|
|hendrycksTest-high_school_chemistry | 0|acc |39.90|± | 3.45|
| | |acc_norm|36.95|± | 3.40|
|hendrycksTest-high_school_computer_science | 0|acc |61.00|± | 4.90|
| | |acc_norm|47.00|± | 5.02|
|hendrycksTest-high_school_european_history | 0|acc |69.70|± | 3.59|
| | |acc_norm|56.36|± | 3.87|
|hendrycksTest-high_school_geography | 0|acc |75.76|± | 3.05|
| | |acc_norm|55.05|± | 3.54|
|hendrycksTest-high_school_government_and_politics| 0|acc |80.83|± | 2.84|
| | |acc_norm|61.14|± | 3.52|
|hendrycksTest-high_school_macroeconomics | 0|acc |51.54|± | 2.53|
| | |acc_norm|41.54|± | 2.50|
|hendrycksTest-high_school_mathematics | 0|acc |25.93|± | 2.67|
| | |acc_norm|31.48|± | 2.83|
|hendrycksTest-high_school_microeconomics | 0|acc |58.40|± | 3.20|
| | |acc_norm|48.32|± | 3.25|
|hendrycksTest-high_school_physics | 0|acc |31.79|± | 3.80|
| | |acc_norm|31.13|± | 3.78|
|hendrycksTest-high_school_psychology | 0|acc |77.06|± | 1.80|
| | |acc_norm|55.41|± | 2.13|
|hendrycksTest-high_school_statistics | 0|acc |43.52|± | 3.38|
| | |acc_norm|35.65|± | 3.27|
|hendrycksTest-high_school_us_history | 0|acc |72.06|± | 3.15|
| | |acc_norm|55.39|± | 3.49|
|hendrycksTest-high_school_world_history | 0|acc |69.62|± | 2.99|
| | |acc_norm|56.96|± | 3.22|
|hendrycksTest-human_aging | 0|acc |67.26|± | 3.15|
| | |acc_norm|36.32|± | 3.23|
|hendrycksTest-human_sexuality | 0|acc |70.23|± | 4.01|
| | |acc_norm|46.56|± | 4.37|
|hendrycksTest-international_law | 0|acc |70.25|± | 4.17|
| | |acc_norm|76.86|± | 3.85|
|hendrycksTest-jurisprudence | 0|acc |66.67|± | 4.56|
| | |acc_norm|55.56|± | 4.80|
|hendrycksTest-logical_fallacies | 0|acc |69.94|± | 3.60|
| | |acc_norm|53.99|± | 3.92|
|hendrycksTest-machine_learning | 0|acc |40.18|± | 4.65|
| | |acc_norm|30.36|± | 4.36|
|hendrycksTest-management | 0|acc |71.84|± | 4.45|
| | |acc_norm|55.34|± | 4.92|
|hendrycksTest-marketing | 0|acc |84.62|± | 2.36|
| | |acc_norm|76.50|± | 2.78|
|hendrycksTest-medical_genetics | 0|acc |60.00|± | 4.92|
| | |acc_norm|54.00|± | 5.01|
|hendrycksTest-miscellaneous | 0|acc |81.86|± | 1.38|
| | |acc_norm|61.43|± | 1.74|
|hendrycksTest-moral_disputes | 0|acc |61.85|± | 2.62|
| | |acc_norm|45.95|± | 2.68|
|hendrycksTest-moral_scenarios | 0|acc |34.30|± | 1.59|
| | |acc_norm|27.26|± | 1.49|
|hendrycksTest-nutrition | 0|acc |61.11|± | 2.79|
| | |acc_norm|50.33|± | 2.86|
|hendrycksTest-philosophy | 0|acc |67.52|± | 2.66|
| | |acc_norm|50.16|± | 2.84|
|hendrycksTest-prehistory | 0|acc |66.36|± | 2.63|
| | |acc_norm|42.90|± | 2.75|
|hendrycksTest-professional_accounting | 0|acc |39.72|± | 2.92|
| | |acc_norm|33.69|± | 2.82|
|hendrycksTest-professional_law | 0|acc |40.03|± | 1.25|
| | |acc_norm|34.35|± | 1.21|
|hendrycksTest-professional_medicine | 0|acc |55.51|± | 3.02|
| | |acc_norm|35.66|± | 2.91|
|hendrycksTest-professional_psychology | 0|acc |58.82|± | 1.99|
| | |acc_norm|43.30|± | 2.00|
|hendrycksTest-public_relations | 0|acc |64.55|± | 4.58|
| | |acc_norm|40.91|± | 4.71|
|hendrycksTest-security_studies | 0|acc |57.14|± | 3.17|
| | |acc_norm|40.41|± | 3.14|
|hendrycksTest-sociology | 0|acc |76.12|± | 3.01|
| | |acc_norm|66.17|± | 3.35|
|hendrycksTest-us_foreign_policy | 0|acc |79.00|± | 4.09|
| | |acc_norm|59.00|± | 4.94|
|hendrycksTest-virology | 0|acc |49.40|± | 3.89|
| | |acc_norm|34.34|± | 3.70|
|hendrycksTest-world_religions | 0|acc |81.29|± | 2.99|
| | |acc_norm|76.61|± | 3.25|
## llama-30B_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |58.20|± | 1.10|
|pawsx_en| 0|acc |58.75|± | 1.10|
|pawsx_es| 0|acc |55.80|± | 1.11|
|pawsx_fr| 0|acc |52.85|± | 1.12|
|pawsx_ja| 0|acc |46.75|± | 1.12|
|pawsx_ko| 0|acc |45.70|± | 1.11|
|pawsx_zh| 0|acc |45.90|± | 1.11|
## llama-30B_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 47.2|± | 2.23|
|xcopa_ht| 0|acc | 51.8|± | 2.24|
|xcopa_id| 0|acc | 60.6|± | 2.19|
|xcopa_it| 0|acc | 71.4|± | 2.02|
|xcopa_qu| 0|acc | 49.4|± | 2.24|
|xcopa_sw| 0|acc | 52.4|± | 2.24|
|xcopa_ta| 0|acc | 53.2|± | 2.23|
|xcopa_th| 0|acc | 54.6|± | 2.23|
|xcopa_tr| 0|acc | 52.2|± | 2.24|
|xcopa_vi| 0|acc | 52.4|± | 2.24|
|xcopa_zh| 0|acc | 62.2|± | 2.17|
## llama-30B_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |34.49|± | 0.67|
|xnli_bg| 0|acc |38.52|± | 0.69|
|xnli_de| 0|acc |43.87|± | 0.70|
|xnli_el| 0|acc |34.91|± | 0.67|
|xnli_en| 0|acc |48.18|± | 0.71|
|xnli_es| 0|acc |40.24|± | 0.69|
|xnli_fr| 0|acc |42.95|± | 0.70|
|xnli_hi| 0|acc |36.47|± | 0.68|
|xnli_ru| 0|acc |38.12|± | 0.69|
|xnli_sw| 0|acc |34.09|± | 0.67|
|xnli_th| 0|acc |33.97|± | 0.67|
|xnli_tr| 0|acc |36.53|± | 0.68|
|xnli_ur| 0|acc |34.31|± | 0.67|
|xnli_vi| 0|acc |35.67|± | 0.68|
|xnli_zh| 0|acc |33.51|± | 0.67|
## llama-30B_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |50.89|± | 1.29|
|xstory_cloze_en| 0|acc |78.16|± | 1.06|
|xstory_cloze_es| 0|acc |70.81|± | 1.17|
|xstory_cloze_eu| 0|acc |51.36|± | 1.29|
|xstory_cloze_hi| 0|acc |56.65|± | 1.28|
|xstory_cloze_id| 0|acc |59.23|± | 1.26|
|xstory_cloze_my| 0|acc |48.78|± | 1.29|
|xstory_cloze_ru| 0|acc |66.71|± | 1.21|
|xstory_cloze_sw| 0|acc |50.63|± | 1.29|
|xstory_cloze_te| 0|acc |53.21|± | 1.28|
|xstory_cloze_zh| 0|acc |58.57|± | 1.27|
## llama-30B_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |87.40|± | 0.69|
|xwinograd_fr| 0|acc |73.49|± | 4.87|
|xwinograd_jp| 0|acc |67.36|± | 1.51|
|xwinograd_pt| 0|acc |76.81|± | 2.61|
|xwinograd_ru| 0|acc |66.98|± | 2.65|
|xwinograd_zh| 0|acc |71.23|± | 2.02|
results/llama/llama-30B/llama-30B_bbh_3-shot.json
0 → 100644
View file @
4a0b0d6e
{
"results"
:
{
"bigbench_hyperbaton"
:
{
"multiple_choice_grade"
:
0.51524
,
"multiple_choice_grade_stderr"
:
0.0022350513992069
},
"bigbench_salient_translation_error_detection"
:
{
"multiple_choice_grade"
:
0.19839679358717435
,
"multiple_choice_grade_stderr"
:
0.012629887094728112
},
"bigbench_geometric_shapes"
:
{
"multiple_choice_grade"
:
0.2785515320334262
,
"multiple_choice_grade_stderr"
:
0.023692665345206258
,
"exact_str_match"
:
0.0
,
"exact_str_match_stderr"
:
0.0
},
"bigbench_navigate"
:
{
"multiple_choice_grade"
:
0.49
,
"multiple_choice_grade_stderr"
:
0.015816135752773193
},
"bigbench_date_understanding"
:
{
"multiple_choice_grade"
:
0.6991869918699187
,
"multiple_choice_grade_stderr"
:
0.023906779002093273
},
"bigbench_disambiguation_qa"
:
{
"multiple_choice_grade"
:
0.5426356589147286
,
"multiple_choice_grade_stderr"
:
0.031075544990472662
},
"bigbench_tracking_shuffled_objects_three_objects"
:
{
"multiple_choice_grade"
:
0.53
,
"multiple_choice_grade_stderr"
:
0.02886365132641709
},
"bigbench_dyck_languages"
:
{
"multiple_choice_grade"
:
0.212
,
"multiple_choice_grade_stderr"
:
0.01293148186493804
},
"bigbench_formal_fallacies_syllogisms_negation"
:
{
"multiple_choice_grade"
:
0.5058450704225352
,
"multiple_choice_grade_stderr"
:
0.004195767817554208
},
"bigbench_tracking_shuffled_objects_seven_objects"
:
{
"multiple_choice_grade"
:
0.15485714285714286
,
"multiple_choice_grade_stderr"
:
0.00865039181414196
},
"bigbench_causal_judgement"
:
{
"multiple_choice_grade"
:
0.5736842105263158
,
"multiple_choice_grade_stderr"
:
0.03597255252302466
},
"bigbench_movie_recommendation"
:
{
"multiple_choice_grade"
:
0.632
,
"multiple_choice_grade_stderr"
:
0.02158898256835354
},
"bigbench_tracking_shuffled_objects_five_objects"
:
{
"multiple_choice_grade"
:
0.2128
,
"multiple_choice_grade_stderr"
:
0.01158102863217863
},
"bigbench_snarks"
:
{
"multiple_choice_grade"
:
0.4696132596685083
,
"multiple_choice_grade_stderr"
:
0.03719891321680327
},
"bigbench_sports_understanding"
:
{
"multiple_choice_grade"
:
0.6237322515212982
,
"multiple_choice_grade_stderr"
:
0.01543581207286162
},
"bigbench_logical_deduction_seven_objects"
:
{
"multiple_choice_grade"
:
0.25285714285714284
,
"multiple_choice_grade_stderr"
:
0.01643996352811702
},
"bigbench_temporal_sequences"
:
{
"multiple_choice_grade"
:
0.146
,
"multiple_choice_grade_stderr"
:
0.011171786285496496
},
"bigbench_logical_deduction_five_objects"
:
{
"multiple_choice_grade"
:
0.368
,
"multiple_choice_grade_stderr"
:
0.021588982568353548
},
"bigbench_ruin_names"
:
{
"multiple_choice_grade"
:
0.39732142857142855
,
"multiple_choice_grade_stderr"
:
0.023145155753004788
},
"bigbench_logical_deduction_three_objects"
:
{
"multiple_choice_grade"
:
0.53
,
"multiple_choice_grade_stderr"
:
0.02886365132641709
},
"bigbench_reasoning_about_colored_objects"
:
{
"multiple_choice_grade"
:
0.5565
,
"multiple_choice_grade_stderr"
:
0.011111507899646487
}
},
"versions"
:
{
"bigbench_hyperbaton"
:
0
,
"bigbench_salient_translation_error_detection"
:
0
,
"bigbench_geometric_shapes"
:
0
,
"bigbench_navigate"
:
0
,
"bigbench_date_understanding"
:
0
,
"bigbench_disambiguation_qa"
:
0
,
"bigbench_tracking_shuffled_objects_three_objects"
:
0
,
"bigbench_dyck_languages"
:
0
,
"bigbench_formal_fallacies_syllogisms_negation"
:
0
,
"bigbench_tracking_shuffled_objects_seven_objects"
:
0
,
"bigbench_causal_judgement"
:
0
,
"bigbench_movie_recommendation"
:
0
,
"bigbench_tracking_shuffled_objects_five_objects"
:
0
,
"bigbench_snarks"
:
0
,
"bigbench_sports_understanding"
:
0
,
"bigbench_logical_deduction_seven_objects"
:
0
,
"bigbench_temporal_sequences"
:
0
,
"bigbench_logical_deduction_five_objects"
:
0
,
"bigbench_ruin_names"
:
0
,
"bigbench_logical_deduction_three_objects"
:
0
,
"bigbench_reasoning_about_colored_objects"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/30B,use_accelerate=True"
,
"num_fewshot"
:
3
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-30B/llama-30B_common_sense_reasoning_0-shot.json
0 → 100644
View file @
4a0b0d6e
{
"results"
:
{
"hellaswag"
:
{
"acc"
:
0.6264688309101772
,
"acc_stderr"
:
0.00482752658488968
,
"acc_norm"
:
0.7923720374427405
,
"acc_norm_stderr"
:
0.00404779964623464
},
"copa"
:
{
"acc"
:
0.9
,
"acc_stderr"
:
0.030151134457776348
},
"prost"
:
{
"acc"
:
0.2598740392826644
,
"acc_stderr"
:
0.003204110008963041
,
"acc_norm"
:
0.2910973526900085
,
"acc_norm_stderr"
:
0.003318834364612203
},
"boolq"
:
{
"acc"
:
0.6840978593272171
,
"acc_stderr"
:
0.008130700051380873
},
"mc_taco"
:
{
"em"
:
0.11411411411411411
,
"f1"
:
0.48361974757894227
},
"winogrande"
:
{
"acc"
:
0.7277032359905288
,
"acc_stderr"
:
0.012510697991453936
},
"arc_challenge"
:
{
"acc"
:
0.46757679180887374
,
"acc_stderr"
:
0.014580637569995423
,
"acc_norm"
:
0.454778156996587
,
"acc_norm_stderr"
:
0.014551507060836352
},
"wsc273"
:
{
"acc"
:
0.8681318681318682
,
"acc_stderr"
:
0.020515321360773595
},
"openbookqa"
:
{
"acc"
:
0.294
,
"acc_stderr"
:
0.020395095484936603
,
"acc_norm"
:
0.42
,
"acc_norm_stderr"
:
0.02209471322976178
},
"swag"
:
{
"acc"
:
0.5861241627511746
,
"acc_stderr"
:
0.0034822550028030703
,
"acc_norm"
:
0.7036389083275018
,
"acc_norm_stderr"
:
0.0032286148364766096
},
"arc_easy"
:
{
"acc"
:
0.7533670033670034
,
"acc_stderr"
:
0.008844984581934908
,
"acc_norm"
:
0.5896464646464646
,
"acc_norm_stderr"
:
0.01009353125576545
},
"piqa"
:
{
"acc"
:
0.809575625680087
,
"acc_stderr"
:
0.009160842206469637
,
"acc_norm"
:
0.8008705114254625
,
"acc_norm_stderr"
:
0.009317391893706834
}
},
"versions"
:
{
"hellaswag"
:
0
,
"copa"
:
0
,
"prost"
:
0
,
"boolq"
:
1
,
"mc_taco"
:
0
,
"winogrande"
:
0
,
"arc_challenge"
:
0
,
"wsc273"
:
0
,
"openbookqa"
:
0
,
"swag"
:
0
,
"arc_easy"
:
0
,
"piqa"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/30B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-30B/llama-30B_gsm8k_8-shot.json
0 → 100644
View file @
4a0b0d6e
{
"results"
:
{
"gsm8k"
:
{
"acc"
:
0.30477634571645185
,
"acc_stderr"
:
0.012679297549515422
}
},
"versions"
:
{
"gsm8k"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/30B,use_accelerate=True"
,
"num_fewshot"
:
8
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-30B/llama-30B_human_alignment_0-shot.json
0 → 100644
View file @
4a0b0d6e
{
"results"
:
{
"crows_pairs_french_disability"
:
{
"likelihood_difference"
:
10.202651515151516
,
"likelihood_difference_stderr"
:
1.261700816634343
,
"pct_stereotype"
:
0.5606060606060606
,
"pct_stereotype_stderr"
:
0.06156009014560979
},
"crows_pairs_french_religion"
:
{
"likelihood_difference"
:
7.940217391304348
,
"likelihood_difference_stderr"
:
0.938898141048901
,
"pct_stereotype"
:
0.5652173913043478
,
"pct_stereotype_stderr"
:
0.04642922286356427
},
"crows_pairs_french_sexual_orientation"
:
{
"likelihood_difference"
:
9.890796703296703
,
"likelihood_difference_stderr"
:
1.6112974194891465
,
"pct_stereotype"
:
0.7142857142857143
,
"pct_stereotype_stderr"
:
0.04761904761904758
},
"toxigen"
:
{
"acc"
:
0.42659574468085104
,
"acc_stderr"
:
0.01614008877637632
,
"acc_norm"
:
0.4319148936170213
,
"acc_norm_stderr"
:
0.016164899004911828
},
"crows_pairs_english_gender"
:
{
"likelihood_difference"
:
6.6615234375
,
"likelihood_difference_stderr"
:
0.4284975339207996
,
"pct_stereotype"
:
0.540625
,
"pct_stereotype_stderr"
:
0.02790206840430007
},
"crows_pairs_english_age"
:
{
"likelihood_difference"
:
5.1291208791208796
,
"likelihood_difference_stderr"
:
0.5813404620923356
,
"pct_stereotype"
:
0.5824175824175825
,
"pct_stereotype_stderr"
:
0.05198368783767557
},
"crows_pairs_english_disability"
:
{
"likelihood_difference"
:
8.886538461538462
,
"likelihood_difference_stderr"
:
1.0342476212707912
,
"pct_stereotype"
:
0.5384615384615384
,
"pct_stereotype_stderr"
:
0.06231481440776789
},
"crows_pairs_french_age"
:
{
"likelihood_difference"
:
8.925
,
"likelihood_difference_stderr"
:
1.01086298976785
,
"pct_stereotype"
:
0.4
,
"pct_stereotype_stderr"
:
0.05192907868894985
},
"ethics_utilitarianism"
:
{
"acc"
:
0.5012479201331115
,
"acc_stderr"
:
0.00721159934497283
},
"crows_pairs_english_physical_appearance"
:
{
"likelihood_difference"
:
5.401041666666667
,
"likelihood_difference_stderr"
:
0.5913652974915496
,
"pct_stereotype"
:
0.5277777777777778
,
"pct_stereotype_stderr"
:
0.05924743948371486
},
"crows_pairs_french_socioeconomic"
:
{
"likelihood_difference"
:
8.312898596938776
,
"likelihood_difference_stderr"
:
0.8737467813045966
,
"pct_stereotype"
:
0.5255102040816326
,
"pct_stereotype_stderr"
:
0.03575911069046443
},
"crows_pairs_english_nationality"
:
{
"likelihood_difference"
:
5.872829861111111
,
"likelihood_difference_stderr"
:
0.3994396285401545
,
"pct_stereotype"
:
0.5324074074074074
,
"pct_stereotype_stderr"
:
0.03402801581358966
},
"ethics_cm"
:
{
"acc"
:
0.5750321750321751
,
"acc_stderr"
:
0.007932032541825585
},
"crows_pairs_french_gender"
:
{
"likelihood_difference"
:
13.732768691588785
,
"likelihood_difference_stderr"
:
1.1030097530113459
,
"pct_stereotype"
:
0.5015576323987538
,
"pct_stereotype_stderr"
:
0.027950714088670354
},
"crows_pairs_french_nationality"
:
{
"likelihood_difference"
:
9.851037549407115
,
"likelihood_difference_stderr"
:
0.8908345552184256
,
"pct_stereotype"
:
0.38735177865612647
,
"pct_stereotype_stderr"
:
0.03068725875850367
},
"ethics_deontology"
:
{
"acc"
:
0.5417130144605117
,
"acc_stderr"
:
0.008310055982844088
,
"em"
:
0.06117908787541713
},
"ethics_utilitarianism_original"
:
{
"acc"
:
0.9396838602329451
,
"acc_stderr"
:
0.0034337651785718414
},
"crows_pairs_english_sexual_orientation"
:
{
"likelihood_difference"
:
7.547715053763441
,
"likelihood_difference_stderr"
:
0.7682550004765589
,
"pct_stereotype"
:
0.6344086021505376
,
"pct_stereotype_stderr"
:
0.05020981279330232
},
"crows_pairs_english_religion"
:
{
"likelihood_difference"
:
8.075731981981981
,
"likelihood_difference_stderr"
:
0.9438303669276185
,
"pct_stereotype"
:
0.6216216216216216
,
"pct_stereotype_stderr"
:
0.04624128233851482
},
"ethics_justice"
:
{
"acc"
:
0.5170118343195266
,
"acc_stderr"
:
0.009611595027307154
,
"em"
:
0.013313609467455622
},
"ethics_virtue"
:
{
"acc"
:
0.5181909547738693
,
"acc_stderr"
:
0.007084831046245509
,
"em"
:
0.0814070351758794
},
"crows_pairs_english_race_color"
:
{
"likelihood_difference"
:
7.68214812992126
,
"likelihood_difference_stderr"
:
0.3913516470344277
,
"pct_stereotype"
:
0.5610236220472441
,
"pct_stereotype_stderr"
:
0.022039775660119297
},
"crows_pairs_english_autre"
:
{
"likelihood_difference"
:
11.380681818181818
,
"likelihood_difference_stderr"
:
3.487665507491904
,
"pct_stereotype"
:
0.6363636363636364
,
"pct_stereotype_stderr"
:
0.15212000482437738
},
"crows_pairs_french_race_color"
:
{
"likelihood_difference"
:
10.611684782608696
,
"likelihood_difference_stderr"
:
0.7668117638923473
,
"pct_stereotype"
:
0.41739130434782606
,
"pct_stereotype_stderr"
:
0.023017271312104015
},
"crows_pairs_french_physical_appearance"
:
{
"likelihood_difference"
:
8.217881944444445
,
"likelihood_difference_stderr"
:
1.3267643213128657
,
"pct_stereotype"
:
0.5694444444444444
,
"pct_stereotype_stderr"
:
0.05876396677084613
},
"crows_pairs_french_autre"
:
{
"likelihood_difference"
:
6.3798076923076925
,
"likelihood_difference_stderr"
:
1.6568389364513447
,
"pct_stereotype"
:
0.6153846153846154
,
"pct_stereotype_stderr"
:
0.1404416814115811
},
"crows_pairs_english_socioeconomic"
:
{
"likelihood_difference"
:
7.303947368421053
,
"likelihood_difference_stderr"
:
0.5463280290787818
,
"pct_stereotype"
:
0.5368421052631579
,
"pct_stereotype_stderr"
:
0.036270781985214155
}
},
"versions"
:
{
"crows_pairs_french_disability"
:
0
,
"crows_pairs_french_religion"
:
0
,
"crows_pairs_french_sexual_orientation"
:
0
,
"toxigen"
:
0
,
"crows_pairs_english_gender"
:
0
,
"crows_pairs_english_age"
:
0
,
"crows_pairs_english_disability"
:
0
,
"crows_pairs_french_age"
:
0
,
"ethics_utilitarianism"
:
0
,
"crows_pairs_english_physical_appearance"
:
0
,
"crows_pairs_french_socioeconomic"
:
0
,
"crows_pairs_english_nationality"
:
0
,
"ethics_cm"
:
0
,
"crows_pairs_french_gender"
:
0
,
"crows_pairs_french_nationality"
:
0
,
"ethics_deontology"
:
0
,
"ethics_utilitarianism_original"
:
0
,
"crows_pairs_english_sexual_orientation"
:
0
,
"crows_pairs_english_religion"
:
0
,
"ethics_justice"
:
0
,
"ethics_virtue"
:
0
,
"crows_pairs_english_race_color"
:
0
,
"crows_pairs_english_autre"
:
0
,
"crows_pairs_french_race_color"
:
0
,
"crows_pairs_french_physical_appearance"
:
0
,
"crows_pairs_french_autre"
:
0
,
"crows_pairs_english_socioeconomic"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/30B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
false
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-30B/llama-30B_mathematical_reasoning_0-shot.json
0 → 100644
View file @
4a0b0d6e
{
"results"
:
{
"math_prealgebra"
:
{
"acc"
:
0.04133180252583238
,
"acc_stderr"
:
0.006748646916387575
},
"drop"
:
{
"em"
:
0.0382760067114094
,
"em_stderr"
:
0.0019648445106113135
,
"f1"
:
0.13911493288590454
,
"f1_stderr"
:
0.0024846240125468515
},
"math_intermediate_algebra"
:
{
"acc"
:
0.008859357696566999
,
"acc_stderr"
:
0.0031200782932944743
},
"gsm8k"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_num_theory"
:
{
"acc"
:
0.02962962962962963
,
"acc_stderr"
:
0.007303608618028771
},
"math_algebra"
:
{
"acc"
:
0.02948609941027801
,
"acc_stderr"
:
0.004912099985374022
},
"math_precalc"
:
{
"acc"
:
0.018315018315018316
,
"acc_stderr"
:
0.005743696731653661
},
"math_geometry"
:
{
"acc"
:
0.014613778705636743
,
"acc_stderr"
:
0.005488713443686309
},
"math_counting_and_prob"
:
{
"acc"
:
0.04008438818565401
,
"acc_stderr"
:
0.009019315660749231
},
"math_asdiv"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"mathqa"
:
{
"acc"
:
0.30586264656616413
,
"acc_stderr"
:
0.00843502782274867
,
"acc_norm"
:
0.3088777219430486
,
"acc_norm_stderr"
:
0.008458071062361336
}
},
"versions"
:
{
"math_prealgebra"
:
1
,
"drop"
:
1
,
"math_intermediate_algebra"
:
1
,
"gsm8k"
:
0
,
"math_asdiv"
:
0
,
"math_num_theory"
:
1
,
"math_algebra"
:
1
,
"math_precalc"
:
1
,
"math_geometry"
:
1
,
"math_counting_and_prob"
:
1
,
"mathqa"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/30B,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-30B/llama-30B_mathematical_reasoning_few_shot_5-shot.json
0 → 100644
View file @
4a0b0d6e
{
"results"
:
{
"math_prealgebra"
:
{
"acc"
:
0.001148105625717566
,
"acc_stderr"
:
0.001148105625717572
},
"math_intermediate_algebra"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_counting_and_prob"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_geometry"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_precalc"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"drop"
:
{
"em"
:
0.008389261744966443
,
"em_stderr"
:
0.0009340543216866975
,
"f1"
:
0.016472315436241603
,
"f1_stderr"
:
0.001049526866424092
},
"gsm8k"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_num_theory"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_algebra"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"mathqa"
:
{
"acc"
:
0.3474036850921273
,
"acc_stderr"
:
0.008716459359487392
,
"acc_norm"
:
0.34539363484087104
,
"acc_norm_stderr"
:
0.008704580930350191
}
},
"versions"
:
{
"math_prealgebra"
:
1
,
"math_intermediate_algebra"
:
1
,
"math_counting_and_prob"
:
1
,
"math_geometry"
:
1
,
"math_precalc"
:
1
,
"drop"
:
1
,
"mathqa"
:
0
,
"gsm8k"
:
0
,
"math_num_theory"
:
1
,
"math_algebra"
:
1
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/30B,use_accelerate=True"
,
"num_fewshot"
:
5
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-30B/llama-30B_mmlu_5-shot.json
0 → 100644
View file @
4a0b0d6e
{
"results"
:
{
"hendrycksTest-high_school_world_history"
:
{
"acc"
:
0.6962025316455697
,
"acc_stderr"
:
0.029936696387138598
,
"acc_norm"
:
0.569620253164557
,
"acc_norm_stderr"
:
0.032230171959375976
},
"hendrycksTest-formal_logic"
:
{
"acc"
:
0.42063492063492064
,
"acc_stderr"
:
0.04415438226743743
,
"acc_norm"
:
0.3968253968253968
,
"acc_norm_stderr"
:
0.043758884927270605
},
"hendrycksTest-human_aging"
:
{
"acc"
:
0.672645739910314
,
"acc_stderr"
:
0.03149384670994131
,
"acc_norm"
:
0.3632286995515695
,
"acc_norm_stderr"
:
0.032277904428505
},
"hendrycksTest-international_law"
:
{
"acc"
:
0.7024793388429752
,
"acc_stderr"
:
0.04173349148083499
,
"acc_norm"
:
0.768595041322314
,
"acc_norm_stderr"
:
0.03849856098794088
},
"hendrycksTest-security_studies"
:
{
"acc"
:
0.5714285714285714
,
"acc_stderr"
:
0.031680911612338825
,
"acc_norm"
:
0.40408163265306124
,
"acc_norm_stderr"
:
0.0314147080258659
},
"hendrycksTest-medical_genetics"
:
{
"acc"
:
0.6
,
"acc_stderr"
:
0.049236596391733084
,
"acc_norm"
:
0.54
,
"acc_norm_stderr"
:
0.05009082659620332
},
"hendrycksTest-econometrics"
:
{
"acc"
:
0.3508771929824561
,
"acc_stderr"
:
0.044895393502707
,
"acc_norm"
:
0.3157894736842105
,
"acc_norm_stderr"
:
0.043727482902780064
},
"hendrycksTest-high_school_macroeconomics"
:
{
"acc"
:
0.5153846153846153
,
"acc_stderr"
:
0.025339003010106515
,
"acc_norm"
:
0.4153846153846154
,
"acc_norm_stderr"
:
0.024985354923102332
},
"hendrycksTest-us_foreign_policy"
:
{
"acc"
:
0.79
,
"acc_stderr"
:
0.040936018074033256
,
"acc_norm"
:
0.59
,
"acc_norm_stderr"
:
0.049431107042371025
},
"hendrycksTest-logical_fallacies"
:
{
"acc"
:
0.6993865030674846
,
"acc_stderr"
:
0.03602511318806771
,
"acc_norm"
:
0.5398773006134969
,
"acc_norm_stderr"
:
0.039158572914369714
},
"hendrycksTest-prehistory"
:
{
"acc"
:
0.6635802469135802
,
"acc_stderr"
:
0.026289734945952926
,
"acc_norm"
:
0.42901234567901236
,
"acc_norm_stderr"
:
0.027538925613470867
},
"hendrycksTest-professional_psychology"
:
{
"acc"
:
0.5882352941176471
,
"acc_stderr"
:
0.019910377463105932
,
"acc_norm"
:
0.43300653594771243
,
"acc_norm_stderr"
:
0.02004544247332422
},
"hendrycksTest-professional_accounting"
:
{
"acc"
:
0.3971631205673759
,
"acc_stderr"
:
0.029189805673587105
,
"acc_norm"
:
0.33687943262411346
,
"acc_norm_stderr"
:
0.02819553487396673
},
"hendrycksTest-college_biology"
:
{
"acc"
:
0.6111111111111112
,
"acc_stderr"
:
0.04076663253918567
,
"acc_norm"
:
0.4236111111111111
,
"acc_norm_stderr"
:
0.04132125019723369
},
"hendrycksTest-high_school_biology"
:
{
"acc"
:
0.6709677419354839
,
"acc_stderr"
:
0.02672949906834996
,
"acc_norm"
:
0.5451612903225806
,
"acc_norm_stderr"
:
0.028327743091561074
},
"hendrycksTest-philosophy"
:
{
"acc"
:
0.6752411575562701
,
"acc_stderr"
:
0.02659678228769704
,
"acc_norm"
:
0.5016077170418006
,
"acc_norm_stderr"
:
0.02839794490780661
},
"hendrycksTest-high_school_european_history"
:
{
"acc"
:
0.696969696969697
,
"acc_stderr"
:
0.03588624800091707
,
"acc_norm"
:
0.5636363636363636
,
"acc_norm_stderr"
:
0.03872592983524754
},
"hendrycksTest-college_medicine"
:
{
"acc"
:
0.5144508670520231
,
"acc_stderr"
:
0.03810871630454764
,
"acc_norm"
:
0.43352601156069365
,
"acc_norm_stderr"
:
0.03778621079092055
},
"hendrycksTest-professional_medicine"
:
{
"acc"
:
0.5551470588235294
,
"acc_stderr"
:
0.03018753206032938
,
"acc_norm"
:
0.35661764705882354
,
"acc_norm_stderr"
:
0.02909720956841195
},
"hendrycksTest-moral_scenarios"
:
{
"acc"
:
0.34301675977653634
,
"acc_stderr"
:
0.015876912673057724
,
"acc_norm"
:
0.27262569832402234
,
"acc_norm_stderr"
:
0.014893391735249588
},
"hendrycksTest-high_school_chemistry"
:
{
"acc"
:
0.39901477832512317
,
"acc_stderr"
:
0.03445487686264716
,
"acc_norm"
:
0.3694581280788177
,
"acc_norm_stderr"
:
0.03395970381998573
},
"hendrycksTest-high_school_physics"
:
{
"acc"
:
0.31788079470198677
,
"acc_stderr"
:
0.038020397601079024
,
"acc_norm"
:
0.31125827814569534
,
"acc_norm_stderr"
:
0.03780445850526733
},
"hendrycksTest-high_school_government_and_politics"
:
{
"acc"
:
0.8082901554404145
,
"acc_stderr"
:
0.028408953626245282
,
"acc_norm"
:
0.6113989637305699
,
"acc_norm_stderr"
:
0.03517739796373132
},
"hendrycksTest-high_school_geography"
:
{
"acc"
:
0.7575757575757576
,
"acc_stderr"
:
0.030532892233932026
,
"acc_norm"
:
0.5505050505050505
,
"acc_norm_stderr"
:
0.0354413249194797
},
"hendrycksTest-global_facts"
:
{
"acc"
:
0.47
,
"acc_stderr"
:
0.05016135580465919
,
"acc_norm"
:
0.37
,
"acc_norm_stderr"
:
0.04852365870939099
},
"hendrycksTest-professional_law"
:
{
"acc"
:
0.4002607561929596
,
"acc_stderr"
:
0.012513582529136213
,
"acc_norm"
:
0.3435462842242503
,
"acc_norm_stderr"
:
0.012128961174190158
},
"hendrycksTest-college_mathematics"
:
{
"acc"
:
0.37
,
"acc_stderr"
:
0.048523658709391
,
"acc_norm"
:
0.3
,
"acc_norm_stderr"
:
0.046056618647183814
},
"hendrycksTest-college_physics"
:
{
"acc"
:
0.23529411764705882
,
"acc_stderr"
:
0.04220773659171452
,
"acc_norm"
:
0.29411764705882354
,
"acc_norm_stderr"
:
0.04533838195929774
},
"hendrycksTest-high_school_statistics"
:
{
"acc"
:
0.4351851851851852
,
"acc_stderr"
:
0.03381200005643525
,
"acc_norm"
:
0.35648148148148145
,
"acc_norm_stderr"
:
0.032664783315272714
},
"hendrycksTest-machine_learning"
:
{
"acc"
:
0.4017857142857143
,
"acc_stderr"
:
0.04653333146973646
,
"acc_norm"
:
0.30357142857142855
,
"acc_norm_stderr"
:
0.04364226155841044
},
"hendrycksTest-public_relations"
:
{
"acc"
:
0.6454545454545455
,
"acc_stderr"
:
0.045820048415054174
,
"acc_norm"
:
0.4090909090909091
,
"acc_norm_stderr"
:
0.047093069786618966
},
"hendrycksTest-high_school_computer_science"
:
{
"acc"
:
0.61
,
"acc_stderr"
:
0.04902071300001974
,
"acc_norm"
:
0.47
,
"acc_norm_stderr"
:
0.05016135580465919
},
"hendrycksTest-high_school_psychology"
:
{
"acc"
:
0.7706422018348624
,
"acc_stderr"
:
0.018025349724618684
,
"acc_norm"
:
0.5541284403669725
,
"acc_norm_stderr"
:
0.021311335009708582
},
"hendrycksTest-virology"
:
{
"acc"
:
0.4939759036144578
,
"acc_stderr"
:
0.03892212195333045
,
"acc_norm"
:
0.3433734939759036
,
"acc_norm_stderr"
:
0.03696584317010601
},
"hendrycksTest-marketing"
:
{
"acc"
:
0.8461538461538461
,
"acc_stderr"
:
0.023636873317489294
,
"acc_norm"
:
0.7649572649572649
,
"acc_norm_stderr"
:
0.027778835904935437
},
"hendrycksTest-human_sexuality"
:
{
"acc"
:
0.7022900763358778
,
"acc_stderr"
:
0.04010358942462203
,
"acc_norm"
:
0.46564885496183206
,
"acc_norm_stderr"
:
0.04374928560599738
},
"hendrycksTest-sociology"
:
{
"acc"
:
0.7611940298507462
,
"acc_stderr"
:
0.03014777593540922
,
"acc_norm"
:
0.6616915422885572
,
"acc_norm_stderr"
:
0.033455630703391914
},
"hendrycksTest-college_computer_science"
:
{
"acc"
:
0.43
,
"acc_stderr"
:
0.049756985195624284
,
"acc_norm"
:
0.34
,
"acc_norm_stderr"
:
0.04760952285695236
},
"hendrycksTest-conceptual_physics"
:
{
"acc"
:
0.5106382978723404
,
"acc_stderr"
:
0.03267862331014063
,
"acc_norm"
:
0.3276595744680851
,
"acc_norm_stderr"
:
0.030683020843231004
},
"hendrycksTest-anatomy"
:
{
"acc"
:
0.5185185185185185
,
"acc_stderr"
:
0.043163785995113245
,
"acc_norm"
:
0.4074074074074074
,
"acc_norm_stderr"
:
0.04244633238353228
},
"hendrycksTest-miscellaneous"
:
{
"acc"
:
0.8186462324393359
,
"acc_stderr"
:
0.013778693778464062
,
"acc_norm"
:
0.6143039591315453
,
"acc_norm_stderr"
:
0.017406476619212907
},
"hendrycksTest-jurisprudence"
:
{
"acc"
:
0.6666666666666666
,
"acc_stderr"
:
0.04557239513497751
,
"acc_norm"
:
0.5555555555555556
,
"acc_norm_stderr"
:
0.04803752235190193
},
"hendrycksTest-moral_disputes"
:
{
"acc"
:
0.6184971098265896
,
"acc_stderr"
:
0.026152198619726792
,
"acc_norm"
:
0.4595375722543353
,
"acc_norm_stderr"
:
0.026830805998952236
},
"hendrycksTest-high_school_us_history"
:
{
"acc"
:
0.7205882352941176
,
"acc_stderr"
:
0.031493281045079556
,
"acc_norm"
:
0.553921568627451
,
"acc_norm_stderr"
:
0.03488845451304974
},
"hendrycksTest-high_school_mathematics"
:
{
"acc"
:
0.25925925925925924
,
"acc_stderr"
:
0.026719240783712177
,
"acc_norm"
:
0.3148148148148148
,
"acc_norm_stderr"
:
0.02831753349606648
},
"hendrycksTest-high_school_microeconomics"
:
{
"acc"
:
0.5840336134453782
,
"acc_stderr"
:
0.032016501007396114
,
"acc_norm"
:
0.4831932773109244
,
"acc_norm_stderr"
:
0.03246013680375308
},
"hendrycksTest-astronomy"
:
{
"acc"
:
0.5723684210526315
,
"acc_stderr"
:
0.04026097083296564
,
"acc_norm"
:
0.5657894736842105
,
"acc_norm_stderr"
:
0.04033565667848319
},
"hendrycksTest-world_religions"
:
{
"acc"
:
0.8128654970760234
,
"acc_stderr"
:
0.029913127232368043
,
"acc_norm"
:
0.7660818713450293
,
"acc_norm_stderr"
:
0.03246721765117825
},
"hendrycksTest-clinical_knowledge"
:
{
"acc"
:
0.5320754716981132
,
"acc_stderr"
:
0.03070948699255654
,
"acc_norm"
:
0.4641509433962264
,
"acc_norm_stderr"
:
0.030693675018458003
},
"hendrycksTest-college_chemistry"
:
{
"acc"
:
0.31
,
"acc_stderr"
:
0.04648231987117316
,
"acc_norm"
:
0.32
,
"acc_norm_stderr"
:
0.046882617226215034
},
"hendrycksTest-abstract_algebra"
:
{
"acc"
:
0.26
,
"acc_stderr"
:
0.04408440022768078
,
"acc_norm"
:
0.29
,
"acc_norm_stderr"
:
0.04560480215720684
},
"hendrycksTest-business_ethics"
:
{
"acc"
:
0.67
,
"acc_stderr"
:
0.04725815626252609
,
"acc_norm"
:
0.48
,
"acc_norm_stderr"
:
0.050211673156867795
},
"hendrycksTest-elementary_mathematics"
:
{
"acc"
:
0.4417989417989418
,
"acc_stderr"
:
0.02557625706125384
,
"acc_norm"
:
0.37037037037037035
,
"acc_norm_stderr"
:
0.024870815251057075
},
"hendrycksTest-management"
:
{
"acc"
:
0.7184466019417476
,
"acc_stderr"
:
0.044532548363264673
,
"acc_norm"
:
0.5533980582524272
,
"acc_norm_stderr"
:
0.04922424153458933
},
"hendrycksTest-electrical_engineering"
:
{
"acc"
:
0.5172413793103449
,
"acc_stderr"
:
0.04164188720169375
,
"acc_norm"
:
0.38620689655172413
,
"acc_norm_stderr"
:
0.040573247344190336
},
"hendrycksTest-nutrition"
:
{
"acc"
:
0.6111111111111112
,
"acc_stderr"
:
0.02791405551046801
,
"acc_norm"
:
0.5032679738562091
,
"acc_norm_stderr"
:
0.028629305194003543
},
"hendrycksTest-computer_security"
:
{
"acc"
:
0.66
,
"acc_stderr"
:
0.04760952285695237
,
"acc_norm"
:
0.58
,
"acc_norm_stderr"
:
0.049604496374885836
}
},
"versions"
:
{
"hendrycksTest-high_school_world_history"
:
0
,
"hendrycksTest-formal_logic"
:
0
,
"hendrycksTest-human_aging"
:
0
,
"hendrycksTest-international_law"
:
0
,
"hendrycksTest-security_studies"
:
0
,
"hendrycksTest-medical_genetics"
:
0
,
"hendrycksTest-econometrics"
:
0
,
"hendrycksTest-high_school_macroeconomics"
:
0
,
"hendrycksTest-us_foreign_policy"
:
0
,
"hendrycksTest-logical_fallacies"
:
0
,
"hendrycksTest-prehistory"
:
0
,
"hendrycksTest-professional_psychology"
:
0
,
"hendrycksTest-professional_accounting"
:
0
,
"hendrycksTest-college_biology"
:
0
,
"hendrycksTest-high_school_biology"
:
0
,
"hendrycksTest-philosophy"
:
0
,
"hendrycksTest-high_school_european_history"
:
0
,
"hendrycksTest-college_medicine"
:
0
,
"hendrycksTest-professional_medicine"
:
0
,
"hendrycksTest-moral_scenarios"
:
0
,
"hendrycksTest-high_school_chemistry"
:
0
,
"hendrycksTest-high_school_physics"
:
0
,
"hendrycksTest-high_school_government_and_politics"
:
0
,
"hendrycksTest-high_school_geography"
:
0
,
"hendrycksTest-global_facts"
:
0
,
"hendrycksTest-professional_law"
:
0
,
"hendrycksTest-college_mathematics"
:
0
,
"hendrycksTest-college_physics"
:
0
,
"hendrycksTest-high_school_statistics"
:
0
,
"hendrycksTest-machine_learning"
:
0
,
"hendrycksTest-public_relations"
:
0
,
"hendrycksTest-high_school_computer_science"
:
0
,
"hendrycksTest-high_school_psychology"
:
0
,
"hendrycksTest-virology"
:
0
,
"hendrycksTest-marketing"
:
0
,
"hendrycksTest-human_sexuality"
:
0
,
"hendrycksTest-sociology"
:
0
,
"hendrycksTest-college_computer_science"
:
0
,
"hendrycksTest-conceptual_physics"
:
0
,
"hendrycksTest-anatomy"
:
0
,
"hendrycksTest-miscellaneous"
:
0
,
"hendrycksTest-jurisprudence"
:
0
,
"hendrycksTest-moral_disputes"
:
0
,
"hendrycksTest-high_school_us_history"
:
0
,
"hendrycksTest-high_school_mathematics"
:
0
,
"hendrycksTest-high_school_microeconomics"
:
0
,
"hendrycksTest-astronomy"
:
0
,
"hendrycksTest-world_religions"
:
0
,
"hendrycksTest-clinical_knowledge"
:
0
,
"hendrycksTest-college_chemistry"
:
0
,
"hendrycksTest-abstract_algebra"
:
0
,
"hendrycksTest-business_ethics"
:
0
,
"hendrycksTest-elementary_mathematics"
:
0
,
"hendrycksTest-management"
:
0
,
"hendrycksTest-electrical_engineering"
:
0
,
"hendrycksTest-nutrition"
:
0
,
"hendrycksTest-computer_security"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/30B,use_accelerate=True"
,
"num_fewshot"
:
5
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
Prev
1
2
3
4
5
6
7
8
9
10
…
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment