Commit 4a0b0d6e authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'gakada-big-refactor-merge' into big-refactor

parents 6ae376e3 c490f165
{
"results": {
"lambada_openai": {
"ppl": 1279051.053451683,
"ppl_stderr": 60995.62964377304,
"acc": 0.0,
"acc_stderr": 0.0
},
"lambada_openai_mt_de": {
"ppl": 1310285.4433720284,
"ppl_stderr": 71395.90633942866,
"acc": 0.0,
"acc_stderr": 0.0
},
"lambada_openai_mt_it": {
"ppl": 4091504.352954044,
"ppl_stderr": 218020.965277226,
"acc": 0.0,
"acc_stderr": 0.0
},
"lambada_standard": {
"ppl": 1409047.9981006894,
"ppl_stderr": 47832.883755899915,
"acc": 0.0,
"acc_stderr": 0.0
},
"lambada_standard_cloze": {
"ppl": 4235345.031433833,
"ppl_stderr": 132892.5654001927,
"acc": 0.0,
"acc_stderr": 0.0
},
"lambada_openai_mt_fr": {
"ppl": 2461448.491005768,
"ppl_stderr": 128013.98724687536,
"acc": 0.0,
"acc_stderr": 0.0
},
"lambada_openai_cloze": {
"ppl": 204515.38946166556,
"ppl_stderr": 9705.341358126625,
"acc": 0.00019406171162429653,
"acc_stderr": 0.00019406171162430135
},
"lambada_openai_mt_en": {
"ppl": 1279051.053451683,
"ppl_stderr": 60995.62964377304,
"acc": 0.0,
"acc_stderr": 0.0
},
"lambada_openai_mt_es": {
"ppl": 1980241.7718905837,
"ppl_stderr": 101614.2034914904,
"acc": 0.0,
"acc_stderr": 0.0
}
},
"versions": {
"lambada_openai": 0,
"lambada_openai_mt_de": 0,
"lambada_openai_mt_it": 0,
"lambada_standard": 0,
"lambada_standard_cloze": 0,
"lambada_openai_mt_fr": 0,
"lambada_openai_cloze": 0,
"lambada_openai_mt_en": 0,
"lambada_openai_mt_es": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"math_prealgebra": {
"acc": 0.02870264064293915,
"acc_stderr": 0.0056607946989983855
},
"math_num_theory": {
"acc": 0.014814814814814815,
"acc_stderr": 0.005203704987512651
},
"drop": {
"em": 0.0388003355704698,
"em_stderr": 0.0019777172311177993,
"f1": 0.13990771812080444,
"f1_stderr": 0.002512880034517493
},
"gsm8k": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_intermediate_algebra": {
"acc": 0.012181616832779624,
"acc_stderr": 0.0036524791938863576
},
"math_algebra": {
"acc": 0.018534119629317607,
"acc_stderr": 0.003916347676363957
},
"math_counting_and_prob": {
"acc": 0.014767932489451477,
"acc_stderr": 0.0055462385896684775
},
"math_geometry": {
"acc": 0.012526096033402923,
"acc_stderr": 0.005086941389677977
},
"math_precalc": {
"acc": 0.01098901098901099,
"acc_stderr": 0.004465618427331416
},
"mathqa": {
"acc": 0.28442211055276384,
"acc_stderr": 0.008258681628795297,
"acc_norm": 0.28676716917922945,
"acc_norm_stderr": 0.00827905882129993
},
"math_asdiv": {
"acc": 0.0,
"acc_stderr": 0.0
}
},
"versions": {
"math_prealgebra": 1,
"math_num_theory": 1,
"drop": 1,
"mathqa": 0,
"gsm8k": 0,
"math_intermediate_algebra": 1,
"math_algebra": 1,
"math_counting_and_prob": 1,
"math_geometry": 1,
"math_precalc": 1,
"math_asdiv": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"math_prealgebra": {
"acc": 0.001148105625717566,
"acc_stderr": 0.0011481056257175704
},
"drop": {
"em": 0.01709312080536913,
"em_stderr": 0.001327414384722433,
"f1": 0.024450503355704672,
"f1_stderr": 0.001413124400630544
},
"math_intermediate_algebra": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_counting_and_prob": {
"acc": 0.002109704641350211,
"acc_stderr": 0.0021097046413502104
},
"math_num_theory": {
"acc": 0.001851851851851852,
"acc_stderr": 0.0018518518518518502
},
"gsm8k": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_geometry": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_algebra": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_precalc": {
"acc": 0.0,
"acc_stderr": 0.0
},
"mathqa": {
"acc": 0.2998324958123953,
"acc_stderr": 0.008387661895516162,
"acc_norm": 0.3035175879396985,
"acc_norm_stderr": 0.008416811454701563
}
},
"versions": {
"math_prealgebra": 1,
"drop": 1,
"mathqa": 0,
"math_intermediate_algebra": 1,
"math_counting_and_prob": 1,
"math_num_theory": 1,
"gsm8k": 0,
"math_geometry": 1,
"math_algebra": 1,
"math_precalc": 1
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True",
"num_fewshot": 5,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"hendrycksTest-college_biology": {
"acc": 0.4583333333333333,
"acc_stderr": 0.04166666666666665,
"acc_norm": 0.3263888888888889,
"acc_norm_stderr": 0.03921067198982266
},
"hendrycksTest-clinical_knowledge": {
"acc": 0.46037735849056605,
"acc_stderr": 0.030676096599389188,
"acc_norm": 0.3849056603773585,
"acc_norm_stderr": 0.029946498567699948
},
"hendrycksTest-high_school_european_history": {
"acc": 0.5272727272727272,
"acc_stderr": 0.03898531605579418,
"acc_norm": 0.49696969696969695,
"acc_norm_stderr": 0.03904272341431855
},
"hendrycksTest-high_school_psychology": {
"acc": 0.6073394495412844,
"acc_stderr": 0.02093750516120109,
"acc_norm": 0.3688073394495413,
"acc_norm_stderr": 0.020686227560729537
},
"hendrycksTest-business_ethics": {
"acc": 0.53,
"acc_stderr": 0.05016135580465919,
"acc_norm": 0.44,
"acc_norm_stderr": 0.04988876515698589
},
"hendrycksTest-high_school_government_and_politics": {
"acc": 0.5854922279792746,
"acc_stderr": 0.035553003195576686,
"acc_norm": 0.38860103626943004,
"acc_norm_stderr": 0.03517739796373132
},
"hendrycksTest-security_studies": {
"acc": 0.45714285714285713,
"acc_stderr": 0.03189141832421396,
"acc_norm": 0.37551020408163266,
"acc_norm_stderr": 0.03100120903989484
},
"hendrycksTest-high_school_macroeconomics": {
"acc": 0.3769230769230769,
"acc_stderr": 0.024570975364225995,
"acc_norm": 0.31794871794871793,
"acc_norm_stderr": 0.02361088430892786
},
"hendrycksTest-sociology": {
"acc": 0.582089552238806,
"acc_stderr": 0.034875586404620636,
"acc_norm": 0.4577114427860697,
"acc_norm_stderr": 0.035228658640995975
},
"hendrycksTest-college_mathematics": {
"acc": 0.29,
"acc_stderr": 0.04560480215720683,
"acc_norm": 0.34,
"acc_norm_stderr": 0.04760952285695235
},
"hendrycksTest-professional_accounting": {
"acc": 0.2978723404255319,
"acc_stderr": 0.02728160834446941,
"acc_norm": 0.2801418439716312,
"acc_norm_stderr": 0.02678917235114023
},
"hendrycksTest-anatomy": {
"acc": 0.42962962962962964,
"acc_stderr": 0.04276349494376599,
"acc_norm": 0.2962962962962963,
"acc_norm_stderr": 0.03944624162501116
},
"hendrycksTest-professional_psychology": {
"acc": 0.42320261437908496,
"acc_stderr": 0.019987809769482067,
"acc_norm": 0.3300653594771242,
"acc_norm_stderr": 0.01902372616072456
},
"hendrycksTest-moral_scenarios": {
"acc": 0.28268156424581004,
"acc_stderr": 0.015060381730018082,
"acc_norm": 0.27262569832402234,
"acc_norm_stderr": 0.014893391735249588
},
"hendrycksTest-conceptual_physics": {
"acc": 0.42127659574468085,
"acc_stderr": 0.03227834510146268,
"acc_norm": 0.2425531914893617,
"acc_norm_stderr": 0.028020226271200217
},
"hendrycksTest-virology": {
"acc": 0.40963855421686746,
"acc_stderr": 0.03828401115079021,
"acc_norm": 0.30120481927710846,
"acc_norm_stderr": 0.035716092300534796
},
"hendrycksTest-world_religions": {
"acc": 0.7426900584795322,
"acc_stderr": 0.03352799844161865,
"acc_norm": 0.6491228070175439,
"acc_norm_stderr": 0.03660298834049162
},
"hendrycksTest-high_school_computer_science": {
"acc": 0.49,
"acc_stderr": 0.05024183937956911,
"acc_norm": 0.41,
"acc_norm_stderr": 0.049431107042371025
},
"hendrycksTest-abstract_algebra": {
"acc": 0.32,
"acc_stderr": 0.046882617226215034,
"acc_norm": 0.3,
"acc_norm_stderr": 0.046056618647183814
},
"hendrycksTest-medical_genetics": {
"acc": 0.49,
"acc_stderr": 0.05024183937956911,
"acc_norm": 0.48,
"acc_norm_stderr": 0.050211673156867795
},
"hendrycksTest-nutrition": {
"acc": 0.45098039215686275,
"acc_stderr": 0.02849199358617156,
"acc_norm": 0.4673202614379085,
"acc_norm_stderr": 0.02856869975222588
},
"hendrycksTest-elementary_mathematics": {
"acc": 0.36772486772486773,
"acc_stderr": 0.024833839825562424,
"acc_norm": 0.328042328042328,
"acc_norm_stderr": 0.024180497164376907
},
"hendrycksTest-philosophy": {
"acc": 0.45980707395498394,
"acc_stderr": 0.028306190403305696,
"acc_norm": 0.3858520900321543,
"acc_norm_stderr": 0.02764814959975146
},
"hendrycksTest-high_school_microeconomics": {
"acc": 0.42016806722689076,
"acc_stderr": 0.03206183783236152,
"acc_norm": 0.40756302521008403,
"acc_norm_stderr": 0.031918633744784645
},
"hendrycksTest-management": {
"acc": 0.6407766990291263,
"acc_stderr": 0.04750458399041696,
"acc_norm": 0.4174757281553398,
"acc_norm_stderr": 0.048828405482122375
},
"hendrycksTest-us_foreign_policy": {
"acc": 0.68,
"acc_stderr": 0.046882617226215034,
"acc_norm": 0.52,
"acc_norm_stderr": 0.050211673156867795
},
"hendrycksTest-international_law": {
"acc": 0.5619834710743802,
"acc_stderr": 0.04529146804435792,
"acc_norm": 0.6033057851239669,
"acc_norm_stderr": 0.044658697805310094
},
"hendrycksTest-college_chemistry": {
"acc": 0.31,
"acc_stderr": 0.04648231987117316,
"acc_norm": 0.3,
"acc_norm_stderr": 0.046056618647183814
},
"hendrycksTest-high_school_mathematics": {
"acc": 0.26666666666666666,
"acc_stderr": 0.026962424325073817,
"acc_norm": 0.31851851851851853,
"acc_norm_stderr": 0.028406533090608463
},
"hendrycksTest-high_school_world_history": {
"acc": 0.4978902953586498,
"acc_stderr": 0.032546938018020076,
"acc_norm": 0.42616033755274263,
"acc_norm_stderr": 0.03219035703131774
},
"hendrycksTest-human_sexuality": {
"acc": 0.549618320610687,
"acc_stderr": 0.04363643698524779,
"acc_norm": 0.3969465648854962,
"acc_norm_stderr": 0.04291135671009224
},
"hendrycksTest-college_computer_science": {
"acc": 0.33,
"acc_stderr": 0.047258156262526045,
"acc_norm": 0.28,
"acc_norm_stderr": 0.045126085985421276
},
"hendrycksTest-college_medicine": {
"acc": 0.4277456647398844,
"acc_stderr": 0.037724468575180255,
"acc_norm": 0.30057803468208094,
"acc_norm_stderr": 0.0349610148119118
},
"hendrycksTest-formal_logic": {
"acc": 0.3253968253968254,
"acc_stderr": 0.041905964388711366,
"acc_norm": 0.3412698412698413,
"acc_norm_stderr": 0.04240799327574925
},
"hendrycksTest-high_school_physics": {
"acc": 0.271523178807947,
"acc_stderr": 0.03631329803969653,
"acc_norm": 0.25165562913907286,
"acc_norm_stderr": 0.035433042343899844
},
"hendrycksTest-marketing": {
"acc": 0.7264957264957265,
"acc_stderr": 0.029202540153431173,
"acc_norm": 0.6153846153846154,
"acc_norm_stderr": 0.03187195347942466
},
"hendrycksTest-jurisprudence": {
"acc": 0.48148148148148145,
"acc_stderr": 0.04830366024635331,
"acc_norm": 0.5,
"acc_norm_stderr": 0.04833682445228318
},
"hendrycksTest-computer_security": {
"acc": 0.57,
"acc_stderr": 0.049756985195624284,
"acc_norm": 0.44,
"acc_norm_stderr": 0.04988876515698589
},
"hendrycksTest-high_school_chemistry": {
"acc": 0.3103448275862069,
"acc_stderr": 0.03255086769970103,
"acc_norm": 0.32019704433497537,
"acc_norm_stderr": 0.032826493853041504
},
"hendrycksTest-prehistory": {
"acc": 0.49691358024691357,
"acc_stderr": 0.02782021415859437,
"acc_norm": 0.345679012345679,
"acc_norm_stderr": 0.026462487777001876
},
"hendrycksTest-machine_learning": {
"acc": 0.2857142857142857,
"acc_stderr": 0.04287858751340455,
"acc_norm": 0.29464285714285715,
"acc_norm_stderr": 0.043270409325787296
},
"hendrycksTest-professional_medicine": {
"acc": 0.39338235294117646,
"acc_stderr": 0.02967428828131118,
"acc_norm": 0.33088235294117646,
"acc_norm_stderr": 0.028582709753898452
},
"hendrycksTest-global_facts": {
"acc": 0.34,
"acc_stderr": 0.04760952285695235,
"acc_norm": 0.29,
"acc_norm_stderr": 0.04560480215720684
},
"hendrycksTest-high_school_us_history": {
"acc": 0.5245098039215687,
"acc_stderr": 0.03505093194348798,
"acc_norm": 0.37254901960784315,
"acc_norm_stderr": 0.033933885849584046
},
"hendrycksTest-high_school_geography": {
"acc": 0.5757575757575758,
"acc_stderr": 0.03521224908841586,
"acc_norm": 0.42424242424242425,
"acc_norm_stderr": 0.03521224908841583
},
"hendrycksTest-human_aging": {
"acc": 0.5739910313901345,
"acc_stderr": 0.033188332862172806,
"acc_norm": 0.336322869955157,
"acc_norm_stderr": 0.03170882426845501
},
"hendrycksTest-high_school_biology": {
"acc": 0.4967741935483871,
"acc_stderr": 0.028443414226438316,
"acc_norm": 0.36129032258064514,
"acc_norm_stderr": 0.027327548447957553
},
"hendrycksTest-public_relations": {
"acc": 0.5454545454545454,
"acc_stderr": 0.04769300568972744,
"acc_norm": 0.2909090909090909,
"acc_norm_stderr": 0.04350271442923243
},
"hendrycksTest-professional_law": {
"acc": 0.30378096479791394,
"acc_stderr": 0.011745787720472483,
"acc_norm": 0.3089960886571056,
"acc_norm_stderr": 0.011801729777239246
},
"hendrycksTest-electrical_engineering": {
"acc": 0.41379310344827586,
"acc_stderr": 0.041042692118062316,
"acc_norm": 0.3448275862068966,
"acc_norm_stderr": 0.039609335494512087
},
"hendrycksTest-logical_fallacies": {
"acc": 0.4539877300613497,
"acc_stderr": 0.0391170190467718,
"acc_norm": 0.36809815950920244,
"acc_norm_stderr": 0.03789213935838396
},
"hendrycksTest-moral_disputes": {
"acc": 0.4479768786127168,
"acc_stderr": 0.026772990653361816,
"acc_norm": 0.3815028901734104,
"acc_norm_stderr": 0.0261521986197268
},
"hendrycksTest-high_school_statistics": {
"acc": 0.38425925925925924,
"acc_stderr": 0.03317354514310742,
"acc_norm": 0.375,
"acc_norm_stderr": 0.033016908987210894
},
"hendrycksTest-college_physics": {
"acc": 0.28431372549019607,
"acc_stderr": 0.04488482852329017,
"acc_norm": 0.35294117647058826,
"acc_norm_stderr": 0.04755129616062947
},
"hendrycksTest-econometrics": {
"acc": 0.2719298245614035,
"acc_stderr": 0.04185774424022056,
"acc_norm": 0.2631578947368421,
"acc_norm_stderr": 0.041424397194893624
},
"hendrycksTest-miscellaneous": {
"acc": 0.6960408684546615,
"acc_stderr": 0.016448321686769043,
"acc_norm": 0.48531289910600256,
"acc_norm_stderr": 0.01787224802442912
},
"hendrycksTest-astronomy": {
"acc": 0.48026315789473684,
"acc_stderr": 0.04065771002562603,
"acc_norm": 0.48026315789473684,
"acc_norm_stderr": 0.040657710025626036
}
},
"versions": {
"hendrycksTest-college_biology": 0,
"hendrycksTest-clinical_knowledge": 0,
"hendrycksTest-high_school_european_history": 0,
"hendrycksTest-high_school_psychology": 0,
"hendrycksTest-business_ethics": 0,
"hendrycksTest-high_school_government_and_politics": 0,
"hendrycksTest-security_studies": 0,
"hendrycksTest-high_school_macroeconomics": 0,
"hendrycksTest-sociology": 0,
"hendrycksTest-college_mathematics": 0,
"hendrycksTest-professional_accounting": 0,
"hendrycksTest-anatomy": 0,
"hendrycksTest-professional_psychology": 0,
"hendrycksTest-moral_scenarios": 0,
"hendrycksTest-conceptual_physics": 0,
"hendrycksTest-virology": 0,
"hendrycksTest-world_religions": 0,
"hendrycksTest-high_school_computer_science": 0,
"hendrycksTest-abstract_algebra": 0,
"hendrycksTest-medical_genetics": 0,
"hendrycksTest-nutrition": 0,
"hendrycksTest-elementary_mathematics": 0,
"hendrycksTest-philosophy": 0,
"hendrycksTest-high_school_microeconomics": 0,
"hendrycksTest-management": 0,
"hendrycksTest-us_foreign_policy": 0,
"hendrycksTest-international_law": 0,
"hendrycksTest-college_chemistry": 0,
"hendrycksTest-high_school_mathematics": 0,
"hendrycksTest-high_school_world_history": 0,
"hendrycksTest-human_sexuality": 0,
"hendrycksTest-college_computer_science": 0,
"hendrycksTest-college_medicine": 0,
"hendrycksTest-formal_logic": 0,
"hendrycksTest-high_school_physics": 0,
"hendrycksTest-marketing": 0,
"hendrycksTest-jurisprudence": 0,
"hendrycksTest-computer_security": 0,
"hendrycksTest-high_school_chemistry": 0,
"hendrycksTest-prehistory": 0,
"hendrycksTest-machine_learning": 0,
"hendrycksTest-professional_medicine": 0,
"hendrycksTest-global_facts": 0,
"hendrycksTest-high_school_us_history": 0,
"hendrycksTest-high_school_geography": 0,
"hendrycksTest-human_aging": 0,
"hendrycksTest-high_school_biology": 0,
"hendrycksTest-public_relations": 0,
"hendrycksTest-professional_law": 0,
"hendrycksTest-electrical_engineering": 0,
"hendrycksTest-logical_fallacies": 0,
"hendrycksTest-moral_disputes": 0,
"hendrycksTest-high_school_statistics": 0,
"hendrycksTest-college_physics": 0,
"hendrycksTest-econometrics": 0,
"hendrycksTest-miscellaneous": 0,
"hendrycksTest-astronomy": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True",
"num_fewshot": 5,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"pawsx_fr": {
"acc": 0.545,
"acc_stderr": 0.011137752231145222
},
"pawsx_en": {
"acc": 0.537,
"acc_stderr": 0.011152474561478174
},
"pawsx_ko": {
"acc": 0.4705,
"acc_stderr": 0.011163654804511664
},
"pawsx_ja": {
"acc": 0.45,
"acc_stderr": 0.011127079848413735
},
"pawsx_es": {
"acc": 0.521,
"acc_stderr": 0.011173268141438304
},
"pawsx_de": {
"acc": 0.5295,
"acc_stderr": 0.011163654804511655
},
"pawsx_zh": {
"acc": 0.452,
"acc_stderr": 0.01113148485052578
}
},
"versions": {
"pawsx_fr": 0,
"pawsx_en": 0,
"pawsx_ko": 0,
"pawsx_ja": 0,
"pawsx_es": 0,
"pawsx_de": 0,
"pawsx_zh": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"triviaqa": {
"acc": 0.0,
"acc_stderr": 0.0
},
"headqa_es": {
"acc": 0.3056163384390955,
"acc_stderr": 0.008799003959214539,
"acc_norm": 0.3515681983953319,
"acc_norm_stderr": 0.009119739372039878
},
"logiqa": {
"acc": 0.2642089093701997,
"acc_stderr": 0.017293954549744514,
"acc_norm": 0.3210445468509985,
"acc_norm_stderr": 0.018312456701476108
},
"headqa_en": {
"acc": 0.34427425237053244,
"acc_stderr": 0.009075255747504299,
"acc_norm": 0.38584974471188915,
"acc_norm_stderr": 0.009298050684004381
},
"truthfulqa_mc": {
"mc1": 0.2582619339045288,
"mc1_stderr": 0.0153218216884762,
"mc2": 0.39884734031519786,
"mc2_stderr": 0.013703865869126058
},
"squad2": {
"exact": 16.440663690726858,
"f1": 24.060945088960178,
"HasAns_exact": 21.086369770580298,
"HasAns_f1": 36.34878560074651,
"NoAns_exact": 11.808242220353238,
"NoAns_f1": 11.808242220353238,
"best_exact": 50.07159100480081,
"best_f1": 50.073888042388
},
"webqs": {
"acc": 0.0,
"acc_stderr": 0.0
}
},
"versions": {
"triviaqa": 1,
"headqa_es": 0,
"logiqa": 0,
"headqa_en": 0,
"truthfulqa_mc": 1,
"squad2": 1,
"webqs": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"coqa": {
"f1": 0.7704068983762044,
"f1_stderr": 0.014191975492335083,
"em": 0.637,
"em_stderr": 0.01847461201879917
},
"drop": {
"em": 0.035864093959731544,
"em_stderr": 0.0019043146639119552,
"f1": 0.13376153523489834,
"f1_stderr": 0.002439665460318613
},
"race": {
"acc": 0.39330143540669854,
"acc_stderr": 0.01511816218614914
}
},
"versions": {
"coqa": 1,
"drop": 1,
"race": 1
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": false,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"boolq": {
"acc": 0.6844036697247706,
"acc_stderr": 0.008128579858785895
},
"wic": {
"acc": 0.49843260188087773,
"acc_stderr": 0.019810623954060382
},
"copa": {
"acc": 0.9,
"acc_stderr": 0.030151134457776348
},
"wsc": {
"acc": 0.3557692307692308,
"acc_stderr": 0.04717221961050337
},
"cb": {
"acc": 0.48214285714285715,
"acc_stderr": 0.0673769750864465,
"f1": 0.3881876266167991
},
"record": {
"f1": 0.9231828571428571,
"f1_stderr": 0.0026119602574627677,
"em": 0.9154,
"em_stderr": 0.002782994521347745
},
"multirc": {
"acc": 0.015739769150052464,
"acc_stderr": 0.00403399795659578
}
},
"versions": {
"boolq": 1,
"wic": 0,
"copa": 0,
"wsc": 0,
"cb": 1,
"record": 0,
"multirc": 1
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": false,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"xcopa_it": {
"acc": 0.672,
"acc_stderr": 0.021017027165175485
},
"xcopa_vi": {
"acc": 0.538,
"acc_stderr": 0.02231833811987053
},
"xcopa_zh": {
"acc": 0.584,
"acc_stderr": 0.02206494331392886
},
"xcopa_ta": {
"acc": 0.544,
"acc_stderr": 0.022296238348407053
},
"xcopa_sw": {
"acc": 0.512,
"acc_stderr": 0.02237662679792717
},
"xcopa_id": {
"acc": 0.578,
"acc_stderr": 0.022109039310618552
},
"xcopa_tr": {
"acc": 0.53,
"acc_stderr": 0.02234274819250285
},
"xcopa_ht": {
"acc": 0.528,
"acc_stderr": 0.02234794983266809
},
"xcopa_qu": {
"acc": 0.502,
"acc_stderr": 0.02238289498648353
},
"xcopa_th": {
"acc": 0.546,
"acc_stderr": 0.022288147591176945
},
"xcopa_et": {
"acc": 0.482,
"acc_stderr": 0.02236856511738799
}
},
"versions": {
"xcopa_it": 0,
"xcopa_vi": 0,
"xcopa_zh": 0,
"xcopa_ta": 0,
"xcopa_sw": 0,
"xcopa_id": 0,
"xcopa_tr": 0,
"xcopa_ht": 0,
"xcopa_qu": 0,
"xcopa_th": 0,
"xcopa_et": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"xnli_ru": {
"acc": 0.3379241516966068,
"acc_stderr": 0.006683254094065008
},
"xnli_vi": {
"acc": 0.34211576846307384,
"acc_stderr": 0.006703255428996599
},
"xnli_zh": {
"acc": 0.3447105788423154,
"acc_stderr": 0.006715345603576115
},
"xnli_bg": {
"acc": 0.34211576846307384,
"acc_stderr": 0.0067032554289965995
},
"xnli_el": {
"acc": 0.3469061876247505,
"acc_stderr": 0.0067254026681375706
},
"xnli_fr": {
"acc": 0.3349301397205589,
"acc_stderr": 0.006668608672768922
},
"xnli_ur": {
"acc": 0.34211576846307384,
"acc_stderr": 0.006703255428996604
},
"xnli_hi": {
"acc": 0.35588822355289423,
"acc_stderr": 0.00676490827777005
},
"xnli_es": {
"acc": 0.3349301397205589,
"acc_stderr": 0.006668608672768919
},
"xnli_sw": {
"acc": 0.3315369261477046,
"acc_stderr": 0.006651646309907708
},
"xnli_th": {
"acc": 0.34830339321357284,
"acc_stderr": 0.006731720358995404
},
"xnli_ar": {
"acc": 0.3407185628742515,
"acc_stderr": 0.006696653153866837
},
"xnli_en": {
"acc": 0.3562874251497006,
"acc_stderr": 0.006766603483662201
},
"xnli_de": {
"acc": 0.3524950099800399,
"acc_stderr": 0.006750291549188483
},
"xnli_tr": {
"acc": 0.3399201596806387,
"acc_stderr": 0.006692851356332768
}
},
"versions": {
"xnli_ru": 0,
"xnli_vi": 0,
"xnli_zh": 0,
"xnli_bg": 0,
"xnli_el": 0,
"xnli_fr": 0,
"xnli_ur": 0,
"xnli_hi": 0,
"xnli_es": 0,
"xnli_sw": 0,
"xnli_th": 0,
"xnli_ar": 0,
"xnli_en": 0,
"xnli_de": 0,
"xnli_tr": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"xstory_cloze_zh": {
"acc": 0.5645268034414295,
"acc_stderr": 0.012759525506489228
},
"xstory_cloze_my": {
"acc": 0.47782925215089345,
"acc_stderr": 0.012854469625936085
},
"xstory_cloze_id": {
"acc": 0.5526141628060887,
"acc_stderr": 0.012795688167385315
},
"xstory_cloze_te": {
"acc": 0.5334215751158173,
"acc_stderr": 0.012838347934731667
},
"xstory_cloze_ar": {
"acc": 0.49702183984116477,
"acc_stderr": 0.012866897066011233
},
"xstory_cloze_sw": {
"acc": 0.4990072799470549,
"acc_stderr": 0.01286709995542293
},
"xstory_cloze_hi": {
"acc": 0.5234943745863666,
"acc_stderr": 0.012852912530051748
},
"xstory_cloze_eu": {
"acc": 0.5069490403706155,
"acc_stderr": 0.012865882570960722
},
"xstory_cloze_en": {
"acc": 0.7729980145598941,
"acc_stderr": 0.010779920137756025
},
"xstory_cloze_es": {
"acc": 0.6942422236929185,
"acc_stderr": 0.011856480568871262
},
"xstory_cloze_ru": {
"acc": 0.6340172071475844,
"acc_stderr": 0.012396308684399372
}
},
"versions": {
"xstory_cloze_zh": 0,
"xstory_cloze_my": 0,
"xstory_cloze_id": 0,
"xstory_cloze_te": 0,
"xstory_cloze_ar": 0,
"xstory_cloze_sw": 0,
"xstory_cloze_hi": 0,
"xstory_cloze_eu": 0,
"xstory_cloze_en": 0,
"xstory_cloze_es": 0,
"xstory_cloze_ru": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"xwinograd_pt": {
"acc": 0.714828897338403,
"acc_stderr": 0.02789350966043832
},
"xwinograd_jp": {
"acc": 0.5985401459854015,
"acc_stderr": 0.01583743878453324
},
"xwinograd_en": {
"acc": 0.8675268817204301,
"acc_stderr": 0.007032136436579812
},
"xwinograd_ru": {
"acc": 0.707936507936508,
"acc_stderr": 0.02566084582577463
},
"xwinograd_zh": {
"acc": 0.7003968253968254,
"acc_stderr": 0.020424963888406065
},
"xwinograd_fr": {
"acc": 0.6867469879518072,
"acc_stderr": 0.051219942106581456
}
},
"versions": {
"xwinograd_pt": 0,
"xwinograd_jp": 0,
"xwinograd_en": 0,
"xwinograd_ru": 0,
"xwinograd_zh": 0,
"xwinograd_fr": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
# llama-30B
## llama-30B_bbh_3-shot.json
| Task |Version| Metric |Value| |Stderr|
|------------------------------------------------|------:|---------------------|----:|---|-----:|
|bigbench_causal_judgement | 0|multiple_choice_grade|57.37|± | 3.60|
|bigbench_date_understanding | 0|multiple_choice_grade|69.92|± | 2.39|
|bigbench_disambiguation_qa | 0|multiple_choice_grade|54.26|± | 3.11|
|bigbench_dyck_languages | 0|multiple_choice_grade|21.20|± | 1.29|
|bigbench_formal_fallacies_syllogisms_negation | 0|multiple_choice_grade|50.58|± | 0.42|
|bigbench_geometric_shapes | 0|multiple_choice_grade|27.86|± | 2.37|
| | |exact_str_match | 0.00|± | 0.00|
|bigbench_hyperbaton | 0|multiple_choice_grade|51.52|± | 0.22|
|bigbench_logical_deduction_five_objects | 0|multiple_choice_grade|36.80|± | 2.16|
|bigbench_logical_deduction_seven_objects | 0|multiple_choice_grade|25.29|± | 1.64|
|bigbench_logical_deduction_three_objects | 0|multiple_choice_grade|53.00|± | 2.89|
|bigbench_movie_recommendation | 0|multiple_choice_grade|63.20|± | 2.16|
|bigbench_navigate | 0|multiple_choice_grade|49.00|± | 1.58|
|bigbench_reasoning_about_colored_objects | 0|multiple_choice_grade|55.65|± | 1.11|
|bigbench_ruin_names | 0|multiple_choice_grade|39.73|± | 2.31|
|bigbench_salient_translation_error_detection | 0|multiple_choice_grade|19.84|± | 1.26|
|bigbench_snarks | 0|multiple_choice_grade|46.96|± | 3.72|
|bigbench_sports_understanding | 0|multiple_choice_grade|62.37|± | 1.54|
|bigbench_temporal_sequences | 0|multiple_choice_grade|14.60|± | 1.12|
|bigbench_tracking_shuffled_objects_five_objects | 0|multiple_choice_grade|21.28|± | 1.16|
|bigbench_tracking_shuffled_objects_seven_objects| 0|multiple_choice_grade|15.49|± | 0.87|
|bigbench_tracking_shuffled_objects_three_objects| 0|multiple_choice_grade|53.00|± | 2.89|
## llama-30B_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |46.76|± | 1.46|
| | |acc_norm|45.48|± | 1.46|
|arc_easy | 0|acc |75.34|± | 0.88|
| | |acc_norm|58.96|± | 1.01|
|boolq | 1|acc |68.41|± | 0.81|
|copa | 0|acc |90.00|± | 3.02|
|hellaswag | 0|acc |62.65|± | 0.48|
| | |acc_norm|79.24|± | 0.40|
|mc_taco | 0|em |11.41| | |
| | |f1 |48.36| | |
|openbookqa | 0|acc |29.40|± | 2.04|
| | |acc_norm|42.00|± | 2.21|
|piqa | 0|acc |80.96|± | 0.92|
| | |acc_norm|80.09|± | 0.93|
|prost | 0|acc |25.99|± | 0.32|
| | |acc_norm|29.11|± | 0.33|
|swag | 0|acc |58.61|± | 0.35|
| | |acc_norm|70.36|± | 0.32|
|winogrande | 0|acc |72.77|± | 1.25|
|wsc273 | 0|acc |86.81|± | 2.05|
## llama-30B_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc |30.48|± | 1.27|
## llama-30B_human_alignment_0-shot.json
| Task |Version| Metric | Value | |Stderr|
|---------------------------------------|------:|---------------------|------:|---|-----:|
|crows_pairs_english_age | 0|likelihood_difference| 512.91|± | 58.13|
| | |pct_stereotype | 58.24|± | 5.20|
|crows_pairs_english_autre | 0|likelihood_difference|1138.07|± |348.77|
| | |pct_stereotype | 63.64|± | 15.21|
|crows_pairs_english_disability | 0|likelihood_difference| 888.65|± |103.42|
| | |pct_stereotype | 53.85|± | 6.23|
|crows_pairs_english_gender | 0|likelihood_difference| 666.15|± | 42.85|
| | |pct_stereotype | 54.06|± | 2.79|
|crows_pairs_english_nationality | 0|likelihood_difference| 587.28|± | 39.94|
| | |pct_stereotype | 53.24|± | 3.40|
|crows_pairs_english_physical_appearance| 0|likelihood_difference| 540.10|± | 59.14|
| | |pct_stereotype | 52.78|± | 5.92|
|crows_pairs_english_race_color | 0|likelihood_difference| 768.21|± | 39.14|
| | |pct_stereotype | 56.10|± | 2.20|
|crows_pairs_english_religion | 0|likelihood_difference| 807.57|± | 94.38|
| | |pct_stereotype | 62.16|± | 4.62|
|crows_pairs_english_sexual_orientation | 0|likelihood_difference| 754.77|± | 76.83|
| | |pct_stereotype | 63.44|± | 5.02|
|crows_pairs_english_socioeconomic | 0|likelihood_difference| 730.39|± | 54.63|
| | |pct_stereotype | 53.68|± | 3.63|
|crows_pairs_french_age | 0|likelihood_difference| 892.50|± |101.09|
| | |pct_stereotype | 40.00|± | 5.19|
|crows_pairs_french_autre | 0|likelihood_difference| 637.98|± |165.68|
| | |pct_stereotype | 61.54|± | 14.04|
|crows_pairs_french_disability | 0|likelihood_difference|1020.27|± |126.17|
| | |pct_stereotype | 56.06|± | 6.16|
|crows_pairs_french_gender | 0|likelihood_difference|1373.28|± |110.30|
| | |pct_stereotype | 50.16|± | 2.80|
|crows_pairs_french_nationality | 0|likelihood_difference| 985.10|± | 89.08|
| | |pct_stereotype | 38.74|± | 3.07|
|crows_pairs_french_physical_appearance | 0|likelihood_difference| 821.79|± |132.68|
| | |pct_stereotype | 56.94|± | 5.88|
|crows_pairs_french_race_color | 0|likelihood_difference|1061.17|± | 76.68|
| | |pct_stereotype | 41.74|± | 2.30|
|crows_pairs_french_religion | 0|likelihood_difference| 794.02|± | 93.89|
| | |pct_stereotype | 56.52|± | 4.64|
|crows_pairs_french_sexual_orientation | 0|likelihood_difference| 989.08|± |161.13|
| | |pct_stereotype | 71.43|± | 4.76|
|crows_pairs_french_socioeconomic | 0|likelihood_difference| 831.29|± | 87.37|
| | |pct_stereotype | 52.55|± | 3.58|
|ethics_cm | 0|acc | 57.50|± | 0.79|
|ethics_deontology | 0|acc | 54.17|± | 0.83|
| | |em | 6.12| | |
|ethics_justice | 0|acc | 51.70|± | 0.96|
| | |em | 1.33| | |
|ethics_utilitarianism | 0|acc | 50.12|± | 0.72|
|ethics_utilitarianism_original | 0|acc | 93.97|± | 0.34|
|ethics_virtue | 0|acc | 51.82|± | 0.71|
| | |em | 8.14| | |
|toxigen | 0|acc | 42.66|± | 1.61|
| | |acc_norm | 43.19|± | 1.62|
## llama-30B_mathematical_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 3.83|± | 0.20|
| | |f1 |13.91|± | 0.25|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 2.95|± | 0.49|
|math_asdiv | 0|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 4.01|± | 0.90|
|math_geometry | 1|acc | 1.46|± | 0.55|
|math_intermediate_algebra| 1|acc | 0.89|± | 0.31|
|math_num_theory | 1|acc | 2.96|± | 0.73|
|math_prealgebra | 1|acc | 4.13|± | 0.67|
|math_precalc | 1|acc | 1.83|± | 0.57|
|mathqa | 0|acc |30.59|± | 0.84|
| | |acc_norm|30.89|± | 0.85|
## llama-30B_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 0.84|± | 0.09|
| | |f1 | 1.65|± | 0.10|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.00|± | 0.00|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.00|± | 0.00|
|math_prealgebra | 1|acc | 0.11|± | 0.11|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |34.74|± | 0.87|
| | |acc_norm|34.54|± | 0.87|
## llama-30B_mmlu_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------------------------------|------:|--------|----:|---|-----:|
|hendrycksTest-abstract_algebra | 0|acc |26.00|± | 4.41|
| | |acc_norm|29.00|± | 4.56|
|hendrycksTest-anatomy | 0|acc |51.85|± | 4.32|
| | |acc_norm|40.74|± | 4.24|
|hendrycksTest-astronomy | 0|acc |57.24|± | 4.03|
| | |acc_norm|56.58|± | 4.03|
|hendrycksTest-business_ethics | 0|acc |67.00|± | 4.73|
| | |acc_norm|48.00|± | 5.02|
|hendrycksTest-clinical_knowledge | 0|acc |53.21|± | 3.07|
| | |acc_norm|46.42|± | 3.07|
|hendrycksTest-college_biology | 0|acc |61.11|± | 4.08|
| | |acc_norm|42.36|± | 4.13|
|hendrycksTest-college_chemistry | 0|acc |31.00|± | 4.65|
| | |acc_norm|32.00|± | 4.69|
|hendrycksTest-college_computer_science | 0|acc |43.00|± | 4.98|
| | |acc_norm|34.00|± | 4.76|
|hendrycksTest-college_mathematics | 0|acc |37.00|± | 4.85|
| | |acc_norm|30.00|± | 4.61|
|hendrycksTest-college_medicine | 0|acc |51.45|± | 3.81|
| | |acc_norm|43.35|± | 3.78|
|hendrycksTest-college_physics | 0|acc |23.53|± | 4.22|
| | |acc_norm|29.41|± | 4.53|
|hendrycksTest-computer_security | 0|acc |66.00|± | 4.76|
| | |acc_norm|58.00|± | 4.96|
|hendrycksTest-conceptual_physics | 0|acc |51.06|± | 3.27|
| | |acc_norm|32.77|± | 3.07|
|hendrycksTest-econometrics | 0|acc |35.09|± | 4.49|
| | |acc_norm|31.58|± | 4.37|
|hendrycksTest-electrical_engineering | 0|acc |51.72|± | 4.16|
| | |acc_norm|38.62|± | 4.06|
|hendrycksTest-elementary_mathematics | 0|acc |44.18|± | 2.56|
| | |acc_norm|37.04|± | 2.49|
|hendrycksTest-formal_logic | 0|acc |42.06|± | 4.42|
| | |acc_norm|39.68|± | 4.38|
|hendrycksTest-global_facts | 0|acc |47.00|± | 5.02|
| | |acc_norm|37.00|± | 4.85|
|hendrycksTest-high_school_biology | 0|acc |67.10|± | 2.67|
| | |acc_norm|54.52|± | 2.83|
|hendrycksTest-high_school_chemistry | 0|acc |39.90|± | 3.45|
| | |acc_norm|36.95|± | 3.40|
|hendrycksTest-high_school_computer_science | 0|acc |61.00|± | 4.90|
| | |acc_norm|47.00|± | 5.02|
|hendrycksTest-high_school_european_history | 0|acc |69.70|± | 3.59|
| | |acc_norm|56.36|± | 3.87|
|hendrycksTest-high_school_geography | 0|acc |75.76|± | 3.05|
| | |acc_norm|55.05|± | 3.54|
|hendrycksTest-high_school_government_and_politics| 0|acc |80.83|± | 2.84|
| | |acc_norm|61.14|± | 3.52|
|hendrycksTest-high_school_macroeconomics | 0|acc |51.54|± | 2.53|
| | |acc_norm|41.54|± | 2.50|
|hendrycksTest-high_school_mathematics | 0|acc |25.93|± | 2.67|
| | |acc_norm|31.48|± | 2.83|
|hendrycksTest-high_school_microeconomics | 0|acc |58.40|± | 3.20|
| | |acc_norm|48.32|± | 3.25|
|hendrycksTest-high_school_physics | 0|acc |31.79|± | 3.80|
| | |acc_norm|31.13|± | 3.78|
|hendrycksTest-high_school_psychology | 0|acc |77.06|± | 1.80|
| | |acc_norm|55.41|± | 2.13|
|hendrycksTest-high_school_statistics | 0|acc |43.52|± | 3.38|
| | |acc_norm|35.65|± | 3.27|
|hendrycksTest-high_school_us_history | 0|acc |72.06|± | 3.15|
| | |acc_norm|55.39|± | 3.49|
|hendrycksTest-high_school_world_history | 0|acc |69.62|± | 2.99|
| | |acc_norm|56.96|± | 3.22|
|hendrycksTest-human_aging | 0|acc |67.26|± | 3.15|
| | |acc_norm|36.32|± | 3.23|
|hendrycksTest-human_sexuality | 0|acc |70.23|± | 4.01|
| | |acc_norm|46.56|± | 4.37|
|hendrycksTest-international_law | 0|acc |70.25|± | 4.17|
| | |acc_norm|76.86|± | 3.85|
|hendrycksTest-jurisprudence | 0|acc |66.67|± | 4.56|
| | |acc_norm|55.56|± | 4.80|
|hendrycksTest-logical_fallacies | 0|acc |69.94|± | 3.60|
| | |acc_norm|53.99|± | 3.92|
|hendrycksTest-machine_learning | 0|acc |40.18|± | 4.65|
| | |acc_norm|30.36|± | 4.36|
|hendrycksTest-management | 0|acc |71.84|± | 4.45|
| | |acc_norm|55.34|± | 4.92|
|hendrycksTest-marketing | 0|acc |84.62|± | 2.36|
| | |acc_norm|76.50|± | 2.78|
|hendrycksTest-medical_genetics | 0|acc |60.00|± | 4.92|
| | |acc_norm|54.00|± | 5.01|
|hendrycksTest-miscellaneous | 0|acc |81.86|± | 1.38|
| | |acc_norm|61.43|± | 1.74|
|hendrycksTest-moral_disputes | 0|acc |61.85|± | 2.62|
| | |acc_norm|45.95|± | 2.68|
|hendrycksTest-moral_scenarios | 0|acc |34.30|± | 1.59|
| | |acc_norm|27.26|± | 1.49|
|hendrycksTest-nutrition | 0|acc |61.11|± | 2.79|
| | |acc_norm|50.33|± | 2.86|
|hendrycksTest-philosophy | 0|acc |67.52|± | 2.66|
| | |acc_norm|50.16|± | 2.84|
|hendrycksTest-prehistory | 0|acc |66.36|± | 2.63|
| | |acc_norm|42.90|± | 2.75|
|hendrycksTest-professional_accounting | 0|acc |39.72|± | 2.92|
| | |acc_norm|33.69|± | 2.82|
|hendrycksTest-professional_law | 0|acc |40.03|± | 1.25|
| | |acc_norm|34.35|± | 1.21|
|hendrycksTest-professional_medicine | 0|acc |55.51|± | 3.02|
| | |acc_norm|35.66|± | 2.91|
|hendrycksTest-professional_psychology | 0|acc |58.82|± | 1.99|
| | |acc_norm|43.30|± | 2.00|
|hendrycksTest-public_relations | 0|acc |64.55|± | 4.58|
| | |acc_norm|40.91|± | 4.71|
|hendrycksTest-security_studies | 0|acc |57.14|± | 3.17|
| | |acc_norm|40.41|± | 3.14|
|hendrycksTest-sociology | 0|acc |76.12|± | 3.01|
| | |acc_norm|66.17|± | 3.35|
|hendrycksTest-us_foreign_policy | 0|acc |79.00|± | 4.09|
| | |acc_norm|59.00|± | 4.94|
|hendrycksTest-virology | 0|acc |49.40|± | 3.89|
| | |acc_norm|34.34|± | 3.70|
|hendrycksTest-world_religions | 0|acc |81.29|± | 2.99|
| | |acc_norm|76.61|± | 3.25|
## llama-30B_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |58.20|± | 1.10|
|pawsx_en| 0|acc |58.75|± | 1.10|
|pawsx_es| 0|acc |55.80|± | 1.11|
|pawsx_fr| 0|acc |52.85|± | 1.12|
|pawsx_ja| 0|acc |46.75|± | 1.12|
|pawsx_ko| 0|acc |45.70|± | 1.11|
|pawsx_zh| 0|acc |45.90|± | 1.11|
## llama-30B_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 47.2|± | 2.23|
|xcopa_ht| 0|acc | 51.8|± | 2.24|
|xcopa_id| 0|acc | 60.6|± | 2.19|
|xcopa_it| 0|acc | 71.4|± | 2.02|
|xcopa_qu| 0|acc | 49.4|± | 2.24|
|xcopa_sw| 0|acc | 52.4|± | 2.24|
|xcopa_ta| 0|acc | 53.2|± | 2.23|
|xcopa_th| 0|acc | 54.6|± | 2.23|
|xcopa_tr| 0|acc | 52.2|± | 2.24|
|xcopa_vi| 0|acc | 52.4|± | 2.24|
|xcopa_zh| 0|acc | 62.2|± | 2.17|
## llama-30B_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |34.49|± | 0.67|
|xnli_bg| 0|acc |38.52|± | 0.69|
|xnli_de| 0|acc |43.87|± | 0.70|
|xnli_el| 0|acc |34.91|± | 0.67|
|xnli_en| 0|acc |48.18|± | 0.71|
|xnli_es| 0|acc |40.24|± | 0.69|
|xnli_fr| 0|acc |42.95|± | 0.70|
|xnli_hi| 0|acc |36.47|± | 0.68|
|xnli_ru| 0|acc |38.12|± | 0.69|
|xnli_sw| 0|acc |34.09|± | 0.67|
|xnli_th| 0|acc |33.97|± | 0.67|
|xnli_tr| 0|acc |36.53|± | 0.68|
|xnli_ur| 0|acc |34.31|± | 0.67|
|xnli_vi| 0|acc |35.67|± | 0.68|
|xnli_zh| 0|acc |33.51|± | 0.67|
## llama-30B_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |50.89|± | 1.29|
|xstory_cloze_en| 0|acc |78.16|± | 1.06|
|xstory_cloze_es| 0|acc |70.81|± | 1.17|
|xstory_cloze_eu| 0|acc |51.36|± | 1.29|
|xstory_cloze_hi| 0|acc |56.65|± | 1.28|
|xstory_cloze_id| 0|acc |59.23|± | 1.26|
|xstory_cloze_my| 0|acc |48.78|± | 1.29|
|xstory_cloze_ru| 0|acc |66.71|± | 1.21|
|xstory_cloze_sw| 0|acc |50.63|± | 1.29|
|xstory_cloze_te| 0|acc |53.21|± | 1.28|
|xstory_cloze_zh| 0|acc |58.57|± | 1.27|
## llama-30B_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |87.40|± | 0.69|
|xwinograd_fr| 0|acc |73.49|± | 4.87|
|xwinograd_jp| 0|acc |67.36|± | 1.51|
|xwinograd_pt| 0|acc |76.81|± | 2.61|
|xwinograd_ru| 0|acc |66.98|± | 2.65|
|xwinograd_zh| 0|acc |71.23|± | 2.02|
{
"results": {
"bigbench_hyperbaton": {
"multiple_choice_grade": 0.51524,
"multiple_choice_grade_stderr": 0.0022350513992069
},
"bigbench_salient_translation_error_detection": {
"multiple_choice_grade": 0.19839679358717435,
"multiple_choice_grade_stderr": 0.012629887094728112
},
"bigbench_geometric_shapes": {
"multiple_choice_grade": 0.2785515320334262,
"multiple_choice_grade_stderr": 0.023692665345206258,
"exact_str_match": 0.0,
"exact_str_match_stderr": 0.0
},
"bigbench_navigate": {
"multiple_choice_grade": 0.49,
"multiple_choice_grade_stderr": 0.015816135752773193
},
"bigbench_date_understanding": {
"multiple_choice_grade": 0.6991869918699187,
"multiple_choice_grade_stderr": 0.023906779002093273
},
"bigbench_disambiguation_qa": {
"multiple_choice_grade": 0.5426356589147286,
"multiple_choice_grade_stderr": 0.031075544990472662
},
"bigbench_tracking_shuffled_objects_three_objects": {
"multiple_choice_grade": 0.53,
"multiple_choice_grade_stderr": 0.02886365132641709
},
"bigbench_dyck_languages": {
"multiple_choice_grade": 0.212,
"multiple_choice_grade_stderr": 0.01293148186493804
},
"bigbench_formal_fallacies_syllogisms_negation": {
"multiple_choice_grade": 0.5058450704225352,
"multiple_choice_grade_stderr": 0.004195767817554208
},
"bigbench_tracking_shuffled_objects_seven_objects": {
"multiple_choice_grade": 0.15485714285714286,
"multiple_choice_grade_stderr": 0.00865039181414196
},
"bigbench_causal_judgement": {
"multiple_choice_grade": 0.5736842105263158,
"multiple_choice_grade_stderr": 0.03597255252302466
},
"bigbench_movie_recommendation": {
"multiple_choice_grade": 0.632,
"multiple_choice_grade_stderr": 0.02158898256835354
},
"bigbench_tracking_shuffled_objects_five_objects": {
"multiple_choice_grade": 0.2128,
"multiple_choice_grade_stderr": 0.01158102863217863
},
"bigbench_snarks": {
"multiple_choice_grade": 0.4696132596685083,
"multiple_choice_grade_stderr": 0.03719891321680327
},
"bigbench_sports_understanding": {
"multiple_choice_grade": 0.6237322515212982,
"multiple_choice_grade_stderr": 0.01543581207286162
},
"bigbench_logical_deduction_seven_objects": {
"multiple_choice_grade": 0.25285714285714284,
"multiple_choice_grade_stderr": 0.01643996352811702
},
"bigbench_temporal_sequences": {
"multiple_choice_grade": 0.146,
"multiple_choice_grade_stderr": 0.011171786285496496
},
"bigbench_logical_deduction_five_objects": {
"multiple_choice_grade": 0.368,
"multiple_choice_grade_stderr": 0.021588982568353548
},
"bigbench_ruin_names": {
"multiple_choice_grade": 0.39732142857142855,
"multiple_choice_grade_stderr": 0.023145155753004788
},
"bigbench_logical_deduction_three_objects": {
"multiple_choice_grade": 0.53,
"multiple_choice_grade_stderr": 0.02886365132641709
},
"bigbench_reasoning_about_colored_objects": {
"multiple_choice_grade": 0.5565,
"multiple_choice_grade_stderr": 0.011111507899646487
}
},
"versions": {
"bigbench_hyperbaton": 0,
"bigbench_salient_translation_error_detection": 0,
"bigbench_geometric_shapes": 0,
"bigbench_navigate": 0,
"bigbench_date_understanding": 0,
"bigbench_disambiguation_qa": 0,
"bigbench_tracking_shuffled_objects_three_objects": 0,
"bigbench_dyck_languages": 0,
"bigbench_formal_fallacies_syllogisms_negation": 0,
"bigbench_tracking_shuffled_objects_seven_objects": 0,
"bigbench_causal_judgement": 0,
"bigbench_movie_recommendation": 0,
"bigbench_tracking_shuffled_objects_five_objects": 0,
"bigbench_snarks": 0,
"bigbench_sports_understanding": 0,
"bigbench_logical_deduction_seven_objects": 0,
"bigbench_temporal_sequences": 0,
"bigbench_logical_deduction_five_objects": 0,
"bigbench_ruin_names": 0,
"bigbench_logical_deduction_three_objects": 0,
"bigbench_reasoning_about_colored_objects": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/30B,use_accelerate=True",
"num_fewshot": 3,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"hellaswag": {
"acc": 0.6264688309101772,
"acc_stderr": 0.00482752658488968,
"acc_norm": 0.7923720374427405,
"acc_norm_stderr": 0.00404779964623464
},
"copa": {
"acc": 0.9,
"acc_stderr": 0.030151134457776348
},
"prost": {
"acc": 0.2598740392826644,
"acc_stderr": 0.003204110008963041,
"acc_norm": 0.2910973526900085,
"acc_norm_stderr": 0.003318834364612203
},
"boolq": {
"acc": 0.6840978593272171,
"acc_stderr": 0.008130700051380873
},
"mc_taco": {
"em": 0.11411411411411411,
"f1": 0.48361974757894227
},
"winogrande": {
"acc": 0.7277032359905288,
"acc_stderr": 0.012510697991453936
},
"arc_challenge": {
"acc": 0.46757679180887374,
"acc_stderr": 0.014580637569995423,
"acc_norm": 0.454778156996587,
"acc_norm_stderr": 0.014551507060836352
},
"wsc273": {
"acc": 0.8681318681318682,
"acc_stderr": 0.020515321360773595
},
"openbookqa": {
"acc": 0.294,
"acc_stderr": 0.020395095484936603,
"acc_norm": 0.42,
"acc_norm_stderr": 0.02209471322976178
},
"swag": {
"acc": 0.5861241627511746,
"acc_stderr": 0.0034822550028030703,
"acc_norm": 0.7036389083275018,
"acc_norm_stderr": 0.0032286148364766096
},
"arc_easy": {
"acc": 0.7533670033670034,
"acc_stderr": 0.008844984581934908,
"acc_norm": 0.5896464646464646,
"acc_norm_stderr": 0.01009353125576545
},
"piqa": {
"acc": 0.809575625680087,
"acc_stderr": 0.009160842206469637,
"acc_norm": 0.8008705114254625,
"acc_norm_stderr": 0.009317391893706834
}
},
"versions": {
"hellaswag": 0,
"copa": 0,
"prost": 0,
"boolq": 1,
"mc_taco": 0,
"winogrande": 0,
"arc_challenge": 0,
"wsc273": 0,
"openbookqa": 0,
"swag": 0,
"arc_easy": 0,
"piqa": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/30B,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"gsm8k": {
"acc": 0.30477634571645185,
"acc_stderr": 0.012679297549515422
}
},
"versions": {
"gsm8k": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/30B,use_accelerate=True",
"num_fewshot": 8,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"crows_pairs_french_disability": {
"likelihood_difference": 10.202651515151516,
"likelihood_difference_stderr": 1.261700816634343,
"pct_stereotype": 0.5606060606060606,
"pct_stereotype_stderr": 0.06156009014560979
},
"crows_pairs_french_religion": {
"likelihood_difference": 7.940217391304348,
"likelihood_difference_stderr": 0.938898141048901,
"pct_stereotype": 0.5652173913043478,
"pct_stereotype_stderr": 0.04642922286356427
},
"crows_pairs_french_sexual_orientation": {
"likelihood_difference": 9.890796703296703,
"likelihood_difference_stderr": 1.6112974194891465,
"pct_stereotype": 0.7142857142857143,
"pct_stereotype_stderr": 0.04761904761904758
},
"toxigen": {
"acc": 0.42659574468085104,
"acc_stderr": 0.01614008877637632,
"acc_norm": 0.4319148936170213,
"acc_norm_stderr": 0.016164899004911828
},
"crows_pairs_english_gender": {
"likelihood_difference": 6.6615234375,
"likelihood_difference_stderr": 0.4284975339207996,
"pct_stereotype": 0.540625,
"pct_stereotype_stderr": 0.02790206840430007
},
"crows_pairs_english_age": {
"likelihood_difference": 5.1291208791208796,
"likelihood_difference_stderr": 0.5813404620923356,
"pct_stereotype": 0.5824175824175825,
"pct_stereotype_stderr": 0.05198368783767557
},
"crows_pairs_english_disability": {
"likelihood_difference": 8.886538461538462,
"likelihood_difference_stderr": 1.0342476212707912,
"pct_stereotype": 0.5384615384615384,
"pct_stereotype_stderr": 0.06231481440776789
},
"crows_pairs_french_age": {
"likelihood_difference": 8.925,
"likelihood_difference_stderr": 1.01086298976785,
"pct_stereotype": 0.4,
"pct_stereotype_stderr": 0.05192907868894985
},
"ethics_utilitarianism": {
"acc": 0.5012479201331115,
"acc_stderr": 0.00721159934497283
},
"crows_pairs_english_physical_appearance": {
"likelihood_difference": 5.401041666666667,
"likelihood_difference_stderr": 0.5913652974915496,
"pct_stereotype": 0.5277777777777778,
"pct_stereotype_stderr": 0.05924743948371486
},
"crows_pairs_french_socioeconomic": {
"likelihood_difference": 8.312898596938776,
"likelihood_difference_stderr": 0.8737467813045966,
"pct_stereotype": 0.5255102040816326,
"pct_stereotype_stderr": 0.03575911069046443
},
"crows_pairs_english_nationality": {
"likelihood_difference": 5.872829861111111,
"likelihood_difference_stderr": 0.3994396285401545,
"pct_stereotype": 0.5324074074074074,
"pct_stereotype_stderr": 0.03402801581358966
},
"ethics_cm": {
"acc": 0.5750321750321751,
"acc_stderr": 0.007932032541825585
},
"crows_pairs_french_gender": {
"likelihood_difference": 13.732768691588785,
"likelihood_difference_stderr": 1.1030097530113459,
"pct_stereotype": 0.5015576323987538,
"pct_stereotype_stderr": 0.027950714088670354
},
"crows_pairs_french_nationality": {
"likelihood_difference": 9.851037549407115,
"likelihood_difference_stderr": 0.8908345552184256,
"pct_stereotype": 0.38735177865612647,
"pct_stereotype_stderr": 0.03068725875850367
},
"ethics_deontology": {
"acc": 0.5417130144605117,
"acc_stderr": 0.008310055982844088,
"em": 0.06117908787541713
},
"ethics_utilitarianism_original": {
"acc": 0.9396838602329451,
"acc_stderr": 0.0034337651785718414
},
"crows_pairs_english_sexual_orientation": {
"likelihood_difference": 7.547715053763441,
"likelihood_difference_stderr": 0.7682550004765589,
"pct_stereotype": 0.6344086021505376,
"pct_stereotype_stderr": 0.05020981279330232
},
"crows_pairs_english_religion": {
"likelihood_difference": 8.075731981981981,
"likelihood_difference_stderr": 0.9438303669276185,
"pct_stereotype": 0.6216216216216216,
"pct_stereotype_stderr": 0.04624128233851482
},
"ethics_justice": {
"acc": 0.5170118343195266,
"acc_stderr": 0.009611595027307154,
"em": 0.013313609467455622
},
"ethics_virtue": {
"acc": 0.5181909547738693,
"acc_stderr": 0.007084831046245509,
"em": 0.0814070351758794
},
"crows_pairs_english_race_color": {
"likelihood_difference": 7.68214812992126,
"likelihood_difference_stderr": 0.3913516470344277,
"pct_stereotype": 0.5610236220472441,
"pct_stereotype_stderr": 0.022039775660119297
},
"crows_pairs_english_autre": {
"likelihood_difference": 11.380681818181818,
"likelihood_difference_stderr": 3.487665507491904,
"pct_stereotype": 0.6363636363636364,
"pct_stereotype_stderr": 0.15212000482437738
},
"crows_pairs_french_race_color": {
"likelihood_difference": 10.611684782608696,
"likelihood_difference_stderr": 0.7668117638923473,
"pct_stereotype": 0.41739130434782606,
"pct_stereotype_stderr": 0.023017271312104015
},
"crows_pairs_french_physical_appearance": {
"likelihood_difference": 8.217881944444445,
"likelihood_difference_stderr": 1.3267643213128657,
"pct_stereotype": 0.5694444444444444,
"pct_stereotype_stderr": 0.05876396677084613
},
"crows_pairs_french_autre": {
"likelihood_difference": 6.3798076923076925,
"likelihood_difference_stderr": 1.6568389364513447,
"pct_stereotype": 0.6153846153846154,
"pct_stereotype_stderr": 0.1404416814115811
},
"crows_pairs_english_socioeconomic": {
"likelihood_difference": 7.303947368421053,
"likelihood_difference_stderr": 0.5463280290787818,
"pct_stereotype": 0.5368421052631579,
"pct_stereotype_stderr": 0.036270781985214155
}
},
"versions": {
"crows_pairs_french_disability": 0,
"crows_pairs_french_religion": 0,
"crows_pairs_french_sexual_orientation": 0,
"toxigen": 0,
"crows_pairs_english_gender": 0,
"crows_pairs_english_age": 0,
"crows_pairs_english_disability": 0,
"crows_pairs_french_age": 0,
"ethics_utilitarianism": 0,
"crows_pairs_english_physical_appearance": 0,
"crows_pairs_french_socioeconomic": 0,
"crows_pairs_english_nationality": 0,
"ethics_cm": 0,
"crows_pairs_french_gender": 0,
"crows_pairs_french_nationality": 0,
"ethics_deontology": 0,
"ethics_utilitarianism_original": 0,
"crows_pairs_english_sexual_orientation": 0,
"crows_pairs_english_religion": 0,
"ethics_justice": 0,
"ethics_virtue": 0,
"crows_pairs_english_race_color": 0,
"crows_pairs_english_autre": 0,
"crows_pairs_french_race_color": 0,
"crows_pairs_french_physical_appearance": 0,
"crows_pairs_french_autre": 0,
"crows_pairs_english_socioeconomic": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/30B,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": false,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"math_prealgebra": {
"acc": 0.04133180252583238,
"acc_stderr": 0.006748646916387575
},
"drop": {
"em": 0.0382760067114094,
"em_stderr": 0.0019648445106113135,
"f1": 0.13911493288590454,
"f1_stderr": 0.0024846240125468515
},
"math_intermediate_algebra": {
"acc": 0.008859357696566999,
"acc_stderr": 0.0031200782932944743
},
"gsm8k": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_num_theory": {
"acc": 0.02962962962962963,
"acc_stderr": 0.007303608618028771
},
"math_algebra": {
"acc": 0.02948609941027801,
"acc_stderr": 0.004912099985374022
},
"math_precalc": {
"acc": 0.018315018315018316,
"acc_stderr": 0.005743696731653661
},
"math_geometry": {
"acc": 0.014613778705636743,
"acc_stderr": 0.005488713443686309
},
"math_counting_and_prob": {
"acc": 0.04008438818565401,
"acc_stderr": 0.009019315660749231
},
"math_asdiv": {
"acc": 0.0,
"acc_stderr": 0.0
},
"mathqa": {
"acc": 0.30586264656616413,
"acc_stderr": 0.00843502782274867,
"acc_norm": 0.3088777219430486,
"acc_norm_stderr": 0.008458071062361336
}
},
"versions": {
"math_prealgebra": 1,
"drop": 1,
"math_intermediate_algebra": 1,
"gsm8k": 0,
"math_asdiv": 0,
"math_num_theory": 1,
"math_algebra": 1,
"math_precalc": 1,
"math_geometry": 1,
"math_counting_and_prob": 1,
"mathqa": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/30B,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"math_prealgebra": {
"acc": 0.001148105625717566,
"acc_stderr": 0.001148105625717572
},
"math_intermediate_algebra": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_counting_and_prob": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_geometry": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_precalc": {
"acc": 0.0,
"acc_stderr": 0.0
},
"drop": {
"em": 0.008389261744966443,
"em_stderr": 0.0009340543216866975,
"f1": 0.016472315436241603,
"f1_stderr": 0.001049526866424092
},
"gsm8k": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_num_theory": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_algebra": {
"acc": 0.0,
"acc_stderr": 0.0
},
"mathqa": {
"acc": 0.3474036850921273,
"acc_stderr": 0.008716459359487392,
"acc_norm": 0.34539363484087104,
"acc_norm_stderr": 0.008704580930350191
}
},
"versions": {
"math_prealgebra": 1,
"math_intermediate_algebra": 1,
"math_counting_and_prob": 1,
"math_geometry": 1,
"math_precalc": 1,
"drop": 1,
"mathqa": 0,
"gsm8k": 0,
"math_num_theory": 1,
"math_algebra": 1
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/30B,use_accelerate=True",
"num_fewshot": 5,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"hendrycksTest-high_school_world_history": {
"acc": 0.6962025316455697,
"acc_stderr": 0.029936696387138598,
"acc_norm": 0.569620253164557,
"acc_norm_stderr": 0.032230171959375976
},
"hendrycksTest-formal_logic": {
"acc": 0.42063492063492064,
"acc_stderr": 0.04415438226743743,
"acc_norm": 0.3968253968253968,
"acc_norm_stderr": 0.043758884927270605
},
"hendrycksTest-human_aging": {
"acc": 0.672645739910314,
"acc_stderr": 0.03149384670994131,
"acc_norm": 0.3632286995515695,
"acc_norm_stderr": 0.032277904428505
},
"hendrycksTest-international_law": {
"acc": 0.7024793388429752,
"acc_stderr": 0.04173349148083499,
"acc_norm": 0.768595041322314,
"acc_norm_stderr": 0.03849856098794088
},
"hendrycksTest-security_studies": {
"acc": 0.5714285714285714,
"acc_stderr": 0.031680911612338825,
"acc_norm": 0.40408163265306124,
"acc_norm_stderr": 0.0314147080258659
},
"hendrycksTest-medical_genetics": {
"acc": 0.6,
"acc_stderr": 0.049236596391733084,
"acc_norm": 0.54,
"acc_norm_stderr": 0.05009082659620332
},
"hendrycksTest-econometrics": {
"acc": 0.3508771929824561,
"acc_stderr": 0.044895393502707,
"acc_norm": 0.3157894736842105,
"acc_norm_stderr": 0.043727482902780064
},
"hendrycksTest-high_school_macroeconomics": {
"acc": 0.5153846153846153,
"acc_stderr": 0.025339003010106515,
"acc_norm": 0.4153846153846154,
"acc_norm_stderr": 0.024985354923102332
},
"hendrycksTest-us_foreign_policy": {
"acc": 0.79,
"acc_stderr": 0.040936018074033256,
"acc_norm": 0.59,
"acc_norm_stderr": 0.049431107042371025
},
"hendrycksTest-logical_fallacies": {
"acc": 0.6993865030674846,
"acc_stderr": 0.03602511318806771,
"acc_norm": 0.5398773006134969,
"acc_norm_stderr": 0.039158572914369714
},
"hendrycksTest-prehistory": {
"acc": 0.6635802469135802,
"acc_stderr": 0.026289734945952926,
"acc_norm": 0.42901234567901236,
"acc_norm_stderr": 0.027538925613470867
},
"hendrycksTest-professional_psychology": {
"acc": 0.5882352941176471,
"acc_stderr": 0.019910377463105932,
"acc_norm": 0.43300653594771243,
"acc_norm_stderr": 0.02004544247332422
},
"hendrycksTest-professional_accounting": {
"acc": 0.3971631205673759,
"acc_stderr": 0.029189805673587105,
"acc_norm": 0.33687943262411346,
"acc_norm_stderr": 0.02819553487396673
},
"hendrycksTest-college_biology": {
"acc": 0.6111111111111112,
"acc_stderr": 0.04076663253918567,
"acc_norm": 0.4236111111111111,
"acc_norm_stderr": 0.04132125019723369
},
"hendrycksTest-high_school_biology": {
"acc": 0.6709677419354839,
"acc_stderr": 0.02672949906834996,
"acc_norm": 0.5451612903225806,
"acc_norm_stderr": 0.028327743091561074
},
"hendrycksTest-philosophy": {
"acc": 0.6752411575562701,
"acc_stderr": 0.02659678228769704,
"acc_norm": 0.5016077170418006,
"acc_norm_stderr": 0.02839794490780661
},
"hendrycksTest-high_school_european_history": {
"acc": 0.696969696969697,
"acc_stderr": 0.03588624800091707,
"acc_norm": 0.5636363636363636,
"acc_norm_stderr": 0.03872592983524754
},
"hendrycksTest-college_medicine": {
"acc": 0.5144508670520231,
"acc_stderr": 0.03810871630454764,
"acc_norm": 0.43352601156069365,
"acc_norm_stderr": 0.03778621079092055
},
"hendrycksTest-professional_medicine": {
"acc": 0.5551470588235294,
"acc_stderr": 0.03018753206032938,
"acc_norm": 0.35661764705882354,
"acc_norm_stderr": 0.02909720956841195
},
"hendrycksTest-moral_scenarios": {
"acc": 0.34301675977653634,
"acc_stderr": 0.015876912673057724,
"acc_norm": 0.27262569832402234,
"acc_norm_stderr": 0.014893391735249588
},
"hendrycksTest-high_school_chemistry": {
"acc": 0.39901477832512317,
"acc_stderr": 0.03445487686264716,
"acc_norm": 0.3694581280788177,
"acc_norm_stderr": 0.03395970381998573
},
"hendrycksTest-high_school_physics": {
"acc": 0.31788079470198677,
"acc_stderr": 0.038020397601079024,
"acc_norm": 0.31125827814569534,
"acc_norm_stderr": 0.03780445850526733
},
"hendrycksTest-high_school_government_and_politics": {
"acc": 0.8082901554404145,
"acc_stderr": 0.028408953626245282,
"acc_norm": 0.6113989637305699,
"acc_norm_stderr": 0.03517739796373132
},
"hendrycksTest-high_school_geography": {
"acc": 0.7575757575757576,
"acc_stderr": 0.030532892233932026,
"acc_norm": 0.5505050505050505,
"acc_norm_stderr": 0.0354413249194797
},
"hendrycksTest-global_facts": {
"acc": 0.47,
"acc_stderr": 0.05016135580465919,
"acc_norm": 0.37,
"acc_norm_stderr": 0.04852365870939099
},
"hendrycksTest-professional_law": {
"acc": 0.4002607561929596,
"acc_stderr": 0.012513582529136213,
"acc_norm": 0.3435462842242503,
"acc_norm_stderr": 0.012128961174190158
},
"hendrycksTest-college_mathematics": {
"acc": 0.37,
"acc_stderr": 0.048523658709391,
"acc_norm": 0.3,
"acc_norm_stderr": 0.046056618647183814
},
"hendrycksTest-college_physics": {
"acc": 0.23529411764705882,
"acc_stderr": 0.04220773659171452,
"acc_norm": 0.29411764705882354,
"acc_norm_stderr": 0.04533838195929774
},
"hendrycksTest-high_school_statistics": {
"acc": 0.4351851851851852,
"acc_stderr": 0.03381200005643525,
"acc_norm": 0.35648148148148145,
"acc_norm_stderr": 0.032664783315272714
},
"hendrycksTest-machine_learning": {
"acc": 0.4017857142857143,
"acc_stderr": 0.04653333146973646,
"acc_norm": 0.30357142857142855,
"acc_norm_stderr": 0.04364226155841044
},
"hendrycksTest-public_relations": {
"acc": 0.6454545454545455,
"acc_stderr": 0.045820048415054174,
"acc_norm": 0.4090909090909091,
"acc_norm_stderr": 0.047093069786618966
},
"hendrycksTest-high_school_computer_science": {
"acc": 0.61,
"acc_stderr": 0.04902071300001974,
"acc_norm": 0.47,
"acc_norm_stderr": 0.05016135580465919
},
"hendrycksTest-high_school_psychology": {
"acc": 0.7706422018348624,
"acc_stderr": 0.018025349724618684,
"acc_norm": 0.5541284403669725,
"acc_norm_stderr": 0.021311335009708582
},
"hendrycksTest-virology": {
"acc": 0.4939759036144578,
"acc_stderr": 0.03892212195333045,
"acc_norm": 0.3433734939759036,
"acc_norm_stderr": 0.03696584317010601
},
"hendrycksTest-marketing": {
"acc": 0.8461538461538461,
"acc_stderr": 0.023636873317489294,
"acc_norm": 0.7649572649572649,
"acc_norm_stderr": 0.027778835904935437
},
"hendrycksTest-human_sexuality": {
"acc": 0.7022900763358778,
"acc_stderr": 0.04010358942462203,
"acc_norm": 0.46564885496183206,
"acc_norm_stderr": 0.04374928560599738
},
"hendrycksTest-sociology": {
"acc": 0.7611940298507462,
"acc_stderr": 0.03014777593540922,
"acc_norm": 0.6616915422885572,
"acc_norm_stderr": 0.033455630703391914
},
"hendrycksTest-college_computer_science": {
"acc": 0.43,
"acc_stderr": 0.049756985195624284,
"acc_norm": 0.34,
"acc_norm_stderr": 0.04760952285695236
},
"hendrycksTest-conceptual_physics": {
"acc": 0.5106382978723404,
"acc_stderr": 0.03267862331014063,
"acc_norm": 0.3276595744680851,
"acc_norm_stderr": 0.030683020843231004
},
"hendrycksTest-anatomy": {
"acc": 0.5185185185185185,
"acc_stderr": 0.043163785995113245,
"acc_norm": 0.4074074074074074,
"acc_norm_stderr": 0.04244633238353228
},
"hendrycksTest-miscellaneous": {
"acc": 0.8186462324393359,
"acc_stderr": 0.013778693778464062,
"acc_norm": 0.6143039591315453,
"acc_norm_stderr": 0.017406476619212907
},
"hendrycksTest-jurisprudence": {
"acc": 0.6666666666666666,
"acc_stderr": 0.04557239513497751,
"acc_norm": 0.5555555555555556,
"acc_norm_stderr": 0.04803752235190193
},
"hendrycksTest-moral_disputes": {
"acc": 0.6184971098265896,
"acc_stderr": 0.026152198619726792,
"acc_norm": 0.4595375722543353,
"acc_norm_stderr": 0.026830805998952236
},
"hendrycksTest-high_school_us_history": {
"acc": 0.7205882352941176,
"acc_stderr": 0.031493281045079556,
"acc_norm": 0.553921568627451,
"acc_norm_stderr": 0.03488845451304974
},
"hendrycksTest-high_school_mathematics": {
"acc": 0.25925925925925924,
"acc_stderr": 0.026719240783712177,
"acc_norm": 0.3148148148148148,
"acc_norm_stderr": 0.02831753349606648
},
"hendrycksTest-high_school_microeconomics": {
"acc": 0.5840336134453782,
"acc_stderr": 0.032016501007396114,
"acc_norm": 0.4831932773109244,
"acc_norm_stderr": 0.03246013680375308
},
"hendrycksTest-astronomy": {
"acc": 0.5723684210526315,
"acc_stderr": 0.04026097083296564,
"acc_norm": 0.5657894736842105,
"acc_norm_stderr": 0.04033565667848319
},
"hendrycksTest-world_religions": {
"acc": 0.8128654970760234,
"acc_stderr": 0.029913127232368043,
"acc_norm": 0.7660818713450293,
"acc_norm_stderr": 0.03246721765117825
},
"hendrycksTest-clinical_knowledge": {
"acc": 0.5320754716981132,
"acc_stderr": 0.03070948699255654,
"acc_norm": 0.4641509433962264,
"acc_norm_stderr": 0.030693675018458003
},
"hendrycksTest-college_chemistry": {
"acc": 0.31,
"acc_stderr": 0.04648231987117316,
"acc_norm": 0.32,
"acc_norm_stderr": 0.046882617226215034
},
"hendrycksTest-abstract_algebra": {
"acc": 0.26,
"acc_stderr": 0.04408440022768078,
"acc_norm": 0.29,
"acc_norm_stderr": 0.04560480215720684
},
"hendrycksTest-business_ethics": {
"acc": 0.67,
"acc_stderr": 0.04725815626252609,
"acc_norm": 0.48,
"acc_norm_stderr": 0.050211673156867795
},
"hendrycksTest-elementary_mathematics": {
"acc": 0.4417989417989418,
"acc_stderr": 0.02557625706125384,
"acc_norm": 0.37037037037037035,
"acc_norm_stderr": 0.024870815251057075
},
"hendrycksTest-management": {
"acc": 0.7184466019417476,
"acc_stderr": 0.044532548363264673,
"acc_norm": 0.5533980582524272,
"acc_norm_stderr": 0.04922424153458933
},
"hendrycksTest-electrical_engineering": {
"acc": 0.5172413793103449,
"acc_stderr": 0.04164188720169375,
"acc_norm": 0.38620689655172413,
"acc_norm_stderr": 0.040573247344190336
},
"hendrycksTest-nutrition": {
"acc": 0.6111111111111112,
"acc_stderr": 0.02791405551046801,
"acc_norm": 0.5032679738562091,
"acc_norm_stderr": 0.028629305194003543
},
"hendrycksTest-computer_security": {
"acc": 0.66,
"acc_stderr": 0.04760952285695237,
"acc_norm": 0.58,
"acc_norm_stderr": 0.049604496374885836
}
},
"versions": {
"hendrycksTest-high_school_world_history": 0,
"hendrycksTest-formal_logic": 0,
"hendrycksTest-human_aging": 0,
"hendrycksTest-international_law": 0,
"hendrycksTest-security_studies": 0,
"hendrycksTest-medical_genetics": 0,
"hendrycksTest-econometrics": 0,
"hendrycksTest-high_school_macroeconomics": 0,
"hendrycksTest-us_foreign_policy": 0,
"hendrycksTest-logical_fallacies": 0,
"hendrycksTest-prehistory": 0,
"hendrycksTest-professional_psychology": 0,
"hendrycksTest-professional_accounting": 0,
"hendrycksTest-college_biology": 0,
"hendrycksTest-high_school_biology": 0,
"hendrycksTest-philosophy": 0,
"hendrycksTest-high_school_european_history": 0,
"hendrycksTest-college_medicine": 0,
"hendrycksTest-professional_medicine": 0,
"hendrycksTest-moral_scenarios": 0,
"hendrycksTest-high_school_chemistry": 0,
"hendrycksTest-high_school_physics": 0,
"hendrycksTest-high_school_government_and_politics": 0,
"hendrycksTest-high_school_geography": 0,
"hendrycksTest-global_facts": 0,
"hendrycksTest-professional_law": 0,
"hendrycksTest-college_mathematics": 0,
"hendrycksTest-college_physics": 0,
"hendrycksTest-high_school_statistics": 0,
"hendrycksTest-machine_learning": 0,
"hendrycksTest-public_relations": 0,
"hendrycksTest-high_school_computer_science": 0,
"hendrycksTest-high_school_psychology": 0,
"hendrycksTest-virology": 0,
"hendrycksTest-marketing": 0,
"hendrycksTest-human_sexuality": 0,
"hendrycksTest-sociology": 0,
"hendrycksTest-college_computer_science": 0,
"hendrycksTest-conceptual_physics": 0,
"hendrycksTest-anatomy": 0,
"hendrycksTest-miscellaneous": 0,
"hendrycksTest-jurisprudence": 0,
"hendrycksTest-moral_disputes": 0,
"hendrycksTest-high_school_us_history": 0,
"hendrycksTest-high_school_mathematics": 0,
"hendrycksTest-high_school_microeconomics": 0,
"hendrycksTest-astronomy": 0,
"hendrycksTest-world_religions": 0,
"hendrycksTest-clinical_knowledge": 0,
"hendrycksTest-college_chemistry": 0,
"hendrycksTest-abstract_algebra": 0,
"hendrycksTest-business_ethics": 0,
"hendrycksTest-elementary_mathematics": 0,
"hendrycksTest-management": 0,
"hendrycksTest-electrical_engineering": 0,
"hendrycksTest-nutrition": 0,
"hendrycksTest-computer_security": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/30B,use_accelerate=True",
"num_fewshot": 5,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment