{ "results": { "hendrycksTest-nutrition": { "acc": 0.29411764705882354, "acc_stderr": 0.02609016250427904, "acc_norm": 0.369281045751634, "acc_norm_stderr": 0.02763417668960266 }, "hendrycksTest-clinical_knowledge": { "acc": 0.24150943396226415, "acc_stderr": 0.02634148037111837, "acc_norm": 0.3018867924528302, "acc_norm_stderr": 0.02825420034443865 }, "hendrycksTest-high_school_us_history": { "acc": 0.23039215686274508, "acc_stderr": 0.02955429260569507, "acc_norm": 0.27941176470588236, "acc_norm_stderr": 0.031493281045079556 }, "hendrycksTest-public_relations": { "acc": 0.2727272727272727, "acc_stderr": 0.04265792110940588, "acc_norm": 0.2, "acc_norm_stderr": 0.03831305140884601 }, "hendrycksTest-high_school_european_history": { "acc": 0.2545454545454545, "acc_stderr": 0.0340150671524904, "acc_norm": 0.296969696969697, "acc_norm_stderr": 0.03567969772268047 }, "hendrycksTest-moral_scenarios": { "acc": 0.27262569832402234, "acc_stderr": 0.014893391735249588, "acc_norm": 0.27262569832402234, "acc_norm_stderr": 0.014893391735249588 }, "hendrycksTest-anatomy": { "acc": 0.26666666666666666, "acc_stderr": 0.038201699145179055, "acc_norm": 0.23703703703703705, "acc_norm_stderr": 0.03673731683969506 }, "hendrycksTest-international_law": { "acc": 0.2809917355371901, "acc_stderr": 0.04103203830514512, "acc_norm": 0.4380165289256198, "acc_norm_stderr": 0.045291468044357915 }, "hendrycksTest-management": { "acc": 0.3300970873786408, "acc_stderr": 0.0465614711001235, "acc_norm": 0.32038834951456313, "acc_norm_stderr": 0.0462028408228004 }, "winogrande": { "acc": 0.6519337016574586, "acc_stderr": 0.013388004531086047 }, "hendrycksTest-college_mathematics": { "acc": 0.21, "acc_stderr": 0.040936018074033256, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "wsc": { "acc": 0.6057692307692307, "acc_stderr": 0.04815154775990711 }, "hendrycksTest-high_school_chemistry": { "acc": 0.23645320197044334, "acc_stderr": 0.02989611429173355, "acc_norm": 0.3251231527093596, "acc_norm_stderr": 0.03295797566311271 }, "hendrycksTest-college_biology": { "acc": 0.20833333333333334, "acc_stderr": 0.03396116205845334, "acc_norm": 0.2152777777777778, "acc_norm_stderr": 0.034370793441061344 }, "hendrycksTest-college_chemistry": { "acc": 0.24, "acc_stderr": 0.042923469599092816, "acc_norm": 0.33, "acc_norm_stderr": 0.04725815626252605 }, "hendrycksTest-human_aging": { "acc": 0.2914798206278027, "acc_stderr": 0.030500283176545906, "acc_norm": 0.24663677130044842, "acc_norm_stderr": 0.028930413120910877 }, "hendrycksTest-computer_security": { "acc": 0.23, "acc_stderr": 0.04229525846816506, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695236 }, "hendrycksTest-high_school_computer_science": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542128 }, "hendrycksTest-electrical_engineering": { "acc": 0.31724137931034485, "acc_stderr": 0.038783523721386215, "acc_norm": 0.31724137931034485, "acc_norm_stderr": 0.038783523721386215 }, "lambada_openai": { "ppl": 4.037871980255364, "ppl_stderr": 0.08695582970269694, "acc": 0.6863962740151368, "acc_stderr": 0.006463833164285201 }, "hendrycksTest-high_school_statistics": { "acc": 0.2962962962962963, "acc_stderr": 0.03114144782353603, "acc_norm": 0.2916666666666667, "acc_norm_stderr": 0.03099866630456052 }, "hendrycksTest-miscellaneous": { "acc": 0.3231162196679438, "acc_stderr": 0.016723726512343048, "acc_norm": 0.28991060025542786, "acc_norm_stderr": 0.01622501794477096 }, "hendrycksTest-human_sexuality": { "acc": 0.32061068702290074, "acc_stderr": 0.04093329229834278, "acc_norm": 0.31297709923664124, "acc_norm_stderr": 0.04066962905677697 }, "hendrycksTest-global_facts": { "acc": 0.24, "acc_stderr": 0.042923469599092816, "acc_norm": 0.24, "acc_norm_stderr": 0.042923469599092816 }, "hendrycksTest-marketing": { "acc": 0.2905982905982906, "acc_stderr": 0.029745048572674047, "acc_norm": 0.29914529914529914, "acc_norm_stderr": 0.029996951858349483 }, "hendrycksTest-college_computer_science": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.27, "acc_norm_stderr": 0.04461960433384741 }, "hendrycksTest-professional_accounting": { "acc": 0.2730496453900709, "acc_stderr": 0.026577860943307847, "acc_norm": 0.28368794326241137, "acc_norm_stderr": 0.026891709428343957 }, "hendrycksTest-college_physics": { "acc": 0.20588235294117646, "acc_stderr": 0.040233822736177455, "acc_norm": 0.23529411764705882, "acc_norm_stderr": 0.042207736591714534 }, "hendrycksTest-professional_law": { "acc": 0.2711864406779661, "acc_stderr": 0.011354581451622985, "acc_norm": 0.28096479791395046, "acc_norm_stderr": 0.011479684550077683 }, "hendrycksTest-elementary_mathematics": { "acc": 0.24074074074074073, "acc_stderr": 0.022019080012217904, "acc_norm": 0.2671957671957672, "acc_norm_stderr": 0.022789673145776578 }, "hendrycksTest-medical_genetics": { "acc": 0.27, "acc_stderr": 0.044619604333847394, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001975 }, "hendrycksTest-sociology": { "acc": 0.2835820895522388, "acc_stderr": 0.03187187537919797, "acc_norm": 0.2835820895522388, "acc_norm_stderr": 0.031871875379197986 }, "hendrycksTest-security_studies": { "acc": 0.3673469387755102, "acc_stderr": 0.03086214492108756, "acc_norm": 0.2897959183673469, "acc_norm_stderr": 0.029043088683304328 }, "hendrycksTest-abstract_algebra": { "acc": 0.23, "acc_stderr": 0.04229525846816506, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909284 }, "hendrycksTest-professional_psychology": { "acc": 0.2696078431372549, "acc_stderr": 0.017952449196987862, "acc_norm": 0.2696078431372549, "acc_norm_stderr": 0.017952449196987862 }, "hendrycksTest-high_school_world_history": { "acc": 0.25738396624472576, "acc_stderr": 0.02845882099146029, "acc_norm": 0.29535864978902954, "acc_norm_stderr": 0.029696338713422882 }, "hendrycksTest-jurisprudence": { "acc": 0.2962962962962963, "acc_stderr": 0.044143436668549335, "acc_norm": 0.4351851851851852, "acc_norm_stderr": 0.04792898170907062 }, "hendrycksTest-machine_learning": { "acc": 0.2767857142857143, "acc_stderr": 0.042466243366976256, "acc_norm": 0.24107142857142858, "acc_norm_stderr": 0.04059867246952688 }, "hendrycksTest-world_religions": { "acc": 0.3391812865497076, "acc_stderr": 0.036310534964889056, "acc_norm": 0.39766081871345027, "acc_norm_stderr": 0.0375363895576169 }, "sciq": { "acc": 0.908, "acc_stderr": 0.009144376393151103, "acc_norm": 0.866, "acc_norm_stderr": 0.01077776229836969 }, "piqa": { "acc": 0.7595212187159956, "acc_stderr": 0.009971345364651073, "acc_norm": 0.7682263329706203, "acc_norm_stderr": 0.009845143772794041 }, "hendrycksTest-high_school_macroeconomics": { "acc": 0.2948717948717949, "acc_stderr": 0.023119362758232294, "acc_norm": 0.2948717948717949, "acc_norm_stderr": 0.02311936275823228 }, "hendrycksTest-professional_medicine": { "acc": 0.2647058823529412, "acc_stderr": 0.026799562024887657, "acc_norm": 0.26838235294117646, "acc_norm_stderr": 0.026917481224377215 }, "arc_challenge": { "acc": 0.3293515358361775, "acc_stderr": 0.013734057652635474, "acc_norm": 0.3575085324232082, "acc_norm_stderr": 0.01400549427591657 }, "hendrycksTest-high_school_physics": { "acc": 0.24503311258278146, "acc_stderr": 0.035118075718047245, "acc_norm": 0.2251655629139073, "acc_norm_stderr": 0.03410435282008937 }, "hendrycksTest-college_medicine": { "acc": 0.2543352601156069, "acc_stderr": 0.0332055644308557, "acc_norm": 0.2254335260115607, "acc_norm_stderr": 0.03186209851641143 }, "hendrycksTest-high_school_biology": { "acc": 0.25161290322580643, "acc_stderr": 0.024685979286239963, "acc_norm": 0.2838709677419355, "acc_norm_stderr": 0.025649381063029258 }, "hendrycksTest-prehistory": { "acc": 0.25308641975308643, "acc_stderr": 0.024191808600713002, "acc_norm": 0.2037037037037037, "acc_norm_stderr": 0.022409674547304168 }, "hendrycksTest-high_school_geography": { "acc": 0.2878787878787879, "acc_stderr": 0.03225883512300992, "acc_norm": 0.30303030303030304, "acc_norm_stderr": 0.03274287914026867 }, "hendrycksTest-conceptual_physics": { "acc": 0.2425531914893617, "acc_stderr": 0.02802022627120022, "acc_norm": 0.2, "acc_norm_stderr": 0.026148818018424506 }, "hendrycksTest-high_school_government_and_politics": { "acc": 0.29533678756476683, "acc_stderr": 0.03292296639155137, "acc_norm": 0.2538860103626943, "acc_norm_stderr": 0.03141024780565318 }, "hendrycksTest-philosophy": { "acc": 0.2540192926045016, "acc_stderr": 0.02472386150477169, "acc_norm": 0.3183279742765273, "acc_norm_stderr": 0.026457225067811025 }, "arc_easy": { "acc": 0.6708754208754208, "acc_stderr": 0.009642048058060987, "acc_norm": 0.6178451178451179, "acc_norm_stderr": 0.009970747281292424 }, "hendrycksTest-high_school_psychology": { "acc": 0.27522935779816515, "acc_stderr": 0.0191490937431552, "acc_norm": 0.24403669724770644, "acc_norm_stderr": 0.018415286351416416 }, "hendrycksTest-moral_disputes": { "acc": 0.30346820809248554, "acc_stderr": 0.024752411960917212, "acc_norm": 0.3092485549132948, "acc_norm_stderr": 0.02488314057007176 }, "hendrycksTest-logical_fallacies": { "acc": 0.31901840490797545, "acc_stderr": 0.03661997551073836, "acc_norm": 0.3006134969325153, "acc_norm_stderr": 0.03602511318806771 }, "hendrycksTest-econometrics": { "acc": 0.3508771929824561, "acc_stderr": 0.044895393502707, "acc_norm": 0.2982456140350877, "acc_norm_stderr": 0.04303684033537315 }, "hendrycksTest-astronomy": { "acc": 0.3157894736842105, "acc_stderr": 0.0378272898086547, "acc_norm": 0.3815789473684211, "acc_norm_stderr": 0.039531733777491945 }, "hendrycksTest-us_foreign_policy": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "hendrycksTest-virology": { "acc": 0.3373493975903614, "acc_stderr": 0.03680783690727581, "acc_norm": 0.2469879518072289, "acc_norm_stderr": 0.03357351982064536 }, "hendrycksTest-formal_logic": { "acc": 0.2857142857142857, "acc_stderr": 0.04040610178208841, "acc_norm": 0.23809523809523808, "acc_norm_stderr": 0.038095238095238106 }, "hendrycksTest-high_school_microeconomics": { "acc": 0.2605042016806723, "acc_stderr": 0.02851025151234193, "acc_norm": 0.3277310924369748, "acc_norm_stderr": 0.030489911417673227 }, "hendrycksTest-high_school_mathematics": { "acc": 0.2740740740740741, "acc_stderr": 0.027195934804085626, "acc_norm": 0.3, "acc_norm_stderr": 0.027940457136228412 }, "logiqa": { "acc": 0.22734254992319508, "acc_stderr": 0.01643906767511774, "acc_norm": 0.2857142857142857, "acc_norm_stderr": 0.017719247798458293 }, "hendrycksTest-business_ethics": { "acc": 0.29, "acc_stderr": 0.045604802157206845, "acc_norm": 0.32, "acc_norm_stderr": 0.046882617226215034 } }, "versions": { "hendrycksTest-nutrition": 0, "hendrycksTest-clinical_knowledge": 0, "hendrycksTest-high_school_us_history": 0, "hendrycksTest-public_relations": 0, "hendrycksTest-high_school_european_history": 0, "hendrycksTest-moral_scenarios": 0, "hendrycksTest-anatomy": 0, "hendrycksTest-international_law": 0, "hendrycksTest-management": 0, "winogrande": 0, "hendrycksTest-college_mathematics": 0, "wsc": 0, "hendrycksTest-high_school_chemistry": 0, "hendrycksTest-college_biology": 0, "hendrycksTest-college_chemistry": 0, "hendrycksTest-human_aging": 0, "hendrycksTest-computer_security": 0, "hendrycksTest-high_school_computer_science": 0, "hendrycksTest-electrical_engineering": 0, "lambada_openai": 0, "hendrycksTest-high_school_statistics": 0, "hendrycksTest-miscellaneous": 0, "hendrycksTest-human_sexuality": 0, "hendrycksTest-global_facts": 0, "hendrycksTest-marketing": 0, "hendrycksTest-college_computer_science": 0, "hendrycksTest-professional_accounting": 0, "hendrycksTest-college_physics": 0, "hendrycksTest-professional_law": 0, "hendrycksTest-elementary_mathematics": 0, "hendrycksTest-medical_genetics": 0, "hendrycksTest-sociology": 0, "hendrycksTest-security_studies": 0, "hendrycksTest-abstract_algebra": 0, "hendrycksTest-professional_psychology": 0, "hendrycksTest-high_school_world_history": 0, "hendrycksTest-jurisprudence": 0, "hendrycksTest-machine_learning": 0, "hendrycksTest-world_religions": 0, "sciq": 0, "piqa": 0, "hendrycksTest-high_school_macroeconomics": 0, "hendrycksTest-professional_medicine": 0, "arc_challenge": 0, "hendrycksTest-high_school_physics": 0, "hendrycksTest-college_medicine": 0, "hendrycksTest-high_school_biology": 0, "hendrycksTest-prehistory": 0, "hendrycksTest-high_school_geography": 0, "hendrycksTest-conceptual_physics": 0, "hendrycksTest-high_school_government_and_politics": 0, "hendrycksTest-philosophy": 0, "arc_easy": 0, "hendrycksTest-high_school_psychology": 0, "hendrycksTest-moral_disputes": 0, "hendrycksTest-logical_fallacies": 0, "hendrycksTest-econometrics": 0, "hendrycksTest-astronomy": 0, "hendrycksTest-us_foreign_policy": 0, "hendrycksTest-virology": 0, "hendrycksTest-formal_logic": 0, "hendrycksTest-high_school_microeconomics": 0, "hendrycksTest-high_school_mathematics": 0, "logiqa": 0, "hendrycksTest-business_ethics": 0 }, "config": { "model": "hf-causal", "model_args": "pretrained=facebook/opt-13b,use_accelerate=True,device_map_option=sequential", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "no_cache": false, "limit": null, "bootstrap_iters": 100000, "description_dict": {} } }