# opt-1.3b ## opt-1.3b.json | Task |Version| Metric |Value| |Stderr| |-------------------------------------------------|------:|--------|----:|---|-----:| |arc_challenge | 0|acc |23.12|± | 1.23| | | |acc_norm|29.44|± | 1.33| |arc_easy | 0|acc |57.03|± | 1.02| | | |acc_norm|50.93|± | 1.03| |hendrycksTest-abstract_algebra | 0|acc |26.00|± | 4.41| | | |acc_norm|25.00|± | 4.35| |hendrycksTest-anatomy | 0|acc |22.96|± | 3.63| | | |acc_norm|21.48|± | 3.55| |hendrycksTest-astronomy | 0|acc |23.68|± | 3.46| | | |acc_norm|34.21|± | 3.86| |hendrycksTest-business_ethics | 0|acc |34.00|± | 4.76| | | |acc_norm|33.00|± | 4.73| |hendrycksTest-clinical_knowledge | 0|acc |19.25|± | 2.43| | | |acc_norm|25.28|± | 2.67| |hendrycksTest-college_biology | 0|acc |26.39|± | 3.69| | | |acc_norm|27.78|± | 3.75| |hendrycksTest-college_chemistry | 0|acc |28.00|± | 4.51| | | |acc_norm|34.00|± | 4.76| |hendrycksTest-college_computer_science | 0|acc |33.00|± | 4.73| | | |acc_norm|30.00|± | 4.61| |hendrycksTest-college_mathematics | 0|acc |19.00|± | 3.94| | | |acc_norm|30.00|± | 4.61| |hendrycksTest-college_medicine | 0|acc |17.92|± | 2.92| | | |acc_norm|23.70|± | 3.24| |hendrycksTest-college_physics | 0|acc |27.45|± | 4.44| | | |acc_norm|30.39|± | 4.58| |hendrycksTest-computer_security | 0|acc |29.00|± | 4.56| | | |acc_norm|38.00|± | 4.88| |hendrycksTest-conceptual_physics | 0|acc |21.70|± | 2.69| | | |acc_norm|20.43|± | 2.64| |hendrycksTest-econometrics | 0|acc |24.56|± | 4.05| | | |acc_norm|23.68|± | 4.00| |hendrycksTest-electrical_engineering | 0|acc |25.52|± | 3.63| | | |acc_norm|28.97|± | 3.78| |hendrycksTest-elementary_mathematics | 0|acc |19.58|± | 2.04| | | |acc_norm|24.87|± | 2.23| |hendrycksTest-formal_logic | 0|acc |29.37|± | 4.07| | | |acc_norm|26.98|± | 3.97| |hendrycksTest-global_facts | 0|acc |16.00|± | 3.68| | | |acc_norm|18.00|± | 3.86| |hendrycksTest-high_school_biology | 0|acc |20.97|± | 2.32| | | |acc_norm|26.77|± | 2.52| |hendrycksTest-high_school_chemistry | 0|acc |24.63|± | 3.03| | | |acc_norm|30.54|± | 3.24| |hendrycksTest-high_school_computer_science | 0|acc |27.00|± | 4.46| | | |acc_norm|32.00|± | 4.69| |hendrycksTest-high_school_european_history | 0|acc |24.24|± | 3.35| | | |acc_norm|27.27|± | 3.48| |hendrycksTest-high_school_geography | 0|acc |22.22|± | 2.96| | | |acc_norm|28.28|± | 3.21| |hendrycksTest-high_school_government_and_politics| 0|acc |20.73|± | 2.93| | | |acc_norm|23.83|± | 3.07| |hendrycksTest-high_school_macroeconomics | 0|acc |29.23|± | 2.31| | | |acc_norm|29.23|± | 2.31| |hendrycksTest-high_school_mathematics | 0|acc |21.85|± | 2.52| | | |acc_norm|28.89|± | 2.76| |hendrycksTest-high_school_microeconomics | 0|acc |21.43|± | 2.67| | | |acc_norm|30.25|± | 2.98| |hendrycksTest-high_school_physics | 0|acc |22.52|± | 3.41| | | |acc_norm|25.17|± | 3.54| |hendrycksTest-high_school_psychology | 0|acc |22.57|± | 1.79| | | |acc_norm|24.22|± | 1.84| |hendrycksTest-high_school_statistics | 0|acc |25.46|± | 2.97| | | |acc_norm|27.78|± | 3.05| |hendrycksTest-high_school_us_history | 0|acc |25.00|± | 3.04| | | |acc_norm|25.49|± | 3.06| |hendrycksTest-high_school_world_history | 0|acc |26.58|± | 2.88| | | |acc_norm|27.85|± | 2.92| |hendrycksTest-human_aging | 0|acc |35.43|± | 3.21| | | |acc_norm|29.15|± | 3.05| |hendrycksTest-human_sexuality | 0|acc |40.46|± | 4.30| | | |acc_norm|31.30|± | 4.07| |hendrycksTest-international_law | 0|acc |17.36|± | 3.46| | | |acc_norm|47.93|± | 4.56| |hendrycksTest-jurisprudence | 0|acc |28.70|± | 4.37| | | |acc_norm|39.81|± | 4.73| |hendrycksTest-logical_fallacies | 0|acc |19.02|± | 3.08| | | |acc_norm|28.83|± | 3.56| |hendrycksTest-machine_learning | 0|acc |27.68|± | 4.25| | | |acc_norm|27.68|± | 4.25| |hendrycksTest-management | 0|acc |20.39|± | 3.99| | | |acc_norm|27.18|± | 4.41| |hendrycksTest-marketing | 0|acc |28.21|± | 2.95| | | |acc_norm|33.76|± | 3.10| |hendrycksTest-medical_genetics | 0|acc |27.00|± | 4.46| | | |acc_norm|36.00|± | 4.82| |hendrycksTest-miscellaneous | 0|acc |28.35|± | 1.61| | | |acc_norm|28.74|± | 1.62| |hendrycksTest-moral_disputes | 0|acc |27.17|± | 2.39| | | |acc_norm|30.35|± | 2.48| |hendrycksTest-moral_scenarios | 0|acc |23.80|± | 1.42| | | |acc_norm|27.26|± | 1.49| |hendrycksTest-nutrition | 0|acc |29.41|± | 2.61| | | |acc_norm|39.54|± | 2.80| |hendrycksTest-philosophy | 0|acc |23.79|± | 2.42| | | |acc_norm|30.87|± | 2.62| |hendrycksTest-prehistory | 0|acc |24.07|± | 2.38| | | |acc_norm|21.60|± | 2.29| |hendrycksTest-professional_accounting | 0|acc |25.89|± | 2.61| | | |acc_norm|26.24|± | 2.62| |hendrycksTest-professional_law | 0|acc |26.01|± | 1.12| | | |acc_norm|28.03|± | 1.15| |hendrycksTest-professional_medicine | 0|acc |24.63|± | 2.62| | | |acc_norm|22.43|± | 2.53| |hendrycksTest-professional_psychology | 0|acc |23.69|± | 1.72| | | |acc_norm|25.49|± | 1.76| |hendrycksTest-public_relations | 0|acc |25.45|± | 4.17| | | |acc_norm|19.09|± | 3.76| |hendrycksTest-security_studies | 0|acc |32.24|± | 2.99| | | |acc_norm|26.53|± | 2.83| |hendrycksTest-sociology | 0|acc |33.83|± | 3.35| | | |acc_norm|34.33|± | 3.36| |hendrycksTest-us_foreign_policy | 0|acc |32.00|± | 4.69| | | |acc_norm|27.00|± | 4.46| |hendrycksTest-virology | 0|acc |34.34|± | 3.70| | | |acc_norm|30.12|± | 3.57| |hendrycksTest-world_religions | 0|acc |34.50|± | 3.65| | | |acc_norm|33.92|± | 3.63| |lambada_openai | 0|ppl | 6.64|± | 0.17| | | |acc |57.93|± | 0.69| |logiqa | 0|acc |22.27|± | 1.63| | | |acc_norm|27.19|± | 1.75| |piqa | 0|acc |71.71|± | 1.05| | | |acc_norm|72.47|± | 1.04| |sciq | 0|acc |84.50|± | 1.15| | | |acc_norm|76.50|± | 1.34| |winogrande | 0|acc |59.75|± | 1.38| |wsc | 0|acc |38.46|± | 4.79|