# opt-125m ## opt-125m.json | Task |Version| Metric |Value| |Stderr| |-------------------------------------------------|------:|--------|----:|---|-----:| |arc_challenge | 0|acc |18.94|± | 1.15| | | |acc_norm|22.78|± | 1.23| |arc_easy | 0|acc |43.52|± | 1.02| | | |acc_norm|39.98|± | 1.01| |hendrycksTest-abstract_algebra | 0|acc |17.00|± | 3.78| | | |acc_norm|20.00|± | 4.02| |hendrycksTest-anatomy | 0|acc |24.44|± | 3.71| | | |acc_norm|21.48|± | 3.55| |hendrycksTest-astronomy | 0|acc |20.39|± | 3.28| | | |acc_norm|33.55|± | 3.84| |hendrycksTest-business_ethics | 0|acc |35.00|± | 4.79| | | |acc_norm|23.00|± | 4.23| |hendrycksTest-clinical_knowledge | 0|acc |18.87|± | 2.41| | | |acc_norm|27.92|± | 2.76| |hendrycksTest-college_biology | 0|acc |22.92|± | 3.51| | | |acc_norm|23.61|± | 3.55| |hendrycksTest-college_chemistry | 0|acc |28.00|± | 4.51| | | |acc_norm|27.00|± | 4.46| |hendrycksTest-college_computer_science | 0|acc |27.00|± | 4.46| | | |acc_norm|22.00|± | 4.16| |hendrycksTest-college_mathematics | 0|acc |16.00|± | 3.68| | | |acc_norm|25.00|± | 4.35| |hendrycksTest-college_medicine | 0|acc |24.28|± | 3.27| | | |acc_norm|26.59|± | 3.37| |hendrycksTest-college_physics | 0|acc |28.43|± | 4.49| | | |acc_norm|23.53|± | 4.22| |hendrycksTest-computer_security | 0|acc |31.00|± | 4.65| | | |acc_norm|33.00|± | 4.73| |hendrycksTest-conceptual_physics | 0|acc |26.38|± | 2.88| | | |acc_norm|17.45|± | 2.48| |hendrycksTest-econometrics | 0|acc |31.58|± | 4.37| | | |acc_norm|28.95|± | 4.27| |hendrycksTest-electrical_engineering | 0|acc |24.14|± | 3.57| | | |acc_norm|30.34|± | 3.83| |hendrycksTest-elementary_mathematics | 0|acc |24.87|± | 2.23| | | |acc_norm|25.13|± | 2.23| |hendrycksTest-formal_logic | 0|acc |30.95|± | 4.13| | | |acc_norm|28.57|± | 4.04| |hendrycksTest-global_facts | 0|acc |19.00|± | 3.94| | | |acc_norm|22.00|± | 4.16| |hendrycksTest-high_school_biology | 0|acc |24.84|± | 2.46| | | |acc_norm|29.03|± | 2.58| |hendrycksTest-high_school_chemistry | 0|acc |17.24|± | 2.66| | | |acc_norm|24.63|± | 3.03| |hendrycksTest-high_school_computer_science | 0|acc |21.00|± | 4.09| | | |acc_norm|25.00|± | 4.35| |hendrycksTest-high_school_european_history | 0|acc |24.85|± | 3.37| | | |acc_norm|31.52|± | 3.63| |hendrycksTest-high_school_geography | 0|acc |22.73|± | 2.99| | | |acc_norm|28.79|± | 3.23| |hendrycksTest-high_school_government_and_politics| 0|acc |24.87|± | 3.12| | | |acc_norm|30.05|± | 3.31| |hendrycksTest-high_school_macroeconomics | 0|acc |26.15|± | 2.23| | | |acc_norm|24.87|± | 2.19| |hendrycksTest-high_school_mathematics | 0|acc |17.04|± | 2.29| | | |acc_norm|23.33|± | 2.58| |hendrycksTest-high_school_microeconomics | 0|acc |23.53|± | 2.76| | | |acc_norm|29.83|± | 2.97| |hendrycksTest-high_school_physics | 0|acc |22.52|± | 3.41| | | |acc_norm|21.19|± | 3.34| |hendrycksTest-high_school_psychology | 0|acc |25.14|± | 1.86| | | |acc_norm|25.69|± | 1.87| |hendrycksTest-high_school_statistics | 0|acc |24.54|± | 2.93| | | |acc_norm|27.78|± | 3.05| |hendrycksTest-high_school_us_history | 0|acc |24.51|± | 3.02| | | |acc_norm|26.47|± | 3.10| |hendrycksTest-high_school_world_history | 0|acc |26.16|± | 2.86| | | |acc_norm|28.69|± | 2.94| |hendrycksTest-human_aging | 0|acc |36.32|± | 3.23| | | |acc_norm|26.01|± | 2.94| |hendrycksTest-human_sexuality | 0|acc |32.82|± | 4.12| | | |acc_norm|32.06|± | 4.09| |hendrycksTest-international_law | 0|acc |14.88|± | 3.25| | | |acc_norm|38.02|± | 4.43| |hendrycksTest-jurisprudence | 0|acc |15.74|± | 3.52| | | |acc_norm|37.04|± | 4.67| |hendrycksTest-logical_fallacies | 0|acc |26.38|± | 3.46| | | |acc_norm|30.06|± | 3.60| |hendrycksTest-machine_learning | 0|acc |24.11|± | 4.06| | | |acc_norm|27.68|± | 4.25| |hendrycksTest-management | 0|acc |17.48|± | 3.76| | | |acc_norm|27.18|± | 4.41| |hendrycksTest-marketing | 0|acc |33.76|± | 3.10| | | |acc_norm|35.90|± | 3.14| |hendrycksTest-medical_genetics | 0|acc |29.00|± | 4.56| | | |acc_norm|38.00|± | 4.88| |hendrycksTest-miscellaneous | 0|acc |27.46|± | 1.60| | | |acc_norm|26.18|± | 1.57| |hendrycksTest-moral_disputes | 0|acc |28.32|± | 2.43| | | |acc_norm|29.48|± | 2.45| |hendrycksTest-moral_scenarios | 0|acc |23.80|± | 1.42| | | |acc_norm|27.26|± | 1.49| |hendrycksTest-nutrition | 0|acc |28.76|± | 2.59| | | |acc_norm|34.97|± | 2.73| |hendrycksTest-philosophy | 0|acc |20.90|± | 2.31| | | |acc_norm|31.19|± | 2.63| |hendrycksTest-prehistory | 0|acc |27.47|± | 2.48| | | |acc_norm|21.91|± | 2.30| |hendrycksTest-professional_accounting | 0|acc |25.89|± | 2.61| | | |acc_norm|26.24|± | 2.62| |hendrycksTest-professional_law | 0|acc |25.23|± | 1.11| | | |acc_norm|27.38|± | 1.14| |hendrycksTest-professional_medicine | 0|acc |25.00|± | 2.63| | | |acc_norm|22.79|± | 2.55| |hendrycksTest-professional_psychology | 0|acc |23.86|± | 1.72| | | |acc_norm|24.84|± | 1.75| |hendrycksTest-public_relations | 0|acc |29.09|± | 4.35| | | |acc_norm|25.45|± | 4.17| |hendrycksTest-security_studies | 0|acc |33.47|± | 3.02| | | |acc_norm|25.71|± | 2.80| |hendrycksTest-sociology | 0|acc |27.86|± | 3.17| | | |acc_norm|24.88|± | 3.06| |hendrycksTest-us_foreign_policy | 0|acc |28.00|± | 4.51| | | |acc_norm|34.00|± | 4.76| |hendrycksTest-virology | 0|acc |31.93|± | 3.63| | | |acc_norm|27.71|± | 3.48| |hendrycksTest-world_religions | 0|acc |28.65|± | 3.47| | | |acc_norm|32.16|± | 3.58| |lambada_openai | 0|ppl |26.02|± | 0.94| | | |acc |37.90|± | 0.68| |logiqa | 0|acc |22.73|± | 1.64| | | |acc_norm|27.96|± | 1.76| |piqa | 0|acc |63.00|± | 1.13| | | |acc_norm|62.02|± | 1.13| |sciq | 0|acc |75.10|± | 1.37| | | |acc_norm|66.90|± | 1.49| |winogrande | 0|acc |50.28|± | 1.41| |wsc | 0|acc |36.54|± | 4.74|