# opt-66b ## opt-66b.json | Task |Version| Metric |Value| |Stderr| |-------------------------------------------------|------:|--------|----:|---|-----:| |arc_challenge | 0|acc |37.20|± | 1.41| | | |acc_norm|40.10|± | 1.43| |arc_easy | 0|acc |71.68|± | 0.92| | | |acc_norm|67.30|± | 0.96| |hendrycksTest-abstract_algebra | 0|acc |23.00|± | 4.23| | | |acc_norm|24.00|± | 4.29| |hendrycksTest-anatomy | 0|acc |27.41|± | 3.85| | | |acc_norm|26.67|± | 3.82| |hendrycksTest-astronomy | 0|acc |28.95|± | 3.69| | | |acc_norm|40.13|± | 3.99| |hendrycksTest-business_ethics | 0|acc |29.00|± | 4.56| | | |acc_norm|28.00|± | 4.51| |hendrycksTest-clinical_knowledge | 0|acc |24.15|± | 2.63| | | |acc_norm|27.55|± | 2.75| |hendrycksTest-college_biology | 0|acc |24.31|± | 3.59| | | |acc_norm|25.00|± | 3.62| |hendrycksTest-college_chemistry | 0|acc |30.00|± | 4.61| | | |acc_norm|34.00|± | 4.76| |hendrycksTest-college_computer_science | 0|acc |23.00|± | 4.23| | | |acc_norm|28.00|± | 4.51| |hendrycksTest-college_mathematics | 0|acc |23.00|± | 4.23| | | |acc_norm|29.00|± | 4.56| |hendrycksTest-college_medicine | 0|acc |23.70|± | 3.24| | | |acc_norm|24.86|± | 3.30| |hendrycksTest-college_physics | 0|acc |28.43|± | 4.49| | | |acc_norm|26.47|± | 4.39| |hendrycksTest-computer_security | 0|acc |32.00|± | 4.69| | | |acc_norm|29.00|± | 4.56| |hendrycksTest-conceptual_physics | 0|acc |25.53|± | 2.85| | | |acc_norm|22.98|± | 2.75| |hendrycksTest-econometrics | 0|acc |28.07|± | 4.23| | | |acc_norm|20.18|± | 3.78| |hendrycksTest-electrical_engineering | 0|acc |35.86|± | 4.00| | | |acc_norm|38.62|± | 4.06| |hendrycksTest-elementary_mathematics | 0|acc |26.46|± | 2.27| | | |acc_norm|27.25|± | 2.29| |hendrycksTest-formal_logic | 0|acc |30.16|± | 4.10| | | |acc_norm|28.57|± | 4.04| |hendrycksTest-global_facts | 0|acc |29.00|± | 4.56| | | |acc_norm|26.00|± | 4.41| |hendrycksTest-high_school_biology | 0|acc |26.13|± | 2.50| | | |acc_norm|31.94|± | 2.65| |hendrycksTest-high_school_chemistry | 0|acc |24.14|± | 3.01| | | |acc_norm|34.48|± | 3.34| |hendrycksTest-high_school_computer_science | 0|acc |31.00|± | 4.65| | | |acc_norm|30.00|± | 4.61| |hendrycksTest-high_school_european_history | 0|acc |29.09|± | 3.55| | | |acc_norm|29.70|± | 3.57| |hendrycksTest-high_school_geography | 0|acc |26.26|± | 3.14| | | |acc_norm|31.82|± | 3.32| |hendrycksTest-high_school_government_and_politics| 0|acc |26.42|± | 3.18| | | |acc_norm|26.42|± | 3.18| |hendrycksTest-high_school_macroeconomics | 0|acc |29.49|± | 2.31| | | |acc_norm|26.67|± | 2.24| |hendrycksTest-high_school_mathematics | 0|acc |21.85|± | 2.52| | | |acc_norm|32.22|± | 2.85| |hendrycksTest-high_school_microeconomics | 0|acc |29.83|± | 2.97| | | |acc_norm|36.13|± | 3.12| |hendrycksTest-high_school_physics | 0|acc |22.52|± | 3.41| | | |acc_norm|23.18|± | 3.45| |hendrycksTest-high_school_psychology | 0|acc |28.44|± | 1.93| | | |acc_norm|25.87|± | 1.88| |hendrycksTest-high_school_statistics | 0|acc |29.17|± | 3.10| | | |acc_norm|33.33|± | 3.21| |hendrycksTest-high_school_us_history | 0|acc |27.45|± | 3.13| | | |acc_norm|30.39|± | 3.23| |hendrycksTest-high_school_world_history | 0|acc |30.80|± | 3.01| | | |acc_norm|32.49|± | 3.05| |hendrycksTest-human_aging | 0|acc |28.70|± | 3.04| | | |acc_norm|22.42|± | 2.80| |hendrycksTest-human_sexuality | 0|acc |36.64|± | 4.23| | | |acc_norm|32.82|± | 4.12| |hendrycksTest-international_law | 0|acc |25.62|± | 3.98| | | |acc_norm|49.59|± | 4.56| |hendrycksTest-jurisprudence | 0|acc |30.56|± | 4.45| | | |acc_norm|42.59|± | 4.78| |hendrycksTest-logical_fallacies | 0|acc |23.93|± | 3.35| | | |acc_norm|28.83|± | 3.56| |hendrycksTest-machine_learning | 0|acc |24.11|± | 4.06| | | |acc_norm|23.21|± | 4.01| |hendrycksTest-management | 0|acc |29.13|± | 4.50| | | |acc_norm|33.98|± | 4.69| |hendrycksTest-marketing | 0|acc |29.06|± | 2.97| | | |acc_norm|29.06|± | 2.97| |hendrycksTest-medical_genetics | 0|acc |34.00|± | 4.76| | | |acc_norm|45.00|± | 5.00| |hendrycksTest-miscellaneous | 0|acc |32.69|± | 1.68| | | |acc_norm|29.37|± | 1.63| |hendrycksTest-moral_disputes | 0|acc |32.08|± | 2.51| | | |acc_norm|31.79|± | 2.51| |hendrycksTest-moral_scenarios | 0|acc |22.12|± | 1.39| | | |acc_norm|27.26|± | 1.49| |hendrycksTest-nutrition | 0|acc |33.33|± | 2.70| | | |acc_norm|39.54|± | 2.80| |hendrycksTest-philosophy | 0|acc |25.08|± | 2.46| | | |acc_norm|35.05|± | 2.71| |hendrycksTest-prehistory | 0|acc |24.38|± | 2.39| | | |acc_norm|21.30|± | 2.28| |hendrycksTest-professional_accounting | 0|acc |20.21|± | 2.40| | | |acc_norm|22.70|± | 2.50| |hendrycksTest-professional_law | 0|acc |27.90|± | 1.15| | | |acc_norm|29.53|± | 1.17| |hendrycksTest-professional_medicine | 0|acc |26.47|± | 2.68| | | |acc_norm|29.78|± | 2.78| |hendrycksTest-professional_psychology | 0|acc |26.14|± | 1.78| | | |acc_norm|28.59|± | 1.83| |hendrycksTest-public_relations | 0|acc |34.55|± | 4.55| | | |acc_norm|15.45|± | 3.46| |hendrycksTest-security_studies | 0|acc |38.37|± | 3.11| | | |acc_norm|32.65|± | 3.00| |hendrycksTest-sociology | 0|acc |28.86|± | 3.20| | | |acc_norm|27.36|± | 3.15| |hendrycksTest-us_foreign_policy | 0|acc |37.00|± | 4.85| | | |acc_norm|36.00|± | 4.82| |hendrycksTest-virology | 0|acc |32.53|± | 3.65| | | |acc_norm|30.12|± | 3.57| |hendrycksTest-world_religions | 0|acc |33.92|± | 3.63| | | |acc_norm|37.43|± | 3.71| |lambada_openai | 0|ppl | 3.29|± | 0.06| | | |acc |73.90|± | 0.61| |logiqa | 0|acc |22.73|± | 1.64| | | |acc_norm|28.73|± | 1.77| |piqa | 0|acc |78.78|± | 0.95| | | |acc_norm|79.87|± | 0.94| |sciq | 0|acc |92.60|± | 0.83| | | |acc_norm|87.30|± | 1.05| |winogrande | 0|acc |68.75|± | 1.30| |wsc | 0|acc |54.81|± | 4.90|