llama-7B_mmlu_5-shot.json 13.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
{
  "results": {
    "hendrycksTest-high_school_geography": {
      "acc": 0.4292929292929293,
      "acc_stderr": 0.035265527246011986,
      "acc_norm": 0.36363636363636365,
      "acc_norm_stderr": 0.03427308652999934
    },
    "hendrycksTest-philosophy": {
      "acc": 0.40192926045016075,
      "acc_stderr": 0.027846476005930477,
      "acc_norm": 0.3536977491961415,
      "acc_norm_stderr": 0.02715520810320088
    },
    "hendrycksTest-world_religions": {
      "acc": 0.6257309941520468,
      "acc_stderr": 0.03711601185389481,
      "acc_norm": 0.5146198830409356,
      "acc_norm_stderr": 0.038331852752130254
    },
    "hendrycksTest-college_biology": {
      "acc": 0.3194444444444444,
      "acc_stderr": 0.03899073687357335,
      "acc_norm": 0.2916666666666667,
      "acc_norm_stderr": 0.03800968060554858
    },
    "hendrycksTest-electrical_engineering": {
      "acc": 0.3586206896551724,
      "acc_stderr": 0.03996629574876719,
      "acc_norm": 0.32413793103448274,
      "acc_norm_stderr": 0.03900432069185554
    },
    "hendrycksTest-global_facts": {
      "acc": 0.32,
      "acc_stderr": 0.046882617226215034,
      "acc_norm": 0.29,
      "acc_norm_stderr": 0.045604802157206824
    },
    "hendrycksTest-high_school_government_and_politics": {
      "acc": 0.48186528497409326,
      "acc_stderr": 0.03606065001832917,
      "acc_norm": 0.37305699481865284,
      "acc_norm_stderr": 0.03490205592048573
    },
    "hendrycksTest-moral_scenarios": {
      "acc": 0.2759776536312849,
      "acc_stderr": 0.014950103002475353,
      "acc_norm": 0.27262569832402234,
      "acc_norm_stderr": 0.014893391735249588
    },
    "hendrycksTest-econometrics": {
      "acc": 0.2894736842105263,
      "acc_stderr": 0.04266339443159394,
      "acc_norm": 0.2631578947368421,
      "acc_norm_stderr": 0.0414243971948936
    },
    "hendrycksTest-international_law": {
      "acc": 0.3884297520661157,
      "acc_stderr": 0.04449270350068382,
      "acc_norm": 0.5785123966942148,
      "acc_norm_stderr": 0.045077322787750874
    },
    "hendrycksTest-us_foreign_policy": {
      "acc": 0.56,
      "acc_stderr": 0.049888765156985884,
      "acc_norm": 0.45,
      "acc_norm_stderr": 0.05
    },
    "hendrycksTest-high_school_macroeconomics": {
      "acc": 0.31794871794871793,
      "acc_stderr": 0.02361088430892786,
      "acc_norm": 0.30256410256410254,
      "acc_norm_stderr": 0.023290888053772742
    },
    "hendrycksTest-virology": {
      "acc": 0.39759036144578314,
      "acc_stderr": 0.038099730845402184,
      "acc_norm": 0.2891566265060241,
      "acc_norm_stderr": 0.035294868015111155
    },
    "hendrycksTest-high_school_mathematics": {
      "acc": 0.22592592592592592,
      "acc_stderr": 0.025497532639609542,
      "acc_norm": 0.3074074074074074,
      "acc_norm_stderr": 0.02813325257881564
    },
    "hendrycksTest-clinical_knowledge": {
      "acc": 0.3886792452830189,
      "acc_stderr": 0.03000048544867599,
      "acc_norm": 0.38113207547169814,
      "acc_norm_stderr": 0.029890609686286627
    },
    "hendrycksTest-professional_psychology": {
      "acc": 0.3839869281045752,
      "acc_stderr": 0.01967580813528152,
      "acc_norm": 0.29901960784313725,
      "acc_norm_stderr": 0.01852175621542302
    },
    "hendrycksTest-formal_logic": {
      "acc": 0.30952380952380953,
      "acc_stderr": 0.04134913018303316,
      "acc_norm": 0.3492063492063492,
      "acc_norm_stderr": 0.042639068927951315
    },
    "hendrycksTest-management": {
      "acc": 0.4854368932038835,
      "acc_stderr": 0.04948637324026637,
      "acc_norm": 0.36893203883495146,
      "acc_norm_stderr": 0.0477761518115674
    },
    "hendrycksTest-human_sexuality": {
      "acc": 0.5114503816793893,
      "acc_stderr": 0.043841400240780176,
      "acc_norm": 0.366412213740458,
      "acc_norm_stderr": 0.042258754519696386
    },
    "hendrycksTest-high_school_world_history": {
      "acc": 0.3924050632911392,
      "acc_stderr": 0.03178471874564729,
      "acc_norm": 0.33755274261603374,
      "acc_norm_stderr": 0.030781549102026216
    },
    "hendrycksTest-medical_genetics": {
      "acc": 0.44,
      "acc_stderr": 0.04988876515698589,
      "acc_norm": 0.4,
      "acc_norm_stderr": 0.04923659639173309
    },
    "hendrycksTest-computer_security": {
      "acc": 0.37,
      "acc_stderr": 0.048523658709391,
      "acc_norm": 0.44,
      "acc_norm_stderr": 0.04988876515698589
    },
    "hendrycksTest-miscellaneous": {
      "acc": 0.5836526181353767,
      "acc_stderr": 0.0176279480304303,
      "acc_norm": 0.3895274584929757,
      "acc_norm_stderr": 0.017438082556264597
    },
    "hendrycksTest-public_relations": {
      "acc": 0.39090909090909093,
      "acc_stderr": 0.046737523336702384,
      "acc_norm": 0.22727272727272727,
      "acc_norm_stderr": 0.040139645540727735
    },
    "hendrycksTest-college_physics": {
      "acc": 0.23529411764705882,
      "acc_stderr": 0.04220773659171453,
      "acc_norm": 0.3235294117647059,
      "acc_norm_stderr": 0.046550104113196177
    },
    "hendrycksTest-professional_accounting": {
      "acc": 0.30141843971631205,
      "acc_stderr": 0.02737412888263115,
      "acc_norm": 0.29432624113475175,
      "acc_norm_stderr": 0.027187127011503793
    },
    "hendrycksTest-logical_fallacies": {
      "acc": 0.3803680981595092,
      "acc_stderr": 0.03814269893261837,
      "acc_norm": 0.3496932515337423,
      "acc_norm_stderr": 0.037466683254700206
    },
    "hendrycksTest-business_ethics": {
      "acc": 0.53,
      "acc_stderr": 0.050161355804659205,
      "acc_norm": 0.46,
      "acc_norm_stderr": 0.05009082659620332
    },
    "hendrycksTest-high_school_chemistry": {
      "acc": 0.2512315270935961,
      "acc_stderr": 0.030516530732694436,
      "acc_norm": 0.2955665024630542,
      "acc_norm_stderr": 0.03210494433751458
    },
    "hendrycksTest-astronomy": {
      "acc": 0.45394736842105265,
      "acc_stderr": 0.04051646342874143,
      "acc_norm": 0.4605263157894737,
      "acc_norm_stderr": 0.04056242252249033
    },
    "hendrycksTest-high_school_us_history": {
      "acc": 0.4264705882352941,
      "acc_stderr": 0.03471157907953424,
      "acc_norm": 0.3137254901960784,
      "acc_norm_stderr": 0.032566854844603886
    },
    "hendrycksTest-college_chemistry": {
      "acc": 0.33,
      "acc_stderr": 0.047258156262526045,
      "acc_norm": 0.3,
      "acc_norm_stderr": 0.046056618647183814
    },
    "hendrycksTest-abstract_algebra": {
      "acc": 0.23,
      "acc_stderr": 0.042295258468165065,
      "acc_norm": 0.26,
      "acc_norm_stderr": 0.0440844002276808
    },
    "hendrycksTest-moral_disputes": {
      "acc": 0.36416184971098264,
      "acc_stderr": 0.025906632631016117,
      "acc_norm": 0.33236994219653176,
      "acc_norm_stderr": 0.02536116874968821
    },
    "hendrycksTest-college_computer_science": {
      "acc": 0.33,
      "acc_stderr": 0.04725815626252605,
      "acc_norm": 0.28,
      "acc_norm_stderr": 0.04512608598542128
    },
    "hendrycksTest-professional_law": {
      "acc": 0.2966101694915254,
      "acc_stderr": 0.011665946586082849,
      "acc_norm": 0.28552803129074317,
      "acc_norm_stderr": 0.011535751586665664
    },
    "hendrycksTest-college_mathematics": {
      "acc": 0.32,
      "acc_stderr": 0.046882617226215034,
      "acc_norm": 0.32,
      "acc_norm_stderr": 0.04688261722621505
    },
    "hendrycksTest-high_school_microeconomics": {
      "acc": 0.3865546218487395,
      "acc_stderr": 0.0316314580755238,
      "acc_norm": 0.36554621848739494,
      "acc_norm_stderr": 0.0312821770636846
    },
    "hendrycksTest-high_school_european_history": {
      "acc": 0.40606060606060607,
      "acc_stderr": 0.03834816355401181,
      "acc_norm": 0.3696969696969697,
      "acc_norm_stderr": 0.03769430314512568
    },
    "hendrycksTest-high_school_biology": {
      "acc": 0.3580645161290323,
      "acc_stderr": 0.027273890594300642,
      "acc_norm": 0.3580645161290323,
      "acc_norm_stderr": 0.02727389059430063
    },
    "hendrycksTest-security_studies": {
      "acc": 0.40816326530612246,
      "acc_stderr": 0.03146465712827424,
      "acc_norm": 0.31020408163265306,
      "acc_norm_stderr": 0.029613459872484375
    },
    "hendrycksTest-high_school_psychology": {
      "acc": 0.46605504587155966,
      "acc_stderr": 0.02138786335035399,
      "acc_norm": 0.30825688073394497,
      "acc_norm_stderr": 0.01979836669836726
    },
    "hendrycksTest-conceptual_physics": {
      "acc": 0.3276595744680851,
      "acc_stderr": 0.030683020843231004,
      "acc_norm": 0.2170212765957447,
      "acc_norm_stderr": 0.026947483121496228
    },
    "hendrycksTest-human_aging": {
      "acc": 0.3721973094170404,
      "acc_stderr": 0.03244305283008731,
      "acc_norm": 0.25112107623318386,
      "acc_norm_stderr": 0.02910522083322462
    },
    "hendrycksTest-prehistory": {
      "acc": 0.4012345679012346,
      "acc_stderr": 0.0272725828498398,
      "acc_norm": 0.2777777777777778,
      "acc_norm_stderr": 0.02492200116888633
    },
    "hendrycksTest-sociology": {
      "acc": 0.47761194029850745,
      "acc_stderr": 0.035319879302087305,
      "acc_norm": 0.42786069651741293,
      "acc_norm_stderr": 0.03498541988407795
    },
    "hendrycksTest-marketing": {
      "acc": 0.6111111111111112,
      "acc_stderr": 0.031937057262002924,
      "acc_norm": 0.5042735042735043,
      "acc_norm_stderr": 0.03275489264382132
    },
    "hendrycksTest-high_school_computer_science": {
      "acc": 0.41,
      "acc_stderr": 0.049431107042371025,
      "acc_norm": 0.34,
      "acc_norm_stderr": 0.047609522856952365
    },
    "hendrycksTest-machine_learning": {
      "acc": 0.30357142857142855,
      "acc_stderr": 0.04364226155841044,
      "acc_norm": 0.26785714285714285,
      "acc_norm_stderr": 0.04203277291467762
    },
    "hendrycksTest-elementary_mathematics": {
      "acc": 0.3201058201058201,
      "acc_stderr": 0.024026846392873506,
      "acc_norm": 0.291005291005291,
      "acc_norm_stderr": 0.023393826500484865
    },
    "hendrycksTest-nutrition": {
      "acc": 0.3954248366013072,
      "acc_stderr": 0.027996723180631435,
      "acc_norm": 0.43790849673202614,
      "acc_norm_stderr": 0.028408302020332694
    },
    "hendrycksTest-anatomy": {
      "acc": 0.3851851851851852,
      "acc_stderr": 0.042039210401562783,
      "acc_norm": 0.2814814814814815,
      "acc_norm_stderr": 0.03885004245800254
    },
    "hendrycksTest-jurisprudence": {
      "acc": 0.4351851851851852,
      "acc_stderr": 0.04792898170907062,
      "acc_norm": 0.5,
      "acc_norm_stderr": 0.04833682445228318
    },
    "hendrycksTest-college_medicine": {
      "acc": 0.37572254335260113,
      "acc_stderr": 0.036928207672648664,
      "acc_norm": 0.3063583815028902,
      "acc_norm_stderr": 0.03514942551267439
    },
    "hendrycksTest-high_school_statistics": {
      "acc": 0.3425925925925926,
      "acc_stderr": 0.03236585252602156,
      "acc_norm": 0.3425925925925926,
      "acc_norm_stderr": 0.03236585252602156
    },
    "hendrycksTest-high_school_physics": {
      "acc": 0.2052980132450331,
      "acc_stderr": 0.03297986648473834,
      "acc_norm": 0.271523178807947,
      "acc_norm_stderr": 0.036313298039696525
    },
    "hendrycksTest-professional_medicine": {
      "acc": 0.3382352941176471,
      "acc_stderr": 0.028739328513983576,
      "acc_norm": 0.27941176470588236,
      "acc_norm_stderr": 0.027257202606114948
    }
  },
  "versions": {
    "hendrycksTest-high_school_geography": 0,
    "hendrycksTest-philosophy": 0,
    "hendrycksTest-world_religions": 0,
    "hendrycksTest-college_biology": 0,
    "hendrycksTest-electrical_engineering": 0,
    "hendrycksTest-global_facts": 0,
    "hendrycksTest-high_school_government_and_politics": 0,
    "hendrycksTest-moral_scenarios": 0,
    "hendrycksTest-econometrics": 0,
    "hendrycksTest-international_law": 0,
    "hendrycksTest-us_foreign_policy": 0,
    "hendrycksTest-high_school_macroeconomics": 0,
    "hendrycksTest-virology": 0,
    "hendrycksTest-high_school_mathematics": 0,
    "hendrycksTest-clinical_knowledge": 0,
    "hendrycksTest-professional_psychology": 0,
    "hendrycksTest-formal_logic": 0,
    "hendrycksTest-management": 0,
    "hendrycksTest-human_sexuality": 0,
    "hendrycksTest-high_school_world_history": 0,
    "hendrycksTest-medical_genetics": 0,
    "hendrycksTest-computer_security": 0,
    "hendrycksTest-miscellaneous": 0,
    "hendrycksTest-public_relations": 0,
    "hendrycksTest-college_physics": 0,
    "hendrycksTest-professional_accounting": 0,
    "hendrycksTest-logical_fallacies": 0,
    "hendrycksTest-business_ethics": 0,
    "hendrycksTest-high_school_chemistry": 0,
    "hendrycksTest-astronomy": 0,
    "hendrycksTest-high_school_us_history": 0,
    "hendrycksTest-college_chemistry": 0,
    "hendrycksTest-abstract_algebra": 0,
    "hendrycksTest-moral_disputes": 0,
    "hendrycksTest-college_computer_science": 0,
    "hendrycksTest-professional_law": 0,
    "hendrycksTest-college_mathematics": 0,
    "hendrycksTest-high_school_microeconomics": 0,
    "hendrycksTest-high_school_european_history": 0,
    "hendrycksTest-high_school_biology": 0,
    "hendrycksTest-security_studies": 0,
    "hendrycksTest-high_school_psychology": 0,
    "hendrycksTest-conceptual_physics": 0,
    "hendrycksTest-human_aging": 0,
    "hendrycksTest-prehistory": 0,
    "hendrycksTest-sociology": 0,
    "hendrycksTest-marketing": 0,
    "hendrycksTest-high_school_computer_science": 0,
    "hendrycksTest-machine_learning": 0,
    "hendrycksTest-elementary_mathematics": 0,
    "hendrycksTest-nutrition": 0,
    "hendrycksTest-anatomy": 0,
    "hendrycksTest-jurisprudence": 0,
    "hendrycksTest-college_medicine": 0,
    "hendrycksTest-high_school_statistics": 0,
    "hendrycksTest-high_school_physics": 0,
    "hendrycksTest-professional_medicine": 0
  },
  "config": {
    "model": "hf-causal-experimental",
    "model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/7B,use_accelerate=True",
    "num_fewshot": 5,
    "batch_size": "auto",
    "device": "cuda:0",
    "no_cache": true,
    "limit": null,
    "bootstrap_iters": 100000,
    "description_dict": {}
  }
}