llama-13B_bbh_3-shot.json 4.47 KB
Newer Older
Rayyyyy's avatar
Rayyyyy committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
{
  "results": {
    "bigbench_sports_understanding": {
      "multiple_choice_grade": 0.5811359026369168,
      "multiple_choice_grade_stderr": 0.015720172474974117
    },
    "bigbench_salient_translation_error_detection": {
      "multiple_choice_grade": 0.1933867735470942,
      "multiple_choice_grade_stderr": 0.012508305339715512
    },
    "bigbench_date_understanding": {
      "multiple_choice_grade": 0.6395663956639567,
      "multiple_choice_grade_stderr": 0.025028311208714224
    },
    "bigbench_navigate": {
      "multiple_choice_grade": 0.517,
      "multiple_choice_grade_stderr": 0.015810153729833434
    },
    "bigbench_dyck_languages": {
      "multiple_choice_grade": 0.201,
      "multiple_choice_grade_stderr": 0.012679107214617324
    },
    "bigbench_movie_recommendation": {
      "multiple_choice_grade": 0.436,
      "multiple_choice_grade_stderr": 0.022198954641476802
    },
    "bigbench_snarks": {
      "multiple_choice_grade": 0.4696132596685083,
      "multiple_choice_grade_stderr": 0.03719891321680327
    },
    "bigbench_disambiguation_qa": {
      "multiple_choice_grade": 0.4573643410852713,
      "multiple_choice_grade_stderr": 0.03107554499047266
    },
    "bigbench_reasoning_about_colored_objects": {
      "multiple_choice_grade": 0.3705,
      "multiple_choice_grade_stderr": 0.010801537464907349
    },
    "bigbench_geometric_shapes": {
      "multiple_choice_grade": 0.23119777158774374,
      "multiple_choice_grade_stderr": 0.02228217728550543,
      "exact_str_match": 0.0,
      "exact_str_match_stderr": 0.0
    },
    "bigbench_tracking_shuffled_objects_five_objects": {
      "multiple_choice_grade": 0.2144,
      "multiple_choice_grade_stderr": 0.011612665292522431
    },
    "bigbench_formal_fallacies_syllogisms_negation": {
      "multiple_choice_grade": 0.5113380281690141,
      "multiple_choice_grade_stderr": 0.004194975590734721
    },
    "bigbench_tracking_shuffled_objects_three_objects": {
      "multiple_choice_grade": 0.4166666666666667,
      "multiple_choice_grade_stderr": 0.028511310643917567
    },
    "bigbench_hyperbaton": {
      "multiple_choice_grade": 0.5038,
      "multiple_choice_grade_stderr": 0.0022360257592931206
    },
    "bigbench_temporal_sequences": {
      "multiple_choice_grade": 0.28,
      "multiple_choice_grade_stderr": 0.014205696104091493
    },
    "bigbench_logical_deduction_three_objects": {
      "multiple_choice_grade": 0.4166666666666667,
      "multiple_choice_grade_stderr": 0.028511310643917567
    },
    "bigbench_causal_judgement": {
      "multiple_choice_grade": 0.49473684210526314,
      "multiple_choice_grade_stderr": 0.036367633377878836
    },
    "bigbench_tracking_shuffled_objects_seven_objects": {
      "multiple_choice_grade": 0.14457142857142857,
      "multiple_choice_grade_stderr": 0.008408881015830339
    },
    "bigbench_logical_deduction_seven_objects": {
      "multiple_choice_grade": 0.22285714285714286,
      "multiple_choice_grade_stderr": 0.015740739118727993
    },
    "bigbench_logical_deduction_five_objects": {
      "multiple_choice_grade": 0.3,
      "multiple_choice_grade_stderr": 0.020514426225628046
    },
    "bigbench_ruin_names": {
      "multiple_choice_grade": 0.34598214285714285,
      "multiple_choice_grade_stderr": 0.02249924183068251
    }
  },
  "versions": {
    "bigbench_sports_understanding": 0,
    "bigbench_salient_translation_error_detection": 0,
    "bigbench_date_understanding": 0,
    "bigbench_navigate": 0,
    "bigbench_dyck_languages": 0,
    "bigbench_movie_recommendation": 0,
    "bigbench_snarks": 0,
    "bigbench_disambiguation_qa": 0,
    "bigbench_reasoning_about_colored_objects": 0,
    "bigbench_geometric_shapes": 0,
    "bigbench_tracking_shuffled_objects_five_objects": 0,
    "bigbench_formal_fallacies_syllogisms_negation": 0,
    "bigbench_tracking_shuffled_objects_three_objects": 0,
    "bigbench_hyperbaton": 0,
    "bigbench_temporal_sequences": 0,
    "bigbench_logical_deduction_three_objects": 0,
    "bigbench_causal_judgement": 0,
    "bigbench_tracking_shuffled_objects_seven_objects": 0,
    "bigbench_logical_deduction_seven_objects": 0,
    "bigbench_logical_deduction_five_objects": 0,
    "bigbench_ruin_names": 0
  },
  "config": {
    "model": "hf-causal-experimental",
    "model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True",
    "num_fewshot": 3,
    "batch_size": "auto",
    "device": "cuda",
    "no_cache": true,
    "limit": null,
    "bootstrap_iters": 100000,
    "description_dict": {}
  }
}