Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
29f12dd9
Unverified
Commit
29f12dd9
authored
Aug 01, 2023
by
Lintang Sutawika
Committed by
GitHub
Aug 01, 2023
Browse files
Merge branch 'big-refactor' into benchmark-scripts
parents
e37698df
4168c05f
Changes
222
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
1489 deletions
+0
-1489
results/bloom/bloom-3b/bloom-3b_question_answering_0-shot.json
...ts/bloom/bloom-3b/bloom-3b_question_answering_0-shot.json
+0
-66
results/bloom/bloom-3b/bloom-3b_reading_comprehension_0-shot.json
...bloom/bloom-3b/bloom-3b_reading_comprehension_0-shot.json
+0
-36
results/bloom/bloom-3b/bloom-3b_xcopa_0-shot.json
results/bloom/bloom-3b/bloom-3b_xcopa_0-shot.json
+0
-72
results/bloom/bloom-3b/bloom-3b_xnli_0-shot.json
results/bloom/bloom-3b/bloom-3b_xnli_0-shot.json
+0
-92
results/bloom/bloom-3b/bloom-3b_xstory_cloze_0-shot.json
results/bloom/bloom-3b/bloom-3b_xstory_cloze_0-shot.json
+0
-72
results/bloom/bloom-3b/bloom-3b_xwinograd_0-shot.json
results/bloom/bloom-3b/bloom-3b_xwinograd_0-shot.json
+0
-47
results/bloom/bloom-560m/README.md
results/bloom/bloom-560m/README.md
+0
-155
results/bloom/bloom-560m/bloom-560m_common_sense_reasoning_0-shot.json
.../bloom-560m/bloom-560m_common_sense_reasoning_0-shot.json
+0
-91
results/bloom/bloom-560m/bloom-560m_gsm8k_8-shot.json
results/bloom/bloom-560m/bloom-560m_gsm8k_8-shot.json
+0
-22
results/bloom/bloom-560m/bloom-560m_lambada_openai_0-shot.json
...ts/bloom/bloom-560m/bloom-560m_lambada_openai_0-shot.json
+0
-31
results/bloom/bloom-560m/bloom-560m_mathematical_reasoning_few_shot_5-shot.json
...0m/bloom-560m_mathematical_reasoning_few_shot_5-shot.json
+0
-71
results/bloom/bloom-560m/bloom-560m_pawsx_0-shot.json
results/bloom/bloom-560m/bloom-560m_pawsx_0-shot.json
+0
-52
results/bloom/bloom-560m/bloom-560m_question_answering_0-shot.json
...loom/bloom-560m/bloom-560m_question_answering_0-shot.json
+0
-66
results/bloom/bloom-560m/bloom-560m_reading_comprehension_0-shot.json
...m/bloom-560m/bloom-560m_reading_comprehension_0-shot.json
+0
-36
results/bloom/bloom-560m/bloom-560m_xcopa_0-shot.json
results/bloom/bloom-560m/bloom-560m_xcopa_0-shot.json
+0
-72
results/bloom/bloom-560m/bloom-560m_xnli_0-shot.json
results/bloom/bloom-560m/bloom-560m_xnli_0-shot.json
+0
-92
results/bloom/bloom-560m/bloom-560m_xstory_cloze_0-shot.json
results/bloom/bloom-560m/bloom-560m_xstory_cloze_0-shot.json
+0
-72
results/bloom/bloom-560m/bloom-560m_xwinograd_0-shot.json
results/bloom/bloom-560m/bloom-560m_xwinograd_0-shot.json
+0
-47
results/bloom/bloom-7b1/README.md
results/bloom/bloom-7b1/README.md
+0
-173
results/bloom/bloom-7b1/bloom-7b1_bbh_3-shot.json
results/bloom/bloom-7b1/bloom-7b1_bbh_3-shot.json
+0
-124
No files found.
results/bloom/bloom-3b/bloom-3b_question_answering_0-shot.json
deleted
100644 → 0
View file @
e37698df
{
"results"
:
{
"squad2"
:
{
"exact"
:
6.914848816642803
,
"f1"
:
11.511512971067512
,
"HasAns_exact"
:
11.099865047233468
,
"HasAns_f1"
:
20.306375422652543
,
"NoAns_exact"
:
2.7417998317914214
,
"NoAns_f1"
:
2.7417998317914214
,
"best_exact"
:
50.07159100480081
,
"best_f1"
:
50.08024690773861
},
"logiqa"
:
{
"acc"
:
0.2073732718894009
,
"acc_stderr"
:
0.015902084913876333
,
"acc_norm"
:
0.29185867895545314
,
"acc_norm_stderr"
:
0.017831570553971925
},
"headqa_en"
:
{
"acc"
:
0.2840991976659373
,
"acc_stderr"
:
0.008614040521644994
,
"acc_norm"
:
0.3336980306345733
,
"acc_norm_stderr"
:
0.009006537310888562
},
"truthfulqa_mc"
:
{
"mc1"
:
0.23255813953488372
,
"mc1_stderr"
:
0.014789157531080503
,
"mc2"
:
0.40572206357204965
,
"mc2_stderr"
:
0.014390512893375817
},
"webqs"
:
{
"acc"
:
0.01673228346456693
,
"acc_stderr"
:
0.0028461549169432184
},
"triviaqa"
:
{
"acc"
:
0.04154512507734465
,
"acc_stderr"
:
0.0018761872163031025
},
"headqa_es"
:
{
"acc"
:
0.26440554339897887
,
"acc_stderr"
:
0.008423643607316284
,
"acc_norm"
:
0.3099927060539752
,
"acc_norm_stderr"
:
0.008833810133604958
}
},
"versions"
:
{
"squad2"
:
1
,
"logiqa"
:
0
,
"headqa_en"
:
0
,
"truthfulqa_mc"
:
1
,
"webqs"
:
0
,
"triviaqa"
:
1
,
"headqa_es"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-3b,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-3b/bloom-3b_reading_comprehension_0-shot.json
deleted
100644 → 0
View file @
e37698df
{
"results"
:
{
"race"
:
{
"acc"
:
0.3521531100478469
,
"acc_stderr"
:
0.014782629897202266
},
"coqa"
:
{
"f1"
:
0.6149953283852732
,
"f1_stderr"
:
0.017671101287646335
,
"em"
:
0.4606666666666665
,
"em_stderr"
:
0.020185214533701167
},
"drop"
:
{
"em"
:
0.0194001677852349
,
"em_stderr"
:
0.0014124994962717904
,
"f1"
:
0.08879089765100667
,
"f1_stderr"
:
0.002002230574295575
}
},
"versions"
:
{
"race"
:
1
,
"coqa"
:
1
,
"drop"
:
1
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-3b,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-3b/bloom-3b_xcopa_0-shot.json
deleted
100644 → 0
View file @
e37698df
{
"results"
:
{
"xcopa_qu"
:
{
"acc"
:
0.506
,
"acc_stderr"
:
0.022381462412439324
},
"xcopa_ta"
:
{
"acc"
:
0.58
,
"acc_stderr"
:
0.02209471322976178
},
"xcopa_it"
:
{
"acc"
:
0.516
,
"acc_stderr"
:
0.0223716109825804
},
"xcopa_vi"
:
{
"acc"
:
0.688
,
"acc_stderr"
:
0.02074059653648807
},
"xcopa_id"
:
{
"acc"
:
0.692
,
"acc_stderr"
:
0.0206670329874661
},
"xcopa_sw"
:
{
"acc"
:
0.514
,
"acc_stderr"
:
0.02237429816635319
},
"xcopa_et"
:
{
"acc"
:
0.492
,
"acc_stderr"
:
0.022380208834928028
},
"xcopa_tr"
:
{
"acc"
:
0.534
,
"acc_stderr"
:
0.02233126442325838
},
"xcopa_th"
:
{
"acc"
:
0.526
,
"acc_stderr"
:
0.02235279165091416
},
"xcopa_ht"
:
{
"acc"
:
0.502
,
"acc_stderr"
:
0.022382894986483524
},
"xcopa_zh"
:
{
"acc"
:
0.62
,
"acc_stderr"
:
0.021728881438701702
}
},
"versions"
:
{
"xcopa_qu"
:
0
,
"xcopa_ta"
:
0
,
"xcopa_it"
:
0
,
"xcopa_vi"
:
0
,
"xcopa_id"
:
0
,
"xcopa_sw"
:
0
,
"xcopa_et"
:
0
,
"xcopa_tr"
:
0
,
"xcopa_th"
:
0
,
"xcopa_ht"
:
0
,
"xcopa_zh"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-3b"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-3b/bloom-3b_xnli_0-shot.json
deleted
100644 → 0
View file @
e37698df
{
"results"
:
{
"xnli_ar"
:
{
"acc"
:
0.3343313373253493
,
"acc_stderr"
:
0.006665643509474758
},
"xnli_bg"
:
{
"acc"
:
0.3790419161676647
,
"acc_stderr"
:
0.006854870366766951
},
"xnli_de"
:
{
"acc"
:
0.4039920159680639
,
"acc_stderr"
:
0.00693325085793088
},
"xnli_el"
:
{
"acc"
:
0.3321357285429142
,
"acc_stderr"
:
0.006654667909419873
},
"xnli_en"
:
{
"acc"
:
0.5341317365269461
,
"acc_stderr"
:
0.007048232775587604
},
"xnli_es"
:
{
"acc"
:
0.4908183632734531
,
"acc_stderr"
:
0.007063521183707481
},
"xnli_fr"
:
{
"acc"
:
0.49181636726546907
,
"acc_stderr"
:
0.007063766092285604
},
"xnli_hi"
:
{
"acc"
:
0.4554890219560878
,
"acc_stderr"
:
0.007036663146398998
},
"xnli_ru"
:
{
"acc"
:
0.41397205588822356
,
"acc_stderr"
:
0.006959357713092714
},
"xnli_sw"
:
{
"acc"
:
0.3582834331337325
,
"acc_stderr"
:
0.006775002711732832
},
"xnli_th"
:
{
"acc"
:
0.3339321357285429
,
"acc_stderr"
:
0.006663660032909966
},
"xnli_tr"
:
{
"acc"
:
0.33812375249501
,
"acc_stderr"
:
0.006684219795088851
},
"xnli_ur"
:
{
"acc"
:
0.4
,
"acc_stderr"
:
0.006921976252528393
},
"xnli_vi"
:
{
"acc"
:
0.46506986027944114
,
"acc_stderr"
:
0.0070474518252208835
},
"xnli_zh"
:
{
"acc"
:
0.37425149700598803
,
"acc_stderr"
:
0.006837638981887555
}
},
"versions"
:
{
"xnli_ar"
:
0
,
"xnli_bg"
:
0
,
"xnli_de"
:
0
,
"xnli_el"
:
0
,
"xnli_en"
:
0
,
"xnli_es"
:
0
,
"xnli_fr"
:
0
,
"xnli_hi"
:
0
,
"xnli_ru"
:
0
,
"xnli_sw"
:
0
,
"xnli_th"
:
0
,
"xnli_tr"
:
0
,
"xnli_ur"
:
0
,
"xnli_vi"
:
0
,
"xnli_zh"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-3b"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-3b/bloom-3b_xstory_cloze_0-shot.json
deleted
100644 → 0
View file @
e37698df
{
"results"
:
{
"xstory_cloze_eu"
:
{
"acc"
:
0.5565850430178689
,
"acc_stderr"
:
0.012784462136657198
},
"xstory_cloze_en"
:
{
"acc"
:
0.6677696889477167
,
"acc_stderr"
:
0.01212116892354459
},
"xstory_cloze_hi"
:
{
"acc"
:
0.5757776307081403
,
"acc_stderr"
:
0.012718494399531051
},
"xstory_cloze_sw"
:
{
"acc"
:
0.5301125082726671
,
"acc_stderr"
:
0.012843769248432169
},
"xstory_cloze_id"
:
{
"acc"
:
0.6082064857710126
,
"acc_stderr"
:
0.012562199063960642
},
"xstory_cloze_my"
:
{
"acc"
:
0.46591661151555264
,
"acc_stderr"
:
0.012837195610619434
},
"xstory_cloze_ru"
:
{
"acc"
:
0.5069490403706155
,
"acc_stderr"
:
0.01286588257096072
},
"xstory_cloze_te"
:
{
"acc"
:
0.5817339510258107
,
"acc_stderr"
:
0.012694045150564688
},
"xstory_cloze_zh"
:
{
"acc"
:
0.6088682991396426
,
"acc_stderr"
:
0.012558411693622684
},
"xstory_cloze_ar"
:
{
"acc"
:
0.5658504301786896
,
"acc_stderr"
:
0.01275504628991221
},
"xstory_cloze_es"
:
{
"acc"
:
0.6412971542025149
,
"acc_stderr"
:
0.012342655113112371
}
},
"versions"
:
{
"xstory_cloze_eu"
:
0
,
"xstory_cloze_en"
:
0
,
"xstory_cloze_hi"
:
0
,
"xstory_cloze_sw"
:
0
,
"xstory_cloze_id"
:
0
,
"xstory_cloze_my"
:
0
,
"xstory_cloze_ru"
:
0
,
"xstory_cloze_te"
:
0
,
"xstory_cloze_zh"
:
0
,
"xstory_cloze_ar"
:
0
,
"xstory_cloze_es"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-3b"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-3b/bloom-3b_xwinograd_0-shot.json
deleted
100644 → 0
View file @
e37698df
{
"results"
:
{
"xwinograd_en"
:
{
"acc"
:
0.7909677419354839
,
"acc_stderr"
:
0.008434667753827384
},
"xwinograd_pt"
:
{
"acc"
:
0.7034220532319392
,
"acc_stderr"
:
0.028218035233491295
},
"xwinograd_jp"
:
{
"acc"
:
0.5662148070907195
,
"acc_stderr"
:
0.016011986615263228
},
"xwinograd_fr"
:
{
"acc"
:
0.7108433734939759
,
"acc_stderr"
:
0.050066428050419186
},
"xwinograd_zh"
:
{
"acc"
:
0.7361111111111112
,
"acc_stderr"
:
0.019651614961528674
},
"xwinograd_ru"
:
{
"acc"
:
0.5365079365079365
,
"acc_stderr"
:
0.028141315964997568
}
},
"versions"
:
{
"xwinograd_en"
:
0
,
"xwinograd_pt"
:
0
,
"xwinograd_jp"
:
0
,
"xwinograd_fr"
:
0
,
"xwinograd_zh"
:
0
,
"xwinograd_ru"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-3b"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-560m/README.md
deleted
100644 → 0
View file @
e37698df
# bloom-560m
## bloom-560m_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |22.44|± | 1.22|
| | |acc_norm|23.98|± | 1.25|
|arc_easy | 0|acc |47.35|± | 1.02|
| | |acc_norm|41.67|± | 1.01|
|boolq | 1|acc |55.14|± | 0.87|
|copa | 0|acc |61.00|± | 4.90|
|hellaswag | 0|acc |31.56|± | 0.46|
| | |acc_norm|36.56|± | 0.48|
|mc_taco | 0|em |17.42| | |
| | |f1 |31.43| | |
|openbookqa | 0|acc |17.20|± | 1.69|
| | |acc_norm|28.20|± | 2.01|
|piqa | 0|acc |64.09|± | 1.12|
| | |acc_norm|65.13|± | 1.11|
|prost | 0|acc |22.08|± | 0.30|
| | |acc_norm|32.08|± | 0.34|
|swag | 0|acc |40.35|± | 0.35|
| | |acc_norm|52.96|± | 0.35|
|winogrande | 0|acc |52.80|± | 1.40|
|wsc273 | 0|acc |66.67|± | 2.86|
## bloom-560m_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc | 0.53|± | 0.2|
## bloom-560m_lambada_openai_0-shot.json
| Task |Version|Metric| Value | |Stderr|
|--------------------|------:|------|------:|---|-----:|
|lambada_openai | 0|ppl | 28.68|± | 1.08|
| | |acc | 35.40|± | 0.67|
|lambada_openai_cloze| 0|ppl |6212.81|± |267.17|
| | |acc | 0.45|± | 0.09|
## bloom-560m_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 1.26|± | 0.11|
| | |f1 | 3.50|± | 0.14|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.00|± | 0.00|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.19|± | 0.19|
|math_prealgebra | 1|acc | 0.23|± | 0.16|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |22.51|± | 0.76|
| | |acc_norm|22.35|± | 0.76|
## bloom-560m_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |52.80|± | 1.12|
|pawsx_en| 0|acc |52.00|± | 1.12|
|pawsx_es| 0|acc |53.25|± | 1.12|
|pawsx_fr| 0|acc |47.95|± | 1.12|
|pawsx_ja| 0|acc |44.90|± | 1.11|
|pawsx_ko| 0|acc |51.90|± | 1.12|
|pawsx_zh| 0|acc |45.20|± | 1.11|
## bloom-560m_question_answering_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|------------|----:|---|-----:|
|headqa_en | 0|acc |25.67|± | 0.83|
| | |acc_norm |29.58|± | 0.87|
|headqa_es | 0|acc |23.96|± | 0.82|
| | |acc_norm |27.17|± | 0.85|
|logiqa | 0|acc |22.58|± | 1.64|
| | |acc_norm |27.19|± | 1.75|
|squad2 | 1|exact | 0.43| | |
| | |f1 | 1.86| | |
| | |HasAns_exact| 0.76| | |
| | |HasAns_f1 | 3.62| | |
| | |NoAns_exact | 0.10| | |
| | |NoAns_f1 | 0.10| | |
| | |best_exact |50.07| | |
| | |best_f1 |50.07| | |
|triviaqa | 1|acc | 1.44|± | 0.11|
|truthfulqa_mc| 1|mc1 |24.48|± | 1.51|
| | |mc2 |42.43|± | 1.51|
|webqs | 0|acc | 0.84|± | 0.20|
## bloom-560m_reading_comprehension_0-shot.json
|Task|Version|Metric|Value| |Stderr|
|----|------:|------|----:|---|-----:|
|coqa| 1|f1 |22.71|± | 1.67|
| | |em |17.40|± | 1.62|
|drop| 1|em | 1.50|± | 0.12|
| | |f1 | 6.21|± | 0.17|
|race| 1|acc |30.24|± | 1.42|
## bloom-560m_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 49.0|± | 2.24|
|xcopa_ht| 0|acc | 50.2|± | 2.24|
|xcopa_id| 0|acc | 59.2|± | 2.20|
|xcopa_it| 0|acc | 50.8|± | 2.24|
|xcopa_qu| 0|acc | 50.2|± | 2.24|
|xcopa_sw| 0|acc | 51.6|± | 2.24|
|xcopa_ta| 0|acc | 55.8|± | 2.22|
|xcopa_th| 0|acc | 54.4|± | 2.23|
|xcopa_tr| 0|acc | 53.0|± | 2.23|
|xcopa_vi| 0|acc | 61.0|± | 2.18|
|xcopa_zh| 0|acc | 58.6|± | 2.20|
## bloom-560m_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.35|± | 0.67|
|xnli_bg| 0|acc |33.39|± | 0.67|
|xnli_de| 0|acc |34.79|± | 0.67|
|xnli_el| 0|acc |33.33|± | 0.67|
|xnli_en| 0|acc |49.50|± | 0.71|
|xnli_es| 0|acc |45.23|± | 0.70|
|xnli_fr| 0|acc |45.29|± | 0.70|
|xnli_hi| 0|acc |40.84|± | 0.69|
|xnli_ru| 0|acc |34.01|± | 0.67|
|xnli_sw| 0|acc |33.17|± | 0.67|
|xnli_th| 0|acc |33.57|± | 0.67|
|xnli_tr| 0|acc |33.43|± | 0.67|
|xnli_ur| 0|acc |37.13|± | 0.68|
|xnli_vi| 0|acc |40.52|± | 0.69|
|xnli_zh| 0|acc |33.95|± | 0.67|
## bloom-560m_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |52.08|± | 1.29|
|xstory_cloze_en| 0|acc |61.22|± | 1.25|
|xstory_cloze_es| 0|acc |55.86|± | 1.28|
|xstory_cloze_eu| 0|acc |53.61|± | 1.28|
|xstory_cloze_hi| 0|acc |55.00|± | 1.28|
|xstory_cloze_id| 0|acc |55.53|± | 1.28|
|xstory_cloze_my| 0|acc |47.19|± | 1.28|
|xstory_cloze_ru| 0|acc |49.17|± | 1.29|
|xstory_cloze_sw| 0|acc |49.83|± | 1.29|
|xstory_cloze_te| 0|acc |55.72|± | 1.28|
|xstory_cloze_zh| 0|acc |54.53|± | 1.28|
## bloom-560m_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |65.89|± | 0.98|
|xwinograd_fr| 0|acc |60.24|± | 5.40|
|xwinograd_jp| 0|acc |52.97|± | 1.61|
|xwinograd_pt| 0|acc |60.08|± | 3.03|
|xwinograd_ru| 0|acc |49.21|± | 2.82|
|xwinograd_zh| 0|acc |67.66|± | 2.09|
results/bloom/bloom-560m/bloom-560m_common_sense_reasoning_0-shot.json
deleted
100644 → 0
View file @
e37698df
{
"results"
:
{
"hellaswag"
:
{
"acc"
:
0.31557458673571004
,
"acc_stderr"
:
0.004637944965914592
,
"acc_norm"
:
0.3655646285600478
,
"acc_norm_stderr"
:
0.00480603903900897
},
"prost"
:
{
"acc"
:
0.22080486763450044
,
"acc_stderr"
:
0.0030304044027250577
,
"acc_norm"
:
0.3207728437233134
,
"acc_norm_stderr"
:
0.003410197007857463
},
"piqa"
:
{
"acc"
:
0.6409140369967355
,
"acc_stderr"
:
0.011192949073844103
,
"acc_norm"
:
0.6512513601741022
,
"acc_norm_stderr"
:
0.011119263056159595
},
"arc_easy"
:
{
"acc"
:
0.4734848484848485
,
"acc_stderr"
:
0.010245347015573713
,
"acc_norm"
:
0.4166666666666667
,
"acc_norm_stderr"
:
0.01011628297778124
},
"winogrande"
:
{
"acc"
:
0.5280189423835833
,
"acc_stderr"
:
0.014030404213405784
},
"mc_taco"
:
{
"em"
:
0.17417417417417416
,
"f1"
:
0.31427590778450365
},
"openbookqa"
:
{
"acc"
:
0.172
,
"acc_stderr"
:
0.01689386887634748
,
"acc_norm"
:
0.282
,
"acc_norm_stderr"
:
0.020143572847290795
},
"copa"
:
{
"acc"
:
0.61
,
"acc_stderr"
:
0.04902071300001975
},
"boolq"
:
{
"acc"
:
0.5513761467889908
,
"acc_stderr"
:
0.008698767182005265
},
"swag"
:
{
"acc"
:
0.40347895631310604
,
"acc_stderr"
:
0.003468598652499914
,
"acc_norm"
:
0.5296411076676997
,
"acc_norm_stderr"
:
0.003528874749486556
},
"arc_challenge"
:
{
"acc"
:
0.22440273037542663
,
"acc_stderr"
:
0.012191404938603838
,
"acc_norm"
:
0.23976109215017063
,
"acc_norm_stderr"
:
0.012476304127453947
},
"wsc273"
:
{
"acc"
:
0.6666666666666666
,
"acc_stderr"
:
0.028583097523751506
}
},
"versions"
:
{
"hellaswag"
:
0
,
"prost"
:
0
,
"piqa"
:
0
,
"arc_easy"
:
0
,
"winogrande"
:
0
,
"mc_taco"
:
0
,
"openbookqa"
:
0
,
"copa"
:
0
,
"boolq"
:
1
,
"swag"
:
0
,
"arc_challenge"
:
0
,
"wsc273"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-560m,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-560m/bloom-560m_gsm8k_8-shot.json
deleted
100644 → 0
View file @
e37698df
{
"results"
:
{
"gsm8k"
:
{
"acc"
:
0.00530705079605762
,
"acc_stderr"
:
0.002001305720948044
}
},
"versions"
:
{
"gsm8k"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-560m,use_accelerate=True"
,
"num_fewshot"
:
8
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-560m/bloom-560m_lambada_openai_0-shot.json
deleted
100644 → 0
View file @
e37698df
{
"results"
:
{
"lambada_openai_cloze"
:
{
"ppl"
:
6212.811566448059
,
"ppl_stderr"
:
267.17480839849884
,
"acc"
:
0.00446341936735882
,
"acc_stderr"
:
0.0009286980441682362
},
"lambada_openai"
:
{
"ppl"
:
28.67792043529687
,
"ppl_stderr"
:
1.0838905590384336
,
"acc"
:
0.35396856200271687
,
"acc_stderr"
:
0.006662254900290991
}
},
"versions"
:
{
"lambada_openai_cloze"
:
0
,
"lambada_openai"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-560m,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-560m/bloom-560m_mathematical_reasoning_few_shot_5-shot.json
deleted
100644 → 0
View file @
e37698df
{
"results"
:
{
"math_intermediate_algebra"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_geometry"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_prealgebra"
:
{
"acc"
:
0.002296211251435132
,
"acc_stderr"
:
0.001622733136934621
},
"drop"
:
{
"em"
:
0.012583892617449664
,
"em_stderr"
:
0.0011415560941551478
,
"f1"
:
0.034977978187919485
,
"f1_stderr"
:
0.0013870691301022255
},
"math_counting_and_prob"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"gsm8k"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_num_theory"
:
{
"acc"
:
0.001851851851851852
,
"acc_stderr"
:
0.0018518518518518465
},
"math_precalc"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_algebra"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"mathqa"
:
{
"acc"
:
0.22512562814070353
,
"acc_stderr"
:
0.00764590166234271
,
"acc_norm"
:
0.2234505862646566
,
"acc_norm_stderr"
:
0.00762563278617748
}
},
"versions"
:
{
"math_intermediate_algebra"
:
1
,
"math_geometry"
:
1
,
"math_prealgebra"
:
1
,
"drop"
:
1
,
"mathqa"
:
0
,
"math_counting_and_prob"
:
1
,
"gsm8k"
:
0
,
"math_num_theory"
:
1
,
"math_precalc"
:
1
,
"math_algebra"
:
1
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-560m,use_accelerate=True"
,
"num_fewshot"
:
5
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-560m/bloom-560m_pawsx_0-shot.json
deleted
100644 → 0
View file @
e37698df
{
"results"
:
{
"pawsx_zh"
:
{
"acc"
:
0.452
,
"acc_stderr"
:
0.011131484850525779
},
"pawsx_de"
:
{
"acc"
:
0.528
,
"acc_stderr"
:
0.01116558709462154
},
"pawsx_ja"
:
{
"acc"
:
0.449
,
"acc_stderr"
:
0.011124809242874427
},
"pawsx_en"
:
{
"acc"
:
0.52
,
"acc_stderr"
:
0.011174185930778313
},
"pawsx_fr"
:
{
"acc"
:
0.4795
,
"acc_stderr"
:
0.011173732641806813
},
"pawsx_es"
:
{
"acc"
:
0.5325
,
"acc_stderr"
:
0.011159486640120933
},
"pawsx_ko"
:
{
"acc"
:
0.519
,
"acc_stderr"
:
0.011175058879956058
}
},
"versions"
:
{
"pawsx_zh"
:
0
,
"pawsx_de"
:
0
,
"pawsx_ja"
:
0
,
"pawsx_en"
:
0
,
"pawsx_fr"
:
0
,
"pawsx_es"
:
0
,
"pawsx_ko"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-560m"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-560m/bloom-560m_question_answering_0-shot.json
deleted
100644 → 0
View file @
e37698df
{
"results"
:
{
"logiqa"
:
{
"acc"
:
0.22580645161290322
,
"acc_stderr"
:
0.01639971378844507
,
"acc_norm"
:
0.271889400921659
,
"acc_norm_stderr"
:
0.01745171600943683
},
"webqs"
:
{
"acc"
:
0.008366141732283465
,
"acc_stderr"
:
0.002021079144496907
},
"triviaqa"
:
{
"acc"
:
0.014408202952355696
,
"acc_stderr"
:
0.001120427476389321
},
"truthfulqa_mc"
:
{
"mc1"
:
0.24479804161566707
,
"mc1_stderr"
:
0.01505186948671501
,
"mc2"
:
0.42431454034377986
,
"mc2_stderr"
:
0.015119647586282134
},
"headqa_en"
:
{
"acc"
:
0.2567469000729395
,
"acc_stderr"
:
0.008343849291869198
,
"acc_norm"
:
0.29576951130561635
,
"acc_norm_stderr"
:
0.008717251898361426
},
"headqa_es"
:
{
"acc"
:
0.23960612691466082
,
"acc_stderr"
:
0.008152930613263032
,
"acc_norm"
:
0.27169948942377825
,
"acc_norm_stderr"
:
0.00849660053640109
},
"squad2"
:
{
"exact"
:
0.4295460288048513
,
"f1"
:
1.8591225589833205
,
"HasAns_exact"
:
0.7591093117408907
,
"HasAns_f1"
:
3.622362034886802
,
"NoAns_exact"
:
0.10092514718250631
,
"NoAns_f1"
:
0.10092514718250631
,
"best_exact"
:
50.07159100480081
,
"best_f1"
:
50.07159100480081
}
},
"versions"
:
{
"logiqa"
:
0
,
"webqs"
:
0
,
"triviaqa"
:
1
,
"truthfulqa_mc"
:
1
,
"headqa_en"
:
0
,
"headqa_es"
:
0
,
"squad2"
:
1
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-560m,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-560m/bloom-560m_reading_comprehension_0-shot.json
deleted
100644 → 0
View file @
e37698df
{
"results"
:
{
"drop"
:
{
"em"
:
0.014995805369127516
,
"em_stderr"
:
0.0012446395261788805
,
"f1"
:
0.0621245805369127
,
"f1_stderr"
:
0.001730980795797461
},
"coqa"
:
{
"f1"
:
0.22712736568843772
,
"f1_stderr"
:
0.01673094848597647
,
"em"
:
0.174
,
"em_stderr"
:
0.016190705499013296
},
"race"
:
{
"acc"
:
0.30239234449760766
,
"acc_stderr"
:
0.014214800395178313
}
},
"versions"
:
{
"drop"
:
1
,
"race"
:
1
,
"coqa"
:
1
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-560m,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-560m/bloom-560m_xcopa_0-shot.json
deleted
100644 → 0
View file @
e37698df
{
"results"
:
{
"xcopa_id"
:
{
"acc"
:
0.592
,
"acc_stderr"
:
0.02200091089387719
},
"xcopa_ht"
:
{
"acc"
:
0.502
,
"acc_stderr"
:
0.022382894986483524
},
"xcopa_qu"
:
{
"acc"
:
0.502
,
"acc_stderr"
:
0.02238289498648353
},
"xcopa_et"
:
{
"acc"
:
0.49
,
"acc_stderr"
:
0.022378596989230785
},
"xcopa_th"
:
{
"acc"
:
0.544
,
"acc_stderr"
:
0.022296238348407056
},
"xcopa_tr"
:
{
"acc"
:
0.53
,
"acc_stderr"
:
0.02234274819250285
},
"xcopa_it"
:
{
"acc"
:
0.508
,
"acc_stderr"
:
0.02238020883492804
},
"xcopa_ta"
:
{
"acc"
:
0.558
,
"acc_stderr"
:
0.02223197069632112
},
"xcopa_sw"
:
{
"acc"
:
0.516
,
"acc_stderr"
:
0.022371610982580396
},
"xcopa_vi"
:
{
"acc"
:
0.61
,
"acc_stderr"
:
0.021834685869369208
},
"xcopa_zh"
:
{
"acc"
:
0.586
,
"acc_stderr"
:
0.022049497969827865
}
},
"versions"
:
{
"xcopa_id"
:
0
,
"xcopa_ht"
:
0
,
"xcopa_qu"
:
0
,
"xcopa_et"
:
0
,
"xcopa_th"
:
0
,
"xcopa_tr"
:
0
,
"xcopa_it"
:
0
,
"xcopa_ta"
:
0
,
"xcopa_sw"
:
0
,
"xcopa_vi"
:
0
,
"xcopa_zh"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-560m"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-560m/bloom-560m_xnli_0-shot.json
deleted
100644 → 0
View file @
e37698df
{
"results"
:
{
"xnli_sw"
:
{
"acc"
:
0.3317365269461078
,
"acc_stderr"
:
0.006652654857813421
},
"xnli_th"
:
{
"acc"
:
0.33572854291417165
,
"acc_stderr"
:
0.0066725434859242665
},
"xnli_ur"
:
{
"acc"
:
0.3712574850299401
,
"acc_stderr"
:
0.0068265064875345964
},
"xnli_bg"
:
{
"acc"
:
0.3339321357285429
,
"acc_stderr"
:
0.00666366003290998
},
"xnli_tr"
:
{
"acc"
:
0.3343313373253493
,
"acc_stderr"
:
0.006665643509474755
},
"xnli_zh"
:
{
"acc"
:
0.3395209580838323
,
"acc_stderr"
:
0.006690942515072474
},
"xnli_ar"
:
{
"acc"
:
0.3335329341317365
,
"acc_stderr"
:
0.006661671189931638
},
"xnli_el"
:
{
"acc"
:
0.3333333333333333
,
"acc_stderr"
:
0.006660674754535592
},
"xnli_hi"
:
{
"acc"
:
0.40838323353293415
,
"acc_stderr"
:
0.006945102706766183
},
"xnli_fr"
:
{
"acc"
:
0.4528942115768463
,
"acc_stderr"
:
0.007033289986695001
},
"xnli_es"
:
{
"acc"
:
0.45229540918163674
,
"acc_stderr"
:
0.007032484191375647
},
"xnli_vi"
:
{
"acc"
:
0.405189620758483
,
"acc_stderr"
:
0.006936540228025353
},
"xnli_en"
:
{
"acc"
:
0.49500998003992014
,
"acc_stderr"
:
0.007064360593648105
},
"xnli_de"
:
{
"acc"
:
0.34790419161676644
,
"acc_stderr"
:
0.006729921818907755
},
"xnli_ru"
:
{
"acc"
:
0.3401197604790419
,
"acc_stderr"
:
0.006693803790492355
}
},
"versions"
:
{
"xnli_sw"
:
0
,
"xnli_th"
:
0
,
"xnli_ur"
:
0
,
"xnli_bg"
:
0
,
"xnli_tr"
:
0
,
"xnli_zh"
:
0
,
"xnli_ar"
:
0
,
"xnli_el"
:
0
,
"xnli_hi"
:
0
,
"xnli_fr"
:
0
,
"xnli_es"
:
0
,
"xnli_vi"
:
0
,
"xnli_en"
:
0
,
"xnli_de"
:
0
,
"xnli_ru"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-560m"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-560m/bloom-560m_xstory_cloze_0-shot.json
deleted
100644 → 0
View file @
e37698df
{
"results"
:
{
"xstory_cloze_es"
:
{
"acc"
:
0.5585704831237591
,
"acc_stderr"
:
0.012778538985880637
},
"xstory_cloze_hi"
:
{
"acc"
:
0.5499669093315684
,
"acc_stderr"
:
0.01280271359821983
},
"xstory_cloze_eu"
:
{
"acc"
:
0.5360688285903376
,
"acc_stderr"
:
0.012833602406620015
},
"xstory_cloze_ar"
:
{
"acc"
:
0.5208471211118465
,
"acc_stderr"
:
0.012855936282881267
},
"xstory_cloze_zh"
:
{
"acc"
:
0.5453342157511581
,
"acc_stderr"
:
0.012814127367359424
},
"xstory_cloze_te"
:
{
"acc"
:
0.557246856386499
,
"acc_stderr"
:
0.012782510750319236
},
"xstory_cloze_sw"
:
{
"acc"
:
0.4983454665784249
,
"acc_stderr"
:
0.012867054869163334
},
"xstory_cloze_ru"
:
{
"acc"
:
0.49172733289212445
,
"acc_stderr"
:
0.012865364020375405
},
"xstory_cloze_my"
:
{
"acc"
:
0.47187293183322304
,
"acc_stderr"
:
0.012846749995797694
},
"xstory_cloze_en"
:
{
"acc"
:
0.6121773659827928
,
"acc_stderr"
:
0.012539110696551456
},
"xstory_cloze_id"
:
{
"acc"
:
0.5552614162806089
,
"acc_stderr"
:
0.01278829597020778
}
},
"versions"
:
{
"xstory_cloze_es"
:
0
,
"xstory_cloze_hi"
:
0
,
"xstory_cloze_eu"
:
0
,
"xstory_cloze_ar"
:
0
,
"xstory_cloze_zh"
:
0
,
"xstory_cloze_te"
:
0
,
"xstory_cloze_sw"
:
0
,
"xstory_cloze_ru"
:
0
,
"xstory_cloze_my"
:
0
,
"xstory_cloze_en"
:
0
,
"xstory_cloze_id"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-560m"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-560m/bloom-560m_xwinograd_0-shot.json
deleted
100644 → 0
View file @
e37698df
{
"results"
:
{
"xwinograd_en"
:
{
"acc"
:
0.6589247311827957
,
"acc_stderr"
:
0.009833881195698878
},
"xwinograd_pt"
:
{
"acc"
:
0.6007604562737643
,
"acc_stderr"
:
0.03025636835693898
},
"xwinograd_ru"
:
{
"acc"
:
0.49206349206349204
,
"acc_stderr"
:
0.028213077547815057
},
"xwinograd_fr"
:
{
"acc"
:
0.6024096385542169
,
"acc_stderr"
:
0.054045178247868114
},
"xwinograd_jp"
:
{
"acc"
:
0.529718456725756
,
"acc_stderr"
:
0.01612570703179889
},
"xwinograd_zh"
:
{
"acc"
:
0.6765873015873016
,
"acc_stderr"
:
0.020857221952855685
}
},
"versions"
:
{
"xwinograd_en"
:
0
,
"xwinograd_pt"
:
0
,
"xwinograd_ru"
:
0
,
"xwinograd_fr"
:
0
,
"xwinograd_jp"
:
0
,
"xwinograd_zh"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-560m"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-7b1/README.md
deleted
100644 → 0
View file @
e37698df
# bloom-7b1
## bloom-7b1_bbh_3-shot.json
| Task |Version| Metric |Value| |Stderr|
|------------------------------------------------|------:|---------------------|----:|---|-----:|
|bigbench_causal_judgement | 0|multiple_choice_grade|52.11|± | 3.63|
|bigbench_date_understanding | 0|multiple_choice_grade|36.59|± | 2.51|
|bigbench_disambiguation_qa | 0|multiple_choice_grade|26.36|± | 2.75|
|bigbench_dyck_languages | 0|multiple_choice_grade|14.40|± | 1.11|
|bigbench_formal_fallacies_syllogisms_negation | 0|multiple_choice_grade|50.06|± | 0.42|
|bigbench_geometric_shapes | 0|multiple_choice_grade|20.06|± | 2.12|
| | |exact_str_match | 0.00|± | 0.00|
|bigbench_hyperbaton | 0|multiple_choice_grade|48.62|± | 0.22|
|bigbench_logical_deduction_five_objects | 0|multiple_choice_grade|26.00|± | 1.96|
|bigbench_logical_deduction_seven_objects | 0|multiple_choice_grade|19.14|± | 1.49|
|bigbench_logical_deduction_three_objects | 0|multiple_choice_grade|37.00|± | 2.79|
|bigbench_movie_recommendation | 0|multiple_choice_grade|26.40|± | 1.97|
|bigbench_navigate | 0|multiple_choice_grade|49.90|± | 1.58|
|bigbench_reasoning_about_colored_objects | 0|multiple_choice_grade|24.85|± | 0.97|
|bigbench_ruin_names | 0|multiple_choice_grade|34.38|± | 2.25|
|bigbench_salient_translation_error_detection | 0|multiple_choice_grade|19.14|± | 1.25|
|bigbench_snarks | 0|multiple_choice_grade|49.72|± | 3.73|
|bigbench_sports_understanding | 0|multiple_choice_grade|50.30|± | 1.59|
|bigbench_temporal_sequences | 0|multiple_choice_grade|24.80|± | 1.37|
|bigbench_tracking_shuffled_objects_five_objects | 0|multiple_choice_grade|18.40|± | 1.10|
|bigbench_tracking_shuffled_objects_seven_objects| 0|multiple_choice_grade|14.00|± | 0.83|
|bigbench_tracking_shuffled_objects_three_objects| 0|multiple_choice_grade|37.00|± | 2.79|
## bloom-7b1_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |30.38|± | 1.34|
| | |acc_norm|33.53|± | 1.38|
|arc_easy | 0|acc |64.94|± | 0.98|
| | |acc_norm|57.32|± | 1.01|
|boolq | 1|acc |62.87|± | 0.85|
|copa | 0|acc |72.00|± | 4.51|
|hellaswag | 0|acc |46.24|± | 0.50|
| | |acc_norm|59.68|± | 0.49|
|mc_taco | 0|em |13.59| | |
| | |f1 |50.53| | |
|openbookqa | 0|acc |25.20|± | 1.94|
| | |acc_norm|35.80|± | 2.15|
|piqa | 0|acc |72.74|± | 1.04|
| | |acc_norm|73.67|± | 1.03|
|prost | 0|acc |26.18|± | 0.32|
| | |acc_norm|30.57|± | 0.34|
|swag | 0|acc |50.25|± | 0.35|
| | |acc_norm|68.26|± | 0.33|
|winogrande | 0|acc |64.33|± | 1.35|
|wsc273 | 0|acc |81.32|± | 2.36|
## bloom-7b1_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc | 1.9|± | 0.38|
## bloom-7b1_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 2.51|± | 0.16|
| | |f1 | 5.09|± | 0.18|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.00|± | 0.00|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.00|± | 0.00|
|math_prealgebra | 1|acc | 0.00|± | 0.00|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |26.57|± | 0.81|
| | |acc_norm|26.53|± | 0.81|
## bloom-7b1_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |52.85|± | 1.12|
|pawsx_en| 0|acc |61.30|± | 1.09|
|pawsx_es| 0|acc |59.35|± | 1.10|
|pawsx_fr| 0|acc |50.90|± | 1.12|
|pawsx_ja| 0|acc |45.45|± | 1.11|
|pawsx_ko| 0|acc |45.10|± | 1.11|
|pawsx_zh| 0|acc |47.35|± | 1.12|
## bloom-7b1_question_answering_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|------------|----:|---|-----:|
|headqa_en | 0|acc |31.18|± | 0.88|
| | |acc_norm |35.56|± | 0.91|
|headqa_es | 0|acc |29.54|± | 0.87|
| | |acc_norm |34.32|± | 0.91|
|logiqa | 0|acc |20.28|± | 1.58|
| | |acc_norm |28.11|± | 1.76|
|squad2 | 1|exact | 7.82| | |
| | |f1 |12.64| | |
| | |HasAns_exact|14.84| | |
| | |HasAns_f1 |24.51| | |
| | |NoAns_exact | 0.81| | |
| | |NoAns_f1 | 0.81| | |
| | |best_exact |50.07| | |
| | |best_f1 |50.07| | |
|triviaqa | 1|acc | 5.52|± | 0.21|
|truthfulqa_mc| 1|mc1 |22.40|± | 1.46|
| | |mc2 |38.90|± | 1.40|
|webqs | 0|acc | 2.26|± | 0.33|
## bloom-7b1_reading_comprehension_0-shot.json
|Task|Version|Metric|Value| |Stderr|
|----|------:|------|----:|---|-----:|
|coqa| 1|f1 |68.83|± | 1.63|
| | |em |53.87|± | 2.00|
|drop| 1|em | 2.57|± | 0.16|
| | |f1 | 9.85|± | 0.21|
|race| 1|acc |36.56|± | 1.49|
## bloom-7b1_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 48.2|± | 2.24|
|xcopa_ht| 0|acc | 50.8|± | 2.24|
|xcopa_id| 0|acc | 69.8|± | 2.06|
|xcopa_it| 0|acc | 52.8|± | 2.23|
|xcopa_qu| 0|acc | 50.8|± | 2.24|
|xcopa_sw| 0|acc | 51.6|± | 2.24|
|xcopa_ta| 0|acc | 59.2|± | 2.20|
|xcopa_th| 0|acc | 55.4|± | 2.23|
|xcopa_tr| 0|acc | 51.2|± | 2.24|
|xcopa_vi| 0|acc | 70.8|± | 2.04|
|xcopa_zh| 0|acc | 65.2|± | 2.13|
## bloom-7b1_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.83|± | 0.67|
|xnli_bg| 0|acc |39.70|± | 0.69|
|xnli_de| 0|acc |39.86|± | 0.69|
|xnli_el| 0|acc |35.75|± | 0.68|
|xnli_en| 0|acc |53.91|± | 0.70|
|xnli_es| 0|acc |48.70|± | 0.71|
|xnli_fr| 0|acc |49.68|± | 0.71|
|xnli_hi| 0|acc |46.51|± | 0.70|
|xnli_ru| 0|acc |43.05|± | 0.70|
|xnli_sw| 0|acc |37.92|± | 0.69|
|xnli_th| 0|acc |34.99|± | 0.67|
|xnli_tr| 0|acc |35.09|± | 0.67|
|xnli_ur| 0|acc |42.10|± | 0.70|
|xnli_vi| 0|acc |47.05|± | 0.71|
|xnli_zh| 0|acc |35.43|± | 0.68|
## bloom-7b1_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |58.57|± | 1.27|
|xstory_cloze_en| 0|acc |70.75|± | 1.17|
|xstory_cloze_es| 0|acc |66.12|± | 1.22|
|xstory_cloze_eu| 0|acc |57.18|± | 1.27|
|xstory_cloze_hi| 0|acc |60.56|± | 1.26|
|xstory_cloze_id| 0|acc |64.46|± | 1.23|
|xstory_cloze_my| 0|acc |48.97|± | 1.29|
|xstory_cloze_ru| 0|acc |52.75|± | 1.28|
|xstory_cloze_sw| 0|acc |53.94|± | 1.28|
|xstory_cloze_te| 0|acc |57.45|± | 1.27|
|xstory_cloze_zh| 0|acc |61.88|± | 1.25|
## bloom-7b1_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |82.15|± | 0.79|
|xwinograd_fr| 0|acc |71.08|± | 5.01|
|xwinograd_jp| 0|acc |58.50|± | 1.59|
|xwinograd_pt| 0|acc |76.81|± | 2.61|
|xwinograd_ru| 0|acc |56.83|± | 2.80|
|xwinograd_zh| 0|acc |74.40|± | 1.95|
results/bloom/bloom-7b1/bloom-7b1_bbh_3-shot.json
deleted
100644 → 0
View file @
e37698df
{
"results"
:
{
"bigbench_disambiguation_qa"
:
{
"multiple_choice_grade"
:
0.26356589147286824
,
"multiple_choice_grade_stderr"
:
0.027481788262218698
},
"bigbench_logical_deduction_three_objects"
:
{
"multiple_choice_grade"
:
0.37
,
"multiple_choice_grade_stderr"
:
0.027921294063982
},
"bigbench_causal_judgement"
:
{
"multiple_choice_grade"
:
0.5210526315789473
,
"multiple_choice_grade_stderr"
:
0.03633739504773335
},
"bigbench_date_understanding"
:
{
"multiple_choice_grade"
:
0.36585365853658536
,
"multiple_choice_grade_stderr"
:
0.025108717905729792
},
"bigbench_navigate"
:
{
"multiple_choice_grade"
:
0.499
,
"multiple_choice_grade_stderr"
:
0.015819268290576817
},
"bigbench_salient_translation_error_detection"
:
{
"multiple_choice_grade"
:
0.19138276553106212
,
"multiple_choice_grade_stderr"
:
0.012458774650265594
},
"bigbench_temporal_sequences"
:
{
"multiple_choice_grade"
:
0.248
,
"multiple_choice_grade_stderr"
:
0.013663187134877651
},
"bigbench_tracking_shuffled_objects_seven_objects"
:
{
"multiple_choice_grade"
:
0.14
,
"multiple_choice_grade_stderr"
:
0.00829694743648913
},
"bigbench_ruin_names"
:
{
"multiple_choice_grade"
:
0.34375
,
"multiple_choice_grade_stderr"
:
0.02246478414865448
},
"bigbench_reasoning_about_colored_objects"
:
{
"multiple_choice_grade"
:
0.2485
,
"multiple_choice_grade_stderr"
:
0.009665432493822852
},
"bigbench_dyck_languages"
:
{
"multiple_choice_grade"
:
0.144
,
"multiple_choice_grade_stderr"
:
0.01110798754893915
},
"bigbench_logical_deduction_five_objects"
:
{
"multiple_choice_grade"
:
0.26
,
"multiple_choice_grade_stderr"
:
0.019635965529725512
},
"bigbench_sports_understanding"
:
{
"multiple_choice_grade"
:
0.5030425963488844
,
"multiple_choice_grade_stderr"
:
0.015931029729145698
},
"bigbench_tracking_shuffled_objects_three_objects"
:
{
"multiple_choice_grade"
:
0.37
,
"multiple_choice_grade_stderr"
:
0.027921294063982
},
"bigbench_geometric_shapes"
:
{
"multiple_choice_grade"
:
0.20055710306406685
,
"multiple_choice_grade_stderr"
:
0.021162707757982353
,
"exact_str_match"
:
0.0
,
"exact_str_match_stderr"
:
0.0
},
"bigbench_hyperbaton"
:
{
"multiple_choice_grade"
:
0.48618
,
"multiple_choice_grade_stderr"
:
0.0022352360227943418
},
"bigbench_logical_deduction_seven_objects"
:
{
"multiple_choice_grade"
:
0.19142857142857142
,
"multiple_choice_grade_stderr"
:
0.014880721436998012
},
"bigbench_snarks"
:
{
"multiple_choice_grade"
:
0.4972375690607735
,
"multiple_choice_grade_stderr"
:
0.037267230837657574
},
"bigbench_formal_fallacies_syllogisms_negation"
:
{
"multiple_choice_grade"
:
0.5005633802816901
,
"multiple_choice_grade_stderr"
:
0.004196051878850066
},
"bigbench_tracking_shuffled_objects_five_objects"
:
{
"multiple_choice_grade"
:
0.184
,
"multiple_choice_grade_stderr"
:
0.010964094540602657
},
"bigbench_movie_recommendation"
:
{
"multiple_choice_grade"
:
0.264
,
"multiple_choice_grade_stderr"
:
0.019732885585922087
}
},
"versions"
:
{
"bigbench_disambiguation_qa"
:
0
,
"bigbench_logical_deduction_three_objects"
:
0
,
"bigbench_causal_judgement"
:
0
,
"bigbench_date_understanding"
:
0
,
"bigbench_navigate"
:
0
,
"bigbench_salient_translation_error_detection"
:
0
,
"bigbench_temporal_sequences"
:
0
,
"bigbench_tracking_shuffled_objects_seven_objects"
:
0
,
"bigbench_ruin_names"
:
0
,
"bigbench_reasoning_about_colored_objects"
:
0
,
"bigbench_dyck_languages"
:
0
,
"bigbench_logical_deduction_five_objects"
:
0
,
"bigbench_sports_understanding"
:
0
,
"bigbench_tracking_shuffled_objects_three_objects"
:
0
,
"bigbench_geometric_shapes"
:
0
,
"bigbench_hyperbaton"
:
0
,
"bigbench_logical_deduction_seven_objects"
:
0
,
"bigbench_snarks"
:
0
,
"bigbench_formal_fallacies_syllogisms_negation"
:
0
,
"bigbench_tracking_shuffled_objects_five_objects"
:
0
,
"bigbench_movie_recommendation"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-7b1,use_accelerate=True"
,
"num_fewshot"
:
3
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
Prev
1
2
3
4
5
6
7
8
…
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment