Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
e53eb332
Unverified
Commit
e53eb332
authored
May 19, 2023
by
Stella Biderman
Committed by
GitHub
May 19, 2023
Browse files
Merge pull request #477 from juletx/results
Add results of various models in json and md format
parents
d1327193
92a50856
Changes
189
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1926 additions
and
0 deletions
+1926
-0
results/bloom/bloom-560m/bloom-560m_reading_comprehension_0-shot.json
...m/bloom-560m/bloom-560m_reading_comprehension_0-shot.json
+36
-0
results/bloom/bloom-560m/bloom-560m_xcopa_0-shot.json
results/bloom/bloom-560m/bloom-560m_xcopa_0-shot.json
+72
-0
results/bloom/bloom-560m/bloom-560m_xnli_0-shot.json
results/bloom/bloom-560m/bloom-560m_xnli_0-shot.json
+92
-0
results/bloom/bloom-560m/bloom-560m_xstory_cloze_0-shot.json
results/bloom/bloom-560m/bloom-560m_xstory_cloze_0-shot.json
+72
-0
results/bloom/bloom-560m/bloom-560m_xwinograd_0-shot.json
results/bloom/bloom-560m/bloom-560m_xwinograd_0-shot.json
+47
-0
results/bloom/bloom-7b1/README.md
results/bloom/bloom-7b1/README.md
+173
-0
results/bloom/bloom-7b1/bloom-7b1_bbh_3-shot.json
results/bloom/bloom-7b1/bloom-7b1_bbh_3-shot.json
+124
-0
results/bloom/bloom-7b1/bloom-7b1_common_sense_reasoning_0-shot.json
...om/bloom-7b1/bloom-7b1_common_sense_reasoning_0-shot.json
+91
-0
results/bloom/bloom-7b1/bloom-7b1_gsm8k_8-shot.json
results/bloom/bloom-7b1/bloom-7b1_gsm8k_8-shot.json
+22
-0
results/bloom/bloom-7b1/bloom-7b1_mathematical_reasoning_few_shot_5-shot.json
...7b1/bloom-7b1_mathematical_reasoning_few_shot_5-shot.json
+71
-0
results/bloom/bloom-7b1/bloom-7b1_pawsx_0-shot.json
results/bloom/bloom-7b1/bloom-7b1_pawsx_0-shot.json
+52
-0
results/bloom/bloom-7b1/bloom-7b1_question_answering_0-shot.json
.../bloom/bloom-7b1/bloom-7b1_question_answering_0-shot.json
+66
-0
results/bloom/bloom-7b1/bloom-7b1_reading_comprehension_0-shot.json
...oom/bloom-7b1/bloom-7b1_reading_comprehension_0-shot.json
+36
-0
results/bloom/bloom-7b1/bloom-7b1_xcopa_0-shot.json
results/bloom/bloom-7b1/bloom-7b1_xcopa_0-shot.json
+72
-0
results/bloom/bloom-7b1/bloom-7b1_xnli_0-shot.json
results/bloom/bloom-7b1/bloom-7b1_xnli_0-shot.json
+92
-0
results/bloom/bloom-7b1/bloom-7b1_xstory_cloze_0-shot.json
results/bloom/bloom-7b1/bloom-7b1_xstory_cloze_0-shot.json
+72
-0
results/bloom/bloom-7b1/bloom-7b1_xwinograd_0-shot.json
results/bloom/bloom-7b1/bloom-7b1_xwinograd_0-shot.json
+47
-0
results/llama/llama-13B/README.md
results/llama/llama-13B/README.md
+498
-0
results/llama/llama-13B/llama-13B_arithmetic_5-shot.json
results/llama/llama-13B/llama-13B_arithmetic_5-shot.json
+67
-0
results/llama/llama-13B/llama-13B_bbh_3-shot.json
results/llama/llama-13B/llama-13B_bbh_3-shot.json
+124
-0
No files found.
results/bloom/bloom-560m/bloom-560m_reading_comprehension_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"drop"
:
{
"em"
:
0.014995805369127516
,
"em_stderr"
:
0.0012446395261788805
,
"f1"
:
0.0621245805369127
,
"f1_stderr"
:
0.001730980795797461
},
"coqa"
:
{
"f1"
:
0.22712736568843772
,
"f1_stderr"
:
0.01673094848597647
,
"em"
:
0.174
,
"em_stderr"
:
0.016190705499013296
},
"race"
:
{
"acc"
:
0.30239234449760766
,
"acc_stderr"
:
0.014214800395178313
}
},
"versions"
:
{
"drop"
:
1
,
"race"
:
1
,
"coqa"
:
1
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-560m,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-560m/bloom-560m_xcopa_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"xcopa_id"
:
{
"acc"
:
0.592
,
"acc_stderr"
:
0.02200091089387719
},
"xcopa_ht"
:
{
"acc"
:
0.502
,
"acc_stderr"
:
0.022382894986483524
},
"xcopa_qu"
:
{
"acc"
:
0.502
,
"acc_stderr"
:
0.02238289498648353
},
"xcopa_et"
:
{
"acc"
:
0.49
,
"acc_stderr"
:
0.022378596989230785
},
"xcopa_th"
:
{
"acc"
:
0.544
,
"acc_stderr"
:
0.022296238348407056
},
"xcopa_tr"
:
{
"acc"
:
0.53
,
"acc_stderr"
:
0.02234274819250285
},
"xcopa_it"
:
{
"acc"
:
0.508
,
"acc_stderr"
:
0.02238020883492804
},
"xcopa_ta"
:
{
"acc"
:
0.558
,
"acc_stderr"
:
0.02223197069632112
},
"xcopa_sw"
:
{
"acc"
:
0.516
,
"acc_stderr"
:
0.022371610982580396
},
"xcopa_vi"
:
{
"acc"
:
0.61
,
"acc_stderr"
:
0.021834685869369208
},
"xcopa_zh"
:
{
"acc"
:
0.586
,
"acc_stderr"
:
0.022049497969827865
}
},
"versions"
:
{
"xcopa_id"
:
0
,
"xcopa_ht"
:
0
,
"xcopa_qu"
:
0
,
"xcopa_et"
:
0
,
"xcopa_th"
:
0
,
"xcopa_tr"
:
0
,
"xcopa_it"
:
0
,
"xcopa_ta"
:
0
,
"xcopa_sw"
:
0
,
"xcopa_vi"
:
0
,
"xcopa_zh"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-560m"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-560m/bloom-560m_xnli_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"xnli_sw"
:
{
"acc"
:
0.3317365269461078
,
"acc_stderr"
:
0.006652654857813421
},
"xnli_th"
:
{
"acc"
:
0.33572854291417165
,
"acc_stderr"
:
0.0066725434859242665
},
"xnli_ur"
:
{
"acc"
:
0.3712574850299401
,
"acc_stderr"
:
0.0068265064875345964
},
"xnli_bg"
:
{
"acc"
:
0.3339321357285429
,
"acc_stderr"
:
0.00666366003290998
},
"xnli_tr"
:
{
"acc"
:
0.3343313373253493
,
"acc_stderr"
:
0.006665643509474755
},
"xnli_zh"
:
{
"acc"
:
0.3395209580838323
,
"acc_stderr"
:
0.006690942515072474
},
"xnli_ar"
:
{
"acc"
:
0.3335329341317365
,
"acc_stderr"
:
0.006661671189931638
},
"xnli_el"
:
{
"acc"
:
0.3333333333333333
,
"acc_stderr"
:
0.006660674754535592
},
"xnli_hi"
:
{
"acc"
:
0.40838323353293415
,
"acc_stderr"
:
0.006945102706766183
},
"xnli_fr"
:
{
"acc"
:
0.4528942115768463
,
"acc_stderr"
:
0.007033289986695001
},
"xnli_es"
:
{
"acc"
:
0.45229540918163674
,
"acc_stderr"
:
0.007032484191375647
},
"xnli_vi"
:
{
"acc"
:
0.405189620758483
,
"acc_stderr"
:
0.006936540228025353
},
"xnli_en"
:
{
"acc"
:
0.49500998003992014
,
"acc_stderr"
:
0.007064360593648105
},
"xnli_de"
:
{
"acc"
:
0.34790419161676644
,
"acc_stderr"
:
0.006729921818907755
},
"xnli_ru"
:
{
"acc"
:
0.3401197604790419
,
"acc_stderr"
:
0.006693803790492355
}
},
"versions"
:
{
"xnli_sw"
:
0
,
"xnli_th"
:
0
,
"xnli_ur"
:
0
,
"xnli_bg"
:
0
,
"xnli_tr"
:
0
,
"xnli_zh"
:
0
,
"xnli_ar"
:
0
,
"xnli_el"
:
0
,
"xnli_hi"
:
0
,
"xnli_fr"
:
0
,
"xnli_es"
:
0
,
"xnli_vi"
:
0
,
"xnli_en"
:
0
,
"xnli_de"
:
0
,
"xnli_ru"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-560m"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-560m/bloom-560m_xstory_cloze_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"xstory_cloze_es"
:
{
"acc"
:
0.5585704831237591
,
"acc_stderr"
:
0.012778538985880637
},
"xstory_cloze_hi"
:
{
"acc"
:
0.5499669093315684
,
"acc_stderr"
:
0.01280271359821983
},
"xstory_cloze_eu"
:
{
"acc"
:
0.5360688285903376
,
"acc_stderr"
:
0.012833602406620015
},
"xstory_cloze_ar"
:
{
"acc"
:
0.5208471211118465
,
"acc_stderr"
:
0.012855936282881267
},
"xstory_cloze_zh"
:
{
"acc"
:
0.5453342157511581
,
"acc_stderr"
:
0.012814127367359424
},
"xstory_cloze_te"
:
{
"acc"
:
0.557246856386499
,
"acc_stderr"
:
0.012782510750319236
},
"xstory_cloze_sw"
:
{
"acc"
:
0.4983454665784249
,
"acc_stderr"
:
0.012867054869163334
},
"xstory_cloze_ru"
:
{
"acc"
:
0.49172733289212445
,
"acc_stderr"
:
0.012865364020375405
},
"xstory_cloze_my"
:
{
"acc"
:
0.47187293183322304
,
"acc_stderr"
:
0.012846749995797694
},
"xstory_cloze_en"
:
{
"acc"
:
0.6121773659827928
,
"acc_stderr"
:
0.012539110696551456
},
"xstory_cloze_id"
:
{
"acc"
:
0.5552614162806089
,
"acc_stderr"
:
0.01278829597020778
}
},
"versions"
:
{
"xstory_cloze_es"
:
0
,
"xstory_cloze_hi"
:
0
,
"xstory_cloze_eu"
:
0
,
"xstory_cloze_ar"
:
0
,
"xstory_cloze_zh"
:
0
,
"xstory_cloze_te"
:
0
,
"xstory_cloze_sw"
:
0
,
"xstory_cloze_ru"
:
0
,
"xstory_cloze_my"
:
0
,
"xstory_cloze_en"
:
0
,
"xstory_cloze_id"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-560m"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-560m/bloom-560m_xwinograd_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"xwinograd_en"
:
{
"acc"
:
0.6589247311827957
,
"acc_stderr"
:
0.009833881195698878
},
"xwinograd_pt"
:
{
"acc"
:
0.6007604562737643
,
"acc_stderr"
:
0.03025636835693898
},
"xwinograd_ru"
:
{
"acc"
:
0.49206349206349204
,
"acc_stderr"
:
0.028213077547815057
},
"xwinograd_fr"
:
{
"acc"
:
0.6024096385542169
,
"acc_stderr"
:
0.054045178247868114
},
"xwinograd_jp"
:
{
"acc"
:
0.529718456725756
,
"acc_stderr"
:
0.01612570703179889
},
"xwinograd_zh"
:
{
"acc"
:
0.6765873015873016
,
"acc_stderr"
:
0.020857221952855685
}
},
"versions"
:
{
"xwinograd_en"
:
0
,
"xwinograd_pt"
:
0
,
"xwinograd_ru"
:
0
,
"xwinograd_fr"
:
0
,
"xwinograd_jp"
:
0
,
"xwinograd_zh"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-560m"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-7b1/README.md
0 → 100644
View file @
e53eb332
# bloom-7b1
## bloom-7b1_bbh_3-shot.json
| Task |Version| Metric |Value| |Stderr|
|------------------------------------------------|------:|---------------------|----:|---|-----:|
|bigbench_causal_judgement | 0|multiple_choice_grade|52.11|± | 3.63|
|bigbench_date_understanding | 0|multiple_choice_grade|36.59|± | 2.51|
|bigbench_disambiguation_qa | 0|multiple_choice_grade|26.36|± | 2.75|
|bigbench_dyck_languages | 0|multiple_choice_grade|14.40|± | 1.11|
|bigbench_formal_fallacies_syllogisms_negation | 0|multiple_choice_grade|50.06|± | 0.42|
|bigbench_geometric_shapes | 0|multiple_choice_grade|20.06|± | 2.12|
| | |exact_str_match | 0.00|± | 0.00|
|bigbench_hyperbaton | 0|multiple_choice_grade|48.62|± | 0.22|
|bigbench_logical_deduction_five_objects | 0|multiple_choice_grade|26.00|± | 1.96|
|bigbench_logical_deduction_seven_objects | 0|multiple_choice_grade|19.14|± | 1.49|
|bigbench_logical_deduction_three_objects | 0|multiple_choice_grade|37.00|± | 2.79|
|bigbench_movie_recommendation | 0|multiple_choice_grade|26.40|± | 1.97|
|bigbench_navigate | 0|multiple_choice_grade|49.90|± | 1.58|
|bigbench_reasoning_about_colored_objects | 0|multiple_choice_grade|24.85|± | 0.97|
|bigbench_ruin_names | 0|multiple_choice_grade|34.38|± | 2.25|
|bigbench_salient_translation_error_detection | 0|multiple_choice_grade|19.14|± | 1.25|
|bigbench_snarks | 0|multiple_choice_grade|49.72|± | 3.73|
|bigbench_sports_understanding | 0|multiple_choice_grade|50.30|± | 1.59|
|bigbench_temporal_sequences | 0|multiple_choice_grade|24.80|± | 1.37|
|bigbench_tracking_shuffled_objects_five_objects | 0|multiple_choice_grade|18.40|± | 1.10|
|bigbench_tracking_shuffled_objects_seven_objects| 0|multiple_choice_grade|14.00|± | 0.83|
|bigbench_tracking_shuffled_objects_three_objects| 0|multiple_choice_grade|37.00|± | 2.79|
## bloom-7b1_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |30.38|± | 1.34|
| | |acc_norm|33.53|± | 1.38|
|arc_easy | 0|acc |64.94|± | 0.98|
| | |acc_norm|57.32|± | 1.01|
|boolq | 1|acc |62.87|± | 0.85|
|copa | 0|acc |72.00|± | 4.51|
|hellaswag | 0|acc |46.24|± | 0.50|
| | |acc_norm|59.68|± | 0.49|
|mc_taco | 0|em |13.59| | |
| | |f1 |50.53| | |
|openbookqa | 0|acc |25.20|± | 1.94|
| | |acc_norm|35.80|± | 2.15|
|piqa | 0|acc |72.74|± | 1.04|
| | |acc_norm|73.67|± | 1.03|
|prost | 0|acc |26.18|± | 0.32|
| | |acc_norm|30.57|± | 0.34|
|swag | 0|acc |50.25|± | 0.35|
| | |acc_norm|68.26|± | 0.33|
|winogrande | 0|acc |64.33|± | 1.35|
|wsc273 | 0|acc |81.32|± | 2.36|
## bloom-7b1_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc | 1.9|± | 0.38|
## bloom-7b1_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 2.51|± | 0.16|
| | |f1 | 5.09|± | 0.18|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.00|± | 0.00|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.00|± | 0.00|
|math_prealgebra | 1|acc | 0.00|± | 0.00|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |26.57|± | 0.81|
| | |acc_norm|26.53|± | 0.81|
## bloom-7b1_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |52.85|± | 1.12|
|pawsx_en| 0|acc |61.30|± | 1.09|
|pawsx_es| 0|acc |59.35|± | 1.10|
|pawsx_fr| 0|acc |50.90|± | 1.12|
|pawsx_ja| 0|acc |45.45|± | 1.11|
|pawsx_ko| 0|acc |45.10|± | 1.11|
|pawsx_zh| 0|acc |47.35|± | 1.12|
## bloom-7b1_question_answering_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|------------|----:|---|-----:|
|headqa_en | 0|acc |31.18|± | 0.88|
| | |acc_norm |35.56|± | 0.91|
|headqa_es | 0|acc |29.54|± | 0.87|
| | |acc_norm |34.32|± | 0.91|
|logiqa | 0|acc |20.28|± | 1.58|
| | |acc_norm |28.11|± | 1.76|
|squad2 | 1|exact | 7.82| | |
| | |f1 |12.64| | |
| | |HasAns_exact|14.84| | |
| | |HasAns_f1 |24.51| | |
| | |NoAns_exact | 0.81| | |
| | |NoAns_f1 | 0.81| | |
| | |best_exact |50.07| | |
| | |best_f1 |50.07| | |
|triviaqa | 1|acc | 5.52|± | 0.21|
|truthfulqa_mc| 1|mc1 |22.40|± | 1.46|
| | |mc2 |38.90|± | 1.40|
|webqs | 0|acc | 2.26|± | 0.33|
## bloom-7b1_reading_comprehension_0-shot.json
|Task|Version|Metric|Value| |Stderr|
|----|------:|------|----:|---|-----:|
|coqa| 1|f1 |68.83|± | 1.63|
| | |em |53.87|± | 2.00|
|drop| 1|em | 2.57|± | 0.16|
| | |f1 | 9.85|± | 0.21|
|race| 1|acc |36.56|± | 1.49|
## bloom-7b1_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 48.2|± | 2.24|
|xcopa_ht| 0|acc | 50.8|± | 2.24|
|xcopa_id| 0|acc | 69.8|± | 2.06|
|xcopa_it| 0|acc | 52.8|± | 2.23|
|xcopa_qu| 0|acc | 50.8|± | 2.24|
|xcopa_sw| 0|acc | 51.6|± | 2.24|
|xcopa_ta| 0|acc | 59.2|± | 2.20|
|xcopa_th| 0|acc | 55.4|± | 2.23|
|xcopa_tr| 0|acc | 51.2|± | 2.24|
|xcopa_vi| 0|acc | 70.8|± | 2.04|
|xcopa_zh| 0|acc | 65.2|± | 2.13|
## bloom-7b1_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.83|± | 0.67|
|xnli_bg| 0|acc |39.70|± | 0.69|
|xnli_de| 0|acc |39.86|± | 0.69|
|xnli_el| 0|acc |35.75|± | 0.68|
|xnli_en| 0|acc |53.91|± | 0.70|
|xnli_es| 0|acc |48.70|± | 0.71|
|xnli_fr| 0|acc |49.68|± | 0.71|
|xnli_hi| 0|acc |46.51|± | 0.70|
|xnli_ru| 0|acc |43.05|± | 0.70|
|xnli_sw| 0|acc |37.92|± | 0.69|
|xnli_th| 0|acc |34.99|± | 0.67|
|xnli_tr| 0|acc |35.09|± | 0.67|
|xnli_ur| 0|acc |42.10|± | 0.70|
|xnli_vi| 0|acc |47.05|± | 0.71|
|xnli_zh| 0|acc |35.43|± | 0.68|
## bloom-7b1_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |58.57|± | 1.27|
|xstory_cloze_en| 0|acc |70.75|± | 1.17|
|xstory_cloze_es| 0|acc |66.12|± | 1.22|
|xstory_cloze_eu| 0|acc |57.18|± | 1.27|
|xstory_cloze_hi| 0|acc |60.56|± | 1.26|
|xstory_cloze_id| 0|acc |64.46|± | 1.23|
|xstory_cloze_my| 0|acc |48.97|± | 1.29|
|xstory_cloze_ru| 0|acc |52.75|± | 1.28|
|xstory_cloze_sw| 0|acc |53.94|± | 1.28|
|xstory_cloze_te| 0|acc |57.45|± | 1.27|
|xstory_cloze_zh| 0|acc |61.88|± | 1.25|
## bloom-7b1_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |82.15|± | 0.79|
|xwinograd_fr| 0|acc |71.08|± | 5.01|
|xwinograd_jp| 0|acc |58.50|± | 1.59|
|xwinograd_pt| 0|acc |76.81|± | 2.61|
|xwinograd_ru| 0|acc |56.83|± | 2.80|
|xwinograd_zh| 0|acc |74.40|± | 1.95|
results/bloom/bloom-7b1/bloom-7b1_bbh_3-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"bigbench_disambiguation_qa"
:
{
"multiple_choice_grade"
:
0.26356589147286824
,
"multiple_choice_grade_stderr"
:
0.027481788262218698
},
"bigbench_logical_deduction_three_objects"
:
{
"multiple_choice_grade"
:
0.37
,
"multiple_choice_grade_stderr"
:
0.027921294063982
},
"bigbench_causal_judgement"
:
{
"multiple_choice_grade"
:
0.5210526315789473
,
"multiple_choice_grade_stderr"
:
0.03633739504773335
},
"bigbench_date_understanding"
:
{
"multiple_choice_grade"
:
0.36585365853658536
,
"multiple_choice_grade_stderr"
:
0.025108717905729792
},
"bigbench_navigate"
:
{
"multiple_choice_grade"
:
0.499
,
"multiple_choice_grade_stderr"
:
0.015819268290576817
},
"bigbench_salient_translation_error_detection"
:
{
"multiple_choice_grade"
:
0.19138276553106212
,
"multiple_choice_grade_stderr"
:
0.012458774650265594
},
"bigbench_temporal_sequences"
:
{
"multiple_choice_grade"
:
0.248
,
"multiple_choice_grade_stderr"
:
0.013663187134877651
},
"bigbench_tracking_shuffled_objects_seven_objects"
:
{
"multiple_choice_grade"
:
0.14
,
"multiple_choice_grade_stderr"
:
0.00829694743648913
},
"bigbench_ruin_names"
:
{
"multiple_choice_grade"
:
0.34375
,
"multiple_choice_grade_stderr"
:
0.02246478414865448
},
"bigbench_reasoning_about_colored_objects"
:
{
"multiple_choice_grade"
:
0.2485
,
"multiple_choice_grade_stderr"
:
0.009665432493822852
},
"bigbench_dyck_languages"
:
{
"multiple_choice_grade"
:
0.144
,
"multiple_choice_grade_stderr"
:
0.01110798754893915
},
"bigbench_logical_deduction_five_objects"
:
{
"multiple_choice_grade"
:
0.26
,
"multiple_choice_grade_stderr"
:
0.019635965529725512
},
"bigbench_sports_understanding"
:
{
"multiple_choice_grade"
:
0.5030425963488844
,
"multiple_choice_grade_stderr"
:
0.015931029729145698
},
"bigbench_tracking_shuffled_objects_three_objects"
:
{
"multiple_choice_grade"
:
0.37
,
"multiple_choice_grade_stderr"
:
0.027921294063982
},
"bigbench_geometric_shapes"
:
{
"multiple_choice_grade"
:
0.20055710306406685
,
"multiple_choice_grade_stderr"
:
0.021162707757982353
,
"exact_str_match"
:
0.0
,
"exact_str_match_stderr"
:
0.0
},
"bigbench_hyperbaton"
:
{
"multiple_choice_grade"
:
0.48618
,
"multiple_choice_grade_stderr"
:
0.0022352360227943418
},
"bigbench_logical_deduction_seven_objects"
:
{
"multiple_choice_grade"
:
0.19142857142857142
,
"multiple_choice_grade_stderr"
:
0.014880721436998012
},
"bigbench_snarks"
:
{
"multiple_choice_grade"
:
0.4972375690607735
,
"multiple_choice_grade_stderr"
:
0.037267230837657574
},
"bigbench_formal_fallacies_syllogisms_negation"
:
{
"multiple_choice_grade"
:
0.5005633802816901
,
"multiple_choice_grade_stderr"
:
0.004196051878850066
},
"bigbench_tracking_shuffled_objects_five_objects"
:
{
"multiple_choice_grade"
:
0.184
,
"multiple_choice_grade_stderr"
:
0.010964094540602657
},
"bigbench_movie_recommendation"
:
{
"multiple_choice_grade"
:
0.264
,
"multiple_choice_grade_stderr"
:
0.019732885585922087
}
},
"versions"
:
{
"bigbench_disambiguation_qa"
:
0
,
"bigbench_logical_deduction_three_objects"
:
0
,
"bigbench_causal_judgement"
:
0
,
"bigbench_date_understanding"
:
0
,
"bigbench_navigate"
:
0
,
"bigbench_salient_translation_error_detection"
:
0
,
"bigbench_temporal_sequences"
:
0
,
"bigbench_tracking_shuffled_objects_seven_objects"
:
0
,
"bigbench_ruin_names"
:
0
,
"bigbench_reasoning_about_colored_objects"
:
0
,
"bigbench_dyck_languages"
:
0
,
"bigbench_logical_deduction_five_objects"
:
0
,
"bigbench_sports_understanding"
:
0
,
"bigbench_tracking_shuffled_objects_three_objects"
:
0
,
"bigbench_geometric_shapes"
:
0
,
"bigbench_hyperbaton"
:
0
,
"bigbench_logical_deduction_seven_objects"
:
0
,
"bigbench_snarks"
:
0
,
"bigbench_formal_fallacies_syllogisms_negation"
:
0
,
"bigbench_tracking_shuffled_objects_five_objects"
:
0
,
"bigbench_movie_recommendation"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-7b1,use_accelerate=True"
,
"num_fewshot"
:
3
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-7b1/bloom-7b1_common_sense_reasoning_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"copa"
:
{
"acc"
:
0.72
,
"acc_stderr"
:
0.04512608598542127
},
"winogrande"
:
{
"acc"
:
0.6432517758484609
,
"acc_stderr"
:
0.013463393958028726
},
"piqa"
:
{
"acc"
:
0.7274211099020674
,
"acc_stderr"
:
0.010389256803296021
,
"acc_norm"
:
0.7366702937976061
,
"acc_norm_stderr"
:
0.010276185322196764
},
"arc_challenge"
:
{
"acc"
:
0.3037542662116041
,
"acc_stderr"
:
0.013438909184778757
,
"acc_norm"
:
0.33532423208191126
,
"acc_norm_stderr"
:
0.013796182947785564
},
"arc_easy"
:
{
"acc"
:
0.6494107744107744
,
"acc_stderr"
:
0.009791003829831557
,
"acc_norm"
:
0.5732323232323232
,
"acc_norm_stderr"
:
0.010149141043955626
},
"boolq"
:
{
"acc"
:
0.6287461773700306
,
"acc_stderr"
:
0.008450174658715903
},
"wsc273"
:
{
"acc"
:
0.8131868131868132
,
"acc_stderr"
:
0.023632761722644544
},
"openbookqa"
:
{
"acc"
:
0.252
,
"acc_stderr"
:
0.019435727282249536
,
"acc_norm"
:
0.358
,
"acc_norm_stderr"
:
0.021461434862859122
},
"prost"
:
{
"acc"
:
0.26184884713919726
,
"acc_stderr"
:
0.003211967450351038
,
"acc_norm"
:
0.30572160546541416
,
"acc_norm_stderr"
:
0.003365914208405272
},
"mc_taco"
:
{
"em"
:
0.13588588588588588
,
"f1"
:
0.5052611696967991
},
"hellaswag"
:
{
"acc"
:
0.4623580959968134
,
"acc_stderr"
:
0.0049756211474061025
,
"acc_norm"
:
0.5967934674367655
,
"acc_norm_stderr"
:
0.0048953903414456264
},
"swag"
:
{
"acc"
:
0.5024992502249325
,
"acc_stderr"
:
0.0035350478846161142
,
"acc_norm"
:
0.6825952214335699
,
"acc_norm_stderr"
:
0.0032909332559412758
}
},
"versions"
:
{
"copa"
:
0
,
"winogrande"
:
0
,
"piqa"
:
0
,
"arc_challenge"
:
0
,
"arc_easy"
:
0
,
"boolq"
:
1
,
"wsc273"
:
0
,
"openbookqa"
:
0
,
"prost"
:
0
,
"mc_taco"
:
0
,
"hellaswag"
:
0
,
"swag"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-7b1,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-7b1/bloom-7b1_gsm8k_8-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"gsm8k"
:
{
"acc"
:
0.018953752843062926
,
"acc_stderr"
:
0.0037560783410314704
}
},
"versions"
:
{
"gsm8k"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-7b1,use_accelerate=True"
,
"num_fewshot"
:
8
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-7b1/bloom-7b1_mathematical_reasoning_few_shot_5-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"mathqa"
:
{
"acc"
:
0.26566164154103855
,
"acc_stderr"
:
0.008085616216226046
,
"acc_norm"
:
0.26532663316582916
,
"acc_norm_stderr"
:
0.008082359462649721
},
"math_prealgebra"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"drop"
:
{
"em"
:
0.02506291946308725
,
"em_stderr"
:
0.0016008246934367681
,
"f1"
:
0.05092911073825512
,
"f1_stderr"
:
0.0017766603696206904
},
"math_precalc"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_geometry"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"gsm8k"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_counting_and_prob"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_num_theory"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_algebra"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"math_intermediate_algebra"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
}
},
"versions"
:
{
"mathqa"
:
0
,
"math_prealgebra"
:
1
,
"drop"
:
1
,
"math_precalc"
:
1
,
"math_geometry"
:
1
,
"gsm8k"
:
0
,
"math_counting_and_prob"
:
1
,
"math_num_theory"
:
1
,
"math_algebra"
:
1
,
"math_intermediate_algebra"
:
1
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-7b1,use_accelerate=True"
,
"num_fewshot"
:
5
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-7b1/bloom-7b1_pawsx_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"pawsx_zh"
:
{
"acc"
:
0.4735
,
"acc_stderr"
:
0.011167418260963935
},
"pawsx_de"
:
{
"acc"
:
0.5285
,
"acc_stderr"
:
0.011164954236428803
},
"pawsx_en"
:
{
"acc"
:
0.613
,
"acc_stderr"
:
0.010893798117218195
},
"pawsx_ko"
:
{
"acc"
:
0.451
,
"acc_stderr"
:
0.01112930504188632
},
"pawsx_fr"
:
{
"acc"
:
0.509
,
"acc_stderr"
:
0.011181324206260283
},
"pawsx_es"
:
{
"acc"
:
0.5935
,
"acc_stderr"
:
0.010985864536294245
},
"pawsx_ja"
:
{
"acc"
:
0.4545
,
"acc_stderr"
:
0.01113673598700373
}
},
"versions"
:
{
"pawsx_zh"
:
0
,
"pawsx_de"
:
0
,
"pawsx_en"
:
0
,
"pawsx_ko"
:
0
,
"pawsx_fr"
:
0
,
"pawsx_es"
:
0
,
"pawsx_ja"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-7b1"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-7b1/bloom-7b1_question_answering_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"webqs"
:
{
"acc"
:
0.022637795275590553
,
"acc_stderr"
:
0.0033005770276179373
},
"headqa_en"
:
{
"acc"
:
0.31181619256017507
,
"acc_stderr"
:
0.008848039223989218
,
"acc_norm"
:
0.35557986870897157
,
"acc_norm_stderr"
:
0.009143208309033068
},
"squad2"
:
{
"exact"
:
7.816053230017687
,
"f1"
:
12.640343596838946
,
"HasAns_exact"
:
14.84480431848853
,
"HasAns_f1"
:
24.507219892926596
,
"NoAns_exact"
:
0.8074011774600505
,
"NoAns_f1"
:
0.8074011774600505
,
"best_exact"
:
50.07159100480081
,
"best_f1"
:
50.07159100480081
},
"truthfulqa_mc"
:
{
"mc1"
:
0.22399020807833536
,
"mc1_stderr"
:
0.014594964329474202
,
"mc2"
:
0.38898018897492265
,
"mc2_stderr"
:
0.014014176010735629
},
"triviaqa"
:
{
"acc"
:
0.055246176964554056
,
"acc_stderr"
:
0.0021480319949071717
},
"headqa_es"
:
{
"acc"
:
0.29540481400437635
,
"acc_stderr"
:
0.008714131357853837
,
"acc_norm"
:
0.34318016046681254
,
"acc_norm_stderr"
:
0.009068379779817705
},
"logiqa"
:
{
"acc"
:
0.20276497695852536
,
"acc_stderr"
:
0.015770046635584564
,
"acc_norm"
:
0.28110599078341014
,
"acc_norm_stderr"
:
0.017632374626460005
}
},
"versions"
:
{
"webqs"
:
0
,
"headqa_en"
:
0
,
"squad2"
:
1
,
"truthfulqa_mc"
:
1
,
"triviaqa"
:
1
,
"headqa_es"
:
0
,
"logiqa"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-7b1,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-7b1/bloom-7b1_reading_comprehension_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"coqa"
:
{
"f1"
:
0.6882976860781418
,
"f1_stderr"
:
0.016322647326969194
,
"em"
:
0.5386666666666665
,
"em_stderr"
:
0.01995482540089559
},
"drop"
:
{
"em"
:
0.02569211409395973
,
"em_stderr"
:
0.0016202710827118362
,
"f1"
:
0.09853712248322138
,
"f1_stderr"
:
0.0021424507419289577
},
"race"
:
{
"acc"
:
0.36555023923444974
,
"acc_stderr"
:
0.014904654247182307
}
},
"versions"
:
{
"coqa"
:
1
,
"race"
:
1
,
"drop"
:
1
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-7b1,use_accelerate=True"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-7b1/bloom-7b1_xcopa_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"xcopa_ta"
:
{
"acc"
:
0.592
,
"acc_stderr"
:
0.02200091089387719
},
"xcopa_id"
:
{
"acc"
:
0.698
,
"acc_stderr"
:
0.02055326917420918
},
"xcopa_tr"
:
{
"acc"
:
0.512
,
"acc_stderr"
:
0.02237662679792717
},
"xcopa_th"
:
{
"acc"
:
0.554
,
"acc_stderr"
:
0.022252153078595897
},
"xcopa_ht"
:
{
"acc"
:
0.508
,
"acc_stderr"
:
0.022380208834928035
},
"xcopa_qu"
:
{
"acc"
:
0.508
,
"acc_stderr"
:
0.022380208834928035
},
"xcopa_sw"
:
{
"acc"
:
0.516
,
"acc_stderr"
:
0.0223716109825804
},
"xcopa_it"
:
{
"acc"
:
0.528
,
"acc_stderr"
:
0.022347949832668086
},
"xcopa_zh"
:
{
"acc"
:
0.652
,
"acc_stderr"
:
0.021323728632807498
},
"xcopa_et"
:
{
"acc"
:
0.482
,
"acc_stderr"
:
0.02236856511738799
},
"xcopa_vi"
:
{
"acc"
:
0.708
,
"acc_stderr"
:
0.02035437548053008
}
},
"versions"
:
{
"xcopa_ta"
:
0
,
"xcopa_id"
:
0
,
"xcopa_tr"
:
0
,
"xcopa_th"
:
0
,
"xcopa_ht"
:
0
,
"xcopa_qu"
:
0
,
"xcopa_sw"
:
0
,
"xcopa_it"
:
0
,
"xcopa_zh"
:
0
,
"xcopa_et"
:
0
,
"xcopa_vi"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-7b1"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-7b1/bloom-7b1_xnli_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"xnli_ar"
:
{
"acc"
:
0.3383233532934132
,
"acc_stderr"
:
0.006685184166851475
},
"xnli_bg"
:
{
"acc"
:
0.3970059880239521
,
"acc_stderr"
:
0.006913206227417221
},
"xnli_de"
:
{
"acc"
:
0.39860279441117763
,
"acc_stderr"
:
0.0069179171504068675
},
"xnli_el"
:
{
"acc"
:
0.35748502994011977
,
"acc_stderr"
:
0.006771658365506411
},
"xnli_en"
:
{
"acc"
:
0.539121756487026
,
"acc_stderr"
:
0.007043053978003474
},
"xnli_es"
:
{
"acc"
:
0.4870259481037924
,
"acc_stderr"
:
0.007062333678954121
},
"xnli_fr"
:
{
"acc"
:
0.49680638722554893
,
"acc_stderr"
:
0.00706456831954508
},
"xnli_hi"
:
{
"acc"
:
0.46506986027944114
,
"acc_stderr"
:
0.007047451825220883
},
"xnli_ru"
:
{
"acc"
:
0.4305389221556886
,
"acc_stderr"
:
0.006996208063220089
},
"xnli_sw"
:
{
"acc"
:
0.37924151696606784
,
"acc_stderr"
:
0.006855572898852684
},
"xnli_th"
:
{
"acc"
:
0.3499001996007984
,
"acc_stderr"
:
0.00673886250800537
},
"xnli_tr"
:
{
"acc"
:
0.3508982035928144
,
"acc_stderr"
:
0.00674328417575373
},
"xnli_ur"
:
{
"acc"
:
0.42095808383233535
,
"acc_stderr"
:
0.006975878576227378
},
"xnli_vi"
:
{
"acc"
:
0.47045908183632734
,
"acc_stderr"
:
0.007052371383794704
},
"xnli_zh"
:
{
"acc"
:
0.35429141716566864
,
"acc_stderr"
:
0.006758076124936785
}
},
"versions"
:
{
"xnli_ar"
:
0
,
"xnli_bg"
:
0
,
"xnli_de"
:
0
,
"xnli_el"
:
0
,
"xnli_en"
:
0
,
"xnli_es"
:
0
,
"xnli_fr"
:
0
,
"xnli_hi"
:
0
,
"xnli_ru"
:
0
,
"xnli_sw"
:
0
,
"xnli_th"
:
0
,
"xnli_tr"
:
0
,
"xnli_ur"
:
0
,
"xnli_vi"
:
0
,
"xnli_zh"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-7b1"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-7b1/bloom-7b1_xstory_cloze_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"xstory_cloze_ru"
:
{
"acc"
:
0.5274652547981469
,
"acc_stderr"
:
0.012847698270388222
},
"xstory_cloze_eu"
:
{
"acc"
:
0.57180675049636
,
"acc_stderr"
:
0.012733742799515155
},
"xstory_cloze_en"
:
{
"acc"
:
0.7074784910655195
,
"acc_stderr"
:
0.011707038572975033
},
"xstory_cloze_ar"
:
{
"acc"
:
0.585704831237591
,
"acc_stderr"
:
0.012676689821720669
},
"xstory_cloze_es"
:
{
"acc"
:
0.6611515552614163
,
"acc_stderr"
:
0.012180490758739058
},
"xstory_cloze_hi"
:
{
"acc"
:
0.6055592322964924
,
"acc_stderr"
:
0.01257710651393614
},
"xstory_cloze_my"
:
{
"acc"
:
0.48974189278623426
,
"acc_stderr"
:
0.012864417047980468
},
"xstory_cloze_sw"
:
{
"acc"
:
0.5393778954334878
,
"acc_stderr"
:
0.012827159238891916
},
"xstory_cloze_zh"
:
{
"acc"
:
0.6187954996690933
,
"acc_stderr"
:
0.01249867885093408
},
"xstory_cloze_id"
:
{
"acc"
:
0.6446062210456651
,
"acc_stderr"
:
0.01231724793041837
},
"xstory_cloze_te"
:
{
"acc"
:
0.5744540039708802
,
"acc_stderr"
:
0.012723670419166324
}
},
"versions"
:
{
"xstory_cloze_ru"
:
0
,
"xstory_cloze_eu"
:
0
,
"xstory_cloze_en"
:
0
,
"xstory_cloze_ar"
:
0
,
"xstory_cloze_es"
:
0
,
"xstory_cloze_hi"
:
0
,
"xstory_cloze_my"
:
0
,
"xstory_cloze_sw"
:
0
,
"xstory_cloze_zh"
:
0
,
"xstory_cloze_id"
:
0
,
"xstory_cloze_te"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-7b1"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/bloom/bloom-7b1/bloom-7b1_xwinograd_0-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"xwinograd_fr"
:
{
"acc"
:
0.7108433734939759
,
"acc_stderr"
:
0.050066428050419186
},
"xwinograd_ru"
:
{
"acc"
:
0.5682539682539682
,
"acc_stderr"
:
0.027952495861671634
},
"xwinograd_en"
:
{
"acc"
:
0.821505376344086
,
"acc_stderr"
:
0.00794327709606643
},
"xwinograd_pt"
:
{
"acc"
:
0.7680608365019012
,
"acc_stderr"
:
0.026075593860304693
},
"xwinograd_jp"
:
{
"acc"
:
0.5849843587069864
,
"acc_stderr"
:
0.015919213413834392
},
"xwinograd_zh"
:
{
"acc"
:
0.7440476190476191
,
"acc_stderr"
:
0.019457899684028012
}
},
"versions"
:
{
"xwinograd_fr"
:
0
,
"xwinograd_ru"
:
0
,
"xwinograd_en"
:
0
,
"xwinograd_pt"
:
0
,
"xwinograd_jp"
:
0
,
"xwinograd_zh"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=bigscience/bloom-7b1"
,
"num_fewshot"
:
0
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/README.md
0 → 100644
View file @
e53eb332
# llama-13B
## llama-13B_arithmetic_5-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------------|------:|------|----:|---|-----:|
|arithmetic_1dc| 0|acc | 0|± | 0|
|arithmetic_2da| 0|acc | 0|± | 0|
|arithmetic_2dm| 0|acc | 0|± | 0|
|arithmetic_2ds| 0|acc | 0|± | 0|
|arithmetic_3da| 0|acc | 0|± | 0|
|arithmetic_3ds| 0|acc | 0|± | 0|
|arithmetic_4da| 0|acc | 0|± | 0|
|arithmetic_4ds| 0|acc | 0|± | 0|
|arithmetic_5da| 0|acc | 0|± | 0|
|arithmetic_5ds| 0|acc | 0|± | 0|
## llama-13B_bbh_3-shot.json
| Task |Version| Metric |Value| |Stderr|
|------------------------------------------------|------:|---------------------|----:|---|-----:|
|bigbench_causal_judgement | 0|multiple_choice_grade|49.47|± | 3.64|
|bigbench_date_understanding | 0|multiple_choice_grade|63.96|± | 2.50|
|bigbench_disambiguation_qa | 0|multiple_choice_grade|45.74|± | 3.11|
|bigbench_dyck_languages | 0|multiple_choice_grade|20.10|± | 1.27|
|bigbench_formal_fallacies_syllogisms_negation | 0|multiple_choice_grade|51.13|± | 0.42|
|bigbench_geometric_shapes | 0|multiple_choice_grade|23.12|± | 2.23|
| | |exact_str_match | 0.00|± | 0.00|
|bigbench_hyperbaton | 0|multiple_choice_grade|50.38|± | 0.22|
|bigbench_logical_deduction_five_objects | 0|multiple_choice_grade|30.00|± | 2.05|
|bigbench_logical_deduction_seven_objects | 0|multiple_choice_grade|22.29|± | 1.57|
|bigbench_logical_deduction_three_objects | 0|multiple_choice_grade|41.67|± | 2.85|
|bigbench_movie_recommendation | 0|multiple_choice_grade|43.60|± | 2.22|
|bigbench_navigate | 0|multiple_choice_grade|51.70|± | 1.58|
|bigbench_reasoning_about_colored_objects | 0|multiple_choice_grade|37.05|± | 1.08|
|bigbench_ruin_names | 0|multiple_choice_grade|34.60|± | 2.25|
|bigbench_salient_translation_error_detection | 0|multiple_choice_grade|19.34|± | 1.25|
|bigbench_snarks | 0|multiple_choice_grade|46.96|± | 3.72|
|bigbench_sports_understanding | 0|multiple_choice_grade|58.11|± | 1.57|
|bigbench_temporal_sequences | 0|multiple_choice_grade|28.00|± | 1.42|
|bigbench_tracking_shuffled_objects_five_objects | 0|multiple_choice_grade|21.44|± | 1.16|
|bigbench_tracking_shuffled_objects_seven_objects| 0|multiple_choice_grade|14.46|± | 0.84|
|bigbench_tracking_shuffled_objects_three_objects| 0|multiple_choice_grade|41.67|± | 2.85|
## llama-13B_blimp_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------------------------------------------------|------:|------|----:|---|-----:|
|blimp_adjunct_island | 0|acc | 33.8|± | 1.50|
|blimp_anaphor_gender_agreement | 0|acc | 57.6|± | 1.56|
|blimp_anaphor_number_agreement | 0|acc | 56.5|± | 1.57|
|blimp_animate_subject_passive | 0|acc | 65.1|± | 1.51|
|blimp_animate_subject_trans | 0|acc | 61.6|± | 1.54|
|blimp_causative | 0|acc | 35.9|± | 1.52|
|blimp_complex_NP_island | 0|acc | 30.3|± | 1.45|
|blimp_coordinate_structure_constraint_complex_left_branch| 0|acc | 34.5|± | 1.50|
|blimp_coordinate_structure_constraint_object_extraction | 0|acc | 27.9|± | 1.42|
|blimp_determiner_noun_agreement_1 | 0|acc | 34.1|± | 1.50|
|blimp_determiner_noun_agreement_2 | 0|acc | 36.1|± | 1.52|
|blimp_determiner_noun_agreement_irregular_1 | 0|acc | 35.6|± | 1.51|
|blimp_determiner_noun_agreement_irregular_2 | 0|acc | 36.9|± | 1.53|
|blimp_determiner_noun_agreement_with_adj_2 | 0|acc | 39.2|± | 1.54|
|blimp_determiner_noun_agreement_with_adj_irregular_1 | 0|acc | 34.2|± | 1.50|
|blimp_determiner_noun_agreement_with_adj_irregular_2 | 0|acc | 39.3|± | 1.55|
|blimp_determiner_noun_agreement_with_adjective_1 | 0|acc | 39.1|± | 1.54|
|blimp_distractor_agreement_relational_noun | 0|acc | 51.4|± | 1.58|
|blimp_distractor_agreement_relative_clause | 0|acc | 42.3|± | 1.56|
|blimp_drop_argument | 0|acc | 70.5|± | 1.44|
|blimp_ellipsis_n_bar_1 | 0|acc | 62.4|± | 1.53|
|blimp_ellipsis_n_bar_2 | 0|acc | 26.4|± | 1.39|
|blimp_existential_there_object_raising | 0|acc | 69.0|± | 1.46|
|blimp_existential_there_quantifiers_1 | 0|acc | 30.8|± | 1.46|
|blimp_existential_there_quantifiers_2 | 0|acc | 78.8|± | 1.29|
|blimp_existential_there_subject_raising | 0|acc | 70.1|± | 1.45|
|blimp_expletive_it_object_raising | 0|acc | 61.9|± | 1.54|
|blimp_inchoative | 0|acc | 47.4|± | 1.58|
|blimp_intransitive | 0|acc | 64.3|± | 1.52|
|blimp_irregular_past_participle_adjectives | 0|acc | 63.6|± | 1.52|
|blimp_irregular_past_participle_verbs | 0|acc | 31.4|± | 1.47|
|blimp_irregular_plural_subject_verb_agreement_1 | 0|acc | 51.8|± | 1.58|
|blimp_irregular_plural_subject_verb_agreement_2 | 0|acc | 50.4|± | 1.58|
|blimp_left_branch_island_echo_question | 0|acc | 49.0|± | 1.58|
|blimp_left_branch_island_simple_question | 0|acc | 41.1|± | 1.56|
|blimp_matrix_question_npi_licensor_present | 0|acc | 54.8|± | 1.57|
|blimp_npi_present_1 | 0|acc | 30.4|± | 1.46|
|blimp_npi_present_2 | 0|acc | 39.0|± | 1.54|
|blimp_only_npi_licensor_present | 0|acc | 73.1|± | 1.40|
|blimp_only_npi_scope | 0|acc | 27.8|± | 1.42|
|blimp_passive_1 | 0|acc | 52.9|± | 1.58|
|blimp_passive_2 | 0|acc | 52.6|± | 1.58|
|blimp_principle_A_c_command | 0|acc | 32.6|± | 1.48|
|blimp_principle_A_case_1 | 0|acc | 2.8|± | 0.52|
|blimp_principle_A_case_2 | 0|acc | 44.3|± | 1.57|
|blimp_principle_A_domain_1 | 0|acc | 32.4|± | 1.48|
|blimp_principle_A_domain_2 | 0|acc | 74.0|± | 1.39|
|blimp_principle_A_domain_3 | 0|acc | 56.3|± | 1.57|
|blimp_principle_A_reconstruction | 0|acc | 79.2|± | 1.28|
|blimp_regular_plural_subject_verb_agreement_1 | 0|acc | 56.0|± | 1.57|
|blimp_regular_plural_subject_verb_agreement_2 | 0|acc | 45.6|± | 1.58|
|blimp_sentential_negation_npi_licensor_present | 0|acc | 39.2|± | 1.54|
|blimp_sentential_negation_npi_scope | 0|acc | 63.8|± | 1.52|
|blimp_sentential_subject_island | 0|acc | 62.1|± | 1.53|
|blimp_superlative_quantifiers_1 | 0|acc | 52.2|± | 1.58|
|blimp_superlative_quantifiers_2 | 0|acc | 71.4|± | 1.43|
|blimp_tough_vs_raising_1 | 0|acc | 36.1|± | 1.52|
|blimp_tough_vs_raising_2 | 0|acc | 64.2|± | 1.52|
|blimp_transitive | 0|acc | 47.3|± | 1.58|
|blimp_wh_island | 0|acc | 50.6|± | 1.58|
|blimp_wh_questions_object_gap | 0|acc | 45.5|± | 1.58|
|blimp_wh_questions_subject_gap | 0|acc | 36.9|± | 1.53|
|blimp_wh_questions_subject_gap_long_distance | 0|acc | 40.8|± | 1.55|
|blimp_wh_vs_that_no_gap | 0|acc | 19.6|± | 1.26|
|blimp_wh_vs_that_no_gap_long_distance | 0|acc | 30.1|± | 1.45|
|blimp_wh_vs_that_with_gap | 0|acc | 84.7|± | 1.14|
|blimp_wh_vs_that_with_gap_long_distance | 0|acc | 69.2|± | 1.46|
## llama-13B_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |43.94|± | 1.45|
| | |acc_norm|44.62|± | 1.45|
|arc_easy | 0|acc |74.58|± | 0.89|
| | |acc_norm|59.89|± | 1.01|
|boolq | 1|acc |68.50|± | 0.81|
|copa | 0|acc |90.00|± | 3.02|
|hellaswag | 0|acc |59.10|± | 0.49|
| | |acc_norm|76.24|± | 0.42|
|mc_taco | 0|em |10.96| | |
| | |f1 |47.53| | |
|openbookqa | 0|acc |30.60|± | 2.06|
| | |acc_norm|42.20|± | 2.21|
|piqa | 0|acc |78.84|± | 0.95|
| | |acc_norm|79.11|± | 0.95|
|prost | 0|acc |26.89|± | 0.32|
| | |acc_norm|30.52|± | 0.34|
|swag | 0|acc |56.73|± | 0.35|
| | |acc_norm|69.35|± | 0.33|
|winogrande | 0|acc |70.17|± | 1.29|
|wsc273 | 0|acc |86.08|± | 2.10|
## llama-13B_glue_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|cola | 0|mcc | 0.00|± | 0.00|
|mnli | 0|acc |43.56|± | 0.50|
|mnli_mismatched| 0|acc |45.35|± | 0.50|
|mrpc | 0|acc |68.63|± | 2.30|
| | |f1 |81.34|± | 1.62|
|qnli | 0|acc |49.95|± | 0.68|
|qqp | 0|acc |36.79|± | 0.24|
| | |f1 |53.66|± | 0.26|
|rte | 0|acc |65.34|± | 2.86|
|sst | 0|acc |65.37|± | 1.61|
|wnli | 1|acc |46.48|± | 5.96|
## llama-13B_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc |13.57|± | 0.94|
## llama-13B_human_alignment_0-shot.json
| Task |Version| Metric | Value | |Stderr|
|---------------------------------------|------:|---------------------|------:|---|-----:|
|crows_pairs_english_age | 0|likelihood_difference| 771.02|± | 93.66|
| | |pct_stereotype | 56.04|± | 5.23|
|crows_pairs_english_autre | 0|likelihood_difference|1142.61|± |435.33|
| | |pct_stereotype | 36.36|± | 15.21|
|crows_pairs_english_disability | 0|likelihood_difference|1297.88|± |182.88|
| | |pct_stereotype | 35.38|± | 5.98|
|crows_pairs_english_gender | 0|likelihood_difference| 867.58|± | 65.49|
| | |pct_stereotype | 58.44|± | 2.76|
|crows_pairs_english_nationality | 0|likelihood_difference|1184.87|± | 83.43|
| | |pct_stereotype | 38.43|± | 3.32|
|crows_pairs_english_physical_appearance| 0|likelihood_difference| 752.95|± | 87.93|
| | |pct_stereotype | 47.22|± | 5.92|
|crows_pairs_english_race_color | 0|likelihood_difference| 985.84|± | 50.57|
| | |pct_stereotype | 50.20|± | 2.22|
|crows_pairs_english_religion | 0|likelihood_difference|1181.25|± |117.52|
| | |pct_stereotype | 49.55|± | 4.77|
|crows_pairs_english_sexual_orientation | 0|likelihood_difference|1072.24|± |115.61|
| | |pct_stereotype | 54.84|± | 5.19|
|crows_pairs_english_socioeconomic | 0|likelihood_difference|1122.24|± | 78.07|
| | |pct_stereotype | 50.53|± | 3.64|
|crows_pairs_french_age | 0|likelihood_difference|1310.14|± |112.01|
| | |pct_stereotype | 38.89|± | 5.17|
|crows_pairs_french_autre | 0|likelihood_difference| 994.23|± |314.84|
| | |pct_stereotype | 53.85|± | 14.39|
|crows_pairs_french_disability | 0|likelihood_difference|1732.39|± |182.40|
| | |pct_stereotype | 40.91|± | 6.10|
|crows_pairs_french_gender | 0|likelihood_difference|1079.15|± | 67.67|
| | |pct_stereotype | 51.40|± | 2.79|
|crows_pairs_french_nationality | 0|likelihood_difference|1633.10|± | 92.24|
| | |pct_stereotype | 31.62|± | 2.93|
|crows_pairs_french_physical_appearance | 0|likelihood_difference|1257.99|± |157.39|
| | |pct_stereotype | 52.78|± | 5.92|
|crows_pairs_french_race_color | 0|likelihood_difference|1192.74|± | 50.28|
| | |pct_stereotype | 35.00|± | 2.23|
|crows_pairs_french_religion | 0|likelihood_difference|1119.24|± |108.66|
| | |pct_stereotype | 59.13|± | 4.60|
|crows_pairs_french_sexual_orientation | 0|likelihood_difference|1755.49|± |118.03|
| | |pct_stereotype | 78.02|± | 4.36|
|crows_pairs_french_socioeconomic | 0|likelihood_difference|1279.15|± | 93.70|
| | |pct_stereotype | 35.71|± | 3.43|
|ethics_cm | 0|acc | 51.74|± | 0.80|
|ethics_deontology | 0|acc | 50.33|± | 0.83|
| | |em | 0.11| | |
|ethics_justice | 0|acc | 49.93|± | 0.96|
| | |em | 0.15| | |
|ethics_utilitarianism | 0|acc | 52.45|± | 0.72|
|ethics_utilitarianism_original | 0|acc | 98.07|± | 0.20|
|ethics_virtue | 0|acc | 20.32|± | 0.57|
| | |em | 0.00| | |
|toxigen | 0|acc | 42.66|± | 1.61|
| | |acc_norm | 43.19|± | 1.62|
## llama-13B_lambada_0-shot.json
| Task |Version|Metric| Value | | Stderr |
|----------------------|------:|------|---------:|---|--------:|
|lambada_openai | 0|ppl |1279051.05|± | 60995.63|
| | |acc | 0.00|± | 0.00|
|lambada_openai_cloze | 0|ppl | 204515.39|± | 9705.34|
| | |acc | 0.02|± | 0.02|
|lambada_openai_mt_de | 0|ppl |1310285.44|± | 71395.91|
| | |acc | 0.00|± | 0.00|
|lambada_openai_mt_en | 0|ppl |1279051.05|± | 60995.63|
| | |acc | 0.00|± | 0.00|
|lambada_openai_mt_es | 0|ppl |1980241.77|± |101614.20|
| | |acc | 0.00|± | 0.00|
|lambada_openai_mt_fr | 0|ppl |2461448.49|± |128013.99|
| | |acc | 0.00|± | 0.00|
|lambada_openai_mt_it | 0|ppl |4091504.35|± |218020.97|
| | |acc | 0.00|± | 0.00|
|lambada_standard | 0|ppl |1409048.00|± | 47832.88|
| | |acc | 0.00|± | 0.00|
|lambada_standard_cloze| 0|ppl |4235345.03|± |132892.57|
| | |acc | 0.00|± | 0.00|
## llama-13B_mathematical_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 3.88|± | 0.20|
| | |f1 |13.99|± | 0.25|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 1.85|± | 0.39|
|math_asdiv | 0|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 1.48|± | 0.55|
|math_geometry | 1|acc | 1.25|± | 0.51|
|math_intermediate_algebra| 1|acc | 1.22|± | 0.37|
|math_num_theory | 1|acc | 1.48|± | 0.52|
|math_prealgebra | 1|acc | 2.87|± | 0.57|
|math_precalc | 1|acc | 1.10|± | 0.45|
|mathqa | 0|acc |28.44|± | 0.83|
| | |acc_norm|28.68|± | 0.83|
## llama-13B_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 1.71|± | 0.13|
| | |f1 | 2.45|± | 0.14|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.21|± | 0.21|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.19|± | 0.19|
|math_prealgebra | 1|acc | 0.11|± | 0.11|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |29.98|± | 0.84|
| | |acc_norm|30.35|± | 0.84|
## llama-13B_mmlu_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------------------------------|------:|--------|----:|---|-----:|
|hendrycksTest-abstract_algebra | 0|acc |32.00|± | 4.69|
| | |acc_norm|30.00|± | 4.61|
|hendrycksTest-anatomy | 0|acc |42.96|± | 4.28|
| | |acc_norm|29.63|± | 3.94|
|hendrycksTest-astronomy | 0|acc |48.03|± | 4.07|
| | |acc_norm|48.03|± | 4.07|
|hendrycksTest-business_ethics | 0|acc |53.00|± | 5.02|
| | |acc_norm|44.00|± | 4.99|
|hendrycksTest-clinical_knowledge | 0|acc |46.04|± | 3.07|
| | |acc_norm|38.49|± | 2.99|
|hendrycksTest-college_biology | 0|acc |45.83|± | 4.17|
| | |acc_norm|32.64|± | 3.92|
|hendrycksTest-college_chemistry | 0|acc |31.00|± | 4.65|
| | |acc_norm|30.00|± | 4.61|
|hendrycksTest-college_computer_science | 0|acc |33.00|± | 4.73|
| | |acc_norm|28.00|± | 4.51|
|hendrycksTest-college_mathematics | 0|acc |29.00|± | 4.56|
| | |acc_norm|34.00|± | 4.76|
|hendrycksTest-college_medicine | 0|acc |42.77|± | 3.77|
| | |acc_norm|30.06|± | 3.50|
|hendrycksTest-college_physics | 0|acc |28.43|± | 4.49|
| | |acc_norm|35.29|± | 4.76|
|hendrycksTest-computer_security | 0|acc |57.00|± | 4.98|
| | |acc_norm|44.00|± | 4.99|
|hendrycksTest-conceptual_physics | 0|acc |42.13|± | 3.23|
| | |acc_norm|24.26|± | 2.80|
|hendrycksTest-econometrics | 0|acc |27.19|± | 4.19|
| | |acc_norm|26.32|± | 4.14|
|hendrycksTest-electrical_engineering | 0|acc |41.38|± | 4.10|
| | |acc_norm|34.48|± | 3.96|
|hendrycksTest-elementary_mathematics | 0|acc |36.77|± | 2.48|
| | |acc_norm|32.80|± | 2.42|
|hendrycksTest-formal_logic | 0|acc |32.54|± | 4.19|
| | |acc_norm|34.13|± | 4.24|
|hendrycksTest-global_facts | 0|acc |34.00|± | 4.76|
| | |acc_norm|29.00|± | 4.56|
|hendrycksTest-high_school_biology | 0|acc |49.68|± | 2.84|
| | |acc_norm|36.13|± | 2.73|
|hendrycksTest-high_school_chemistry | 0|acc |31.03|± | 3.26|
| | |acc_norm|32.02|± | 3.28|
|hendrycksTest-high_school_computer_science | 0|acc |49.00|± | 5.02|
| | |acc_norm|41.00|± | 4.94|
|hendrycksTest-high_school_european_history | 0|acc |52.73|± | 3.90|
| | |acc_norm|49.70|± | 3.90|
|hendrycksTest-high_school_geography | 0|acc |57.58|± | 3.52|
| | |acc_norm|42.42|± | 3.52|
|hendrycksTest-high_school_government_and_politics| 0|acc |58.55|± | 3.56|
| | |acc_norm|38.86|± | 3.52|
|hendrycksTest-high_school_macroeconomics | 0|acc |37.69|± | 2.46|
| | |acc_norm|31.79|± | 2.36|
|hendrycksTest-high_school_mathematics | 0|acc |26.67|± | 2.70|
| | |acc_norm|31.85|± | 2.84|
|hendrycksTest-high_school_microeconomics | 0|acc |42.02|± | 3.21|
| | |acc_norm|40.76|± | 3.19|
|hendrycksTest-high_school_physics | 0|acc |27.15|± | 3.63|
| | |acc_norm|25.17|± | 3.54|
|hendrycksTest-high_school_psychology | 0|acc |60.73|± | 2.09|
| | |acc_norm|36.88|± | 2.07|
|hendrycksTest-high_school_statistics | 0|acc |38.43|± | 3.32|
| | |acc_norm|37.50|± | 3.30|
|hendrycksTest-high_school_us_history | 0|acc |52.45|± | 3.51|
| | |acc_norm|37.25|± | 3.39|
|hendrycksTest-high_school_world_history | 0|acc |49.79|± | 3.25|
| | |acc_norm|42.62|± | 3.22|
|hendrycksTest-human_aging | 0|acc |57.40|± | 3.32|
| | |acc_norm|33.63|± | 3.17|
|hendrycksTest-human_sexuality | 0|acc |54.96|± | 4.36|
| | |acc_norm|39.69|± | 4.29|
|hendrycksTest-international_law | 0|acc |56.20|± | 4.53|
| | |acc_norm|60.33|± | 4.47|
|hendrycksTest-jurisprudence | 0|acc |48.15|± | 4.83|
| | |acc_norm|50.00|± | 4.83|
|hendrycksTest-logical_fallacies | 0|acc |45.40|± | 3.91|
| | |acc_norm|36.81|± | 3.79|
|hendrycksTest-machine_learning | 0|acc |28.57|± | 4.29|
| | |acc_norm|29.46|± | 4.33|
|hendrycksTest-management | 0|acc |64.08|± | 4.75|
| | |acc_norm|41.75|± | 4.88|
|hendrycksTest-marketing | 0|acc |72.65|± | 2.92|
| | |acc_norm|61.54|± | 3.19|
|hendrycksTest-medical_genetics | 0|acc |49.00|± | 5.02|
| | |acc_norm|48.00|± | 5.02|
|hendrycksTest-miscellaneous | 0|acc |69.60|± | 1.64|
| | |acc_norm|48.53|± | 1.79|
|hendrycksTest-moral_disputes | 0|acc |44.80|± | 2.68|
| | |acc_norm|38.15|± | 2.62|
|hendrycksTest-moral_scenarios | 0|acc |28.27|± | 1.51|
| | |acc_norm|27.26|± | 1.49|
|hendrycksTest-nutrition | 0|acc |45.10|± | 2.85|
| | |acc_norm|46.73|± | 2.86|
|hendrycksTest-philosophy | 0|acc |45.98|± | 2.83|
| | |acc_norm|38.59|± | 2.76|
|hendrycksTest-prehistory | 0|acc |49.69|± | 2.78|
| | |acc_norm|34.57|± | 2.65|
|hendrycksTest-professional_accounting | 0|acc |29.79|± | 2.73|
| | |acc_norm|28.01|± | 2.68|
|hendrycksTest-professional_law | 0|acc |30.38|± | 1.17|
| | |acc_norm|30.90|± | 1.18|
|hendrycksTest-professional_medicine | 0|acc |39.34|± | 2.97|
| | |acc_norm|33.09|± | 2.86|
|hendrycksTest-professional_psychology | 0|acc |42.32|± | 2.00|
| | |acc_norm|33.01|± | 1.90|
|hendrycksTest-public_relations | 0|acc |54.55|± | 4.77|
| | |acc_norm|29.09|± | 4.35|
|hendrycksTest-security_studies | 0|acc |45.71|± | 3.19|
| | |acc_norm|37.55|± | 3.10|
|hendrycksTest-sociology | 0|acc |58.21|± | 3.49|
| | |acc_norm|45.77|± | 3.52|
|hendrycksTest-us_foreign_policy | 0|acc |68.00|± | 4.69|
| | |acc_norm|52.00|± | 5.02|
|hendrycksTest-virology | 0|acc |40.96|± | 3.83|
| | |acc_norm|30.12|± | 3.57|
|hendrycksTest-world_religions | 0|acc |74.27|± | 3.35|
| | |acc_norm|64.91|± | 3.66|
## llama-13B_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |52.95|± | 1.12|
|pawsx_en| 0|acc |53.70|± | 1.12|
|pawsx_es| 0|acc |52.10|± | 1.12|
|pawsx_fr| 0|acc |54.50|± | 1.11|
|pawsx_ja| 0|acc |45.00|± | 1.11|
|pawsx_ko| 0|acc |47.05|± | 1.12|
|pawsx_zh| 0|acc |45.20|± | 1.11|
## llama-13B_question_answering_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|------------|----:|---|-----:|
|headqa_en | 0|acc |34.43|± | 0.91|
| | |acc_norm |38.58|± | 0.93|
|headqa_es | 0|acc |30.56|± | 0.88|
| | |acc_norm |35.16|± | 0.91|
|logiqa | 0|acc |26.42|± | 1.73|
| | |acc_norm |32.10|± | 1.83|
|squad2 | 1|exact |16.44| | |
| | |f1 |24.06| | |
| | |HasAns_exact|21.09| | |
| | |HasAns_f1 |36.35| | |
| | |NoAns_exact |11.81| | |
| | |NoAns_f1 |11.81| | |
| | |best_exact |50.07| | |
| | |best_f1 |50.07| | |
|triviaqa | 1|acc | 0.00|± | 0.00|
|truthfulqa_mc| 1|mc1 |25.83|± | 1.53|
| | |mc2 |39.88|± | 1.37|
|webqs | 0|acc | 0.00|± | 0.00|
## llama-13B_reading_comprehension_0-shot.json
|Task|Version|Metric|Value| |Stderr|
|----|------:|------|----:|---|-----:|
|coqa| 1|f1 |77.04|± | 1.42|
| | |em |63.70|± | 1.85|
|drop| 1|em | 3.59|± | 0.19|
| | |f1 |13.38|± | 0.24|
|race| 1|acc |39.33|± | 1.51|
## llama-13B_superglue_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|boolq | 1|acc |68.44|± | 0.81|
|cb | 1|acc |48.21|± | 6.74|
| | |f1 |38.82| | |
|copa | 0|acc |90.00|± | 3.02|
|multirc| 1|acc | 1.57|± | 0.40|
|record | 0|f1 |92.32|± | 0.26|
| | |em |91.54|± | 0.28|
|wic | 0|acc |49.84|± | 1.98|
|wsc | 0|acc |35.58|± | 4.72|
## llama-13B_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 48.2|± | 2.24|
|xcopa_ht| 0|acc | 52.8|± | 2.23|
|xcopa_id| 0|acc | 57.8|± | 2.21|
|xcopa_it| 0|acc | 67.2|± | 2.10|
|xcopa_qu| 0|acc | 50.2|± | 2.24|
|xcopa_sw| 0|acc | 51.2|± | 2.24|
|xcopa_ta| 0|acc | 54.4|± | 2.23|
|xcopa_th| 0|acc | 54.6|± | 2.23|
|xcopa_tr| 0|acc | 53.0|± | 2.23|
|xcopa_vi| 0|acc | 53.8|± | 2.23|
|xcopa_zh| 0|acc | 58.4|± | 2.21|
## llama-13B_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |34.07|± | 0.67|
|xnli_bg| 0|acc |34.21|± | 0.67|
|xnli_de| 0|acc |35.25|± | 0.68|
|xnli_el| 0|acc |34.69|± | 0.67|
|xnli_en| 0|acc |35.63|± | 0.68|
|xnli_es| 0|acc |33.49|± | 0.67|
|xnli_fr| 0|acc |33.49|± | 0.67|
|xnli_hi| 0|acc |35.59|± | 0.68|
|xnli_ru| 0|acc |33.79|± | 0.67|
|xnli_sw| 0|acc |33.15|± | 0.67|
|xnli_th| 0|acc |34.83|± | 0.67|
|xnli_tr| 0|acc |33.99|± | 0.67|
|xnli_ur| 0|acc |34.21|± | 0.67|
|xnli_vi| 0|acc |34.21|± | 0.67|
|xnli_zh| 0|acc |34.47|± | 0.67|
## llama-13B_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |49.70|± | 1.29|
|xstory_cloze_en| 0|acc |77.30|± | 1.08|
|xstory_cloze_es| 0|acc |69.42|± | 1.19|
|xstory_cloze_eu| 0|acc |50.69|± | 1.29|
|xstory_cloze_hi| 0|acc |52.35|± | 1.29|
|xstory_cloze_id| 0|acc |55.26|± | 1.28|
|xstory_cloze_my| 0|acc |47.78|± | 1.29|
|xstory_cloze_ru| 0|acc |63.40|± | 1.24|
|xstory_cloze_sw| 0|acc |49.90|± | 1.29|
|xstory_cloze_te| 0|acc |53.34|± | 1.28|
|xstory_cloze_zh| 0|acc |56.45|± | 1.28|
## llama-13B_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |86.75|± | 0.70|
|xwinograd_fr| 0|acc |68.67|± | 5.12|
|xwinograd_jp| 0|acc |59.85|± | 1.58|
|xwinograd_pt| 0|acc |71.48|± | 2.79|
|xwinograd_ru| 0|acc |70.79|± | 2.57|
|xwinograd_zh| 0|acc |70.04|± | 2.04|
results/llama/llama-13B/llama-13B_arithmetic_5-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"arithmetic_2dm"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"arithmetic_5ds"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"arithmetic_4da"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"arithmetic_3da"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"arithmetic_4ds"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"arithmetic_5da"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"arithmetic_2da"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"arithmetic_3ds"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"arithmetic_1dc"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
},
"arithmetic_2ds"
:
{
"acc"
:
0.0
,
"acc_stderr"
:
0.0
}
},
"versions"
:
{
"arithmetic_2dm"
:
0
,
"arithmetic_5ds"
:
0
,
"arithmetic_4da"
:
0
,
"arithmetic_3da"
:
0
,
"arithmetic_4ds"
:
0
,
"arithmetic_5da"
:
0
,
"arithmetic_2da"
:
0
,
"arithmetic_3ds"
:
0
,
"arithmetic_1dc"
:
0
,
"arithmetic_2ds"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True"
,
"num_fewshot"
:
5
,
"batch_size"
:
"auto"
,
"device"
:
"cuda:0"
,
"no_cache"
:
false
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
results/llama/llama-13B/llama-13B_bbh_3-shot.json
0 → 100644
View file @
e53eb332
{
"results"
:
{
"bigbench_sports_understanding"
:
{
"multiple_choice_grade"
:
0.5811359026369168
,
"multiple_choice_grade_stderr"
:
0.015720172474974117
},
"bigbench_salient_translation_error_detection"
:
{
"multiple_choice_grade"
:
0.1933867735470942
,
"multiple_choice_grade_stderr"
:
0.012508305339715512
},
"bigbench_date_understanding"
:
{
"multiple_choice_grade"
:
0.6395663956639567
,
"multiple_choice_grade_stderr"
:
0.025028311208714224
},
"bigbench_navigate"
:
{
"multiple_choice_grade"
:
0.517
,
"multiple_choice_grade_stderr"
:
0.015810153729833434
},
"bigbench_dyck_languages"
:
{
"multiple_choice_grade"
:
0.201
,
"multiple_choice_grade_stderr"
:
0.012679107214617324
},
"bigbench_movie_recommendation"
:
{
"multiple_choice_grade"
:
0.436
,
"multiple_choice_grade_stderr"
:
0.022198954641476802
},
"bigbench_snarks"
:
{
"multiple_choice_grade"
:
0.4696132596685083
,
"multiple_choice_grade_stderr"
:
0.03719891321680327
},
"bigbench_disambiguation_qa"
:
{
"multiple_choice_grade"
:
0.4573643410852713
,
"multiple_choice_grade_stderr"
:
0.03107554499047266
},
"bigbench_reasoning_about_colored_objects"
:
{
"multiple_choice_grade"
:
0.3705
,
"multiple_choice_grade_stderr"
:
0.010801537464907349
},
"bigbench_geometric_shapes"
:
{
"multiple_choice_grade"
:
0.23119777158774374
,
"multiple_choice_grade_stderr"
:
0.02228217728550543
,
"exact_str_match"
:
0.0
,
"exact_str_match_stderr"
:
0.0
},
"bigbench_tracking_shuffled_objects_five_objects"
:
{
"multiple_choice_grade"
:
0.2144
,
"multiple_choice_grade_stderr"
:
0.011612665292522431
},
"bigbench_formal_fallacies_syllogisms_negation"
:
{
"multiple_choice_grade"
:
0.5113380281690141
,
"multiple_choice_grade_stderr"
:
0.004194975590734721
},
"bigbench_tracking_shuffled_objects_three_objects"
:
{
"multiple_choice_grade"
:
0.4166666666666667
,
"multiple_choice_grade_stderr"
:
0.028511310643917567
},
"bigbench_hyperbaton"
:
{
"multiple_choice_grade"
:
0.5038
,
"multiple_choice_grade_stderr"
:
0.0022360257592931206
},
"bigbench_temporal_sequences"
:
{
"multiple_choice_grade"
:
0.28
,
"multiple_choice_grade_stderr"
:
0.014205696104091493
},
"bigbench_logical_deduction_three_objects"
:
{
"multiple_choice_grade"
:
0.4166666666666667
,
"multiple_choice_grade_stderr"
:
0.028511310643917567
},
"bigbench_causal_judgement"
:
{
"multiple_choice_grade"
:
0.49473684210526314
,
"multiple_choice_grade_stderr"
:
0.036367633377878836
},
"bigbench_tracking_shuffled_objects_seven_objects"
:
{
"multiple_choice_grade"
:
0.14457142857142857
,
"multiple_choice_grade_stderr"
:
0.008408881015830339
},
"bigbench_logical_deduction_seven_objects"
:
{
"multiple_choice_grade"
:
0.22285714285714286
,
"multiple_choice_grade_stderr"
:
0.015740739118727993
},
"bigbench_logical_deduction_five_objects"
:
{
"multiple_choice_grade"
:
0.3
,
"multiple_choice_grade_stderr"
:
0.020514426225628046
},
"bigbench_ruin_names"
:
{
"multiple_choice_grade"
:
0.34598214285714285
,
"multiple_choice_grade_stderr"
:
0.02249924183068251
}
},
"versions"
:
{
"bigbench_sports_understanding"
:
0
,
"bigbench_salient_translation_error_detection"
:
0
,
"bigbench_date_understanding"
:
0
,
"bigbench_navigate"
:
0
,
"bigbench_dyck_languages"
:
0
,
"bigbench_movie_recommendation"
:
0
,
"bigbench_snarks"
:
0
,
"bigbench_disambiguation_qa"
:
0
,
"bigbench_reasoning_about_colored_objects"
:
0
,
"bigbench_geometric_shapes"
:
0
,
"bigbench_tracking_shuffled_objects_five_objects"
:
0
,
"bigbench_formal_fallacies_syllogisms_negation"
:
0
,
"bigbench_tracking_shuffled_objects_three_objects"
:
0
,
"bigbench_hyperbaton"
:
0
,
"bigbench_temporal_sequences"
:
0
,
"bigbench_logical_deduction_three_objects"
:
0
,
"bigbench_causal_judgement"
:
0
,
"bigbench_tracking_shuffled_objects_seven_objects"
:
0
,
"bigbench_logical_deduction_seven_objects"
:
0
,
"bigbench_logical_deduction_five_objects"
:
0
,
"bigbench_ruin_names"
:
0
},
"config"
:
{
"model"
:
"hf-causal-experimental"
,
"model_args"
:
"pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True"
,
"num_fewshot"
:
3
,
"batch_size"
:
"auto"
,
"device"
:
"cuda"
,
"no_cache"
:
true
,
"limit"
:
null
,
"bootstrap_iters"
:
100000
,
"description_dict"
:
{}
}
}
Prev
1
2
3
4
5
6
7
…
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment