Commit 5ba0c5f9 authored by Oleh Shliazhko's avatar Oleh Shliazhko
Browse files

Merge remote-tracking branch 'upstream/master' into mmlu_fix

parents c117e787 9d06c953
# bloom-7b1
## bloom-7b1_bbh_3-shot.json
| Task |Version| Metric |Value| |Stderr|
|------------------------------------------------|------:|---------------------|----:|---|-----:|
|bigbench_causal_judgement | 0|multiple_choice_grade|52.11|± | 3.63|
|bigbench_date_understanding | 0|multiple_choice_grade|36.59|± | 2.51|
|bigbench_disambiguation_qa | 0|multiple_choice_grade|26.36|± | 2.75|
|bigbench_dyck_languages | 0|multiple_choice_grade|14.40|± | 1.11|
|bigbench_formal_fallacies_syllogisms_negation | 0|multiple_choice_grade|50.06|± | 0.42|
|bigbench_geometric_shapes | 0|multiple_choice_grade|20.06|± | 2.12|
| | |exact_str_match | 0.00|± | 0.00|
|bigbench_hyperbaton | 0|multiple_choice_grade|48.62|± | 0.22|
|bigbench_logical_deduction_five_objects | 0|multiple_choice_grade|26.00|± | 1.96|
|bigbench_logical_deduction_seven_objects | 0|multiple_choice_grade|19.14|± | 1.49|
|bigbench_logical_deduction_three_objects | 0|multiple_choice_grade|37.00|± | 2.79|
|bigbench_movie_recommendation | 0|multiple_choice_grade|26.40|± | 1.97|
|bigbench_navigate | 0|multiple_choice_grade|49.90|± | 1.58|
|bigbench_reasoning_about_colored_objects | 0|multiple_choice_grade|24.85|± | 0.97|
|bigbench_ruin_names | 0|multiple_choice_grade|34.38|± | 2.25|
|bigbench_salient_translation_error_detection | 0|multiple_choice_grade|19.14|± | 1.25|
|bigbench_snarks | 0|multiple_choice_grade|49.72|± | 3.73|
|bigbench_sports_understanding | 0|multiple_choice_grade|50.30|± | 1.59|
|bigbench_temporal_sequences | 0|multiple_choice_grade|24.80|± | 1.37|
|bigbench_tracking_shuffled_objects_five_objects | 0|multiple_choice_grade|18.40|± | 1.10|
|bigbench_tracking_shuffled_objects_seven_objects| 0|multiple_choice_grade|14.00|± | 0.83|
|bigbench_tracking_shuffled_objects_three_objects| 0|multiple_choice_grade|37.00|± | 2.79|
## bloom-7b1_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |30.38|± | 1.34|
| | |acc_norm|33.53|± | 1.38|
|arc_easy | 0|acc |64.94|± | 0.98|
| | |acc_norm|57.32|± | 1.01|
|boolq | 1|acc |62.87|± | 0.85|
|copa | 0|acc |72.00|± | 4.51|
|hellaswag | 0|acc |46.24|± | 0.50|
| | |acc_norm|59.68|± | 0.49|
|mc_taco | 0|em |13.59| | |
| | |f1 |50.53| | |
|openbookqa | 0|acc |25.20|± | 1.94|
| | |acc_norm|35.80|± | 2.15|
|piqa | 0|acc |72.74|± | 1.04|
| | |acc_norm|73.67|± | 1.03|
|prost | 0|acc |26.18|± | 0.32|
| | |acc_norm|30.57|± | 0.34|
|swag | 0|acc |50.25|± | 0.35|
| | |acc_norm|68.26|± | 0.33|
|winogrande | 0|acc |64.33|± | 1.35|
|wsc273 | 0|acc |81.32|± | 2.36|
## bloom-7b1_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc | 1.9|± | 0.38|
## bloom-7b1_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 2.51|± | 0.16|
| | |f1 | 5.09|± | 0.18|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.00|± | 0.00|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.00|± | 0.00|
|math_prealgebra | 1|acc | 0.00|± | 0.00|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |26.57|± | 0.81|
| | |acc_norm|26.53|± | 0.81|
## bloom-7b1_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |52.85|± | 1.12|
|pawsx_en| 0|acc |61.30|± | 1.09|
|pawsx_es| 0|acc |59.35|± | 1.10|
|pawsx_fr| 0|acc |50.90|± | 1.12|
|pawsx_ja| 0|acc |45.45|± | 1.11|
|pawsx_ko| 0|acc |45.10|± | 1.11|
|pawsx_zh| 0|acc |47.35|± | 1.12|
## bloom-7b1_question_answering_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|------------|----:|---|-----:|
|headqa_en | 0|acc |31.18|± | 0.88|
| | |acc_norm |35.56|± | 0.91|
|headqa_es | 0|acc |29.54|± | 0.87|
| | |acc_norm |34.32|± | 0.91|
|logiqa | 0|acc |20.28|± | 1.58|
| | |acc_norm |28.11|± | 1.76|
|squad2 | 1|exact | 7.82| | |
| | |f1 |12.64| | |
| | |HasAns_exact|14.84| | |
| | |HasAns_f1 |24.51| | |
| | |NoAns_exact | 0.81| | |
| | |NoAns_f1 | 0.81| | |
| | |best_exact |50.07| | |
| | |best_f1 |50.07| | |
|triviaqa | 1|acc | 5.52|± | 0.21|
|truthfulqa_mc| 1|mc1 |22.40|± | 1.46|
| | |mc2 |38.90|± | 1.40|
|webqs | 0|acc | 2.26|± | 0.33|
## bloom-7b1_reading_comprehension_0-shot.json
|Task|Version|Metric|Value| |Stderr|
|----|------:|------|----:|---|-----:|
|coqa| 1|f1 |68.83|± | 1.63|
| | |em |53.87|± | 2.00|
|drop| 1|em | 2.57|± | 0.16|
| | |f1 | 9.85|± | 0.21|
|race| 1|acc |36.56|± | 1.49|
## bloom-7b1_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 48.2|± | 2.24|
|xcopa_ht| 0|acc | 50.8|± | 2.24|
|xcopa_id| 0|acc | 69.8|± | 2.06|
|xcopa_it| 0|acc | 52.8|± | 2.23|
|xcopa_qu| 0|acc | 50.8|± | 2.24|
|xcopa_sw| 0|acc | 51.6|± | 2.24|
|xcopa_ta| 0|acc | 59.2|± | 2.20|
|xcopa_th| 0|acc | 55.4|± | 2.23|
|xcopa_tr| 0|acc | 51.2|± | 2.24|
|xcopa_vi| 0|acc | 70.8|± | 2.04|
|xcopa_zh| 0|acc | 65.2|± | 2.13|
## bloom-7b1_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.83|± | 0.67|
|xnli_bg| 0|acc |39.70|± | 0.69|
|xnli_de| 0|acc |39.86|± | 0.69|
|xnli_el| 0|acc |35.75|± | 0.68|
|xnli_en| 0|acc |53.91|± | 0.70|
|xnli_es| 0|acc |48.70|± | 0.71|
|xnli_fr| 0|acc |49.68|± | 0.71|
|xnli_hi| 0|acc |46.51|± | 0.70|
|xnli_ru| 0|acc |43.05|± | 0.70|
|xnli_sw| 0|acc |37.92|± | 0.69|
|xnli_th| 0|acc |34.99|± | 0.67|
|xnli_tr| 0|acc |35.09|± | 0.67|
|xnli_ur| 0|acc |42.10|± | 0.70|
|xnli_vi| 0|acc |47.05|± | 0.71|
|xnli_zh| 0|acc |35.43|± | 0.68|
## bloom-7b1_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |58.57|± | 1.27|
|xstory_cloze_en| 0|acc |70.75|± | 1.17|
|xstory_cloze_es| 0|acc |66.12|± | 1.22|
|xstory_cloze_eu| 0|acc |57.18|± | 1.27|
|xstory_cloze_hi| 0|acc |60.56|± | 1.26|
|xstory_cloze_id| 0|acc |64.46|± | 1.23|
|xstory_cloze_my| 0|acc |48.97|± | 1.29|
|xstory_cloze_ru| 0|acc |52.75|± | 1.28|
|xstory_cloze_sw| 0|acc |53.94|± | 1.28|
|xstory_cloze_te| 0|acc |57.45|± | 1.27|
|xstory_cloze_zh| 0|acc |61.88|± | 1.25|
## bloom-7b1_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |82.15|± | 0.79|
|xwinograd_fr| 0|acc |71.08|± | 5.01|
|xwinograd_jp| 0|acc |58.50|± | 1.59|
|xwinograd_pt| 0|acc |76.81|± | 2.61|
|xwinograd_ru| 0|acc |56.83|± | 2.80|
|xwinograd_zh| 0|acc |74.40|± | 1.95|
{
"results": {
"bigbench_disambiguation_qa": {
"multiple_choice_grade": 0.26356589147286824,
"multiple_choice_grade_stderr": 0.027481788262218698
},
"bigbench_logical_deduction_three_objects": {
"multiple_choice_grade": 0.37,
"multiple_choice_grade_stderr": 0.027921294063982
},
"bigbench_causal_judgement": {
"multiple_choice_grade": 0.5210526315789473,
"multiple_choice_grade_stderr": 0.03633739504773335
},
"bigbench_date_understanding": {
"multiple_choice_grade": 0.36585365853658536,
"multiple_choice_grade_stderr": 0.025108717905729792
},
"bigbench_navigate": {
"multiple_choice_grade": 0.499,
"multiple_choice_grade_stderr": 0.015819268290576817
},
"bigbench_salient_translation_error_detection": {
"multiple_choice_grade": 0.19138276553106212,
"multiple_choice_grade_stderr": 0.012458774650265594
},
"bigbench_temporal_sequences": {
"multiple_choice_grade": 0.248,
"multiple_choice_grade_stderr": 0.013663187134877651
},
"bigbench_tracking_shuffled_objects_seven_objects": {
"multiple_choice_grade": 0.14,
"multiple_choice_grade_stderr": 0.00829694743648913
},
"bigbench_ruin_names": {
"multiple_choice_grade": 0.34375,
"multiple_choice_grade_stderr": 0.02246478414865448
},
"bigbench_reasoning_about_colored_objects": {
"multiple_choice_grade": 0.2485,
"multiple_choice_grade_stderr": 0.009665432493822852
},
"bigbench_dyck_languages": {
"multiple_choice_grade": 0.144,
"multiple_choice_grade_stderr": 0.01110798754893915
},
"bigbench_logical_deduction_five_objects": {
"multiple_choice_grade": 0.26,
"multiple_choice_grade_stderr": 0.019635965529725512
},
"bigbench_sports_understanding": {
"multiple_choice_grade": 0.5030425963488844,
"multiple_choice_grade_stderr": 0.015931029729145698
},
"bigbench_tracking_shuffled_objects_three_objects": {
"multiple_choice_grade": 0.37,
"multiple_choice_grade_stderr": 0.027921294063982
},
"bigbench_geometric_shapes": {
"multiple_choice_grade": 0.20055710306406685,
"multiple_choice_grade_stderr": 0.021162707757982353,
"exact_str_match": 0.0,
"exact_str_match_stderr": 0.0
},
"bigbench_hyperbaton": {
"multiple_choice_grade": 0.48618,
"multiple_choice_grade_stderr": 0.0022352360227943418
},
"bigbench_logical_deduction_seven_objects": {
"multiple_choice_grade": 0.19142857142857142,
"multiple_choice_grade_stderr": 0.014880721436998012
},
"bigbench_snarks": {
"multiple_choice_grade": 0.4972375690607735,
"multiple_choice_grade_stderr": 0.037267230837657574
},
"bigbench_formal_fallacies_syllogisms_negation": {
"multiple_choice_grade": 0.5005633802816901,
"multiple_choice_grade_stderr": 0.004196051878850066
},
"bigbench_tracking_shuffled_objects_five_objects": {
"multiple_choice_grade": 0.184,
"multiple_choice_grade_stderr": 0.010964094540602657
},
"bigbench_movie_recommendation": {
"multiple_choice_grade": 0.264,
"multiple_choice_grade_stderr": 0.019732885585922087
}
},
"versions": {
"bigbench_disambiguation_qa": 0,
"bigbench_logical_deduction_three_objects": 0,
"bigbench_causal_judgement": 0,
"bigbench_date_understanding": 0,
"bigbench_navigate": 0,
"bigbench_salient_translation_error_detection": 0,
"bigbench_temporal_sequences": 0,
"bigbench_tracking_shuffled_objects_seven_objects": 0,
"bigbench_ruin_names": 0,
"bigbench_reasoning_about_colored_objects": 0,
"bigbench_dyck_languages": 0,
"bigbench_logical_deduction_five_objects": 0,
"bigbench_sports_understanding": 0,
"bigbench_tracking_shuffled_objects_three_objects": 0,
"bigbench_geometric_shapes": 0,
"bigbench_hyperbaton": 0,
"bigbench_logical_deduction_seven_objects": 0,
"bigbench_snarks": 0,
"bigbench_formal_fallacies_syllogisms_negation": 0,
"bigbench_tracking_shuffled_objects_five_objects": 0,
"bigbench_movie_recommendation": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-7b1,use_accelerate=True",
"num_fewshot": 3,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"copa": {
"acc": 0.72,
"acc_stderr": 0.04512608598542127
},
"winogrande": {
"acc": 0.6432517758484609,
"acc_stderr": 0.013463393958028726
},
"piqa": {
"acc": 0.7274211099020674,
"acc_stderr": 0.010389256803296021,
"acc_norm": 0.7366702937976061,
"acc_norm_stderr": 0.010276185322196764
},
"arc_challenge": {
"acc": 0.3037542662116041,
"acc_stderr": 0.013438909184778757,
"acc_norm": 0.33532423208191126,
"acc_norm_stderr": 0.013796182947785564
},
"arc_easy": {
"acc": 0.6494107744107744,
"acc_stderr": 0.009791003829831557,
"acc_norm": 0.5732323232323232,
"acc_norm_stderr": 0.010149141043955626
},
"boolq": {
"acc": 0.6287461773700306,
"acc_stderr": 0.008450174658715903
},
"wsc273": {
"acc": 0.8131868131868132,
"acc_stderr": 0.023632761722644544
},
"openbookqa": {
"acc": 0.252,
"acc_stderr": 0.019435727282249536,
"acc_norm": 0.358,
"acc_norm_stderr": 0.021461434862859122
},
"prost": {
"acc": 0.26184884713919726,
"acc_stderr": 0.003211967450351038,
"acc_norm": 0.30572160546541416,
"acc_norm_stderr": 0.003365914208405272
},
"mc_taco": {
"em": 0.13588588588588588,
"f1": 0.5052611696967991
},
"hellaswag": {
"acc": 0.4623580959968134,
"acc_stderr": 0.0049756211474061025,
"acc_norm": 0.5967934674367655,
"acc_norm_stderr": 0.0048953903414456264
},
"swag": {
"acc": 0.5024992502249325,
"acc_stderr": 0.0035350478846161142,
"acc_norm": 0.6825952214335699,
"acc_norm_stderr": 0.0032909332559412758
}
},
"versions": {
"copa": 0,
"winogrande": 0,
"piqa": 0,
"arc_challenge": 0,
"arc_easy": 0,
"boolq": 1,
"wsc273": 0,
"openbookqa": 0,
"prost": 0,
"mc_taco": 0,
"hellaswag": 0,
"swag": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-7b1,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"gsm8k": {
"acc": 0.018953752843062926,
"acc_stderr": 0.0037560783410314704
}
},
"versions": {
"gsm8k": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-7b1,use_accelerate=True",
"num_fewshot": 8,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"pawsx_zh": {
"acc": 0.4735,
"acc_stderr": 0.011167418260963935
},
"pawsx_de": {
"acc": 0.5285,
"acc_stderr": 0.011164954236428803
},
"pawsx_en": {
"acc": 0.613,
"acc_stderr": 0.010893798117218195
},
"pawsx_ko": {
"acc": 0.451,
"acc_stderr": 0.01112930504188632
},
"pawsx_fr": {
"acc": 0.509,
"acc_stderr": 0.011181324206260283
},
"pawsx_es": {
"acc": 0.5935,
"acc_stderr": 0.010985864536294245
},
"pawsx_ja": {
"acc": 0.4545,
"acc_stderr": 0.01113673598700373
}
},
"versions": {
"pawsx_zh": 0,
"pawsx_de": 0,
"pawsx_en": 0,
"pawsx_ko": 0,
"pawsx_fr": 0,
"pawsx_es": 0,
"pawsx_ja": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-7b1",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment