Commit 2ac318a9 authored by Julen Etxaniz's avatar Julen Etxaniz
Browse files

add basic markdown tables with results

parent 21e128d8
# bloom-1b1
## bloom-1b1_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |23.63|± | 1.24|
| | |acc_norm|25.68|± | 1.28|
|arc_easy | 0|acc |51.47|± | 1.03|
| | |acc_norm|45.45|± | 1.02|
|boolq | 1|acc |59.08|± | 0.86|
|copa | 0|acc |68.00|± | 4.69|
|hellaswag | 0|acc |34.63|± | 0.47|
| | |acc_norm|41.77|± | 0.49|
|mc_taco | 0|em |14.49| | |
| | |f1 |32.43| | |
|openbookqa | 0|acc |19.60|± | 1.78|
| | |acc_norm|29.40|± | 2.04|
|piqa | 0|acc |67.14|± | 1.10|
| | |acc_norm|67.14|± | 1.10|
|prost | 0|acc |23.41|± | 0.31|
| | |acc_norm|30.50|± | 0.34|
|swag | 0|acc |43.43|± | 0.35|
| | |acc_norm|58.28|± | 0.35|
|winogrande | 0|acc |54.93|± | 1.40|
|wsc273 | 0|acc |68.50|± | 2.82|
## bloom-1b1_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc | 0.83|± | 0.25|
## bloom-1b1_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 1.38|± | 0.12|
| | |f1 | 4.01|± | 0.15|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.21|± | 0.21|
|math_geometry | 1|acc | 0.21|± | 0.21|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.19|± | 0.19|
|math_prealgebra | 1|acc | 0.11|± | 0.11|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |23.55|± | 0.78|
| | |acc_norm|23.62|± | 0.78|
## bloom-1b1_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |46.95|± | 1.12|
|pawsx_en| 0|acc |52.45|± | 1.12|
|pawsx_es| 0|acc |51.50|± | 1.12|
|pawsx_fr| 0|acc |46.15|± | 1.11|
|pawsx_ja| 0|acc |48.40|± | 1.12|
|pawsx_ko| 0|acc |49.90|± | 1.12|
|pawsx_zh| 0|acc |48.95|± | 1.12|
## bloom-1b1_question_answering_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|------------|----:|---|-----:|
|headqa_en | 0|acc |26.44|± | 0.84|
| | |acc_norm |30.49|± | 0.88|
|headqa_es | 0|acc |24.43|± | 0.82|
| | |acc_norm |28.30|± | 0.86|
|logiqa | 0|acc |18.89|± | 1.54|
| | |acc_norm |25.65|± | 1.71|
|squad2 | 1|exact | 4.17| | |
| | |f1 | 6.60| | |
| | |HasAns_exact| 2.19| | |
| | |HasAns_f1 | 7.05| | |
| | |NoAns_exact | 6.14| | |
| | |NoAns_f1 | 6.14| | |
| | |best_exact |50.07| | |
| | |best_f1 |50.07| | |
|triviaqa | 1|acc | 2.68|± | 0.15|
|truthfulqa_mc| 1|mc1 |25.34|± | 1.52|
| | |mc2 |41.80|± | 1.46|
|webqs | 0|acc | 1.38|± | 0.26|
## bloom-1b1_reading_comprehension_0-shot.json
|Task|Version|Metric|Value| |Stderr|
|----|------:|------|----:|---|-----:|
|coqa| 1|f1 |45.57|± | 1.88|
| | |em |32.98|± | 1.95|
|drop| 1|em | 3.31|± | 0.18|
| | |f1 | 8.63|± | 0.22|
|race| 1|acc |32.63|± | 1.45|
## bloom-1b1_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 50.6|± | 2.24|
|xcopa_ht| 0|acc | 53.0|± | 2.23|
|xcopa_id| 0|acc | 64.8|± | 2.14|
|xcopa_it| 0|acc | 50.8|± | 2.24|
|xcopa_qu| 0|acc | 51.2|± | 2.24|
|xcopa_sw| 0|acc | 54.4|± | 2.23|
|xcopa_ta| 0|acc | 57.0|± | 2.22|
|xcopa_th| 0|acc | 53.2|± | 2.23|
|xcopa_tr| 0|acc | 53.0|± | 2.23|
|xcopa_vi| 0|acc | 62.4|± | 2.17|
|xcopa_zh| 0|acc | 59.4|± | 2.20|
## bloom-1b1_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.93|± | 0.67|
|xnli_bg| 0|acc |34.13|± | 0.67|
|xnli_de| 0|acc |39.64|± | 0.69|
|xnli_el| 0|acc |34.03|± | 0.67|
|xnli_en| 0|acc |51.48|± | 0.71|
|xnli_es| 0|acc |47.98|± | 0.71|
|xnli_fr| 0|acc |47.15|± | 0.71|
|xnli_hi| 0|acc |42.32|± | 0.70|
|xnli_ru| 0|acc |40.46|± | 0.69|
|xnli_sw| 0|acc |35.29|± | 0.68|
|xnli_th| 0|acc |33.75|± | 0.67|
|xnli_tr| 0|acc |34.79|± | 0.67|
|xnli_ur| 0|acc |37.33|± | 0.68|
|xnli_vi| 0|acc |44.45|± | 0.70|
|xnli_zh| 0|acc |36.23|± | 0.68|
## bloom-1b1_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |52.88|± | 1.28|
|xstory_cloze_en| 0|acc |62.54|± | 1.25|
|xstory_cloze_es| 0|acc |58.31|± | 1.27|
|xstory_cloze_eu| 0|acc |54.33|± | 1.28|
|xstory_cloze_hi| 0|acc |55.53|± | 1.28|
|xstory_cloze_id| 0|acc |57.91|± | 1.27|
|xstory_cloze_my| 0|acc |46.19|± | 1.28|
|xstory_cloze_ru| 0|acc |48.25|± | 1.29|
|xstory_cloze_sw| 0|acc |50.56|± | 1.29|
|xstory_cloze_te| 0|acc |56.39|± | 1.28|
|xstory_cloze_zh| 0|acc |58.04|± | 1.27|
## bloom-1b1_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |69.98|± | 0.95|
|xwinograd_fr| 0|acc |66.27|± | 5.22|
|xwinograd_jp| 0|acc |52.87|± | 1.61|
|xwinograd_pt| 0|acc |63.12|± | 2.98|
|xwinograd_ru| 0|acc |54.29|± | 2.81|
|xwinograd_zh| 0|acc |69.25|± | 2.06|
# bloom-1b7
## bloom-1b7_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |23.55|± | 1.24|
| | |acc_norm|26.79|± | 1.29|
|arc_easy | 0|acc |56.31|± | 1.02|
| | |acc_norm|48.11|± | 1.03|
|boolq | 1|acc |61.77|± | 0.85|
|copa | 0|acc |70.00|± | 4.61|
|hellaswag | 0|acc |37.62|± | 0.48|
| | |acc_norm|46.56|± | 0.50|
|mc_taco | 0|em |12.54| | |
| | |f1 |47.46| | |
|openbookqa | 0|acc |21.40|± | 1.84|
| | |acc_norm|30.00|± | 2.05|
|piqa | 0|acc |68.77|± | 1.08|
| | |acc_norm|70.08|± | 1.07|
|prost | 0|acc |23.52|± | 0.31|
| | |acc_norm|26.70|± | 0.32|
|swag | 0|acc |45.32|± | 0.35|
| | |acc_norm|61.15|± | 0.34|
|winogrande | 0|acc |57.14|± | 1.39|
|wsc273 | 0|acc |72.89|± | 2.70|
## bloom-1b7_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc | 1.29|± | 0.31|
## bloom-1b7_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 1.49|± | 0.12|
| | |f1 | 4.31|± | 0.15|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.00|± | 0.00|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.74|± | 0.37|
|math_prealgebra | 1|acc | 0.23|± | 0.16|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |24.29|± | 0.79|
| | |acc_norm|24.62|± | 0.79|
## bloom-1b7_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |48.75|± | 1.12|
|pawsx_en| 0|acc |48.90|± | 1.12|
|pawsx_es| 0|acc |51.30|± | 1.12|
|pawsx_fr| 0|acc |46.20|± | 1.12|
|pawsx_ja| 0|acc |44.70|± | 1.11|
|pawsx_ko| 0|acc |45.80|± | 1.11|
|pawsx_zh| 0|acc |45.40|± | 1.11|
## bloom-1b7_question_answering_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|------------|----:|---|-----:|
|headqa_en | 0|acc |27.75|± | 0.86|
| | |acc_norm |32.57|± | 0.90|
|headqa_es | 0|acc |25.42|± | 0.83|
| | |acc_norm |29.58|± | 0.87|
|logiqa | 0|acc |21.66|± | 1.62|
| | |acc_norm |28.11|± | 1.76|
|squad2 | 1|exact | 1.80| | |
| | |f1 | 4.38| | |
| | |HasAns_exact| 2.40| | |
| | |HasAns_f1 | 7.56| | |
| | |NoAns_exact | 1.21| | |
| | |NoAns_f1 | 1.21| | |
| | |best_exact |50.07| | |
| | |best_f1 |50.07| | |
|triviaqa | 1|acc | 3.14|± | 0.16|
|truthfulqa_mc| 1|mc1 |24.48|± | 1.51|
| | |mc2 |41.32|± | 1.44|
|webqs | 0|acc | 1.28|± | 0.25|
## bloom-1b7_reading_comprehension_0-shot.json
|Task|Version|Metric|Value| |Stderr|
|----|------:|------|----:|---|-----:|
|coqa| 1|f1 |53.55|± | 1.89|
| | |em |40.90|± | 2.03|
|drop| 1|em | 0.69|± | 0.08|
| | |f1 | 6.89|± | 0.16|
|race| 1|acc |33.21|± | 1.46|
## bloom-1b7_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 47.4|± | 2.24|
|xcopa_ht| 0|acc | 50.4|± | 2.24|
|xcopa_id| 0|acc | 63.2|± | 2.16|
|xcopa_it| 0|acc | 52.6|± | 2.24|
|xcopa_qu| 0|acc | 50.6|± | 2.24|
|xcopa_sw| 0|acc | 51.8|± | 2.24|
|xcopa_ta| 0|acc | 56.6|± | 2.22|
|xcopa_th| 0|acc | 53.2|± | 2.23|
|xcopa_tr| 0|acc | 52.8|± | 2.23|
|xcopa_vi| 0|acc | 65.8|± | 2.12|
|xcopa_zh| 0|acc | 61.4|± | 2.18|
## bloom-1b7_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.57|± | 0.67|
|xnli_bg| 0|acc |35.43|± | 0.68|
|xnli_de| 0|acc |40.58|± | 0.69|
|xnli_el| 0|acc |33.99|± | 0.67|
|xnli_en| 0|acc |50.14|± | 0.71|
|xnli_es| 0|acc |47.82|± | 0.71|
|xnli_fr| 0|acc |48.18|± | 0.71|
|xnli_hi| 0|acc |43.95|± | 0.70|
|xnli_ru| 0|acc |39.32|± | 0.69|
|xnli_sw| 0|acc |34.51|± | 0.67|
|xnli_th| 0|acc |33.37|± | 0.67|
|xnli_tr| 0|acc |34.93|± | 0.67|
|xnli_ur| 0|acc |40.50|± | 0.69|
|xnli_vi| 0|acc |46.23|± | 0.70|
|xnli_zh| 0|acc |36.21|± | 0.68|
## bloom-1b7_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |55.00|± | 1.28|
|xstory_cloze_en| 0|acc |64.66|± | 1.23|
|xstory_cloze_es| 0|acc |60.82|± | 1.26|
|xstory_cloze_eu| 0|acc |54.93|± | 1.28|
|xstory_cloze_hi| 0|acc |56.78|± | 1.27|
|xstory_cloze_id| 0|acc |59.76|± | 1.26|
|xstory_cloze_my| 0|acc |47.25|± | 1.28|
|xstory_cloze_ru| 0|acc |50.36|± | 1.29|
|xstory_cloze_sw| 0|acc |52.28|± | 1.29|
|xstory_cloze_te| 0|acc |56.52|± | 1.28|
|xstory_cloze_zh| 0|acc |58.24|± | 1.27|
## bloom-1b7_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |74.71|± | 0.90|
|xwinograd_fr| 0|acc |68.67|± | 5.12|
|xwinograd_jp| 0|acc |54.12|± | 1.61|
|xwinograd_pt| 0|acc |63.50|± | 2.97|
|xwinograd_ru| 0|acc |52.38|± | 2.82|
|xwinograd_zh| 0|acc |69.64|± | 2.05|
# bloom-3b
## bloom-3b_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |27.99|± | 1.31|
| | |acc_norm|30.55|± | 1.35|
|arc_easy | 0|acc |59.47|± | 1.01|
| | |acc_norm|53.24|± | 1.02|
|boolq | 1|acc |61.62|± | 0.85|
|copa | 0|acc |74.00|± | 4.41|
|hellaswag | 0|acc |41.26|± | 0.49|
| | |acc_norm|52.72|± | 0.50|
|mc_taco | 0|em |11.94| | |
| | |f1 |49.57| | |
|openbookqa | 0|acc |21.60|± | 1.84|
| | |acc_norm|32.20|± | 2.09|
|piqa | 0|acc |70.84|± | 1.06|
| | |acc_norm|70.51|± | 1.06|
|prost | 0|acc |22.69|± | 0.31|
| | |acc_norm|26.36|± | 0.32|
|swag | 0|acc |47.36|± | 0.35|
| | |acc_norm|64.59|± | 0.34|
|winogrande | 0|acc |58.72|± | 1.38|
|wsc273 | 0|acc |76.92|± | 2.55|
## bloom-3b_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc | 1.21|± | 0.3|
## bloom-3b_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 2.10|± | 0.15|
| | |f1 | 4.63|± | 0.17|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.21|± | 0.21|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.19|± | 0.19|
|math_prealgebra | 1|acc | 0.11|± | 0.11|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |25.26|± | 0.80|
| | |acc_norm|25.06|± | 0.79|
## bloom-3b_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc | 54.6|± | 1.11|
|pawsx_en| 0|acc | 56.8|± | 1.11|
|pawsx_es| 0|acc | 56.4|± | 1.11|
|pawsx_fr| 0|acc | 47.6|± | 1.12|
|pawsx_ja| 0|acc | 44.6|± | 1.11|
|pawsx_ko| 0|acc | 46.3|± | 1.12|
|pawsx_zh| 0|acc | 47.1|± | 1.12|
## bloom-3b_question_answering_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|------------|----:|---|-----:|
|headqa_en | 0|acc |28.41|± | 0.86|
| | |acc_norm |33.37|± | 0.90|
|headqa_es | 0|acc |26.44|± | 0.84|
| | |acc_norm |31.00|± | 0.88|
|logiqa | 0|acc |20.74|± | 1.59|
| | |acc_norm |29.19|± | 1.78|
|squad2 | 1|exact | 6.91| | |
| | |f1 |11.51| | |
| | |HasAns_exact|11.10| | |
| | |HasAns_f1 |20.31| | |
| | |NoAns_exact | 2.74| | |
| | |NoAns_f1 | 2.74| | |
| | |best_exact |50.07| | |
| | |best_f1 |50.08| | |
|triviaqa | 1|acc | 4.15|± | 0.19|
|truthfulqa_mc| 1|mc1 |23.26|± | 1.48|
| | |mc2 |40.57|± | 1.44|
|webqs | 0|acc | 1.67|± | 0.28|
## bloom-3b_reading_comprehension_0-shot.json
|Task|Version|Metric|Value| |Stderr|
|----|------:|------|----:|---|-----:|
|coqa| 1|f1 |61.50|± | 1.77|
| | |em |46.07|± | 2.02|
|drop| 1|em | 1.94|± | 0.14|
| | |f1 | 8.88|± | 0.20|
|race| 1|acc |35.22|± | 1.48|
## bloom-3b_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 49.2|± | 2.24|
|xcopa_ht| 0|acc | 50.2|± | 2.24|
|xcopa_id| 0|acc | 69.2|± | 2.07|
|xcopa_it| 0|acc | 51.6|± | 2.24|
|xcopa_qu| 0|acc | 50.6|± | 2.24|
|xcopa_sw| 0|acc | 51.4|± | 2.24|
|xcopa_ta| 0|acc | 58.0|± | 2.21|
|xcopa_th| 0|acc | 52.6|± | 2.24|
|xcopa_tr| 0|acc | 53.4|± | 2.23|
|xcopa_vi| 0|acc | 68.8|± | 2.07|
|xcopa_zh| 0|acc | 62.0|± | 2.17|
## bloom-3b_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.43|± | 0.67|
|xnli_bg| 0|acc |37.90|± | 0.69|
|xnli_de| 0|acc |40.40|± | 0.69|
|xnli_el| 0|acc |33.21|± | 0.67|
|xnli_en| 0|acc |53.41|± | 0.70|
|xnli_es| 0|acc |49.08|± | 0.71|
|xnli_fr| 0|acc |49.18|± | 0.71|
|xnli_hi| 0|acc |45.55|± | 0.70|
|xnli_ru| 0|acc |41.40|± | 0.70|
|xnli_sw| 0|acc |35.83|± | 0.68|
|xnli_th| 0|acc |33.39|± | 0.67|
|xnli_tr| 0|acc |33.81|± | 0.67|
|xnli_ur| 0|acc |40.00|± | 0.69|
|xnli_vi| 0|acc |46.51|± | 0.70|
|xnli_zh| 0|acc |37.43|± | 0.68|
## bloom-3b_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |56.59|± | 1.28|
|xstory_cloze_en| 0|acc |66.78|± | 1.21|
|xstory_cloze_es| 0|acc |64.13|± | 1.23|
|xstory_cloze_eu| 0|acc |55.66|± | 1.28|
|xstory_cloze_hi| 0|acc |57.58|± | 1.27|
|xstory_cloze_id| 0|acc |60.82|± | 1.26|
|xstory_cloze_my| 0|acc |46.59|± | 1.28|
|xstory_cloze_ru| 0|acc |50.69|± | 1.29|
|xstory_cloze_sw| 0|acc |53.01|± | 1.28|
|xstory_cloze_te| 0|acc |58.17|± | 1.27|
|xstory_cloze_zh| 0|acc |60.89|± | 1.26|
## bloom-3b_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |79.10|± | 0.84|
|xwinograd_fr| 0|acc |71.08|± | 5.01|
|xwinograd_jp| 0|acc |56.62|± | 1.60|
|xwinograd_pt| 0|acc |70.34|± | 2.82|
|xwinograd_ru| 0|acc |53.65|± | 2.81|
|xwinograd_zh| 0|acc |73.61|± | 1.97|
# bloom-560m
## bloom-560m_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |22.44|± | 1.22|
| | |acc_norm|23.98|± | 1.25|
|arc_easy | 0|acc |47.35|± | 1.02|
| | |acc_norm|41.67|± | 1.01|
|boolq | 1|acc |55.14|± | 0.87|
|copa | 0|acc |61.00|± | 4.90|
|hellaswag | 0|acc |31.56|± | 0.46|
| | |acc_norm|36.56|± | 0.48|
|mc_taco | 0|em |17.42| | |
| | |f1 |31.43| | |
|openbookqa | 0|acc |17.20|± | 1.69|
| | |acc_norm|28.20|± | 2.01|
|piqa | 0|acc |64.09|± | 1.12|
| | |acc_norm|65.13|± | 1.11|
|prost | 0|acc |22.08|± | 0.30|
| | |acc_norm|32.08|± | 0.34|
|swag | 0|acc |40.35|± | 0.35|
| | |acc_norm|52.96|± | 0.35|
|winogrande | 0|acc |52.80|± | 1.40|
|wsc273 | 0|acc |66.67|± | 2.86|
## bloom-560m_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc | 0.53|± | 0.2|
## bloom-560m_lambada_openai_0-shot.json
| Task |Version|Metric| Value | |Stderr|
|--------------------|------:|------|------:|---|-----:|
|lambada_openai | 0|ppl | 28.68|± | 1.08|
| | |acc | 35.40|± | 0.67|
|lambada_openai_cloze| 0|ppl |6212.81|± |267.17|
| | |acc | 0.45|± | 0.09|
## bloom-560m_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 1.26|± | 0.11|
| | |f1 | 3.50|± | 0.14|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.00|± | 0.00|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.19|± | 0.19|
|math_prealgebra | 1|acc | 0.23|± | 0.16|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |22.51|± | 0.76|
| | |acc_norm|22.35|± | 0.76|
## bloom-560m_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |52.80|± | 1.12|
|pawsx_en| 0|acc |52.00|± | 1.12|
|pawsx_es| 0|acc |53.25|± | 1.12|
|pawsx_fr| 0|acc |47.95|± | 1.12|
|pawsx_ja| 0|acc |44.90|± | 1.11|
|pawsx_ko| 0|acc |51.90|± | 1.12|
|pawsx_zh| 0|acc |45.20|± | 1.11|
## bloom-560m_question_answering_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|------------|----:|---|-----:|
|headqa_en | 0|acc |25.67|± | 0.83|
| | |acc_norm |29.58|± | 0.87|
|headqa_es | 0|acc |23.96|± | 0.82|
| | |acc_norm |27.17|± | 0.85|
|logiqa | 0|acc |22.58|± | 1.64|
| | |acc_norm |27.19|± | 1.75|
|squad2 | 1|exact | 0.43| | |
| | |f1 | 1.86| | |
| | |HasAns_exact| 0.76| | |
| | |HasAns_f1 | 3.62| | |
| | |NoAns_exact | 0.10| | |
| | |NoAns_f1 | 0.10| | |
| | |best_exact |50.07| | |
| | |best_f1 |50.07| | |
|triviaqa | 1|acc | 1.44|± | 0.11|
|truthfulqa_mc| 1|mc1 |24.48|± | 1.51|
| | |mc2 |42.43|± | 1.51|
|webqs | 0|acc | 0.84|± | 0.20|
## bloom-560m_reading_comprehension_0-shot.json
|Task|Version|Metric|Value| |Stderr|
|----|------:|------|----:|---|-----:|
|coqa| 1|f1 |22.71|± | 1.67|
| | |em |17.40|± | 1.62|
|drop| 1|em | 1.50|± | 0.12|
| | |f1 | 6.21|± | 0.17|
|race| 1|acc |30.24|± | 1.42|
## bloom-560m_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 49.0|± | 2.24|
|xcopa_ht| 0|acc | 50.2|± | 2.24|
|xcopa_id| 0|acc | 59.2|± | 2.20|
|xcopa_it| 0|acc | 50.8|± | 2.24|
|xcopa_qu| 0|acc | 50.2|± | 2.24|
|xcopa_sw| 0|acc | 51.6|± | 2.24|
|xcopa_ta| 0|acc | 55.8|± | 2.22|
|xcopa_th| 0|acc | 54.4|± | 2.23|
|xcopa_tr| 0|acc | 53.0|± | 2.23|
|xcopa_vi| 0|acc | 61.0|± | 2.18|
|xcopa_zh| 0|acc | 58.6|± | 2.20|
## bloom-560m_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.35|± | 0.67|
|xnli_bg| 0|acc |33.39|± | 0.67|
|xnli_de| 0|acc |34.79|± | 0.67|
|xnli_el| 0|acc |33.33|± | 0.67|
|xnli_en| 0|acc |49.50|± | 0.71|
|xnli_es| 0|acc |45.23|± | 0.70|
|xnli_fr| 0|acc |45.29|± | 0.70|
|xnli_hi| 0|acc |40.84|± | 0.69|
|xnli_ru| 0|acc |34.01|± | 0.67|
|xnli_sw| 0|acc |33.17|± | 0.67|
|xnli_th| 0|acc |33.57|± | 0.67|
|xnli_tr| 0|acc |33.43|± | 0.67|
|xnli_ur| 0|acc |37.13|± | 0.68|
|xnli_vi| 0|acc |40.52|± | 0.69|
|xnli_zh| 0|acc |33.95|± | 0.67|
## bloom-560m_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |52.08|± | 1.29|
|xstory_cloze_en| 0|acc |61.22|± | 1.25|
|xstory_cloze_es| 0|acc |55.86|± | 1.28|
|xstory_cloze_eu| 0|acc |53.61|± | 1.28|
|xstory_cloze_hi| 0|acc |55.00|± | 1.28|
|xstory_cloze_id| 0|acc |55.53|± | 1.28|
|xstory_cloze_my| 0|acc |47.19|± | 1.28|
|xstory_cloze_ru| 0|acc |49.17|± | 1.29|
|xstory_cloze_sw| 0|acc |49.83|± | 1.29|
|xstory_cloze_te| 0|acc |55.72|± | 1.28|
|xstory_cloze_zh| 0|acc |54.53|± | 1.28|
## bloom-560m_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |65.89|± | 0.98|
|xwinograd_fr| 0|acc |60.24|± | 5.40|
|xwinograd_jp| 0|acc |52.97|± | 1.61|
|xwinograd_pt| 0|acc |60.08|± | 3.03|
|xwinograd_ru| 0|acc |49.21|± | 2.82|
|xwinograd_zh| 0|acc |67.66|± | 2.09|
# bloom-7b1
## bloom-7b1_bbh_3-shot.json
| Task |Version| Metric |Value| |Stderr|
|------------------------------------------------|------:|---------------------|----:|---|-----:|
|bigbench_causal_judgement | 0|multiple_choice_grade|52.11|± | 3.63|
|bigbench_date_understanding | 0|multiple_choice_grade|36.59|± | 2.51|
|bigbench_disambiguation_qa | 0|multiple_choice_grade|26.36|± | 2.75|
|bigbench_dyck_languages | 0|multiple_choice_grade|14.40|± | 1.11|
|bigbench_formal_fallacies_syllogisms_negation | 0|multiple_choice_grade|50.06|± | 0.42|
|bigbench_geometric_shapes | 0|multiple_choice_grade|20.06|± | 2.12|
| | |exact_str_match | 0.00|± | 0.00|
|bigbench_hyperbaton | 0|multiple_choice_grade|48.62|± | 0.22|
|bigbench_logical_deduction_five_objects | 0|multiple_choice_grade|26.00|± | 1.96|
|bigbench_logical_deduction_seven_objects | 0|multiple_choice_grade|19.14|± | 1.49|
|bigbench_logical_deduction_three_objects | 0|multiple_choice_grade|37.00|± | 2.79|
|bigbench_movie_recommendation | 0|multiple_choice_grade|26.40|± | 1.97|
|bigbench_navigate | 0|multiple_choice_grade|49.90|± | 1.58|
|bigbench_reasoning_about_colored_objects | 0|multiple_choice_grade|24.85|± | 0.97|
|bigbench_ruin_names | 0|multiple_choice_grade|34.38|± | 2.25|
|bigbench_salient_translation_error_detection | 0|multiple_choice_grade|19.14|± | 1.25|
|bigbench_snarks | 0|multiple_choice_grade|49.72|± | 3.73|
|bigbench_sports_understanding | 0|multiple_choice_grade|50.30|± | 1.59|
|bigbench_temporal_sequences | 0|multiple_choice_grade|24.80|± | 1.37|
|bigbench_tracking_shuffled_objects_five_objects | 0|multiple_choice_grade|18.40|± | 1.10|
|bigbench_tracking_shuffled_objects_seven_objects| 0|multiple_choice_grade|14.00|± | 0.83|
|bigbench_tracking_shuffled_objects_three_objects| 0|multiple_choice_grade|37.00|± | 2.79|
## bloom-7b1_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |30.38|± | 1.34|
| | |acc_norm|33.53|± | 1.38|
|arc_easy | 0|acc |64.94|± | 0.98|
| | |acc_norm|57.32|± | 1.01|
|boolq | 1|acc |62.87|± | 0.85|
|copa | 0|acc |72.00|± | 4.51|
|hellaswag | 0|acc |46.24|± | 0.50|
| | |acc_norm|59.68|± | 0.49|
|mc_taco | 0|em |13.59| | |
| | |f1 |50.53| | |
|openbookqa | 0|acc |25.20|± | 1.94|
| | |acc_norm|35.80|± | 2.15|
|piqa | 0|acc |72.74|± | 1.04|
| | |acc_norm|73.67|± | 1.03|
|prost | 0|acc |26.18|± | 0.32|
| | |acc_norm|30.57|± | 0.34|
|swag | 0|acc |50.25|± | 0.35|
| | |acc_norm|68.26|± | 0.33|
|winogrande | 0|acc |64.33|± | 1.35|
|wsc273 | 0|acc |81.32|± | 2.36|
## bloom-7b1_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc | 1.9|± | 0.38|
## bloom-7b1_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 2.51|± | 0.16|
| | |f1 | 5.09|± | 0.18|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.00|± | 0.00|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.00|± | 0.00|
|math_prealgebra | 1|acc | 0.00|± | 0.00|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |26.57|± | 0.81|
| | |acc_norm|26.53|± | 0.81|
## bloom-7b1_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |52.85|± | 1.12|
|pawsx_en| 0|acc |61.30|± | 1.09|
|pawsx_es| 0|acc |59.35|± | 1.10|
|pawsx_fr| 0|acc |50.90|± | 1.12|
|pawsx_ja| 0|acc |45.45|± | 1.11|
|pawsx_ko| 0|acc |45.10|± | 1.11|
|pawsx_zh| 0|acc |47.35|± | 1.12|
## bloom-7b1_question_answering_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|------------|----:|---|-----:|
|headqa_en | 0|acc |31.18|± | 0.88|
| | |acc_norm |35.56|± | 0.91|
|headqa_es | 0|acc |29.54|± | 0.87|
| | |acc_norm |34.32|± | 0.91|
|logiqa | 0|acc |20.28|± | 1.58|
| | |acc_norm |28.11|± | 1.76|
|squad2 | 1|exact | 7.82| | |
| | |f1 |12.64| | |
| | |HasAns_exact|14.84| | |
| | |HasAns_f1 |24.51| | |
| | |NoAns_exact | 0.81| | |
| | |NoAns_f1 | 0.81| | |
| | |best_exact |50.07| | |
| | |best_f1 |50.07| | |
|triviaqa | 1|acc | 5.52|± | 0.21|
|truthfulqa_mc| 1|mc1 |22.40|± | 1.46|
| | |mc2 |38.90|± | 1.40|
|webqs | 0|acc | 2.26|± | 0.33|
## bloom-7b1_reading_comprehension_0-shot.json
|Task|Version|Metric|Value| |Stderr|
|----|------:|------|----:|---|-----:|
|coqa| 1|f1 |68.83|± | 1.63|
| | |em |53.87|± | 2.00|
|drop| 1|em | 2.57|± | 0.16|
| | |f1 | 9.85|± | 0.21|
|race| 1|acc |36.56|± | 1.49|
## bloom-7b1_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 48.2|± | 2.24|
|xcopa_ht| 0|acc | 50.8|± | 2.24|
|xcopa_id| 0|acc | 69.8|± | 2.06|
|xcopa_it| 0|acc | 52.8|± | 2.23|
|xcopa_qu| 0|acc | 50.8|± | 2.24|
|xcopa_sw| 0|acc | 51.6|± | 2.24|
|xcopa_ta| 0|acc | 59.2|± | 2.20|
|xcopa_th| 0|acc | 55.4|± | 2.23|
|xcopa_tr| 0|acc | 51.2|± | 2.24|
|xcopa_vi| 0|acc | 70.8|± | 2.04|
|xcopa_zh| 0|acc | 65.2|± | 2.13|
## bloom-7b1_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.83|± | 0.67|
|xnli_bg| 0|acc |39.70|± | 0.69|
|xnli_de| 0|acc |39.86|± | 0.69|
|xnli_el| 0|acc |35.75|± | 0.68|
|xnli_en| 0|acc |53.91|± | 0.70|
|xnli_es| 0|acc |48.70|± | 0.71|
|xnli_fr| 0|acc |49.68|± | 0.71|
|xnli_hi| 0|acc |46.51|± | 0.70|
|xnli_ru| 0|acc |43.05|± | 0.70|
|xnli_sw| 0|acc |37.92|± | 0.69|
|xnli_th| 0|acc |34.99|± | 0.67|
|xnli_tr| 0|acc |35.09|± | 0.67|
|xnli_ur| 0|acc |42.10|± | 0.70|
|xnli_vi| 0|acc |47.05|± | 0.71|
|xnli_zh| 0|acc |35.43|± | 0.68|
## bloom-7b1_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |58.57|± | 1.27|
|xstory_cloze_en| 0|acc |70.75|± | 1.17|
|xstory_cloze_es| 0|acc |66.12|± | 1.22|
|xstory_cloze_eu| 0|acc |57.18|± | 1.27|
|xstory_cloze_hi| 0|acc |60.56|± | 1.26|
|xstory_cloze_id| 0|acc |64.46|± | 1.23|
|xstory_cloze_my| 0|acc |48.97|± | 1.29|
|xstory_cloze_ru| 0|acc |52.75|± | 1.28|
|xstory_cloze_sw| 0|acc |53.94|± | 1.28|
|xstory_cloze_te| 0|acc |57.45|± | 1.27|
|xstory_cloze_zh| 0|acc |61.88|± | 1.25|
## bloom-7b1_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |82.15|± | 0.79|
|xwinograd_fr| 0|acc |71.08|± | 5.01|
|xwinograd_jp| 0|acc |58.50|± | 1.59|
|xwinograd_pt| 0|acc |76.81|± | 2.61|
|xwinograd_ru| 0|acc |56.83|± | 2.80|
|xwinograd_zh| 0|acc |74.40|± | 1.95|
# llama-13B
## llama-13B_arithmetic_5-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------------|------:|------|----:|---|-----:|
|arithmetic_1dc| 0|acc | 0|± | 0|
|arithmetic_2da| 0|acc | 0|± | 0|
|arithmetic_2dm| 0|acc | 0|± | 0|
|arithmetic_2ds| 0|acc | 0|± | 0|
|arithmetic_3da| 0|acc | 0|± | 0|
|arithmetic_3ds| 0|acc | 0|± | 0|
|arithmetic_4da| 0|acc | 0|± | 0|
|arithmetic_4ds| 0|acc | 0|± | 0|
|arithmetic_5da| 0|acc | 0|± | 0|
|arithmetic_5ds| 0|acc | 0|± | 0|
## llama-13B_bbh_3-shot.json
| Task |Version| Metric |Value| |Stderr|
|------------------------------------------------|------:|---------------------|----:|---|-----:|
|bigbench_causal_judgement | 0|multiple_choice_grade|49.47|± | 3.64|
|bigbench_date_understanding | 0|multiple_choice_grade|63.96|± | 2.50|
|bigbench_disambiguation_qa | 0|multiple_choice_grade|45.74|± | 3.11|
|bigbench_dyck_languages | 0|multiple_choice_grade|20.10|± | 1.27|
|bigbench_formal_fallacies_syllogisms_negation | 0|multiple_choice_grade|51.13|± | 0.42|
|bigbench_geometric_shapes | 0|multiple_choice_grade|23.12|± | 2.23|
| | |exact_str_match | 0.00|± | 0.00|
|bigbench_hyperbaton | 0|multiple_choice_grade|50.38|± | 0.22|
|bigbench_logical_deduction_five_objects | 0|multiple_choice_grade|30.00|± | 2.05|
|bigbench_logical_deduction_seven_objects | 0|multiple_choice_grade|22.29|± | 1.57|
|bigbench_logical_deduction_three_objects | 0|multiple_choice_grade|41.67|± | 2.85|
|bigbench_movie_recommendation | 0|multiple_choice_grade|43.60|± | 2.22|
|bigbench_navigate | 0|multiple_choice_grade|51.70|± | 1.58|
|bigbench_reasoning_about_colored_objects | 0|multiple_choice_grade|37.05|± | 1.08|
|bigbench_ruin_names | 0|multiple_choice_grade|34.60|± | 2.25|
|bigbench_salient_translation_error_detection | 0|multiple_choice_grade|19.34|± | 1.25|
|bigbench_snarks | 0|multiple_choice_grade|46.96|± | 3.72|
|bigbench_sports_understanding | 0|multiple_choice_grade|58.11|± | 1.57|
|bigbench_temporal_sequences | 0|multiple_choice_grade|28.00|± | 1.42|
|bigbench_tracking_shuffled_objects_five_objects | 0|multiple_choice_grade|21.44|± | 1.16|
|bigbench_tracking_shuffled_objects_seven_objects| 0|multiple_choice_grade|14.46|± | 0.84|
|bigbench_tracking_shuffled_objects_three_objects| 0|multiple_choice_grade|41.67|± | 2.85|
## llama-13B_blimp_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------------------------------------------------|------:|------|----:|---|-----:|
|blimp_adjunct_island | 0|acc | 33.8|± | 1.50|
|blimp_anaphor_gender_agreement | 0|acc | 57.6|± | 1.56|
|blimp_anaphor_number_agreement | 0|acc | 56.5|± | 1.57|
|blimp_animate_subject_passive | 0|acc | 65.1|± | 1.51|
|blimp_animate_subject_trans | 0|acc | 61.6|± | 1.54|
|blimp_causative | 0|acc | 35.9|± | 1.52|
|blimp_complex_NP_island | 0|acc | 30.3|± | 1.45|
|blimp_coordinate_structure_constraint_complex_left_branch| 0|acc | 34.5|± | 1.50|
|blimp_coordinate_structure_constraint_object_extraction | 0|acc | 27.9|± | 1.42|
|blimp_determiner_noun_agreement_1 | 0|acc | 34.1|± | 1.50|
|blimp_determiner_noun_agreement_2 | 0|acc | 36.1|± | 1.52|
|blimp_determiner_noun_agreement_irregular_1 | 0|acc | 35.6|± | 1.51|
|blimp_determiner_noun_agreement_irregular_2 | 0|acc | 36.9|± | 1.53|
|blimp_determiner_noun_agreement_with_adj_2 | 0|acc | 39.2|± | 1.54|
|blimp_determiner_noun_agreement_with_adj_irregular_1 | 0|acc | 34.2|± | 1.50|
|blimp_determiner_noun_agreement_with_adj_irregular_2 | 0|acc | 39.3|± | 1.55|
|blimp_determiner_noun_agreement_with_adjective_1 | 0|acc | 39.1|± | 1.54|
|blimp_distractor_agreement_relational_noun | 0|acc | 51.4|± | 1.58|
|blimp_distractor_agreement_relative_clause | 0|acc | 42.3|± | 1.56|
|blimp_drop_argument | 0|acc | 70.5|± | 1.44|
|blimp_ellipsis_n_bar_1 | 0|acc | 62.4|± | 1.53|
|blimp_ellipsis_n_bar_2 | 0|acc | 26.4|± | 1.39|
|blimp_existential_there_object_raising | 0|acc | 69.0|± | 1.46|
|blimp_existential_there_quantifiers_1 | 0|acc | 30.8|± | 1.46|
|blimp_existential_there_quantifiers_2 | 0|acc | 78.8|± | 1.29|
|blimp_existential_there_subject_raising | 0|acc | 70.1|± | 1.45|
|blimp_expletive_it_object_raising | 0|acc | 61.9|± | 1.54|
|blimp_inchoative | 0|acc | 47.4|± | 1.58|
|blimp_intransitive | 0|acc | 64.3|± | 1.52|
|blimp_irregular_past_participle_adjectives | 0|acc | 63.6|± | 1.52|
|blimp_irregular_past_participle_verbs | 0|acc | 31.4|± | 1.47|
|blimp_irregular_plural_subject_verb_agreement_1 | 0|acc | 51.8|± | 1.58|
|blimp_irregular_plural_subject_verb_agreement_2 | 0|acc | 50.4|± | 1.58|
|blimp_left_branch_island_echo_question | 0|acc | 49.0|± | 1.58|
|blimp_left_branch_island_simple_question | 0|acc | 41.1|± | 1.56|
|blimp_matrix_question_npi_licensor_present | 0|acc | 54.8|± | 1.57|
|blimp_npi_present_1 | 0|acc | 30.4|± | 1.46|
|blimp_npi_present_2 | 0|acc | 39.0|± | 1.54|
|blimp_only_npi_licensor_present | 0|acc | 73.1|± | 1.40|
|blimp_only_npi_scope | 0|acc | 27.8|± | 1.42|
|blimp_passive_1 | 0|acc | 52.9|± | 1.58|
|blimp_passive_2 | 0|acc | 52.6|± | 1.58|
|blimp_principle_A_c_command | 0|acc | 32.6|± | 1.48|
|blimp_principle_A_case_1 | 0|acc | 2.8|± | 0.52|
|blimp_principle_A_case_2 | 0|acc | 44.3|± | 1.57|
|blimp_principle_A_domain_1 | 0|acc | 32.4|± | 1.48|
|blimp_principle_A_domain_2 | 0|acc | 74.0|± | 1.39|
|blimp_principle_A_domain_3 | 0|acc | 56.3|± | 1.57|
|blimp_principle_A_reconstruction | 0|acc | 79.2|± | 1.28|
|blimp_regular_plural_subject_verb_agreement_1 | 0|acc | 56.0|± | 1.57|
|blimp_regular_plural_subject_verb_agreement_2 | 0|acc | 45.6|± | 1.58|
|blimp_sentential_negation_npi_licensor_present | 0|acc | 39.2|± | 1.54|
|blimp_sentential_negation_npi_scope | 0|acc | 63.8|± | 1.52|
|blimp_sentential_subject_island | 0|acc | 62.1|± | 1.53|
|blimp_superlative_quantifiers_1 | 0|acc | 52.2|± | 1.58|
|blimp_superlative_quantifiers_2 | 0|acc | 71.4|± | 1.43|
|blimp_tough_vs_raising_1 | 0|acc | 36.1|± | 1.52|
|blimp_tough_vs_raising_2 | 0|acc | 64.2|± | 1.52|
|blimp_transitive | 0|acc | 47.3|± | 1.58|
|blimp_wh_island | 0|acc | 50.6|± | 1.58|
|blimp_wh_questions_object_gap | 0|acc | 45.5|± | 1.58|
|blimp_wh_questions_subject_gap | 0|acc | 36.9|± | 1.53|
|blimp_wh_questions_subject_gap_long_distance | 0|acc | 40.8|± | 1.55|
|blimp_wh_vs_that_no_gap | 0|acc | 19.6|± | 1.26|
|blimp_wh_vs_that_no_gap_long_distance | 0|acc | 30.1|± | 1.45|
|blimp_wh_vs_that_with_gap | 0|acc | 84.7|± | 1.14|
|blimp_wh_vs_that_with_gap_long_distance | 0|acc | 69.2|± | 1.46|
## llama-13B_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |43.94|± | 1.45|
| | |acc_norm|44.62|± | 1.45|
|arc_easy | 0|acc |74.58|± | 0.89|
| | |acc_norm|59.89|± | 1.01|
|boolq | 1|acc |68.50|± | 0.81|
|copa | 0|acc |90.00|± | 3.02|
|hellaswag | 0|acc |59.10|± | 0.49|
| | |acc_norm|76.24|± | 0.42|
|mc_taco | 0|em |10.96| | |
| | |f1 |47.53| | |
|openbookqa | 0|acc |30.60|± | 2.06|
| | |acc_norm|42.20|± | 2.21|
|piqa | 0|acc |78.84|± | 0.95|
| | |acc_norm|79.11|± | 0.95|
|prost | 0|acc |26.89|± | 0.32|
| | |acc_norm|30.52|± | 0.34|
|swag | 0|acc |56.73|± | 0.35|
| | |acc_norm|69.35|± | 0.33|
|winogrande | 0|acc |70.17|± | 1.29|
|wsc273 | 0|acc |86.08|± | 2.10|
## llama-13B_glue_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|cola | 0|mcc | 0.00|± | 0.00|
|mnli | 0|acc |43.56|± | 0.50|
|mnli_mismatched| 0|acc |45.35|± | 0.50|
|mrpc | 0|acc |68.63|± | 2.30|
| | |f1 |81.34|± | 1.62|
|qnli | 0|acc |49.95|± | 0.68|
|qqp | 0|acc |36.79|± | 0.24|
| | |f1 |53.66|± | 0.26|
|rte | 0|acc |65.34|± | 2.86|
|sst | 0|acc |65.37|± | 1.61|
|wnli | 1|acc |46.48|± | 5.96|
## llama-13B_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc |13.57|± | 0.94|
## llama-13B_human_alignment_0-shot.json
| Task |Version| Metric | Value | |Stderr|
|---------------------------------------|------:|---------------------|------:|---|-----:|
|crows_pairs_english_age | 0|likelihood_difference| 771.02|± | 93.66|
| | |pct_stereotype | 56.04|± | 5.23|
|crows_pairs_english_autre | 0|likelihood_difference|1142.61|± |435.33|
| | |pct_stereotype | 36.36|± | 15.21|
|crows_pairs_english_disability | 0|likelihood_difference|1297.88|± |182.88|
| | |pct_stereotype | 35.38|± | 5.98|
|crows_pairs_english_gender | 0|likelihood_difference| 867.58|± | 65.49|
| | |pct_stereotype | 58.44|± | 2.76|
|crows_pairs_english_nationality | 0|likelihood_difference|1184.87|± | 83.43|
| | |pct_stereotype | 38.43|± | 3.32|
|crows_pairs_english_physical_appearance| 0|likelihood_difference| 752.95|± | 87.93|
| | |pct_stereotype | 47.22|± | 5.92|
|crows_pairs_english_race_color | 0|likelihood_difference| 985.84|± | 50.57|
| | |pct_stereotype | 50.20|± | 2.22|
|crows_pairs_english_religion | 0|likelihood_difference|1181.25|± |117.52|
| | |pct_stereotype | 49.55|± | 4.77|
|crows_pairs_english_sexual_orientation | 0|likelihood_difference|1072.24|± |115.61|
| | |pct_stereotype | 54.84|± | 5.19|
|crows_pairs_english_socioeconomic | 0|likelihood_difference|1122.24|± | 78.07|
| | |pct_stereotype | 50.53|± | 3.64|
|crows_pairs_french_age | 0|likelihood_difference|1310.14|± |112.01|
| | |pct_stereotype | 38.89|± | 5.17|
|crows_pairs_french_autre | 0|likelihood_difference| 994.23|± |314.84|
| | |pct_stereotype | 53.85|± | 14.39|
|crows_pairs_french_disability | 0|likelihood_difference|1732.39|± |182.40|
| | |pct_stereotype | 40.91|± | 6.10|
|crows_pairs_french_gender | 0|likelihood_difference|1079.15|± | 67.67|
| | |pct_stereotype | 51.40|± | 2.79|
|crows_pairs_french_nationality | 0|likelihood_difference|1633.10|± | 92.24|
| | |pct_stereotype | 31.62|± | 2.93|
|crows_pairs_french_physical_appearance | 0|likelihood_difference|1257.99|± |157.39|
| | |pct_stereotype | 52.78|± | 5.92|
|crows_pairs_french_race_color | 0|likelihood_difference|1192.74|± | 50.28|
| | |pct_stereotype | 35.00|± | 2.23|
|crows_pairs_french_religion | 0|likelihood_difference|1119.24|± |108.66|
| | |pct_stereotype | 59.13|± | 4.60|
|crows_pairs_french_sexual_orientation | 0|likelihood_difference|1755.49|± |118.03|
| | |pct_stereotype | 78.02|± | 4.36|
|crows_pairs_french_socioeconomic | 0|likelihood_difference|1279.15|± | 93.70|
| | |pct_stereotype | 35.71|± | 3.43|
|ethics_cm | 0|acc | 51.74|± | 0.80|
|ethics_deontology | 0|acc | 50.33|± | 0.83|
| | |em | 0.11| | |
|ethics_justice | 0|acc | 49.93|± | 0.96|
| | |em | 0.15| | |
|ethics_utilitarianism | 0|acc | 52.45|± | 0.72|
|ethics_utilitarianism_original | 0|acc | 98.07|± | 0.20|
|ethics_virtue | 0|acc | 20.32|± | 0.57|
| | |em | 0.00| | |
|toxigen | 0|acc | 42.66|± | 1.61|
| | |acc_norm | 43.19|± | 1.62|
## llama-13B_lambada_0-shot.json
| Task |Version|Metric| Value | | Stderr |
|----------------------|------:|------|---------:|---|--------:|
|lambada_openai | 0|ppl |1279051.05|± | 60995.63|
| | |acc | 0.00|± | 0.00|
|lambada_openai_cloze | 0|ppl | 204515.39|± | 9705.34|
| | |acc | 0.02|± | 0.02|
|lambada_openai_mt_de | 0|ppl |1310285.44|± | 71395.91|
| | |acc | 0.00|± | 0.00|
|lambada_openai_mt_en | 0|ppl |1279051.05|± | 60995.63|
| | |acc | 0.00|± | 0.00|
|lambada_openai_mt_es | 0|ppl |1980241.77|± |101614.20|
| | |acc | 0.00|± | 0.00|
|lambada_openai_mt_fr | 0|ppl |2461448.49|± |128013.99|
| | |acc | 0.00|± | 0.00|
|lambada_openai_mt_it | 0|ppl |4091504.35|± |218020.97|
| | |acc | 0.00|± | 0.00|
|lambada_standard | 0|ppl |1409048.00|± | 47832.88|
| | |acc | 0.00|± | 0.00|
|lambada_standard_cloze| 0|ppl |4235345.03|± |132892.57|
| | |acc | 0.00|± | 0.00|
## llama-13B_mathematical_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 3.88|± | 0.20|
| | |f1 |13.99|± | 0.25|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 1.85|± | 0.39|
|math_asdiv | 0|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 1.48|± | 0.55|
|math_geometry | 1|acc | 1.25|± | 0.51|
|math_intermediate_algebra| 1|acc | 1.22|± | 0.37|
|math_num_theory | 1|acc | 1.48|± | 0.52|
|math_prealgebra | 1|acc | 2.87|± | 0.57|
|math_precalc | 1|acc | 1.10|± | 0.45|
|mathqa | 0|acc |28.44|± | 0.83|
| | |acc_norm|28.68|± | 0.83|
## llama-13B_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 1.71|± | 0.13|
| | |f1 | 2.45|± | 0.14|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.21|± | 0.21|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.19|± | 0.19|
|math_prealgebra | 1|acc | 0.11|± | 0.11|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |29.98|± | 0.84|
| | |acc_norm|30.35|± | 0.84|
## llama-13B_mmlu_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------------------------------|------:|--------|----:|---|-----:|
|hendrycksTest-abstract_algebra | 0|acc |32.00|± | 4.69|
| | |acc_norm|30.00|± | 4.61|
|hendrycksTest-anatomy | 0|acc |42.96|± | 4.28|
| | |acc_norm|29.63|± | 3.94|
|hendrycksTest-astronomy | 0|acc |48.03|± | 4.07|
| | |acc_norm|48.03|± | 4.07|
|hendrycksTest-business_ethics | 0|acc |53.00|± | 5.02|
| | |acc_norm|44.00|± | 4.99|
|hendrycksTest-clinical_knowledge | 0|acc |46.04|± | 3.07|
| | |acc_norm|38.49|± | 2.99|
|hendrycksTest-college_biology | 0|acc |45.83|± | 4.17|
| | |acc_norm|32.64|± | 3.92|
|hendrycksTest-college_chemistry | 0|acc |31.00|± | 4.65|
| | |acc_norm|30.00|± | 4.61|
|hendrycksTest-college_computer_science | 0|acc |33.00|± | 4.73|
| | |acc_norm|28.00|± | 4.51|
|hendrycksTest-college_mathematics | 0|acc |29.00|± | 4.56|
| | |acc_norm|34.00|± | 4.76|
|hendrycksTest-college_medicine | 0|acc |42.77|± | 3.77|
| | |acc_norm|30.06|± | 3.50|
|hendrycksTest-college_physics | 0|acc |28.43|± | 4.49|
| | |acc_norm|35.29|± | 4.76|
|hendrycksTest-computer_security | 0|acc |57.00|± | 4.98|
| | |acc_norm|44.00|± | 4.99|
|hendrycksTest-conceptual_physics | 0|acc |42.13|± | 3.23|
| | |acc_norm|24.26|± | 2.80|
|hendrycksTest-econometrics | 0|acc |27.19|± | 4.19|
| | |acc_norm|26.32|± | 4.14|
|hendrycksTest-electrical_engineering | 0|acc |41.38|± | 4.10|
| | |acc_norm|34.48|± | 3.96|
|hendrycksTest-elementary_mathematics | 0|acc |36.77|± | 2.48|
| | |acc_norm|32.80|± | 2.42|
|hendrycksTest-formal_logic | 0|acc |32.54|± | 4.19|
| | |acc_norm|34.13|± | 4.24|
|hendrycksTest-global_facts | 0|acc |34.00|± | 4.76|
| | |acc_norm|29.00|± | 4.56|
|hendrycksTest-high_school_biology | 0|acc |49.68|± | 2.84|
| | |acc_norm|36.13|± | 2.73|
|hendrycksTest-high_school_chemistry | 0|acc |31.03|± | 3.26|
| | |acc_norm|32.02|± | 3.28|
|hendrycksTest-high_school_computer_science | 0|acc |49.00|± | 5.02|
| | |acc_norm|41.00|± | 4.94|
|hendrycksTest-high_school_european_history | 0|acc |52.73|± | 3.90|
| | |acc_norm|49.70|± | 3.90|
|hendrycksTest-high_school_geography | 0|acc |57.58|± | 3.52|
| | |acc_norm|42.42|± | 3.52|
|hendrycksTest-high_school_government_and_politics| 0|acc |58.55|± | 3.56|
| | |acc_norm|38.86|± | 3.52|
|hendrycksTest-high_school_macroeconomics | 0|acc |37.69|± | 2.46|
| | |acc_norm|31.79|± | 2.36|
|hendrycksTest-high_school_mathematics | 0|acc |26.67|± | 2.70|
| | |acc_norm|31.85|± | 2.84|
|hendrycksTest-high_school_microeconomics | 0|acc |42.02|± | 3.21|
| | |acc_norm|40.76|± | 3.19|
|hendrycksTest-high_school_physics | 0|acc |27.15|± | 3.63|
| | |acc_norm|25.17|± | 3.54|
|hendrycksTest-high_school_psychology | 0|acc |60.73|± | 2.09|
| | |acc_norm|36.88|± | 2.07|
|hendrycksTest-high_school_statistics | 0|acc |38.43|± | 3.32|
| | |acc_norm|37.50|± | 3.30|
|hendrycksTest-high_school_us_history | 0|acc |52.45|± | 3.51|
| | |acc_norm|37.25|± | 3.39|
|hendrycksTest-high_school_world_history | 0|acc |49.79|± | 3.25|
| | |acc_norm|42.62|± | 3.22|
|hendrycksTest-human_aging | 0|acc |57.40|± | 3.32|
| | |acc_norm|33.63|± | 3.17|
|hendrycksTest-human_sexuality | 0|acc |54.96|± | 4.36|
| | |acc_norm|39.69|± | 4.29|
|hendrycksTest-international_law | 0|acc |56.20|± | 4.53|
| | |acc_norm|60.33|± | 4.47|
|hendrycksTest-jurisprudence | 0|acc |48.15|± | 4.83|
| | |acc_norm|50.00|± | 4.83|
|hendrycksTest-logical_fallacies | 0|acc |45.40|± | 3.91|
| | |acc_norm|36.81|± | 3.79|
|hendrycksTest-machine_learning | 0|acc |28.57|± | 4.29|
| | |acc_norm|29.46|± | 4.33|
|hendrycksTest-management | 0|acc |64.08|± | 4.75|
| | |acc_norm|41.75|± | 4.88|
|hendrycksTest-marketing | 0|acc |72.65|± | 2.92|
| | |acc_norm|61.54|± | 3.19|
|hendrycksTest-medical_genetics | 0|acc |49.00|± | 5.02|
| | |acc_norm|48.00|± | 5.02|
|hendrycksTest-miscellaneous | 0|acc |69.60|± | 1.64|
| | |acc_norm|48.53|± | 1.79|
|hendrycksTest-moral_disputes | 0|acc |44.80|± | 2.68|
| | |acc_norm|38.15|± | 2.62|
|hendrycksTest-moral_scenarios | 0|acc |28.27|± | 1.51|
| | |acc_norm|27.26|± | 1.49|
|hendrycksTest-nutrition | 0|acc |45.10|± | 2.85|
| | |acc_norm|46.73|± | 2.86|
|hendrycksTest-philosophy | 0|acc |45.98|± | 2.83|
| | |acc_norm|38.59|± | 2.76|
|hendrycksTest-prehistory | 0|acc |49.69|± | 2.78|
| | |acc_norm|34.57|± | 2.65|
|hendrycksTest-professional_accounting | 0|acc |29.79|± | 2.73|
| | |acc_norm|28.01|± | 2.68|
|hendrycksTest-professional_law | 0|acc |30.38|± | 1.17|
| | |acc_norm|30.90|± | 1.18|
|hendrycksTest-professional_medicine | 0|acc |39.34|± | 2.97|
| | |acc_norm|33.09|± | 2.86|
|hendrycksTest-professional_psychology | 0|acc |42.32|± | 2.00|
| | |acc_norm|33.01|± | 1.90|
|hendrycksTest-public_relations | 0|acc |54.55|± | 4.77|
| | |acc_norm|29.09|± | 4.35|
|hendrycksTest-security_studies | 0|acc |45.71|± | 3.19|
| | |acc_norm|37.55|± | 3.10|
|hendrycksTest-sociology | 0|acc |58.21|± | 3.49|
| | |acc_norm|45.77|± | 3.52|
|hendrycksTest-us_foreign_policy | 0|acc |68.00|± | 4.69|
| | |acc_norm|52.00|± | 5.02|
|hendrycksTest-virology | 0|acc |40.96|± | 3.83|
| | |acc_norm|30.12|± | 3.57|
|hendrycksTest-world_religions | 0|acc |74.27|± | 3.35|
| | |acc_norm|64.91|± | 3.66|
## llama-13B_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |52.95|± | 1.12|
|pawsx_en| 0|acc |53.70|± | 1.12|
|pawsx_es| 0|acc |52.10|± | 1.12|
|pawsx_fr| 0|acc |54.50|± | 1.11|
|pawsx_ja| 0|acc |45.00|± | 1.11|
|pawsx_ko| 0|acc |47.05|± | 1.12|
|pawsx_zh| 0|acc |45.20|± | 1.11|
## llama-13B_question_answering_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|------------|----:|---|-----:|
|headqa_en | 0|acc |34.43|± | 0.91|
| | |acc_norm |38.58|± | 0.93|
|headqa_es | 0|acc |30.56|± | 0.88|
| | |acc_norm |35.16|± | 0.91|
|logiqa | 0|acc |26.42|± | 1.73|
| | |acc_norm |32.10|± | 1.83|
|squad2 | 1|exact |16.44| | |
| | |f1 |24.06| | |
| | |HasAns_exact|21.09| | |
| | |HasAns_f1 |36.35| | |
| | |NoAns_exact |11.81| | |
| | |NoAns_f1 |11.81| | |
| | |best_exact |50.07| | |
| | |best_f1 |50.07| | |
|triviaqa | 1|acc | 0.00|± | 0.00|
|truthfulqa_mc| 1|mc1 |25.83|± | 1.53|
| | |mc2 |39.88|± | 1.37|
|webqs | 0|acc | 0.00|± | 0.00|
## llama-13B_reading_comprehension_0-shot.json
|Task|Version|Metric|Value| |Stderr|
|----|------:|------|----:|---|-----:|
|coqa| 1|f1 |77.04|± | 1.42|
| | |em |63.70|± | 1.85|
|drop| 1|em | 3.59|± | 0.19|
| | |f1 |13.38|± | 0.24|
|race| 1|acc |39.33|± | 1.51|
## llama-13B_superglue_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|boolq | 1|acc |68.44|± | 0.81|
|cb | 1|acc |48.21|± | 6.74|
| | |f1 |38.82| | |
|copa | 0|acc |90.00|± | 3.02|
|multirc| 1|acc | 1.57|± | 0.40|
|record | 0|f1 |92.32|± | 0.26|
| | |em |91.54|± | 0.28|
|wic | 0|acc |49.84|± | 1.98|
|wsc | 0|acc |35.58|± | 4.72|
## llama-13B_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 48.2|± | 2.24|
|xcopa_ht| 0|acc | 52.8|± | 2.23|
|xcopa_id| 0|acc | 57.8|± | 2.21|
|xcopa_it| 0|acc | 67.2|± | 2.10|
|xcopa_qu| 0|acc | 50.2|± | 2.24|
|xcopa_sw| 0|acc | 51.2|± | 2.24|
|xcopa_ta| 0|acc | 54.4|± | 2.23|
|xcopa_th| 0|acc | 54.6|± | 2.23|
|xcopa_tr| 0|acc | 53.0|± | 2.23|
|xcopa_vi| 0|acc | 53.8|± | 2.23|
|xcopa_zh| 0|acc | 58.4|± | 2.21|
## llama-13B_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |34.07|± | 0.67|
|xnli_bg| 0|acc |34.21|± | 0.67|
|xnli_de| 0|acc |35.25|± | 0.68|
|xnli_el| 0|acc |34.69|± | 0.67|
|xnli_en| 0|acc |35.63|± | 0.68|
|xnli_es| 0|acc |33.49|± | 0.67|
|xnli_fr| 0|acc |33.49|± | 0.67|
|xnli_hi| 0|acc |35.59|± | 0.68|
|xnli_ru| 0|acc |33.79|± | 0.67|
|xnli_sw| 0|acc |33.15|± | 0.67|
|xnli_th| 0|acc |34.83|± | 0.67|
|xnli_tr| 0|acc |33.99|± | 0.67|
|xnli_ur| 0|acc |34.21|± | 0.67|
|xnli_vi| 0|acc |34.21|± | 0.67|
|xnli_zh| 0|acc |34.47|± | 0.67|
## llama-13B_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |49.70|± | 1.29|
|xstory_cloze_en| 0|acc |77.30|± | 1.08|
|xstory_cloze_es| 0|acc |69.42|± | 1.19|
|xstory_cloze_eu| 0|acc |50.69|± | 1.29|
|xstory_cloze_hi| 0|acc |52.35|± | 1.29|
|xstory_cloze_id| 0|acc |55.26|± | 1.28|
|xstory_cloze_my| 0|acc |47.78|± | 1.29|
|xstory_cloze_ru| 0|acc |63.40|± | 1.24|
|xstory_cloze_sw| 0|acc |49.90|± | 1.29|
|xstory_cloze_te| 0|acc |53.34|± | 1.28|
|xstory_cloze_zh| 0|acc |56.45|± | 1.28|
## llama-13B_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |86.75|± | 0.70|
|xwinograd_fr| 0|acc |68.67|± | 5.12|
|xwinograd_jp| 0|acc |59.85|± | 1.58|
|xwinograd_pt| 0|acc |71.48|± | 2.79|
|xwinograd_ru| 0|acc |70.79|± | 2.57|
|xwinograd_zh| 0|acc |70.04|± | 2.04|
# llama-30B
## llama-30B_bbh_3-shot.json
| Task |Version| Metric |Value| |Stderr|
|------------------------------------------------|------:|---------------------|----:|---|-----:|
|bigbench_causal_judgement | 0|multiple_choice_grade|57.37|± | 3.60|
|bigbench_date_understanding | 0|multiple_choice_grade|69.92|± | 2.39|
|bigbench_disambiguation_qa | 0|multiple_choice_grade|54.26|± | 3.11|
|bigbench_dyck_languages | 0|multiple_choice_grade|21.20|± | 1.29|
|bigbench_formal_fallacies_syllogisms_negation | 0|multiple_choice_grade|50.58|± | 0.42|
|bigbench_geometric_shapes | 0|multiple_choice_grade|27.86|± | 2.37|
| | |exact_str_match | 0.00|± | 0.00|
|bigbench_hyperbaton | 0|multiple_choice_grade|51.52|± | 0.22|
|bigbench_logical_deduction_five_objects | 0|multiple_choice_grade|36.80|± | 2.16|
|bigbench_logical_deduction_seven_objects | 0|multiple_choice_grade|25.29|± | 1.64|
|bigbench_logical_deduction_three_objects | 0|multiple_choice_grade|53.00|± | 2.89|
|bigbench_movie_recommendation | 0|multiple_choice_grade|63.20|± | 2.16|
|bigbench_navigate | 0|multiple_choice_grade|49.00|± | 1.58|
|bigbench_reasoning_about_colored_objects | 0|multiple_choice_grade|55.65|± | 1.11|
|bigbench_ruin_names | 0|multiple_choice_grade|39.73|± | 2.31|
|bigbench_salient_translation_error_detection | 0|multiple_choice_grade|19.84|± | 1.26|
|bigbench_snarks | 0|multiple_choice_grade|46.96|± | 3.72|
|bigbench_sports_understanding | 0|multiple_choice_grade|62.37|± | 1.54|
|bigbench_temporal_sequences | 0|multiple_choice_grade|14.60|± | 1.12|
|bigbench_tracking_shuffled_objects_five_objects | 0|multiple_choice_grade|21.28|± | 1.16|
|bigbench_tracking_shuffled_objects_seven_objects| 0|multiple_choice_grade|15.49|± | 0.87|
|bigbench_tracking_shuffled_objects_three_objects| 0|multiple_choice_grade|53.00|± | 2.89|
## llama-30B_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |46.76|± | 1.46|
| | |acc_norm|45.48|± | 1.46|
|arc_easy | 0|acc |75.34|± | 0.88|
| | |acc_norm|58.96|± | 1.01|
|boolq | 1|acc |68.41|± | 0.81|
|copa | 0|acc |90.00|± | 3.02|
|hellaswag | 0|acc |62.65|± | 0.48|
| | |acc_norm|79.24|± | 0.40|
|mc_taco | 0|em |11.41| | |
| | |f1 |48.36| | |
|openbookqa | 0|acc |29.40|± | 2.04|
| | |acc_norm|42.00|± | 2.21|
|piqa | 0|acc |80.96|± | 0.92|
| | |acc_norm|80.09|± | 0.93|
|prost | 0|acc |25.99|± | 0.32|
| | |acc_norm|29.11|± | 0.33|
|swag | 0|acc |58.61|± | 0.35|
| | |acc_norm|70.36|± | 0.32|
|winogrande | 0|acc |72.77|± | 1.25|
|wsc273 | 0|acc |86.81|± | 2.05|
## llama-30B_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc |30.48|± | 1.27|
## llama-30B_human_alignment_0-shot.json
| Task |Version| Metric | Value | |Stderr|
|---------------------------------------|------:|---------------------|------:|---|-----:|
|crows_pairs_english_age | 0|likelihood_difference| 512.91|± | 58.13|
| | |pct_stereotype | 58.24|± | 5.20|
|crows_pairs_english_autre | 0|likelihood_difference|1138.07|± |348.77|
| | |pct_stereotype | 63.64|± | 15.21|
|crows_pairs_english_disability | 0|likelihood_difference| 888.65|± |103.42|
| | |pct_stereotype | 53.85|± | 6.23|
|crows_pairs_english_gender | 0|likelihood_difference| 666.15|± | 42.85|
| | |pct_stereotype | 54.06|± | 2.79|
|crows_pairs_english_nationality | 0|likelihood_difference| 587.28|± | 39.94|
| | |pct_stereotype | 53.24|± | 3.40|
|crows_pairs_english_physical_appearance| 0|likelihood_difference| 540.10|± | 59.14|
| | |pct_stereotype | 52.78|± | 5.92|
|crows_pairs_english_race_color | 0|likelihood_difference| 768.21|± | 39.14|
| | |pct_stereotype | 56.10|± | 2.20|
|crows_pairs_english_religion | 0|likelihood_difference| 807.57|± | 94.38|
| | |pct_stereotype | 62.16|± | 4.62|
|crows_pairs_english_sexual_orientation | 0|likelihood_difference| 754.77|± | 76.83|
| | |pct_stereotype | 63.44|± | 5.02|
|crows_pairs_english_socioeconomic | 0|likelihood_difference| 730.39|± | 54.63|
| | |pct_stereotype | 53.68|± | 3.63|
|crows_pairs_french_age | 0|likelihood_difference| 892.50|± |101.09|
| | |pct_stereotype | 40.00|± | 5.19|
|crows_pairs_french_autre | 0|likelihood_difference| 637.98|± |165.68|
| | |pct_stereotype | 61.54|± | 14.04|
|crows_pairs_french_disability | 0|likelihood_difference|1020.27|± |126.17|
| | |pct_stereotype | 56.06|± | 6.16|
|crows_pairs_french_gender | 0|likelihood_difference|1373.28|± |110.30|
| | |pct_stereotype | 50.16|± | 2.80|
|crows_pairs_french_nationality | 0|likelihood_difference| 985.10|± | 89.08|
| | |pct_stereotype | 38.74|± | 3.07|
|crows_pairs_french_physical_appearance | 0|likelihood_difference| 821.79|± |132.68|
| | |pct_stereotype | 56.94|± | 5.88|
|crows_pairs_french_race_color | 0|likelihood_difference|1061.17|± | 76.68|
| | |pct_stereotype | 41.74|± | 2.30|
|crows_pairs_french_religion | 0|likelihood_difference| 794.02|± | 93.89|
| | |pct_stereotype | 56.52|± | 4.64|
|crows_pairs_french_sexual_orientation | 0|likelihood_difference| 989.08|± |161.13|
| | |pct_stereotype | 71.43|± | 4.76|
|crows_pairs_french_socioeconomic | 0|likelihood_difference| 831.29|± | 87.37|
| | |pct_stereotype | 52.55|± | 3.58|
|ethics_cm | 0|acc | 57.50|± | 0.79|
|ethics_deontology | 0|acc | 54.17|± | 0.83|
| | |em | 6.12| | |
|ethics_justice | 0|acc | 51.70|± | 0.96|
| | |em | 1.33| | |
|ethics_utilitarianism | 0|acc | 50.12|± | 0.72|
|ethics_utilitarianism_original | 0|acc | 93.97|± | 0.34|
|ethics_virtue | 0|acc | 51.82|± | 0.71|
| | |em | 8.14| | |
|toxigen | 0|acc | 42.66|± | 1.61|
| | |acc_norm | 43.19|± | 1.62|
## llama-30B_mathematical_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 3.83|± | 0.20|
| | |f1 |13.91|± | 0.25|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 2.95|± | 0.49|
|math_asdiv | 0|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 4.01|± | 0.90|
|math_geometry | 1|acc | 1.46|± | 0.55|
|math_intermediate_algebra| 1|acc | 0.89|± | 0.31|
|math_num_theory | 1|acc | 2.96|± | 0.73|
|math_prealgebra | 1|acc | 4.13|± | 0.67|
|math_precalc | 1|acc | 1.83|± | 0.57|
|mathqa | 0|acc |30.59|± | 0.84|
| | |acc_norm|30.89|± | 0.85|
## llama-30B_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 0.84|± | 0.09|
| | |f1 | 1.65|± | 0.10|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.00|± | 0.00|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.00|± | 0.00|
|math_prealgebra | 1|acc | 0.11|± | 0.11|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |34.74|± | 0.87|
| | |acc_norm|34.54|± | 0.87|
## llama-30B_mmlu_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------------------------------|------:|--------|----:|---|-----:|
|hendrycksTest-abstract_algebra | 0|acc |26.00|± | 4.41|
| | |acc_norm|29.00|± | 4.56|
|hendrycksTest-anatomy | 0|acc |51.85|± | 4.32|
| | |acc_norm|40.74|± | 4.24|
|hendrycksTest-astronomy | 0|acc |57.24|± | 4.03|
| | |acc_norm|56.58|± | 4.03|
|hendrycksTest-business_ethics | 0|acc |67.00|± | 4.73|
| | |acc_norm|48.00|± | 5.02|
|hendrycksTest-clinical_knowledge | 0|acc |53.21|± | 3.07|
| | |acc_norm|46.42|± | 3.07|
|hendrycksTest-college_biology | 0|acc |61.11|± | 4.08|
| | |acc_norm|42.36|± | 4.13|
|hendrycksTest-college_chemistry | 0|acc |31.00|± | 4.65|
| | |acc_norm|32.00|± | 4.69|
|hendrycksTest-college_computer_science | 0|acc |43.00|± | 4.98|
| | |acc_norm|34.00|± | 4.76|
|hendrycksTest-college_mathematics | 0|acc |37.00|± | 4.85|
| | |acc_norm|30.00|± | 4.61|
|hendrycksTest-college_medicine | 0|acc |51.45|± | 3.81|
| | |acc_norm|43.35|± | 3.78|
|hendrycksTest-college_physics | 0|acc |23.53|± | 4.22|
| | |acc_norm|29.41|± | 4.53|
|hendrycksTest-computer_security | 0|acc |66.00|± | 4.76|
| | |acc_norm|58.00|± | 4.96|
|hendrycksTest-conceptual_physics | 0|acc |51.06|± | 3.27|
| | |acc_norm|32.77|± | 3.07|
|hendrycksTest-econometrics | 0|acc |35.09|± | 4.49|
| | |acc_norm|31.58|± | 4.37|
|hendrycksTest-electrical_engineering | 0|acc |51.72|± | 4.16|
| | |acc_norm|38.62|± | 4.06|
|hendrycksTest-elementary_mathematics | 0|acc |44.18|± | 2.56|
| | |acc_norm|37.04|± | 2.49|
|hendrycksTest-formal_logic | 0|acc |42.06|± | 4.42|
| | |acc_norm|39.68|± | 4.38|
|hendrycksTest-global_facts | 0|acc |47.00|± | 5.02|
| | |acc_norm|37.00|± | 4.85|
|hendrycksTest-high_school_biology | 0|acc |67.10|± | 2.67|
| | |acc_norm|54.52|± | 2.83|
|hendrycksTest-high_school_chemistry | 0|acc |39.90|± | 3.45|
| | |acc_norm|36.95|± | 3.40|
|hendrycksTest-high_school_computer_science | 0|acc |61.00|± | 4.90|
| | |acc_norm|47.00|± | 5.02|
|hendrycksTest-high_school_european_history | 0|acc |69.70|± | 3.59|
| | |acc_norm|56.36|± | 3.87|
|hendrycksTest-high_school_geography | 0|acc |75.76|± | 3.05|
| | |acc_norm|55.05|± | 3.54|
|hendrycksTest-high_school_government_and_politics| 0|acc |80.83|± | 2.84|
| | |acc_norm|61.14|± | 3.52|
|hendrycksTest-high_school_macroeconomics | 0|acc |51.54|± | 2.53|
| | |acc_norm|41.54|± | 2.50|
|hendrycksTest-high_school_mathematics | 0|acc |25.93|± | 2.67|
| | |acc_norm|31.48|± | 2.83|
|hendrycksTest-high_school_microeconomics | 0|acc |58.40|± | 3.20|
| | |acc_norm|48.32|± | 3.25|
|hendrycksTest-high_school_physics | 0|acc |31.79|± | 3.80|
| | |acc_norm|31.13|± | 3.78|
|hendrycksTest-high_school_psychology | 0|acc |77.06|± | 1.80|
| | |acc_norm|55.41|± | 2.13|
|hendrycksTest-high_school_statistics | 0|acc |43.52|± | 3.38|
| | |acc_norm|35.65|± | 3.27|
|hendrycksTest-high_school_us_history | 0|acc |72.06|± | 3.15|
| | |acc_norm|55.39|± | 3.49|
|hendrycksTest-high_school_world_history | 0|acc |69.62|± | 2.99|
| | |acc_norm|56.96|± | 3.22|
|hendrycksTest-human_aging | 0|acc |67.26|± | 3.15|
| | |acc_norm|36.32|± | 3.23|
|hendrycksTest-human_sexuality | 0|acc |70.23|± | 4.01|
| | |acc_norm|46.56|± | 4.37|
|hendrycksTest-international_law | 0|acc |70.25|± | 4.17|
| | |acc_norm|76.86|± | 3.85|
|hendrycksTest-jurisprudence | 0|acc |66.67|± | 4.56|
| | |acc_norm|55.56|± | 4.80|
|hendrycksTest-logical_fallacies | 0|acc |69.94|± | 3.60|
| | |acc_norm|53.99|± | 3.92|
|hendrycksTest-machine_learning | 0|acc |40.18|± | 4.65|
| | |acc_norm|30.36|± | 4.36|
|hendrycksTest-management | 0|acc |71.84|± | 4.45|
| | |acc_norm|55.34|± | 4.92|
|hendrycksTest-marketing | 0|acc |84.62|± | 2.36|
| | |acc_norm|76.50|± | 2.78|
|hendrycksTest-medical_genetics | 0|acc |60.00|± | 4.92|
| | |acc_norm|54.00|± | 5.01|
|hendrycksTest-miscellaneous | 0|acc |81.86|± | 1.38|
| | |acc_norm|61.43|± | 1.74|
|hendrycksTest-moral_disputes | 0|acc |61.85|± | 2.62|
| | |acc_norm|45.95|± | 2.68|
|hendrycksTest-moral_scenarios | 0|acc |34.30|± | 1.59|
| | |acc_norm|27.26|± | 1.49|
|hendrycksTest-nutrition | 0|acc |61.11|± | 2.79|
| | |acc_norm|50.33|± | 2.86|
|hendrycksTest-philosophy | 0|acc |67.52|± | 2.66|
| | |acc_norm|50.16|± | 2.84|
|hendrycksTest-prehistory | 0|acc |66.36|± | 2.63|
| | |acc_norm|42.90|± | 2.75|
|hendrycksTest-professional_accounting | 0|acc |39.72|± | 2.92|
| | |acc_norm|33.69|± | 2.82|
|hendrycksTest-professional_law | 0|acc |40.03|± | 1.25|
| | |acc_norm|34.35|± | 1.21|
|hendrycksTest-professional_medicine | 0|acc |55.51|± | 3.02|
| | |acc_norm|35.66|± | 2.91|
|hendrycksTest-professional_psychology | 0|acc |58.82|± | 1.99|
| | |acc_norm|43.30|± | 2.00|
|hendrycksTest-public_relations | 0|acc |64.55|± | 4.58|
| | |acc_norm|40.91|± | 4.71|
|hendrycksTest-security_studies | 0|acc |57.14|± | 3.17|
| | |acc_norm|40.41|± | 3.14|
|hendrycksTest-sociology | 0|acc |76.12|± | 3.01|
| | |acc_norm|66.17|± | 3.35|
|hendrycksTest-us_foreign_policy | 0|acc |79.00|± | 4.09|
| | |acc_norm|59.00|± | 4.94|
|hendrycksTest-virology | 0|acc |49.40|± | 3.89|
| | |acc_norm|34.34|± | 3.70|
|hendrycksTest-world_religions | 0|acc |81.29|± | 2.99|
| | |acc_norm|76.61|± | 3.25|
## llama-30B_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |58.20|± | 1.10|
|pawsx_en| 0|acc |58.75|± | 1.10|
|pawsx_es| 0|acc |55.80|± | 1.11|
|pawsx_fr| 0|acc |52.85|± | 1.12|
|pawsx_ja| 0|acc |46.75|± | 1.12|
|pawsx_ko| 0|acc |45.70|± | 1.11|
|pawsx_zh| 0|acc |45.90|± | 1.11|
## llama-30B_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 47.2|± | 2.23|
|xcopa_ht| 0|acc | 51.8|± | 2.24|
|xcopa_id| 0|acc | 60.6|± | 2.19|
|xcopa_it| 0|acc | 71.4|± | 2.02|
|xcopa_qu| 0|acc | 49.4|± | 2.24|
|xcopa_sw| 0|acc | 52.4|± | 2.24|
|xcopa_ta| 0|acc | 53.2|± | 2.23|
|xcopa_th| 0|acc | 54.6|± | 2.23|
|xcopa_tr| 0|acc | 52.2|± | 2.24|
|xcopa_vi| 0|acc | 52.4|± | 2.24|
|xcopa_zh| 0|acc | 62.2|± | 2.17|
## llama-30B_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |34.49|± | 0.67|
|xnli_bg| 0|acc |38.52|± | 0.69|
|xnli_de| 0|acc |43.87|± | 0.70|
|xnli_el| 0|acc |34.91|± | 0.67|
|xnli_en| 0|acc |48.18|± | 0.71|
|xnli_es| 0|acc |40.24|± | 0.69|
|xnli_fr| 0|acc |42.95|± | 0.70|
|xnli_hi| 0|acc |36.47|± | 0.68|
|xnli_ru| 0|acc |38.12|± | 0.69|
|xnli_sw| 0|acc |34.09|± | 0.67|
|xnli_th| 0|acc |33.97|± | 0.67|
|xnli_tr| 0|acc |36.53|± | 0.68|
|xnli_ur| 0|acc |34.31|± | 0.67|
|xnli_vi| 0|acc |35.67|± | 0.68|
|xnli_zh| 0|acc |33.51|± | 0.67|
## llama-30B_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |50.89|± | 1.29|
|xstory_cloze_en| 0|acc |78.16|± | 1.06|
|xstory_cloze_es| 0|acc |70.81|± | 1.17|
|xstory_cloze_eu| 0|acc |51.36|± | 1.29|
|xstory_cloze_hi| 0|acc |56.65|± | 1.28|
|xstory_cloze_id| 0|acc |59.23|± | 1.26|
|xstory_cloze_my| 0|acc |48.78|± | 1.29|
|xstory_cloze_ru| 0|acc |66.71|± | 1.21|
|xstory_cloze_sw| 0|acc |50.63|± | 1.29|
|xstory_cloze_te| 0|acc |53.21|± | 1.28|
|xstory_cloze_zh| 0|acc |58.57|± | 1.27|
## llama-30B_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |87.40|± | 0.69|
|xwinograd_fr| 0|acc |73.49|± | 4.87|
|xwinograd_jp| 0|acc |67.36|± | 1.51|
|xwinograd_pt| 0|acc |76.81|± | 2.61|
|xwinograd_ru| 0|acc |66.98|± | 2.65|
|xwinograd_zh| 0|acc |71.23|± | 2.02|
# llama-7B
## llama-7B_anli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|anli_r1| 0|acc |34.80|± | 1.51|
|anli_r2| 0|acc |33.70|± | 1.50|
|anli_r3| 0|acc |36.58|± | 1.39|
## llama-7B_arithmetic_5-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------------|------:|------|----:|---|-----:|
|arithmetic_1dc| 0|acc | 0|± | 0|
|arithmetic_2da| 0|acc | 0|± | 0|
|arithmetic_2dm| 0|acc | 0|± | 0|
|arithmetic_2ds| 0|acc | 0|± | 0|
|arithmetic_3da| 0|acc | 0|± | 0|
|arithmetic_3ds| 0|acc | 0|± | 0|
|arithmetic_4da| 0|acc | 0|± | 0|
|arithmetic_4ds| 0|acc | 0|± | 0|
|arithmetic_5da| 0|acc | 0|± | 0|
|arithmetic_5ds| 0|acc | 0|± | 0|
## llama-7B_bbh_3-shot.json
| Task |Version| Metric |Value| |Stderr|
|------------------------------------------------|------:|---------------------|----:|---|-----:|
|bigbench_causal_judgement | 0|multiple_choice_grade|48.42|± | 3.64|
|bigbench_date_understanding | 0|multiple_choice_grade|62.06|± | 2.53|
|bigbench_disambiguation_qa | 0|multiple_choice_grade|35.27|± | 2.98|
|bigbench_dyck_languages | 0|multiple_choice_grade|15.40|± | 1.14|
|bigbench_formal_fallacies_syllogisms_negation | 0|multiple_choice_grade|51.35|± | 0.42|
|bigbench_geometric_shapes | 0|multiple_choice_grade|17.83|± | 2.02|
| | |exact_str_match | 0.00|± | 0.00|
|bigbench_hyperbaton | 0|multiple_choice_grade|49.51|± | 0.22|
|bigbench_logical_deduction_five_objects | 0|multiple_choice_grade|29.00|± | 2.03|
|bigbench_logical_deduction_seven_objects | 0|multiple_choice_grade|24.57|± | 1.63|
|bigbench_logical_deduction_three_objects | 0|multiple_choice_grade|39.33|± | 2.83|
|bigbench_movie_recommendation | 0|multiple_choice_grade|40.40|± | 2.20|
|bigbench_navigate | 0|multiple_choice_grade|49.50|± | 1.58|
|bigbench_reasoning_about_colored_objects | 0|multiple_choice_grade|34.60|± | 1.06|
|bigbench_ruin_names | 0|multiple_choice_grade|29.91|± | 2.17|
|bigbench_salient_translation_error_detection | 0|multiple_choice_grade|16.53|± | 1.18|
|bigbench_snarks | 0|multiple_choice_grade|50.83|± | 3.73|
|bigbench_sports_understanding | 0|multiple_choice_grade|50.00|± | 1.59|
|bigbench_temporal_sequences | 0|multiple_choice_grade|27.20|± | 1.41|
|bigbench_tracking_shuffled_objects_five_objects | 0|multiple_choice_grade|18.24|± | 1.09|
|bigbench_tracking_shuffled_objects_seven_objects| 0|multiple_choice_grade|13.71|± | 0.82|
|bigbench_tracking_shuffled_objects_three_objects| 0|multiple_choice_grade|39.33|± | 2.83|
## llama-7B_blimp_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------------------------------------------------|------:|------|----:|---|-----:|
|blimp_adjunct_island | 0|acc | 53.9|± | 1.58|
|blimp_anaphor_gender_agreement | 0|acc | 44.8|± | 1.57|
|blimp_anaphor_number_agreement | 0|acc | 65.9|± | 1.50|
|blimp_animate_subject_passive | 0|acc | 62.6|± | 1.53|
|blimp_animate_subject_trans | 0|acc | 76.1|± | 1.35|
|blimp_causative | 0|acc | 50.8|± | 1.58|
|blimp_complex_NP_island | 0|acc | 41.6|± | 1.56|
|blimp_coordinate_structure_constraint_complex_left_branch| 0|acc | 68.2|± | 1.47|
|blimp_coordinate_structure_constraint_object_extraction | 0|acc | 62.9|± | 1.53|
|blimp_determiner_noun_agreement_1 | 0|acc | 63.6|± | 1.52|
|blimp_determiner_noun_agreement_2 | 0|acc | 59.8|± | 1.55|
|blimp_determiner_noun_agreement_irregular_1 | 0|acc | 57.2|± | 1.57|
|blimp_determiner_noun_agreement_irregular_2 | 0|acc | 60.2|± | 1.55|
|blimp_determiner_noun_agreement_with_adj_2 | 0|acc | 54.0|± | 1.58|
|blimp_determiner_noun_agreement_with_adj_irregular_1 | 0|acc | 56.3|± | 1.57|
|blimp_determiner_noun_agreement_with_adj_irregular_2 | 0|acc | 59.1|± | 1.56|
|blimp_determiner_noun_agreement_with_adjective_1 | 0|acc | 57.7|± | 1.56|
|blimp_distractor_agreement_relational_noun | 0|acc | 44.1|± | 1.57|
|blimp_distractor_agreement_relative_clause | 0|acc | 31.4|± | 1.47|
|blimp_drop_argument | 0|acc | 70.1|± | 1.45|
|blimp_ellipsis_n_bar_1 | 0|acc | 66.8|± | 1.49|
|blimp_ellipsis_n_bar_2 | 0|acc | 79.4|± | 1.28|
|blimp_existential_there_object_raising | 0|acc | 78.8|± | 1.29|
|blimp_existential_there_quantifiers_1 | 0|acc | 68.3|± | 1.47|
|blimp_existential_there_quantifiers_2 | 0|acc | 67.4|± | 1.48|
|blimp_existential_there_subject_raising | 0|acc | 69.6|± | 1.46|
|blimp_expletive_it_object_raising | 0|acc | 65.9|± | 1.50|
|blimp_inchoative | 0|acc | 42.0|± | 1.56|
|blimp_intransitive | 0|acc | 59.2|± | 1.55|
|blimp_irregular_past_participle_adjectives | 0|acc | 42.9|± | 1.57|
|blimp_irregular_past_participle_verbs | 0|acc | 72.5|± | 1.41|
|blimp_irregular_plural_subject_verb_agreement_1 | 0|acc | 65.3|± | 1.51|
|blimp_irregular_plural_subject_verb_agreement_2 | 0|acc | 70.0|± | 1.45|
|blimp_left_branch_island_echo_question | 0|acc | 83.5|± | 1.17|
|blimp_left_branch_island_simple_question | 0|acc | 74.0|± | 1.39|
|blimp_matrix_question_npi_licensor_present | 0|acc | 11.7|± | 1.02|
|blimp_npi_present_1 | 0|acc | 53.4|± | 1.58|
|blimp_npi_present_2 | 0|acc | 53.0|± | 1.58|
|blimp_only_npi_licensor_present | 0|acc | 81.4|± | 1.23|
|blimp_only_npi_scope | 0|acc | 26.6|± | 1.40|
|blimp_passive_1 | 0|acc | 70.2|± | 1.45|
|blimp_passive_2 | 0|acc | 70.3|± | 1.45|
|blimp_principle_A_c_command | 0|acc | 39.0|± | 1.54|
|blimp_principle_A_case_1 | 0|acc | 98.5|± | 0.38|
|blimp_principle_A_case_2 | 0|acc | 55.4|± | 1.57|
|blimp_principle_A_domain_1 | 0|acc | 96.2|± | 0.60|
|blimp_principle_A_domain_2 | 0|acc | 64.6|± | 1.51|
|blimp_principle_A_domain_3 | 0|acc | 50.1|± | 1.58|
|blimp_principle_A_reconstruction | 0|acc | 67.3|± | 1.48|
|blimp_regular_plural_subject_verb_agreement_1 | 0|acc | 64.5|± | 1.51|
|blimp_regular_plural_subject_verb_agreement_2 | 0|acc | 70.5|± | 1.44|
|blimp_sentential_negation_npi_licensor_present | 0|acc | 94.0|± | 0.75|
|blimp_sentential_negation_npi_scope | 0|acc | 58.8|± | 1.56|
|blimp_sentential_subject_island | 0|acc | 60.6|± | 1.55|
|blimp_superlative_quantifiers_1 | 0|acc | 61.2|± | 1.54|
|blimp_superlative_quantifiers_2 | 0|acc | 56.1|± | 1.57|
|blimp_tough_vs_raising_1 | 0|acc | 29.8|± | 1.45|
|blimp_tough_vs_raising_2 | 0|acc | 76.8|± | 1.34|
|blimp_transitive | 0|acc | 69.8|± | 1.45|
|blimp_wh_island | 0|acc | 27.5|± | 1.41|
|blimp_wh_questions_object_gap | 0|acc | 67.0|± | 1.49|
|blimp_wh_questions_subject_gap | 0|acc | 72.0|± | 1.42|
|blimp_wh_questions_subject_gap_long_distance | 0|acc | 74.6|± | 1.38|
|blimp_wh_vs_that_no_gap | 0|acc | 84.8|± | 1.14|
|blimp_wh_vs_that_no_gap_long_distance | 0|acc | 81.2|± | 1.24|
|blimp_wh_vs_that_with_gap | 0|acc | 23.9|± | 1.35|
|blimp_wh_vs_that_with_gap_long_distance | 0|acc | 22.7|± | 1.33|
## llama-7B_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |38.23|± | 1.42|
| | |acc_norm|41.38|± | 1.44|
|arc_easy | 0|acc |67.38|± | 0.96|
| | |acc_norm|52.48|± | 1.02|
|boolq | 1|acc |73.06|± | 0.78|
|copa | 0|acc |84.00|± | 3.68|
|hellaswag | 0|acc |56.39|± | 0.49|
| | |acc_norm|72.98|± | 0.44|
|mc_taco | 0|em |11.26| | |
| | |f1 |48.27| | |
|openbookqa | 0|acc |28.20|± | 2.01|
| | |acc_norm|42.40|± | 2.21|
|piqa | 0|acc |78.18|± | 0.96|
| | |acc_norm|77.42|± | 0.98|
|prost | 0|acc |25.69|± | 0.32|
| | |acc_norm|28.03|± | 0.33|
|swag | 0|acc |55.47|± | 0.35|
| | |acc_norm|66.87|± | 0.33|
|winogrande | 0|acc |66.93|± | 1.32|
|wsc273 | 0|acc |80.95|± | 2.38|
## llama-7B_glue_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|cola | 0|mcc | 0.00|± | 0.00|
|mnli | 0|acc |34.40|± | 0.48|
|mnli_mismatched| 0|acc |35.72|± | 0.48|
|mrpc | 0|acc |68.38|± | 2.30|
| | |f1 |81.22|± | 1.62|
|qnli | 0|acc |49.57|± | 0.68|
|qqp | 0|acc |36.84|± | 0.24|
| | |f1 |53.81|± | 0.26|
|rte | 0|acc |53.07|± | 3.00|
|sst | 0|acc |52.98|± | 1.69|
|wnli | 1|acc |56.34|± | 5.93|
## llama-7B_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc | 8.04|± | 0.75|
## llama-7B_human_alignment_0-shot.json
| Task |Version| Metric | Value | |Stderr|
|---------------------------------------|------:|---------------------|------:|---|-----:|
|crows_pairs_english_age | 0|likelihood_difference| 594.23|± | 79.03|
| | |pct_stereotype | 51.65|± | 5.27|
|crows_pairs_english_autre | 0|likelihood_difference|1101.14|± |589.08|
| | |pct_stereotype | 45.45|± | 15.75|
|crows_pairs_english_disability | 0|likelihood_difference| 966.97|± |113.86|
| | |pct_stereotype | 66.15|± | 5.91|
|crows_pairs_english_gender | 0|likelihood_difference| 791.74|± | 55.02|
| | |pct_stereotype | 53.12|± | 2.79|
|crows_pairs_english_nationality | 0|likelihood_difference| 676.26|± | 58.69|
| | |pct_stereotype | 53.70|± | 3.40|
|crows_pairs_english_physical_appearance| 0|likelihood_difference| 451.26|± | 69.32|
| | |pct_stereotype | 50.00|± | 5.93|
|crows_pairs_english_race_color | 0|likelihood_difference| 624.65|± | 32.39|
| | |pct_stereotype | 46.65|± | 2.22|
|crows_pairs_english_religion | 0|likelihood_difference| 721.96|± | 75.92|
| | |pct_stereotype | 66.67|± | 4.49|
|crows_pairs_english_sexual_orientation | 0|likelihood_difference| 830.48|± | 84.28|
| | |pct_stereotype | 62.37|± | 5.05|
|crows_pairs_english_socioeconomic | 0|likelihood_difference| 640.16|± | 54.20|
| | |pct_stereotype | 56.84|± | 3.60|
|crows_pairs_french_age | 0|likelihood_difference|1193.96|± |153.77|
| | |pct_stereotype | 35.56|± | 5.07|
|crows_pairs_french_autre | 0|likelihood_difference| 751.20|± |209.58|
| | |pct_stereotype | 61.54|± | 14.04|
|crows_pairs_french_disability | 0|likelihood_difference|1014.77|± |139.07|
| | |pct_stereotype | 42.42|± | 6.13|
|crows_pairs_french_gender | 0|likelihood_difference|1179.90|± | 87.14|
| | |pct_stereotype | 52.02|± | 2.79|
|crows_pairs_french_nationality | 0|likelihood_difference|1041.65|± | 90.66|
| | |pct_stereotype | 40.71|± | 3.09|
|crows_pairs_french_physical_appearance | 0|likelihood_difference| 704.51|± | 94.84|
| | |pct_stereotype | 55.56|± | 5.90|
|crows_pairs_french_race_color | 0|likelihood_difference|1204.89|± | 73.32|
| | |pct_stereotype | 43.26|± | 2.31|
|crows_pairs_french_religion | 0|likelihood_difference| 958.53|± | 87.50|
| | |pct_stereotype | 43.48|± | 4.64|
|crows_pairs_french_sexual_orientation | 0|likelihood_difference| 760.58|± | 79.39|
| | |pct_stereotype | 67.03|± | 4.96|
|crows_pairs_french_socioeconomic | 0|likelihood_difference| 980.84|± |101.51|
| | |pct_stereotype | 52.04|± | 3.58|
|ethics_cm | 0|acc | 56.91|± | 0.79|
|ethics_deontology | 0|acc | 50.58|± | 0.83|
| | |em | 0.22| | |
|ethics_justice | 0|acc | 49.96|± | 0.96|
| | |em | 0.15| | |
|ethics_utilitarianism | 0|acc | 49.81|± | 0.72|
|ethics_utilitarianism_original | 0|acc | 95.86|± | 0.29|
|ethics_virtue | 0|acc | 20.98|± | 0.58|
| | |em | 0.00| | |
|toxigen | 0|acc | 43.09|± | 1.62|
| | |acc_norm | 43.19|± | 1.62|
## llama-7B_lambada_0-shot.json
| Task |Version|Metric| Value | | Stderr |
|----------------------|------:|------|---------:|---|--------:|
|lambada_openai | 0|ppl |2817465.09|± |138319.09|
| | |acc | 0.00|± | 0.00|
|lambada_openai_cloze | 0|ppl | 255777.71|± | 11345.77|
| | |acc | 0.04|± | 0.03|
|lambada_openai_mt_de | 0|ppl |1805613.68|± | 97892.79|
| | |acc | 0.00|± | 0.00|
|lambada_openai_mt_en | 0|ppl |2817465.09|± |138319.09|
| | |acc | 0.00|± | 0.00|
|lambada_openai_mt_es | 0|ppl |3818890.45|± |197999.05|
| | |acc | 0.00|± | 0.00|
|lambada_openai_mt_fr | 0|ppl |2111186.12|± |111724.43|
| | |acc | 0.00|± | 0.00|
|lambada_openai_mt_it | 0|ppl |3653680.57|± |197082.99|
| | |acc | 0.00|± | 0.00|
|lambada_standard | 0|ppl |2460346.86|± | 81216.57|
| | |acc | 0.00|± | 0.00|
|lambada_standard_cloze| 0|ppl |6710057.24|± |169833.91|
| | |acc | 0.00|± | 0.00|
## llama-7B_mathematical_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 4.27|± | 0.21|
| | |f1 |12.16|± | 0.25|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 1.68|± | 0.37|
|math_asdiv | 0|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 1.69|± | 0.59|
|math_geometry | 1|acc | 0.84|± | 0.42|
|math_intermediate_algebra| 1|acc | 0.66|± | 0.27|
|math_num_theory | 1|acc | 0.74|± | 0.37|
|math_prealgebra | 1|acc | 1.26|± | 0.38|
|math_precalc | 1|acc | 0.37|± | 0.26|
|mathqa | 0|acc |26.77|± | 0.81|
| | |acc_norm|27.87|± | 0.82|
## llama-7B_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 1.24|± | 0.11|
| | |f1 | 2.10|± | 0.13|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.00|± | 0.00|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.00|± | 0.00|
|math_prealgebra | 1|acc | 0.11|± | 0.11|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |28.21|± | 0.82|
| | |acc_norm|28.78|± | 0.83|
## llama-7B_mmlu_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------------------------------|------:|--------|----:|---|-----:|
|hendrycksTest-abstract_algebra | 0|acc |23.00|± | 4.23|
| | |acc_norm|26.00|± | 4.41|
|hendrycksTest-anatomy | 0|acc |38.52|± | 4.20|
| | |acc_norm|28.15|± | 3.89|
|hendrycksTest-astronomy | 0|acc |45.39|± | 4.05|
| | |acc_norm|46.05|± | 4.06|
|hendrycksTest-business_ethics | 0|acc |53.00|± | 5.02|
| | |acc_norm|46.00|± | 5.01|
|hendrycksTest-clinical_knowledge | 0|acc |38.87|± | 3.00|
| | |acc_norm|38.11|± | 2.99|
|hendrycksTest-college_biology | 0|acc |31.94|± | 3.90|
| | |acc_norm|29.17|± | 3.80|
|hendrycksTest-college_chemistry | 0|acc |33.00|± | 4.73|
| | |acc_norm|30.00|± | 4.61|
|hendrycksTest-college_computer_science | 0|acc |33.00|± | 4.73|
| | |acc_norm|28.00|± | 4.51|
|hendrycksTest-college_mathematics | 0|acc |32.00|± | 4.69|
| | |acc_norm|32.00|± | 4.69|
|hendrycksTest-college_medicine | 0|acc |37.57|± | 3.69|
| | |acc_norm|30.64|± | 3.51|
|hendrycksTest-college_physics | 0|acc |23.53|± | 4.22|
| | |acc_norm|32.35|± | 4.66|
|hendrycksTest-computer_security | 0|acc |37.00|± | 4.85|
| | |acc_norm|44.00|± | 4.99|
|hendrycksTest-conceptual_physics | 0|acc |32.77|± | 3.07|
| | |acc_norm|21.70|± | 2.69|
|hendrycksTest-econometrics | 0|acc |28.95|± | 4.27|
| | |acc_norm|26.32|± | 4.14|
|hendrycksTest-electrical_engineering | 0|acc |35.86|± | 4.00|
| | |acc_norm|32.41|± | 3.90|
|hendrycksTest-elementary_mathematics | 0|acc |32.01|± | 2.40|
| | |acc_norm|29.10|± | 2.34|
|hendrycksTest-formal_logic | 0|acc |30.95|± | 4.13|
| | |acc_norm|34.92|± | 4.26|
|hendrycksTest-global_facts | 0|acc |32.00|± | 4.69|
| | |acc_norm|29.00|± | 4.56|
|hendrycksTest-high_school_biology | 0|acc |35.81|± | 2.73|
| | |acc_norm|35.81|± | 2.73|
|hendrycksTest-high_school_chemistry | 0|acc |25.12|± | 3.05|
| | |acc_norm|29.56|± | 3.21|
|hendrycksTest-high_school_computer_science | 0|acc |41.00|± | 4.94|
| | |acc_norm|34.00|± | 4.76|
|hendrycksTest-high_school_european_history | 0|acc |40.61|± | 3.83|
| | |acc_norm|36.97|± | 3.77|
|hendrycksTest-high_school_geography | 0|acc |42.93|± | 3.53|
| | |acc_norm|36.36|± | 3.43|
|hendrycksTest-high_school_government_and_politics| 0|acc |48.19|± | 3.61|
| | |acc_norm|37.31|± | 3.49|
|hendrycksTest-high_school_macroeconomics | 0|acc |31.79|± | 2.36|
| | |acc_norm|30.26|± | 2.33|
|hendrycksTest-high_school_mathematics | 0|acc |22.59|± | 2.55|
| | |acc_norm|30.74|± | 2.81|
|hendrycksTest-high_school_microeconomics | 0|acc |38.66|± | 3.16|
| | |acc_norm|36.55|± | 3.13|
|hendrycksTest-high_school_physics | 0|acc |20.53|± | 3.30|
| | |acc_norm|27.15|± | 3.63|
|hendrycksTest-high_school_psychology | 0|acc |46.61|± | 2.14|
| | |acc_norm|30.83|± | 1.98|
|hendrycksTest-high_school_statistics | 0|acc |34.26|± | 3.24|
| | |acc_norm|34.26|± | 3.24|
|hendrycksTest-high_school_us_history | 0|acc |42.65|± | 3.47|
| | |acc_norm|31.37|± | 3.26|
|hendrycksTest-high_school_world_history | 0|acc |39.24|± | 3.18|
| | |acc_norm|33.76|± | 3.08|
|hendrycksTest-human_aging | 0|acc |37.22|± | 3.24|
| | |acc_norm|25.11|± | 2.91|
|hendrycksTest-human_sexuality | 0|acc |51.15|± | 4.38|
| | |acc_norm|36.64|± | 4.23|
|hendrycksTest-international_law | 0|acc |38.84|± | 4.45|
| | |acc_norm|57.85|± | 4.51|
|hendrycksTest-jurisprudence | 0|acc |43.52|± | 4.79|
| | |acc_norm|50.00|± | 4.83|
|hendrycksTest-logical_fallacies | 0|acc |38.04|± | 3.81|
| | |acc_norm|34.97|± | 3.75|
|hendrycksTest-machine_learning | 0|acc |30.36|± | 4.36|
| | |acc_norm|26.79|± | 4.20|
|hendrycksTest-management | 0|acc |48.54|± | 4.95|
| | |acc_norm|36.89|± | 4.78|
|hendrycksTest-marketing | 0|acc |61.11|± | 3.19|
| | |acc_norm|50.43|± | 3.28|
|hendrycksTest-medical_genetics | 0|acc |44.00|± | 4.99|
| | |acc_norm|40.00|± | 4.92|
|hendrycksTest-miscellaneous | 0|acc |58.37|± | 1.76|
| | |acc_norm|38.95|± | 1.74|
|hendrycksTest-moral_disputes | 0|acc |36.42|± | 2.59|
| | |acc_norm|33.24|± | 2.54|
|hendrycksTest-moral_scenarios | 0|acc |27.60|± | 1.50|
| | |acc_norm|27.26|± | 1.49|
|hendrycksTest-nutrition | 0|acc |39.54|± | 2.80|
| | |acc_norm|43.79|± | 2.84|
|hendrycksTest-philosophy | 0|acc |40.19|± | 2.78|
| | |acc_norm|35.37|± | 2.72|
|hendrycksTest-prehistory | 0|acc |40.12|± | 2.73|
| | |acc_norm|27.78|± | 2.49|
|hendrycksTest-professional_accounting | 0|acc |30.14|± | 2.74|
| | |acc_norm|29.43|± | 2.72|
|hendrycksTest-professional_law | 0|acc |29.66|± | 1.17|
| | |acc_norm|28.55|± | 1.15|
|hendrycksTest-professional_medicine | 0|acc |33.82|± | 2.87|
| | |acc_norm|27.94|± | 2.73|
|hendrycksTest-professional_psychology | 0|acc |38.40|± | 1.97|
| | |acc_norm|29.90|± | 1.85|
|hendrycksTest-public_relations | 0|acc |39.09|± | 4.67|
| | |acc_norm|22.73|± | 4.01|
|hendrycksTest-security_studies | 0|acc |40.82|± | 3.15|
| | |acc_norm|31.02|± | 2.96|
|hendrycksTest-sociology | 0|acc |47.76|± | 3.53|
| | |acc_norm|42.79|± | 3.50|
|hendrycksTest-us_foreign_policy | 0|acc |56.00|± | 4.99|
| | |acc_norm|45.00|± | 5.00|
|hendrycksTest-virology | 0|acc |39.76|± | 3.81|
| | |acc_norm|28.92|± | 3.53|
|hendrycksTest-world_religions | 0|acc |62.57|± | 3.71|
| | |acc_norm|51.46|± | 3.83|
## llama-7B_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |54.65|± | 1.11|
|pawsx_en| 0|acc |61.85|± | 1.09|
|pawsx_es| 0|acc |56.10|± | 1.11|
|pawsx_fr| 0|acc |52.95|± | 1.12|
|pawsx_ja| 0|acc |56.70|± | 1.11|
|pawsx_ko| 0|acc |49.70|± | 1.12|
|pawsx_zh| 0|acc |49.10|± | 1.12|
## llama-7B_question_answering_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|------------|----:|---|-----:|
|headqa_en | 0|acc |32.42|± | 0.89|
| | |acc_norm |35.92|± | 0.92|
|headqa_es | 0|acc |28.26|± | 0.86|
| | |acc_norm |32.42|± | 0.89|
|logiqa | 0|acc |21.81|± | 1.62|
| | |acc_norm |30.26|± | 1.80|
|squad2 | 1|exact | 9.42| | |
| | |f1 |19.45| | |
| | |HasAns_exact|18.49| | |
| | |HasAns_f1 |38.58| | |
| | |NoAns_exact | 0.37| | |
| | |NoAns_f1 | 0.37| | |
| | |best_exact |50.07| | |
| | |best_f1 |50.08| | |
|triviaqa | 1|acc | 0.00|± | 0.00|
|truthfulqa_mc| 1|mc1 |21.05|± | 1.43|
| | |mc2 |34.14|± | 1.31|
|webqs | 0|acc | 0.00|± | 0.00|
## llama-7B_reading_comprehension_0-shot.json
|Task|Version|Metric|Value| |Stderr|
|----|------:|------|----:|---|-----:|
|coqa| 1|f1 |75.21|± | 1.53|
| | |em |62.67|± | 1.88|
|drop| 1|em | 3.59|± | 0.19|
| | |f1 |11.35|± | 0.23|
|race| 1|acc |39.90|± | 1.52|
## llama-7B_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 48.8|± | 2.24|
|xcopa_ht| 0|acc | 51.0|± | 2.24|
|xcopa_id| 0|acc | 54.6|± | 2.23|
|xcopa_it| 0|acc | 62.0|± | 2.17|
|xcopa_qu| 0|acc | 51.4|± | 2.24|
|xcopa_sw| 0|acc | 50.8|± | 2.24|
|xcopa_ta| 0|acc | 55.2|± | 2.23|
|xcopa_th| 0|acc | 55.8|± | 2.22|
|xcopa_tr| 0|acc | 55.6|± | 2.22|
|xcopa_vi| 0|acc | 51.6|± | 2.24|
|xcopa_zh| 0|acc | 56.2|± | 2.22|
## llama-7B_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.57|± | 0.67|
|xnli_bg| 0|acc |36.99|± | 0.68|
|xnli_de| 0|acc |44.77|± | 0.70|
|xnli_el| 0|acc |34.93|± | 0.67|
|xnli_en| 0|acc |51.06|± | 0.71|
|xnli_es| 0|acc |40.62|± | 0.69|
|xnli_fr| 0|acc |43.75|± | 0.70|
|xnli_hi| 0|acc |36.11|± | 0.68|
|xnli_ru| 0|acc |39.36|± | 0.69|
|xnli_sw| 0|acc |33.71|± | 0.67|
|xnli_th| 0|acc |34.51|± | 0.67|
|xnli_tr| 0|acc |35.59|± | 0.68|
|xnli_ur| 0|acc |33.39|± | 0.67|
|xnli_vi| 0|acc |35.59|± | 0.68|
|xnli_zh| 0|acc |36.23|± | 0.68|
## llama-7B_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |48.31|± | 1.29|
|xstory_cloze_en| 0|acc |74.78|± | 1.12|
|xstory_cloze_es| 0|acc |65.12|± | 1.23|
|xstory_cloze_eu| 0|acc |50.10|± | 1.29|
|xstory_cloze_hi| 0|acc |52.68|± | 1.28|
|xstory_cloze_id| 0|acc |52.08|± | 1.29|
|xstory_cloze_my| 0|acc |48.71|± | 1.29|
|xstory_cloze_ru| 0|acc |61.35|± | 1.25|
|xstory_cloze_sw| 0|acc |50.36|± | 1.29|
|xstory_cloze_te| 0|acc |52.88|± | 1.28|
|xstory_cloze_zh| 0|acc |54.33|± | 1.28|
## llama-7B_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |84.95|± | 0.74|
|xwinograd_fr| 0|acc |72.29|± | 4.94|
|xwinograd_jp| 0|acc |58.92|± | 1.59|
|xwinograd_pt| 0|acc |70.72|± | 2.81|
|xwinograd_ru| 0|acc |64.44|± | 2.70|
|xwinograd_zh| 0|acc |63.69|± | 2.14|
# xglm-1.7B
## xglm-1.7B_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |20.99|± | 1.19|
| | |acc_norm|24.32|± | 1.25|
|arc_easy | 0|acc |53.62|± | 1.02|
| | |acc_norm|47.90|± | 1.03|
|boolq | 1|acc |58.56|± | 0.86|
|copa | 0|acc |68.00|± | 4.69|
|hellaswag | 0|acc |36.18|± | 0.48|
| | |acc_norm|45.80|± | 0.50|
|mc_taco | 0|em |12.91| | |
| | |f1 |34.52| | |
|openbookqa | 0|acc |17.00|± | 1.68|
| | |acc_norm|29.80|± | 2.05|
|piqa | 0|acc |69.70|± | 1.07|
| | |acc_norm|70.35|± | 1.07|
|prost | 0|acc |22.69|± | 0.31|
| | |acc_norm|27.21|± | 0.33|
|swag | 0|acc |45.97|± | 0.35|
| | |acc_norm|62.19|± | 0.34|
|winogrande | 0|acc |54.93|± | 1.40|
|wsc273 | 0|acc |68.13|± | 2.83|
## xglm-1.7B_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc | 0.99|± | 0.27|
## xglm-1.7B_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 0.67|± | 0.08|
| | |f1 | 3.44|± | 0.13|
|gsm8k | 0|acc | 0.83|± | 0.25|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.00|± | 0.00|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.00|± | 0.00|
|math_prealgebra | 1|acc | 0.00|± | 0.00|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |22.91|± | 0.77|
| | |acc_norm|21.44|± | 0.75|
## xglm-1.7B_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |57.55|± | 1.11|
|pawsx_en| 0|acc |52.65|± | 1.12|
|pawsx_es| 0|acc |53.80|± | 1.12|
|pawsx_fr| 0|acc |47.35|± | 1.12|
|pawsx_ja| 0|acc |46.10|± | 1.11|
|pawsx_ko| 0|acc |51.40|± | 1.12|
|pawsx_zh| 0|acc |48.10|± | 1.12|
## xglm-1.7B_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 56.8|± | 2.22|
|xcopa_ht| 0|acc | 55.8|± | 2.22|
|xcopa_id| 0|acc | 64.6|± | 2.14|
|xcopa_it| 0|acc | 54.0|± | 2.23|
|xcopa_qu| 0|acc | 52.2|± | 2.24|
|xcopa_sw| 0|acc | 56.6|± | 2.22|
|xcopa_ta| 0|acc | 55.2|± | 2.23|
|xcopa_th| 0|acc | 58.2|± | 2.21|
|xcopa_tr| 0|acc | 53.4|± | 2.23|
|xcopa_vi| 0|acc | 63.0|± | 2.16|
|xcopa_zh| 0|acc | 58.0|± | 2.21|
## xglm-1.7B_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.51|± | 0.67|
|xnli_bg| 0|acc |44.73|± | 0.70|
|xnli_de| 0|acc |45.33|± | 0.70|
|xnli_el| 0|acc |40.10|± | 0.69|
|xnli_en| 0|acc |49.68|± | 0.71|
|xnli_es| 0|acc |43.61|± | 0.70|
|xnli_fr| 0|acc |45.73|± | 0.70|
|xnli_hi| 0|acc |42.61|± | 0.70|
|xnli_ru| 0|acc |45.97|± | 0.70|
|xnli_sw| 0|acc |42.00|± | 0.70|
|xnli_th| 0|acc |41.70|± | 0.70|
|xnli_tr| 0|acc |42.95|± | 0.70|
|xnli_ur| 0|acc |39.50|± | 0.69|
|xnli_vi| 0|acc |45.03|± | 0.70|
|xnli_zh| 0|acc |33.77|± | 0.67|
## xglm-1.7B_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |52.48|± | 1.29|
|xstory_cloze_en| 0|acc |64.33|± | 1.23|
|xstory_cloze_es| 0|acc |59.23|± | 1.26|
|xstory_cloze_eu| 0|acc |56.12|± | 1.28|
|xstory_cloze_hi| 0|acc |55.79|± | 1.28|
|xstory_cloze_id| 0|acc |57.97|± | 1.27|
|xstory_cloze_my| 0|acc |53.81|± | 1.28|
|xstory_cloze_ru| 0|acc |59.83|± | 1.26|
|xstory_cloze_sw| 0|acc |55.99|± | 1.28|
|xstory_cloze_te| 0|acc |58.04|± | 1.27|
|xstory_cloze_zh| 0|acc |56.19|± | 1.28|
## xglm-1.7B_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |71.05|± | 0.94|
|xwinograd_fr| 0|acc |60.24|± | 5.40|
|xwinograd_jp| 0|acc |60.58|± | 1.58|
|xwinograd_pt| 0|acc |63.88|± | 2.97|
|xwinograd_ru| 0|acc |59.68|± | 2.77|
|xwinograd_zh| 0|acc |69.84|± | 2.05|
# xglm-2.9B
## xglm-2.9B_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |23.46|± | 1.24|
| | |acc_norm|27.39|± | 1.30|
|arc_easy | 0|acc |56.65|± | 1.02|
| | |acc_norm|53.37|± | 1.02|
|boolq | 1|acc |61.44|± | 0.85|
|copa | 0|acc |74.00|± | 4.41|
|hellaswag | 0|acc |40.92|± | 0.49|
| | |acc_norm|53.70|± | 0.50|
|mc_taco | 0|em |11.94| | |
| | |f1 |47.80| | |
|openbookqa | 0|acc |21.60|± | 1.84|
| | |acc_norm|33.20|± | 2.11|
|piqa | 0|acc |71.27|± | 1.06|
| | |acc_norm|73.01|± | 1.04|
|prost | 0|acc |21.92|± | 0.30|
| | |acc_norm|26.64|± | 0.32|
|swag | 0|acc |48.49|± | 0.35|
| | |acc_norm|65.78|± | 0.34|
|winogrande | 0|acc |54.62|± | 1.40|
|wsc273 | 0|acc |71.06|± | 2.75|
## xglm-2.9B_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |50.65|± | 1.12|
|pawsx_en| 0|acc |54.75|± | 1.11|
|pawsx_es| 0|acc |53.15|± | 1.12|
|pawsx_fr| 0|acc |49.70|± | 1.12|
|pawsx_ja| 0|acc |50.95|± | 1.12|
|pawsx_ko| 0|acc |46.75|± | 1.12|
|pawsx_zh| 0|acc |53.70|± | 1.12|
## xglm-2.9B_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 58.2|± | 2.21|
|xcopa_ht| 0|acc | 55.8|± | 2.22|
|xcopa_id| 0|acc | 66.8|± | 2.11|
|xcopa_it| 0|acc | 60.2|± | 2.19|
|xcopa_qu| 0|acc | 50.2|± | 2.24|
|xcopa_sw| 0|acc | 58.8|± | 2.20|
|xcopa_ta| 0|acc | 54.2|± | 2.23|
|xcopa_th| 0|acc | 57.0|± | 2.22|
|xcopa_tr| 0|acc | 56.6|± | 2.22|
|xcopa_vi| 0|acc | 65.2|± | 2.13|
|xcopa_zh| 0|acc | 60.0|± | 2.19|
## xglm-2.9B_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.65|± | 0.67|
|xnli_bg| 0|acc |45.97|± | 0.70|
|xnli_de| 0|acc |48.32|± | 0.71|
|xnli_el| 0|acc |41.40|± | 0.70|
|xnli_en| 0|acc |51.08|± | 0.71|
|xnli_es| 0|acc |46.67|± | 0.70|
|xnli_fr| 0|acc |45.03|± | 0.70|
|xnli_hi| 0|acc |44.03|± | 0.70|
|xnli_ru| 0|acc |45.29|± | 0.70|
|xnli_sw| 0|acc |44.43|± | 0.70|
|xnli_th| 0|acc |41.98|± | 0.70|
|xnli_tr| 0|acc |44.97|± | 0.70|
|xnli_ur| 0|acc |40.10|± | 0.69|
|xnli_vi| 0|acc |45.99|± | 0.70|
|xnli_zh| 0|acc |34.81|± | 0.67|
## xglm-2.9B_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |53.87|± | 1.28|
|xstory_cloze_en| 0|acc |67.31|± | 1.21|
|xstory_cloze_es| 0|acc |60.95|± | 1.26|
|xstory_cloze_eu| 0|acc |56.32|± | 1.28|
|xstory_cloze_hi| 0|acc |57.51|± | 1.27|
|xstory_cloze_id| 0|acc |61.35|± | 1.25|
|xstory_cloze_my| 0|acc |55.20|± | 1.28|
|xstory_cloze_ru| 0|acc |62.21|± | 1.25|
|xstory_cloze_sw| 0|acc |56.72|± | 1.28|
|xstory_cloze_te| 0|acc |60.03|± | 1.26|
|xstory_cloze_zh| 0|acc |57.64|± | 1.27|
## xglm-2.9B_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |75.61|± | 0.89|
|xwinograd_fr| 0|acc |59.04|± | 5.43|
|xwinograd_jp| 0|acc |64.65|± | 1.54|
|xwinograd_pt| 0|acc |66.16|± | 2.92|
|xwinograd_ru| 0|acc |62.86|± | 2.73|
|xwinograd_zh| 0|acc |71.63|± | 2.01|
# xglm-4.5B
## xglm-4.5B_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |27.13|± | 1.30|
| | |acc_norm|28.16|± | 1.31|
|arc_easy | 0|acc |60.31|± | 1.00|
| | |acc_norm|57.24|± | 1.02|
|boolq | 1|acc |61.19|± | 0.85|
|copa | 0|acc |81.00|± | 3.94|
|hellaswag | 0|acc |43.77|± | 0.50|
| | |acc_norm|58.24|± | 0.49|
|mc_taco | 0|em |15.39| | |
| | |f1 |43.51| | |
|openbookqa | 0|acc |23.20|± | 1.89|
| | |acc_norm|34.40|± | 2.13|
|piqa | 0|acc |72.74|± | 1.04|
| | |acc_norm|72.96|± | 1.04|
|prost | 0|acc |26.43|± | 0.32|
| | |acc_norm|26.28|± | 0.32|
|swag | 0|acc |49.65|± | 0.35|
| | |acc_norm|67.87|± | 0.33|
|winogrande | 0|acc |56.12|± | 1.39|
|wsc273 | 0|acc |71.79|± | 2.73|
## xglm-4.5B_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc | 0.08|± | 0.08|
## xglm-4.5B_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |52.65|± | 1.12|
|pawsx_en| 0|acc |55.40|± | 1.11|
|pawsx_es| 0|acc |51.05|± | 1.12|
|pawsx_fr| 0|acc |51.60|± | 1.12|
|pawsx_ja| 0|acc |47.75|± | 1.12|
|pawsx_ko| 0|acc |49.10|± | 1.12|
|pawsx_zh| 0|acc |54.60|± | 1.11|
## xglm-4.5B_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 55.0|± | 2.23|
|xcopa_ht| 0|acc | 51.2|± | 2.24|
|xcopa_id| 0|acc | 67.0|± | 2.10|
|xcopa_it| 0|acc | 61.6|± | 2.18|
|xcopa_qu| 0|acc | 50.0|± | 2.24|
|xcopa_sw| 0|acc | 56.2|± | 2.22|
|xcopa_ta| 0|acc | 55.6|± | 2.22|
|xcopa_th| 0|acc | 55.2|± | 2.23|
|xcopa_tr| 0|acc | 57.2|± | 2.21|
|xcopa_vi| 0|acc | 66.0|± | 2.12|
|xcopa_zh| 0|acc | 61.6|± | 2.18|
## xglm-4.5B_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.59|± | 0.67|
|xnli_bg| 0|acc |45.61|± | 0.70|
|xnli_de| 0|acc |47.11|± | 0.71|
|xnli_el| 0|acc |39.84|± | 0.69|
|xnli_en| 0|acc |53.63|± | 0.70|
|xnli_es| 0|acc |47.68|± | 0.71|
|xnli_fr| 0|acc |47.31|± | 0.71|
|xnli_hi| 0|acc |42.50|± | 0.70|
|xnli_ru| 0|acc |46.15|± | 0.70|
|xnli_sw| 0|acc |39.58|± | 0.69|
|xnli_th| 0|acc |39.68|± | 0.69|
|xnli_tr| 0|acc |44.85|± | 0.70|
|xnli_ur| 0|acc |37.47|± | 0.68|
|xnli_vi| 0|acc |45.87|± | 0.70|
|xnli_zh| 0|acc |34.77|± | 0.67|
## xglm-4.5B_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |53.67|± | 1.28|
|xstory_cloze_en| 0|acc |69.16|± | 1.19|
|xstory_cloze_es| 0|acc |62.81|± | 1.24|
|xstory_cloze_eu| 0|acc |53.74|± | 1.28|
|xstory_cloze_hi| 0|acc |56.85|± | 1.27|
|xstory_cloze_id| 0|acc |60.42|± | 1.26|
|xstory_cloze_my| 0|acc |50.76|± | 1.29|
|xstory_cloze_ru| 0|acc |62.74|± | 1.24|
|xstory_cloze_sw| 0|acc |55.06|± | 1.28|
|xstory_cloze_te| 0|acc |57.05|± | 1.27|
|xstory_cloze_zh| 0|acc |58.17|± | 1.27|
## xglm-4.5B_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |76.26|± | 0.88|
|xwinograd_fr| 0|acc |60.24|± | 5.40|
|xwinograd_jp| 0|acc |62.67|± | 1.56|
|xwinograd_pt| 0|acc |64.64|± | 2.95|
|xwinograd_ru| 0|acc |62.22|± | 2.74|
|xwinograd_zh| 0|acc |70.63|± | 2.03|
# xglm-564M
## xglm-564M_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |19.97|± | 1.17|
| | |acc_norm|24.23|± | 1.25|
|arc_easy | 0|acc |45.71|± | 1.02|
| | |acc_norm|41.20|± | 1.01|
|boolq | 1|acc |53.33|± | 0.87|
|copa | 0|acc |69.00|± | 4.65|
|hellaswag | 0|acc |30.78|± | 0.46|
| | |acc_norm|35.20|± | 0.48|
|mc_taco | 0|em |14.04| | |
| | |f1 |40.42| | |
|openbookqa | 0|acc |15.00|± | 1.60|
| | |acc_norm|28.80|± | 2.03|
|piqa | 0|acc |65.13|± | 1.11|
| | |acc_norm|64.85|± | 1.11|
|prost | 0|acc |24.04|± | 0.31|
| | |acc_norm|31.05|± | 0.34|
|swag | 0|acc |41.11|± | 0.35|
| | |acc_norm|54.26|± | 0.35|
|winogrande | 0|acc |52.49|± | 1.40|
|wsc273 | 0|acc |58.61|± | 2.99|
## xglm-564M_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc | 1.21|± | 0.3|
## xglm-564M_lambada_openai_0-shot.json
| Task |Version|Metric| Value | |Stderr|
|--------------------|------:|------|------:|---|-----:|
|lambada_openai | 0|ppl | 28.57|± | 1.03|
| | |acc | 35.94|± | 0.67|
|lambada_openai_cloze| 0|ppl |6898.44|± |322.93|
| | |acc | 0.04|± | 0.03|
## xglm-564M_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 0.38|± | 0.06|
| | |f1 | 3.06|± | 0.11|
|gsm8k | 0|acc | 0.83|± | 0.25|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.21|± | 0.21|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.00|± | 0.00|
|math_prealgebra | 1|acc | 0.23|± | 0.16|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |21.11|± | 0.75|
| | |acc_norm|21.17|± | 0.75|
## xglm-564M_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |49.10|± | 1.12|
|pawsx_en| 0|acc |50.65|± | 1.12|
|pawsx_es| 0|acc |52.55|± | 1.12|
|pawsx_fr| 0|acc |50.80|± | 1.12|
|pawsx_ja| 0|acc |44.10|± | 1.11|
|pawsx_ko| 0|acc |46.25|± | 1.12|
|pawsx_zh| 0|acc |47.80|± | 1.12|
## xglm-564M_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 55.6|± | 2.22|
|xcopa_ht| 0|acc | 55.0|± | 2.23|
|xcopa_id| 0|acc | 57.2|± | 2.21|
|xcopa_it| 0|acc | 53.8|± | 2.23|
|xcopa_qu| 0|acc | 49.2|± | 2.24|
|xcopa_sw| 0|acc | 53.2|± | 2.23|
|xcopa_ta| 0|acc | 56.2|± | 2.22|
|xcopa_th| 0|acc | 55.2|± | 2.23|
|xcopa_tr| 0|acc | 54.4|± | 2.23|
|xcopa_vi| 0|acc | 58.4|± | 2.21|
|xcopa_zh| 0|acc | 55.6|± | 2.22|
## xglm-564M_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.41|± | 0.67|
|xnli_bg| 0|acc |41.30|± | 0.70|
|xnli_de| 0|acc |44.49|± | 0.70|
|xnli_el| 0|acc |39.56|± | 0.69|
|xnli_en| 0|acc |48.28|± | 0.71|
|xnli_es| 0|acc |42.04|± | 0.70|
|xnli_fr| 0|acc |45.49|± | 0.70|
|xnli_hi| 0|acc |38.68|± | 0.69|
|xnli_ru| 0|acc |44.63|± | 0.70|
|xnli_sw| 0|acc |36.07|± | 0.68|
|xnli_th| 0|acc |38.78|± | 0.69|
|xnli_tr| 0|acc |40.20|± | 0.69|
|xnli_ur| 0|acc |34.47|± | 0.67|
|xnli_vi| 0|acc |38.48|± | 0.69|
|xnli_zh| 0|acc |33.51|± | 0.67|
## xglm-564M_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |50.10|± | 1.29|
|xstory_cloze_en| 0|acc |60.56|± | 1.26|
|xstory_cloze_es| 0|acc |55.06|± | 1.28|
|xstory_cloze_eu| 0|acc |53.14|± | 1.28|
|xstory_cloze_hi| 0|acc |52.28|± | 1.29|
|xstory_cloze_id| 0|acc |54.00|± | 1.28|
|xstory_cloze_my| 0|acc |51.49|± | 1.29|
|xstory_cloze_ru| 0|acc |56.19|± | 1.28|
|xstory_cloze_sw| 0|acc |53.08|± | 1.28|
|xstory_cloze_te| 0|acc |55.86|± | 1.28|
|xstory_cloze_zh| 0|acc |53.28|± | 1.28|
## xglm-564M_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |62.62|± | 1.00|
|xwinograd_fr| 0|acc |57.83|± | 5.45|
|xwinograd_jp| 0|acc |54.54|± | 1.61|
|xwinograd_pt| 0|acc |58.56|± | 3.04|
|xwinograd_ru| 0|acc |59.05|± | 2.78|
|xwinograd_zh| 0|acc |65.67|± | 2.12|
# xglm-7.5B
## xglm-7.5B_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |28.75|± | 1.32|
| | |acc_norm|31.91|± | 1.36|
|arc_easy | 0|acc |62.37|± | 0.99|
| | |acc_norm|58.63|± | 1.01|
|boolq | 1|acc |60.18|± | 0.86|
|copa | 0|acc |79.00|± | 4.09|
|hellaswag | 0|acc |45.69|± | 0.50|
| | |acc_norm|61.23|± | 0.49|
|mc_taco | 0|em |13.81| | |
| | |f1 |47.92| | |
|openbookqa | 0|acc |25.40|± | 1.95|
| | |acc_norm|35.80|± | 2.15|
|piqa | 0|acc |73.94|± | 1.02|
| | |acc_norm|74.92|± | 1.01|
|prost | 0|acc |25.89|± | 0.32|
| | |acc_norm|26.36|± | 0.32|
|swag | 0|acc |50.51|± | 0.35|
| | |acc_norm|69.23|± | 0.33|
|winogrande | 0|acc |57.85|± | 1.39|
|wsc273 | 0|acc |75.82|± | 2.60|
## xglm-7.5B_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc | 0.15|± | 0.11|
## xglm-7.5B_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 5.42|± | 0.23|
| | |f1 | 8.96|± | 0.26|
|gsm8k | 0|acc | 0.23|± | 0.13|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.00|± | 0.00|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.00|± | 0.00|
|math_prealgebra | 1|acc | 0.00|± | 0.00|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |23.99|± | 0.78|
| | |acc_norm|23.52|± | 0.78|
## xglm-7.5B_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |55.90|± | 1.11|
|pawsx_en| 0|acc |58.85|± | 1.10|
|pawsx_es| 0|acc |52.80|± | 1.12|
|pawsx_fr| 0|acc |51.80|± | 1.12|
|pawsx_ja| 0|acc |52.00|± | 1.12|
|pawsx_ko| 0|acc |45.95|± | 1.11|
|pawsx_zh| 0|acc |51.30|± | 1.12|
## xglm-7.5B_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 61.2|± | 2.18|
|xcopa_ht| 0|acc | 57.4|± | 2.21|
|xcopa_id| 0|acc | 69.4|± | 2.06|
|xcopa_it| 0|acc | 63.6|± | 2.15|
|xcopa_qu| 0|acc | 48.8|± | 2.24|
|xcopa_sw| 0|acc | 60.0|± | 2.19|
|xcopa_ta| 0|acc | 54.4|± | 2.23|
|xcopa_th| 0|acc | 59.4|± | 2.20|
|xcopa_tr| 0|acc | 58.4|± | 2.21|
|xcopa_vi| 0|acc | 70.2|± | 2.05|
|xcopa_zh| 0|acc | 63.8|± | 2.15|
## xglm-7.5B_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.37|± | 0.67|
|xnli_bg| 0|acc |44.89|± | 0.70|
|xnli_de| 0|acc |48.98|± | 0.71|
|xnli_el| 0|acc |40.66|± | 0.69|
|xnli_en| 0|acc |53.85|± | 0.70|
|xnli_es| 0|acc |47.70|± | 0.71|
|xnli_fr| 0|acc |46.95|± | 0.71|
|xnli_hi| 0|acc |47.21|± | 0.71|
|xnli_ru| 0|acc |46.33|± | 0.70|
|xnli_sw| 0|acc |45.83|± | 0.70|
|xnli_th| 0|acc |43.71|± | 0.70|
|xnli_tr| 0|acc |46.27|± | 0.70|
|xnli_ur| 0|acc |42.10|± | 0.70|
|xnli_vi| 0|acc |46.33|± | 0.70|
|xnli_zh| 0|acc |35.37|± | 0.68|
## xglm-7.5B_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |56.19|± | 1.28|
|xstory_cloze_en| 0|acc |69.82|± | 1.18|
|xstory_cloze_es| 0|acc |64.06|± | 1.23|
|xstory_cloze_eu| 0|acc |57.71|± | 1.27|
|xstory_cloze_hi| 0|acc |58.77|± | 1.27|
|xstory_cloze_id| 0|acc |62.94|± | 1.24|
|xstory_cloze_my| 0|acc |57.11|± | 1.27|
|xstory_cloze_ru| 0|acc |63.53|± | 1.24|
|xstory_cloze_sw| 0|acc |59.30|± | 1.26|
|xstory_cloze_te| 0|acc |60.23|± | 1.26|
|xstory_cloze_zh| 0|acc |58.90|± | 1.27|
## xglm-7.5B_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |79.48|± | 0.84|
|xwinograd_fr| 0|acc |65.06|± | 5.27|
|xwinograd_jp| 0|acc |64.96|± | 1.54|
|xwinograd_pt| 0|acc |67.30|± | 2.90|
|xwinograd_ru| 0|acc |63.17|± | 2.72|
|xwinograd_zh| 0|acc |72.82|± | 1.98|
"""
Usage:
python make_table_tasks.py --output <markdown_filename>
"""
import logging
from lm_eval import tasks
from pytablewriter import MarkdownTableWriter, LatexTableWriter
import os
import json
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def make_table(result_dict):
"""Generate table of results."""
md_writer = MarkdownTableWriter()
latex_writer = LatexTableWriter()
md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
latex_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
values = []
for k, dic in sorted(result_dict["results"].items()):
version = result_dict["versions"][k]
percent = k == "squad2"
for m, v in dic.items():
if m.endswith("_stderr"):
continue
if m + "_stderr" in dic:
se = dic[m + "_stderr"]
if percent or m == "ppl":
values.append([k, version, m, "%.2f" % v, "±", "%.2f" % se])
else:
values.append(
[k, version, m, "%.2f" % (v * 100), "±", "%.2f" % (se * 100)]
)
else:
if percent or m == "ppl":
values.append([k, version, m, "%.2f" % v, "", ""])
else:
values.append([k, version, m, "%.2f" % (v * 100), "", ""])
k = ""
version = ""
md_writer.value_matrix = values
latex_writer.value_matrix = values
# todo: make latex table look good
# print(latex_writer.dumps())
return md_writer.dumps()
if __name__ == "__main__":
task_names = tasks.ALL_TASKS
# loop dirs and subdirs in results dir
# for each dir, load json files
for dirpath, dirnames, filenames in os.walk("../results"):
# skip dirs without files
if not filenames:
continue
path_readme = os.path.join(dirpath, "README.md")
with open(path_readme, "w") as f:
# get path name, only last folder
path_name = dirpath.split("/")[-1]
f.write(f"# {path_name} \n\n")
for filename in sorted([f for f in filenames if f.endswith(".json")]):
path = os.path.join(dirpath, filename)
with open(path, "r") as f:
result_dict = json.load(f)
with open(path_readme, "a") as f:
f.write(f"## {filename} \n")
f.write(f"{make_table(result_dict)} \n")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment