Commit 2ac318a9 authored by Julen Etxaniz's avatar Julen Etxaniz
Browse files

add basic markdown tables with results

parent 21e128d8
# bloom-1b1
## bloom-1b1_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |23.63|± | 1.24|
| | |acc_norm|25.68|± | 1.28|
|arc_easy | 0|acc |51.47|± | 1.03|
| | |acc_norm|45.45|± | 1.02|
|boolq | 1|acc |59.08|± | 0.86|
|copa | 0|acc |68.00|± | 4.69|
|hellaswag | 0|acc |34.63|± | 0.47|
| | |acc_norm|41.77|± | 0.49|
|mc_taco | 0|em |14.49| | |
| | |f1 |32.43| | |
|openbookqa | 0|acc |19.60|± | 1.78|
| | |acc_norm|29.40|± | 2.04|
|piqa | 0|acc |67.14|± | 1.10|
| | |acc_norm|67.14|± | 1.10|
|prost | 0|acc |23.41|± | 0.31|
| | |acc_norm|30.50|± | 0.34|
|swag | 0|acc |43.43|± | 0.35|
| | |acc_norm|58.28|± | 0.35|
|winogrande | 0|acc |54.93|± | 1.40|
|wsc273 | 0|acc |68.50|± | 2.82|
## bloom-1b1_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc | 0.83|± | 0.25|
## bloom-1b1_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 1.38|± | 0.12|
| | |f1 | 4.01|± | 0.15|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.21|± | 0.21|
|math_geometry | 1|acc | 0.21|± | 0.21|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.19|± | 0.19|
|math_prealgebra | 1|acc | 0.11|± | 0.11|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |23.55|± | 0.78|
| | |acc_norm|23.62|± | 0.78|
## bloom-1b1_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |46.95|± | 1.12|
|pawsx_en| 0|acc |52.45|± | 1.12|
|pawsx_es| 0|acc |51.50|± | 1.12|
|pawsx_fr| 0|acc |46.15|± | 1.11|
|pawsx_ja| 0|acc |48.40|± | 1.12|
|pawsx_ko| 0|acc |49.90|± | 1.12|
|pawsx_zh| 0|acc |48.95|± | 1.12|
## bloom-1b1_question_answering_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|------------|----:|---|-----:|
|headqa_en | 0|acc |26.44|± | 0.84|
| | |acc_norm |30.49|± | 0.88|
|headqa_es | 0|acc |24.43|± | 0.82|
| | |acc_norm |28.30|± | 0.86|
|logiqa | 0|acc |18.89|± | 1.54|
| | |acc_norm |25.65|± | 1.71|
|squad2 | 1|exact | 4.17| | |
| | |f1 | 6.60| | |
| | |HasAns_exact| 2.19| | |
| | |HasAns_f1 | 7.05| | |
| | |NoAns_exact | 6.14| | |
| | |NoAns_f1 | 6.14| | |
| | |best_exact |50.07| | |
| | |best_f1 |50.07| | |
|triviaqa | 1|acc | 2.68|± | 0.15|
|truthfulqa_mc| 1|mc1 |25.34|± | 1.52|
| | |mc2 |41.80|± | 1.46|
|webqs | 0|acc | 1.38|± | 0.26|
## bloom-1b1_reading_comprehension_0-shot.json
|Task|Version|Metric|Value| |Stderr|
|----|------:|------|----:|---|-----:|
|coqa| 1|f1 |45.57|± | 1.88|
| | |em |32.98|± | 1.95|
|drop| 1|em | 3.31|± | 0.18|
| | |f1 | 8.63|± | 0.22|
|race| 1|acc |32.63|± | 1.45|
## bloom-1b1_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 50.6|± | 2.24|
|xcopa_ht| 0|acc | 53.0|± | 2.23|
|xcopa_id| 0|acc | 64.8|± | 2.14|
|xcopa_it| 0|acc | 50.8|± | 2.24|
|xcopa_qu| 0|acc | 51.2|± | 2.24|
|xcopa_sw| 0|acc | 54.4|± | 2.23|
|xcopa_ta| 0|acc | 57.0|± | 2.22|
|xcopa_th| 0|acc | 53.2|± | 2.23|
|xcopa_tr| 0|acc | 53.0|± | 2.23|
|xcopa_vi| 0|acc | 62.4|± | 2.17|
|xcopa_zh| 0|acc | 59.4|± | 2.20|
## bloom-1b1_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.93|± | 0.67|
|xnli_bg| 0|acc |34.13|± | 0.67|
|xnli_de| 0|acc |39.64|± | 0.69|
|xnli_el| 0|acc |34.03|± | 0.67|
|xnli_en| 0|acc |51.48|± | 0.71|
|xnli_es| 0|acc |47.98|± | 0.71|
|xnli_fr| 0|acc |47.15|± | 0.71|
|xnli_hi| 0|acc |42.32|± | 0.70|
|xnli_ru| 0|acc |40.46|± | 0.69|
|xnli_sw| 0|acc |35.29|± | 0.68|
|xnli_th| 0|acc |33.75|± | 0.67|
|xnli_tr| 0|acc |34.79|± | 0.67|
|xnli_ur| 0|acc |37.33|± | 0.68|
|xnli_vi| 0|acc |44.45|± | 0.70|
|xnli_zh| 0|acc |36.23|± | 0.68|
## bloom-1b1_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |52.88|± | 1.28|
|xstory_cloze_en| 0|acc |62.54|± | 1.25|
|xstory_cloze_es| 0|acc |58.31|± | 1.27|
|xstory_cloze_eu| 0|acc |54.33|± | 1.28|
|xstory_cloze_hi| 0|acc |55.53|± | 1.28|
|xstory_cloze_id| 0|acc |57.91|± | 1.27|
|xstory_cloze_my| 0|acc |46.19|± | 1.28|
|xstory_cloze_ru| 0|acc |48.25|± | 1.29|
|xstory_cloze_sw| 0|acc |50.56|± | 1.29|
|xstory_cloze_te| 0|acc |56.39|± | 1.28|
|xstory_cloze_zh| 0|acc |58.04|± | 1.27|
## bloom-1b1_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |69.98|± | 0.95|
|xwinograd_fr| 0|acc |66.27|± | 5.22|
|xwinograd_jp| 0|acc |52.87|± | 1.61|
|xwinograd_pt| 0|acc |63.12|± | 2.98|
|xwinograd_ru| 0|acc |54.29|± | 2.81|
|xwinograd_zh| 0|acc |69.25|± | 2.06|
# bloom-1b7
## bloom-1b7_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |23.55|± | 1.24|
| | |acc_norm|26.79|± | 1.29|
|arc_easy | 0|acc |56.31|± | 1.02|
| | |acc_norm|48.11|± | 1.03|
|boolq | 1|acc |61.77|± | 0.85|
|copa | 0|acc |70.00|± | 4.61|
|hellaswag | 0|acc |37.62|± | 0.48|
| | |acc_norm|46.56|± | 0.50|
|mc_taco | 0|em |12.54| | |
| | |f1 |47.46| | |
|openbookqa | 0|acc |21.40|± | 1.84|
| | |acc_norm|30.00|± | 2.05|
|piqa | 0|acc |68.77|± | 1.08|
| | |acc_norm|70.08|± | 1.07|
|prost | 0|acc |23.52|± | 0.31|
| | |acc_norm|26.70|± | 0.32|
|swag | 0|acc |45.32|± | 0.35|
| | |acc_norm|61.15|± | 0.34|
|winogrande | 0|acc |57.14|± | 1.39|
|wsc273 | 0|acc |72.89|± | 2.70|
## bloom-1b7_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc | 1.29|± | 0.31|
## bloom-1b7_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 1.49|± | 0.12|
| | |f1 | 4.31|± | 0.15|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.00|± | 0.00|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.74|± | 0.37|
|math_prealgebra | 1|acc | 0.23|± | 0.16|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |24.29|± | 0.79|
| | |acc_norm|24.62|± | 0.79|
## bloom-1b7_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |48.75|± | 1.12|
|pawsx_en| 0|acc |48.90|± | 1.12|
|pawsx_es| 0|acc |51.30|± | 1.12|
|pawsx_fr| 0|acc |46.20|± | 1.12|
|pawsx_ja| 0|acc |44.70|± | 1.11|
|pawsx_ko| 0|acc |45.80|± | 1.11|
|pawsx_zh| 0|acc |45.40|± | 1.11|
## bloom-1b7_question_answering_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|------------|----:|---|-----:|
|headqa_en | 0|acc |27.75|± | 0.86|
| | |acc_norm |32.57|± | 0.90|
|headqa_es | 0|acc |25.42|± | 0.83|
| | |acc_norm |29.58|± | 0.87|
|logiqa | 0|acc |21.66|± | 1.62|
| | |acc_norm |28.11|± | 1.76|
|squad2 | 1|exact | 1.80| | |
| | |f1 | 4.38| | |
| | |HasAns_exact| 2.40| | |
| | |HasAns_f1 | 7.56| | |
| | |NoAns_exact | 1.21| | |
| | |NoAns_f1 | 1.21| | |
| | |best_exact |50.07| | |
| | |best_f1 |50.07| | |
|triviaqa | 1|acc | 3.14|± | 0.16|
|truthfulqa_mc| 1|mc1 |24.48|± | 1.51|
| | |mc2 |41.32|± | 1.44|
|webqs | 0|acc | 1.28|± | 0.25|
## bloom-1b7_reading_comprehension_0-shot.json
|Task|Version|Metric|Value| |Stderr|
|----|------:|------|----:|---|-----:|
|coqa| 1|f1 |53.55|± | 1.89|
| | |em |40.90|± | 2.03|
|drop| 1|em | 0.69|± | 0.08|
| | |f1 | 6.89|± | 0.16|
|race| 1|acc |33.21|± | 1.46|
## bloom-1b7_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 47.4|± | 2.24|
|xcopa_ht| 0|acc | 50.4|± | 2.24|
|xcopa_id| 0|acc | 63.2|± | 2.16|
|xcopa_it| 0|acc | 52.6|± | 2.24|
|xcopa_qu| 0|acc | 50.6|± | 2.24|
|xcopa_sw| 0|acc | 51.8|± | 2.24|
|xcopa_ta| 0|acc | 56.6|± | 2.22|
|xcopa_th| 0|acc | 53.2|± | 2.23|
|xcopa_tr| 0|acc | 52.8|± | 2.23|
|xcopa_vi| 0|acc | 65.8|± | 2.12|
|xcopa_zh| 0|acc | 61.4|± | 2.18|
## bloom-1b7_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.57|± | 0.67|
|xnli_bg| 0|acc |35.43|± | 0.68|
|xnli_de| 0|acc |40.58|± | 0.69|
|xnli_el| 0|acc |33.99|± | 0.67|
|xnli_en| 0|acc |50.14|± | 0.71|
|xnli_es| 0|acc |47.82|± | 0.71|
|xnli_fr| 0|acc |48.18|± | 0.71|
|xnli_hi| 0|acc |43.95|± | 0.70|
|xnli_ru| 0|acc |39.32|± | 0.69|
|xnli_sw| 0|acc |34.51|± | 0.67|
|xnli_th| 0|acc |33.37|± | 0.67|
|xnli_tr| 0|acc |34.93|± | 0.67|
|xnli_ur| 0|acc |40.50|± | 0.69|
|xnli_vi| 0|acc |46.23|± | 0.70|
|xnli_zh| 0|acc |36.21|± | 0.68|
## bloom-1b7_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |55.00|± | 1.28|
|xstory_cloze_en| 0|acc |64.66|± | 1.23|
|xstory_cloze_es| 0|acc |60.82|± | 1.26|
|xstory_cloze_eu| 0|acc |54.93|± | 1.28|
|xstory_cloze_hi| 0|acc |56.78|± | 1.27|
|xstory_cloze_id| 0|acc |59.76|± | 1.26|
|xstory_cloze_my| 0|acc |47.25|± | 1.28|
|xstory_cloze_ru| 0|acc |50.36|± | 1.29|
|xstory_cloze_sw| 0|acc |52.28|± | 1.29|
|xstory_cloze_te| 0|acc |56.52|± | 1.28|
|xstory_cloze_zh| 0|acc |58.24|± | 1.27|
## bloom-1b7_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |74.71|± | 0.90|
|xwinograd_fr| 0|acc |68.67|± | 5.12|
|xwinograd_jp| 0|acc |54.12|± | 1.61|
|xwinograd_pt| 0|acc |63.50|± | 2.97|
|xwinograd_ru| 0|acc |52.38|± | 2.82|
|xwinograd_zh| 0|acc |69.64|± | 2.05|
# bloom-3b
## bloom-3b_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |27.99|± | 1.31|
| | |acc_norm|30.55|± | 1.35|
|arc_easy | 0|acc |59.47|± | 1.01|
| | |acc_norm|53.24|± | 1.02|
|boolq | 1|acc |61.62|± | 0.85|
|copa | 0|acc |74.00|± | 4.41|
|hellaswag | 0|acc |41.26|± | 0.49|
| | |acc_norm|52.72|± | 0.50|
|mc_taco | 0|em |11.94| | |
| | |f1 |49.57| | |
|openbookqa | 0|acc |21.60|± | 1.84|
| | |acc_norm|32.20|± | 2.09|
|piqa | 0|acc |70.84|± | 1.06|
| | |acc_norm|70.51|± | 1.06|
|prost | 0|acc |22.69|± | 0.31|
| | |acc_norm|26.36|± | 0.32|
|swag | 0|acc |47.36|± | 0.35|
| | |acc_norm|64.59|± | 0.34|
|winogrande | 0|acc |58.72|± | 1.38|
|wsc273 | 0|acc |76.92|± | 2.55|
## bloom-3b_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc | 1.21|± | 0.3|
## bloom-3b_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 2.10|± | 0.15|
| | |f1 | 4.63|± | 0.17|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.21|± | 0.21|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.19|± | 0.19|
|math_prealgebra | 1|acc | 0.11|± | 0.11|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |25.26|± | 0.80|
| | |acc_norm|25.06|± | 0.79|
## bloom-3b_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc | 54.6|± | 1.11|
|pawsx_en| 0|acc | 56.8|± | 1.11|
|pawsx_es| 0|acc | 56.4|± | 1.11|
|pawsx_fr| 0|acc | 47.6|± | 1.12|
|pawsx_ja| 0|acc | 44.6|± | 1.11|
|pawsx_ko| 0|acc | 46.3|± | 1.12|
|pawsx_zh| 0|acc | 47.1|± | 1.12|
## bloom-3b_question_answering_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|------------|----:|---|-----:|
|headqa_en | 0|acc |28.41|± | 0.86|
| | |acc_norm |33.37|± | 0.90|
|headqa_es | 0|acc |26.44|± | 0.84|
| | |acc_norm |31.00|± | 0.88|
|logiqa | 0|acc |20.74|± | 1.59|
| | |acc_norm |29.19|± | 1.78|
|squad2 | 1|exact | 6.91| | |
| | |f1 |11.51| | |
| | |HasAns_exact|11.10| | |
| | |HasAns_f1 |20.31| | |
| | |NoAns_exact | 2.74| | |
| | |NoAns_f1 | 2.74| | |
| | |best_exact |50.07| | |
| | |best_f1 |50.08| | |
|triviaqa | 1|acc | 4.15|± | 0.19|
|truthfulqa_mc| 1|mc1 |23.26|± | 1.48|
| | |mc2 |40.57|± | 1.44|
|webqs | 0|acc | 1.67|± | 0.28|
## bloom-3b_reading_comprehension_0-shot.json
|Task|Version|Metric|Value| |Stderr|
|----|------:|------|----:|---|-----:|
|coqa| 1|f1 |61.50|± | 1.77|
| | |em |46.07|± | 2.02|
|drop| 1|em | 1.94|± | 0.14|
| | |f1 | 8.88|± | 0.20|
|race| 1|acc |35.22|± | 1.48|
## bloom-3b_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 49.2|± | 2.24|
|xcopa_ht| 0|acc | 50.2|± | 2.24|
|xcopa_id| 0|acc | 69.2|± | 2.07|
|xcopa_it| 0|acc | 51.6|± | 2.24|
|xcopa_qu| 0|acc | 50.6|± | 2.24|
|xcopa_sw| 0|acc | 51.4|± | 2.24|
|xcopa_ta| 0|acc | 58.0|± | 2.21|
|xcopa_th| 0|acc | 52.6|± | 2.24|
|xcopa_tr| 0|acc | 53.4|± | 2.23|
|xcopa_vi| 0|acc | 68.8|± | 2.07|
|xcopa_zh| 0|acc | 62.0|± | 2.17|
## bloom-3b_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.43|± | 0.67|
|xnli_bg| 0|acc |37.90|± | 0.69|
|xnli_de| 0|acc |40.40|± | 0.69|
|xnli_el| 0|acc |33.21|± | 0.67|
|xnli_en| 0|acc |53.41|± | 0.70|
|xnli_es| 0|acc |49.08|± | 0.71|
|xnli_fr| 0|acc |49.18|± | 0.71|
|xnli_hi| 0|acc |45.55|± | 0.70|
|xnli_ru| 0|acc |41.40|± | 0.70|
|xnli_sw| 0|acc |35.83|± | 0.68|
|xnli_th| 0|acc |33.39|± | 0.67|
|xnli_tr| 0|acc |33.81|± | 0.67|
|xnli_ur| 0|acc |40.00|± | 0.69|
|xnli_vi| 0|acc |46.51|± | 0.70|
|xnli_zh| 0|acc |37.43|± | 0.68|
## bloom-3b_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |56.59|± | 1.28|
|xstory_cloze_en| 0|acc |66.78|± | 1.21|
|xstory_cloze_es| 0|acc |64.13|± | 1.23|
|xstory_cloze_eu| 0|acc |55.66|± | 1.28|
|xstory_cloze_hi| 0|acc |57.58|± | 1.27|
|xstory_cloze_id| 0|acc |60.82|± | 1.26|
|xstory_cloze_my| 0|acc |46.59|± | 1.28|
|xstory_cloze_ru| 0|acc |50.69|± | 1.29|
|xstory_cloze_sw| 0|acc |53.01|± | 1.28|
|xstory_cloze_te| 0|acc |58.17|± | 1.27|
|xstory_cloze_zh| 0|acc |60.89|± | 1.26|
## bloom-3b_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |79.10|± | 0.84|
|xwinograd_fr| 0|acc |71.08|± | 5.01|
|xwinograd_jp| 0|acc |56.62|± | 1.60|
|xwinograd_pt| 0|acc |70.34|± | 2.82|
|xwinograd_ru| 0|acc |53.65|± | 2.81|
|xwinograd_zh| 0|acc |73.61|± | 1.97|
# bloom-560m
## bloom-560m_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |22.44|± | 1.22|
| | |acc_norm|23.98|± | 1.25|
|arc_easy | 0|acc |47.35|± | 1.02|
| | |acc_norm|41.67|± | 1.01|
|boolq | 1|acc |55.14|± | 0.87|
|copa | 0|acc |61.00|± | 4.90|
|hellaswag | 0|acc |31.56|± | 0.46|
| | |acc_norm|36.56|± | 0.48|
|mc_taco | 0|em |17.42| | |
| | |f1 |31.43| | |
|openbookqa | 0|acc |17.20|± | 1.69|
| | |acc_norm|28.20|± | 2.01|
|piqa | 0|acc |64.09|± | 1.12|
| | |acc_norm|65.13|± | 1.11|
|prost | 0|acc |22.08|± | 0.30|
| | |acc_norm|32.08|± | 0.34|
|swag | 0|acc |40.35|± | 0.35|
| | |acc_norm|52.96|± | 0.35|
|winogrande | 0|acc |52.80|± | 1.40|
|wsc273 | 0|acc |66.67|± | 2.86|
## bloom-560m_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc | 0.53|± | 0.2|
## bloom-560m_lambada_openai_0-shot.json
| Task |Version|Metric| Value | |Stderr|
|--------------------|------:|------|------:|---|-----:|
|lambada_openai | 0|ppl | 28.68|± | 1.08|
| | |acc | 35.40|± | 0.67|
|lambada_openai_cloze| 0|ppl |6212.81|± |267.17|
| | |acc | 0.45|± | 0.09|
## bloom-560m_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 1.26|± | 0.11|
| | |f1 | 3.50|± | 0.14|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.00|± | 0.00|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.19|± | 0.19|
|math_prealgebra | 1|acc | 0.23|± | 0.16|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |22.51|± | 0.76|
| | |acc_norm|22.35|± | 0.76|
## bloom-560m_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |52.80|± | 1.12|
|pawsx_en| 0|acc |52.00|± | 1.12|
|pawsx_es| 0|acc |53.25|± | 1.12|
|pawsx_fr| 0|acc |47.95|± | 1.12|
|pawsx_ja| 0|acc |44.90|± | 1.11|
|pawsx_ko| 0|acc |51.90|± | 1.12|
|pawsx_zh| 0|acc |45.20|± | 1.11|
## bloom-560m_question_answering_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|------------|----:|---|-----:|
|headqa_en | 0|acc |25.67|± | 0.83|
| | |acc_norm |29.58|± | 0.87|
|headqa_es | 0|acc |23.96|± | 0.82|
| | |acc_norm |27.17|± | 0.85|
|logiqa | 0|acc |22.58|± | 1.64|
| | |acc_norm |27.19|± | 1.75|
|squad2 | 1|exact | 0.43| | |
| | |f1 | 1.86| | |
| | |HasAns_exact| 0.76| | |
| | |HasAns_f1 | 3.62| | |
| | |NoAns_exact | 0.10| | |
| | |NoAns_f1 | 0.10| | |
| | |best_exact |50.07| | |
| | |best_f1 |50.07| | |
|triviaqa | 1|acc | 1.44|± | 0.11|
|truthfulqa_mc| 1|mc1 |24.48|± | 1.51|
| | |mc2 |42.43|± | 1.51|
|webqs | 0|acc | 0.84|± | 0.20|
## bloom-560m_reading_comprehension_0-shot.json
|Task|Version|Metric|Value| |Stderr|
|----|------:|------|----:|---|-----:|
|coqa| 1|f1 |22.71|± | 1.67|
| | |em |17.40|± | 1.62|
|drop| 1|em | 1.50|± | 0.12|
| | |f1 | 6.21|± | 0.17|
|race| 1|acc |30.24|± | 1.42|
## bloom-560m_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 49.0|± | 2.24|
|xcopa_ht| 0|acc | 50.2|± | 2.24|
|xcopa_id| 0|acc | 59.2|± | 2.20|
|xcopa_it| 0|acc | 50.8|± | 2.24|
|xcopa_qu| 0|acc | 50.2|± | 2.24|
|xcopa_sw| 0|acc | 51.6|± | 2.24|
|xcopa_ta| 0|acc | 55.8|± | 2.22|
|xcopa_th| 0|acc | 54.4|± | 2.23|
|xcopa_tr| 0|acc | 53.0|± | 2.23|
|xcopa_vi| 0|acc | 61.0|± | 2.18|
|xcopa_zh| 0|acc | 58.6|± | 2.20|
## bloom-560m_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.35|± | 0.67|
|xnli_bg| 0|acc |33.39|± | 0.67|
|xnli_de| 0|acc |34.79|± | 0.67|
|xnli_el| 0|acc |33.33|± | 0.67|
|xnli_en| 0|acc |49.50|± | 0.71|
|xnli_es| 0|acc |45.23|± | 0.70|
|xnli_fr| 0|acc |45.29|± | 0.70|
|xnli_hi| 0|acc |40.84|± | 0.69|
|xnli_ru| 0|acc |34.01|± | 0.67|
|xnli_sw| 0|acc |33.17|± | 0.67|
|xnli_th| 0|acc |33.57|± | 0.67|
|xnli_tr| 0|acc |33.43|± | 0.67|
|xnli_ur| 0|acc |37.13|± | 0.68|
|xnli_vi| 0|acc |40.52|± | 0.69|
|xnli_zh| 0|acc |33.95|± | 0.67|
## bloom-560m_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |52.08|± | 1.29|
|xstory_cloze_en| 0|acc |61.22|± | 1.25|
|xstory_cloze_es| 0|acc |55.86|± | 1.28|
|xstory_cloze_eu| 0|acc |53.61|± | 1.28|
|xstory_cloze_hi| 0|acc |55.00|± | 1.28|
|xstory_cloze_id| 0|acc |55.53|± | 1.28|
|xstory_cloze_my| 0|acc |47.19|± | 1.28|
|xstory_cloze_ru| 0|acc |49.17|± | 1.29|
|xstory_cloze_sw| 0|acc |49.83|± | 1.29|
|xstory_cloze_te| 0|acc |55.72|± | 1.28|
|xstory_cloze_zh| 0|acc |54.53|± | 1.28|
## bloom-560m_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |65.89|± | 0.98|
|xwinograd_fr| 0|acc |60.24|± | 5.40|
|xwinograd_jp| 0|acc |52.97|± | 1.61|
|xwinograd_pt| 0|acc |60.08|± | 3.03|
|xwinograd_ru| 0|acc |49.21|± | 2.82|
|xwinograd_zh| 0|acc |67.66|± | 2.09|
# bloom-7b1
## bloom-7b1_bbh_3-shot.json
| Task |Version| Metric |Value| |Stderr|
|------------------------------------------------|------:|---------------------|----:|---|-----:|
|bigbench_causal_judgement | 0|multiple_choice_grade|52.11|± | 3.63|
|bigbench_date_understanding | 0|multiple_choice_grade|36.59|± | 2.51|
|bigbench_disambiguation_qa | 0|multiple_choice_grade|26.36|± | 2.75|
|bigbench_dyck_languages | 0|multiple_choice_grade|14.40|± | 1.11|
|bigbench_formal_fallacies_syllogisms_negation | 0|multiple_choice_grade|50.06|± | 0.42|
|bigbench_geometric_shapes | 0|multiple_choice_grade|20.06|± | 2.12|
| | |exact_str_match | 0.00|± | 0.00|
|bigbench_hyperbaton | 0|multiple_choice_grade|48.62|± | 0.22|
|bigbench_logical_deduction_five_objects | 0|multiple_choice_grade|26.00|± | 1.96|
|bigbench_logical_deduction_seven_objects | 0|multiple_choice_grade|19.14|± | 1.49|
|bigbench_logical_deduction_three_objects | 0|multiple_choice_grade|37.00|± | 2.79|
|bigbench_movie_recommendation | 0|multiple_choice_grade|26.40|± | 1.97|
|bigbench_navigate | 0|multiple_choice_grade|49.90|± | 1.58|
|bigbench_reasoning_about_colored_objects | 0|multiple_choice_grade|24.85|± | 0.97|
|bigbench_ruin_names | 0|multiple_choice_grade|34.38|± | 2.25|
|bigbench_salient_translation_error_detection | 0|multiple_choice_grade|19.14|± | 1.25|
|bigbench_snarks | 0|multiple_choice_grade|49.72|± | 3.73|
|bigbench_sports_understanding | 0|multiple_choice_grade|50.30|± | 1.59|
|bigbench_temporal_sequences | 0|multiple_choice_grade|24.80|± | 1.37|
|bigbench_tracking_shuffled_objects_five_objects | 0|multiple_choice_grade|18.40|± | 1.10|
|bigbench_tracking_shuffled_objects_seven_objects| 0|multiple_choice_grade|14.00|± | 0.83|
|bigbench_tracking_shuffled_objects_three_objects| 0|multiple_choice_grade|37.00|± | 2.79|
## bloom-7b1_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |30.38|± | 1.34|
| | |acc_norm|33.53|± | 1.38|
|arc_easy | 0|acc |64.94|± | 0.98|
| | |acc_norm|57.32|± | 1.01|
|boolq | 1|acc |62.87|± | 0.85|
|copa | 0|acc |72.00|± | 4.51|
|hellaswag | 0|acc |46.24|± | 0.50|
| | |acc_norm|59.68|± | 0.49|
|mc_taco | 0|em |13.59| | |
| | |f1 |50.53| | |
|openbookqa | 0|acc |25.20|± | 1.94|
| | |acc_norm|35.80|± | 2.15|
|piqa | 0|acc |72.74|± | 1.04|
| | |acc_norm|73.67|± | 1.03|
|prost | 0|acc |26.18|± | 0.32|
| | |acc_norm|30.57|± | 0.34|
|swag | 0|acc |50.25|± | 0.35|
| | |acc_norm|68.26|± | 0.33|
|winogrande | 0|acc |64.33|± | 1.35|
|wsc273 | 0|acc |81.32|± | 2.36|
## bloom-7b1_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc | 1.9|± | 0.38|
## bloom-7b1_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 2.51|± | 0.16|
| | |f1 | 5.09|± | 0.18|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.00|± | 0.00|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.00|± | 0.00|
|math_prealgebra | 1|acc | 0.00|± | 0.00|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |26.57|± | 0.81|
| | |acc_norm|26.53|± | 0.81|
## bloom-7b1_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |52.85|± | 1.12|
|pawsx_en| 0|acc |61.30|± | 1.09|
|pawsx_es| 0|acc |59.35|± | 1.10|
|pawsx_fr| 0|acc |50.90|± | 1.12|
|pawsx_ja| 0|acc |45.45|± | 1.11|
|pawsx_ko| 0|acc |45.10|± | 1.11|
|pawsx_zh| 0|acc |47.35|± | 1.12|
## bloom-7b1_question_answering_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|------------|----:|---|-----:|
|headqa_en | 0|acc |31.18|± | 0.88|
| | |acc_norm |35.56|± | 0.91|
|headqa_es | 0|acc |29.54|± | 0.87|
| | |acc_norm |34.32|± | 0.91|
|logiqa | 0|acc |20.28|± | 1.58|
| | |acc_norm |28.11|± | 1.76|
|squad2 | 1|exact | 7.82| | |
| | |f1 |12.64| | |
| | |HasAns_exact|14.84| | |
| | |HasAns_f1 |24.51| | |
| | |NoAns_exact | 0.81| | |
| | |NoAns_f1 | 0.81| | |
| | |best_exact |50.07| | |
| | |best_f1 |50.07| | |
|triviaqa | 1|acc | 5.52|± | 0.21|
|truthfulqa_mc| 1|mc1 |22.40|± | 1.46|
| | |mc2 |38.90|± | 1.40|
|webqs | 0|acc | 2.26|± | 0.33|
## bloom-7b1_reading_comprehension_0-shot.json
|Task|Version|Metric|Value| |Stderr|
|----|------:|------|----:|---|-----:|
|coqa| 1|f1 |68.83|± | 1.63|
| | |em |53.87|± | 2.00|
|drop| 1|em | 2.57|± | 0.16|
| | |f1 | 9.85|± | 0.21|
|race| 1|acc |36.56|± | 1.49|
## bloom-7b1_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 48.2|± | 2.24|
|xcopa_ht| 0|acc | 50.8|± | 2.24|
|xcopa_id| 0|acc | 69.8|± | 2.06|
|xcopa_it| 0|acc | 52.8|± | 2.23|
|xcopa_qu| 0|acc | 50.8|± | 2.24|
|xcopa_sw| 0|acc | 51.6|± | 2.24|
|xcopa_ta| 0|acc | 59.2|± | 2.20|
|xcopa_th| 0|acc | 55.4|± | 2.23|
|xcopa_tr| 0|acc | 51.2|± | 2.24|
|xcopa_vi| 0|acc | 70.8|± | 2.04|
|xcopa_zh| 0|acc | 65.2|± | 2.13|
## bloom-7b1_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.83|± | 0.67|
|xnli_bg| 0|acc |39.70|± | 0.69|
|xnli_de| 0|acc |39.86|± | 0.69|
|xnli_el| 0|acc |35.75|± | 0.68|
|xnli_en| 0|acc |53.91|± | 0.70|
|xnli_es| 0|acc |48.70|± | 0.71|
|xnli_fr| 0|acc |49.68|± | 0.71|
|xnli_hi| 0|acc |46.51|± | 0.70|
|xnli_ru| 0|acc |43.05|± | 0.70|
|xnli_sw| 0|acc |37.92|± | 0.69|
|xnli_th| 0|acc |34.99|± | 0.67|
|xnli_tr| 0|acc |35.09|± | 0.67|
|xnli_ur| 0|acc |42.10|± | 0.70|
|xnli_vi| 0|acc |47.05|± | 0.71|
|xnli_zh| 0|acc |35.43|± | 0.68|
## bloom-7b1_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |58.57|± | 1.27|
|xstory_cloze_en| 0|acc |70.75|± | 1.17|
|xstory_cloze_es| 0|acc |66.12|± | 1.22|
|xstory_cloze_eu| 0|acc |57.18|± | 1.27|
|xstory_cloze_hi| 0|acc |60.56|± | 1.26|
|xstory_cloze_id| 0|acc |64.46|± | 1.23|
|xstory_cloze_my| 0|acc |48.97|± | 1.29|
|xstory_cloze_ru| 0|acc |52.75|± | 1.28|
|xstory_cloze_sw| 0|acc |53.94|± | 1.28|
|xstory_cloze_te| 0|acc |57.45|± | 1.27|
|xstory_cloze_zh| 0|acc |61.88|± | 1.25|
## bloom-7b1_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |82.15|± | 0.79|
|xwinograd_fr| 0|acc |71.08|± | 5.01|
|xwinograd_jp| 0|acc |58.50|± | 1.59|
|xwinograd_pt| 0|acc |76.81|± | 2.61|
|xwinograd_ru| 0|acc |56.83|± | 2.80|
|xwinograd_zh| 0|acc |74.40|± | 1.95|
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
# xglm-1.7B
## xglm-1.7B_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |20.99|± | 1.19|
| | |acc_norm|24.32|± | 1.25|
|arc_easy | 0|acc |53.62|± | 1.02|
| | |acc_norm|47.90|± | 1.03|
|boolq | 1|acc |58.56|± | 0.86|
|copa | 0|acc |68.00|± | 4.69|
|hellaswag | 0|acc |36.18|± | 0.48|
| | |acc_norm|45.80|± | 0.50|
|mc_taco | 0|em |12.91| | |
| | |f1 |34.52| | |
|openbookqa | 0|acc |17.00|± | 1.68|
| | |acc_norm|29.80|± | 2.05|
|piqa | 0|acc |69.70|± | 1.07|
| | |acc_norm|70.35|± | 1.07|
|prost | 0|acc |22.69|± | 0.31|
| | |acc_norm|27.21|± | 0.33|
|swag | 0|acc |45.97|± | 0.35|
| | |acc_norm|62.19|± | 0.34|
|winogrande | 0|acc |54.93|± | 1.40|
|wsc273 | 0|acc |68.13|± | 2.83|
## xglm-1.7B_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc | 0.99|± | 0.27|
## xglm-1.7B_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 0.67|± | 0.08|
| | |f1 | 3.44|± | 0.13|
|gsm8k | 0|acc | 0.83|± | 0.25|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.00|± | 0.00|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.00|± | 0.00|
|math_prealgebra | 1|acc | 0.00|± | 0.00|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |22.91|± | 0.77|
| | |acc_norm|21.44|± | 0.75|
## xglm-1.7B_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |57.55|± | 1.11|
|pawsx_en| 0|acc |52.65|± | 1.12|
|pawsx_es| 0|acc |53.80|± | 1.12|
|pawsx_fr| 0|acc |47.35|± | 1.12|
|pawsx_ja| 0|acc |46.10|± | 1.11|
|pawsx_ko| 0|acc |51.40|± | 1.12|
|pawsx_zh| 0|acc |48.10|± | 1.12|
## xglm-1.7B_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 56.8|± | 2.22|
|xcopa_ht| 0|acc | 55.8|± | 2.22|
|xcopa_id| 0|acc | 64.6|± | 2.14|
|xcopa_it| 0|acc | 54.0|± | 2.23|
|xcopa_qu| 0|acc | 52.2|± | 2.24|
|xcopa_sw| 0|acc | 56.6|± | 2.22|
|xcopa_ta| 0|acc | 55.2|± | 2.23|
|xcopa_th| 0|acc | 58.2|± | 2.21|
|xcopa_tr| 0|acc | 53.4|± | 2.23|
|xcopa_vi| 0|acc | 63.0|± | 2.16|
|xcopa_zh| 0|acc | 58.0|± | 2.21|
## xglm-1.7B_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.51|± | 0.67|
|xnli_bg| 0|acc |44.73|± | 0.70|
|xnli_de| 0|acc |45.33|± | 0.70|
|xnli_el| 0|acc |40.10|± | 0.69|
|xnli_en| 0|acc |49.68|± | 0.71|
|xnli_es| 0|acc |43.61|± | 0.70|
|xnli_fr| 0|acc |45.73|± | 0.70|
|xnli_hi| 0|acc |42.61|± | 0.70|
|xnli_ru| 0|acc |45.97|± | 0.70|
|xnli_sw| 0|acc |42.00|± | 0.70|
|xnli_th| 0|acc |41.70|± | 0.70|
|xnli_tr| 0|acc |42.95|± | 0.70|
|xnli_ur| 0|acc |39.50|± | 0.69|
|xnli_vi| 0|acc |45.03|± | 0.70|
|xnli_zh| 0|acc |33.77|± | 0.67|
## xglm-1.7B_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |52.48|± | 1.29|
|xstory_cloze_en| 0|acc |64.33|± | 1.23|
|xstory_cloze_es| 0|acc |59.23|± | 1.26|
|xstory_cloze_eu| 0|acc |56.12|± | 1.28|
|xstory_cloze_hi| 0|acc |55.79|± | 1.28|
|xstory_cloze_id| 0|acc |57.97|± | 1.27|
|xstory_cloze_my| 0|acc |53.81|± | 1.28|
|xstory_cloze_ru| 0|acc |59.83|± | 1.26|
|xstory_cloze_sw| 0|acc |55.99|± | 1.28|
|xstory_cloze_te| 0|acc |58.04|± | 1.27|
|xstory_cloze_zh| 0|acc |56.19|± | 1.28|
## xglm-1.7B_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |71.05|± | 0.94|
|xwinograd_fr| 0|acc |60.24|± | 5.40|
|xwinograd_jp| 0|acc |60.58|± | 1.58|
|xwinograd_pt| 0|acc |63.88|± | 2.97|
|xwinograd_ru| 0|acc |59.68|± | 2.77|
|xwinograd_zh| 0|acc |69.84|± | 2.05|
# xglm-2.9B
## xglm-2.9B_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |23.46|± | 1.24|
| | |acc_norm|27.39|± | 1.30|
|arc_easy | 0|acc |56.65|± | 1.02|
| | |acc_norm|53.37|± | 1.02|
|boolq | 1|acc |61.44|± | 0.85|
|copa | 0|acc |74.00|± | 4.41|
|hellaswag | 0|acc |40.92|± | 0.49|
| | |acc_norm|53.70|± | 0.50|
|mc_taco | 0|em |11.94| | |
| | |f1 |47.80| | |
|openbookqa | 0|acc |21.60|± | 1.84|
| | |acc_norm|33.20|± | 2.11|
|piqa | 0|acc |71.27|± | 1.06|
| | |acc_norm|73.01|± | 1.04|
|prost | 0|acc |21.92|± | 0.30|
| | |acc_norm|26.64|± | 0.32|
|swag | 0|acc |48.49|± | 0.35|
| | |acc_norm|65.78|± | 0.34|
|winogrande | 0|acc |54.62|± | 1.40|
|wsc273 | 0|acc |71.06|± | 2.75|
## xglm-2.9B_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |50.65|± | 1.12|
|pawsx_en| 0|acc |54.75|± | 1.11|
|pawsx_es| 0|acc |53.15|± | 1.12|
|pawsx_fr| 0|acc |49.70|± | 1.12|
|pawsx_ja| 0|acc |50.95|± | 1.12|
|pawsx_ko| 0|acc |46.75|± | 1.12|
|pawsx_zh| 0|acc |53.70|± | 1.12|
## xglm-2.9B_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 58.2|± | 2.21|
|xcopa_ht| 0|acc | 55.8|± | 2.22|
|xcopa_id| 0|acc | 66.8|± | 2.11|
|xcopa_it| 0|acc | 60.2|± | 2.19|
|xcopa_qu| 0|acc | 50.2|± | 2.24|
|xcopa_sw| 0|acc | 58.8|± | 2.20|
|xcopa_ta| 0|acc | 54.2|± | 2.23|
|xcopa_th| 0|acc | 57.0|± | 2.22|
|xcopa_tr| 0|acc | 56.6|± | 2.22|
|xcopa_vi| 0|acc | 65.2|± | 2.13|
|xcopa_zh| 0|acc | 60.0|± | 2.19|
## xglm-2.9B_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.65|± | 0.67|
|xnli_bg| 0|acc |45.97|± | 0.70|
|xnli_de| 0|acc |48.32|± | 0.71|
|xnli_el| 0|acc |41.40|± | 0.70|
|xnli_en| 0|acc |51.08|± | 0.71|
|xnli_es| 0|acc |46.67|± | 0.70|
|xnli_fr| 0|acc |45.03|± | 0.70|
|xnli_hi| 0|acc |44.03|± | 0.70|
|xnli_ru| 0|acc |45.29|± | 0.70|
|xnli_sw| 0|acc |44.43|± | 0.70|
|xnli_th| 0|acc |41.98|± | 0.70|
|xnli_tr| 0|acc |44.97|± | 0.70|
|xnli_ur| 0|acc |40.10|± | 0.69|
|xnli_vi| 0|acc |45.99|± | 0.70|
|xnli_zh| 0|acc |34.81|± | 0.67|
## xglm-2.9B_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |53.87|± | 1.28|
|xstory_cloze_en| 0|acc |67.31|± | 1.21|
|xstory_cloze_es| 0|acc |60.95|± | 1.26|
|xstory_cloze_eu| 0|acc |56.32|± | 1.28|
|xstory_cloze_hi| 0|acc |57.51|± | 1.27|
|xstory_cloze_id| 0|acc |61.35|± | 1.25|
|xstory_cloze_my| 0|acc |55.20|± | 1.28|
|xstory_cloze_ru| 0|acc |62.21|± | 1.25|
|xstory_cloze_sw| 0|acc |56.72|± | 1.28|
|xstory_cloze_te| 0|acc |60.03|± | 1.26|
|xstory_cloze_zh| 0|acc |57.64|± | 1.27|
## xglm-2.9B_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |75.61|± | 0.89|
|xwinograd_fr| 0|acc |59.04|± | 5.43|
|xwinograd_jp| 0|acc |64.65|± | 1.54|
|xwinograd_pt| 0|acc |66.16|± | 2.92|
|xwinograd_ru| 0|acc |62.86|± | 2.73|
|xwinograd_zh| 0|acc |71.63|± | 2.01|
# xglm-4.5B
## xglm-4.5B_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |27.13|± | 1.30|
| | |acc_norm|28.16|± | 1.31|
|arc_easy | 0|acc |60.31|± | 1.00|
| | |acc_norm|57.24|± | 1.02|
|boolq | 1|acc |61.19|± | 0.85|
|copa | 0|acc |81.00|± | 3.94|
|hellaswag | 0|acc |43.77|± | 0.50|
| | |acc_norm|58.24|± | 0.49|
|mc_taco | 0|em |15.39| | |
| | |f1 |43.51| | |
|openbookqa | 0|acc |23.20|± | 1.89|
| | |acc_norm|34.40|± | 2.13|
|piqa | 0|acc |72.74|± | 1.04|
| | |acc_norm|72.96|± | 1.04|
|prost | 0|acc |26.43|± | 0.32|
| | |acc_norm|26.28|± | 0.32|
|swag | 0|acc |49.65|± | 0.35|
| | |acc_norm|67.87|± | 0.33|
|winogrande | 0|acc |56.12|± | 1.39|
|wsc273 | 0|acc |71.79|± | 2.73|
## xglm-4.5B_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc | 0.08|± | 0.08|
## xglm-4.5B_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |52.65|± | 1.12|
|pawsx_en| 0|acc |55.40|± | 1.11|
|pawsx_es| 0|acc |51.05|± | 1.12|
|pawsx_fr| 0|acc |51.60|± | 1.12|
|pawsx_ja| 0|acc |47.75|± | 1.12|
|pawsx_ko| 0|acc |49.10|± | 1.12|
|pawsx_zh| 0|acc |54.60|± | 1.11|
## xglm-4.5B_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 55.0|± | 2.23|
|xcopa_ht| 0|acc | 51.2|± | 2.24|
|xcopa_id| 0|acc | 67.0|± | 2.10|
|xcopa_it| 0|acc | 61.6|± | 2.18|
|xcopa_qu| 0|acc | 50.0|± | 2.24|
|xcopa_sw| 0|acc | 56.2|± | 2.22|
|xcopa_ta| 0|acc | 55.6|± | 2.22|
|xcopa_th| 0|acc | 55.2|± | 2.23|
|xcopa_tr| 0|acc | 57.2|± | 2.21|
|xcopa_vi| 0|acc | 66.0|± | 2.12|
|xcopa_zh| 0|acc | 61.6|± | 2.18|
## xglm-4.5B_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.59|± | 0.67|
|xnli_bg| 0|acc |45.61|± | 0.70|
|xnli_de| 0|acc |47.11|± | 0.71|
|xnli_el| 0|acc |39.84|± | 0.69|
|xnli_en| 0|acc |53.63|± | 0.70|
|xnli_es| 0|acc |47.68|± | 0.71|
|xnli_fr| 0|acc |47.31|± | 0.71|
|xnli_hi| 0|acc |42.50|± | 0.70|
|xnli_ru| 0|acc |46.15|± | 0.70|
|xnli_sw| 0|acc |39.58|± | 0.69|
|xnli_th| 0|acc |39.68|± | 0.69|
|xnli_tr| 0|acc |44.85|± | 0.70|
|xnli_ur| 0|acc |37.47|± | 0.68|
|xnli_vi| 0|acc |45.87|± | 0.70|
|xnli_zh| 0|acc |34.77|± | 0.67|
## xglm-4.5B_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |53.67|± | 1.28|
|xstory_cloze_en| 0|acc |69.16|± | 1.19|
|xstory_cloze_es| 0|acc |62.81|± | 1.24|
|xstory_cloze_eu| 0|acc |53.74|± | 1.28|
|xstory_cloze_hi| 0|acc |56.85|± | 1.27|
|xstory_cloze_id| 0|acc |60.42|± | 1.26|
|xstory_cloze_my| 0|acc |50.76|± | 1.29|
|xstory_cloze_ru| 0|acc |62.74|± | 1.24|
|xstory_cloze_sw| 0|acc |55.06|± | 1.28|
|xstory_cloze_te| 0|acc |57.05|± | 1.27|
|xstory_cloze_zh| 0|acc |58.17|± | 1.27|
## xglm-4.5B_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |76.26|± | 0.88|
|xwinograd_fr| 0|acc |60.24|± | 5.40|
|xwinograd_jp| 0|acc |62.67|± | 1.56|
|xwinograd_pt| 0|acc |64.64|± | 2.95|
|xwinograd_ru| 0|acc |62.22|± | 2.74|
|xwinograd_zh| 0|acc |70.63|± | 2.03|
# xglm-564M
## xglm-564M_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |19.97|± | 1.17|
| | |acc_norm|24.23|± | 1.25|
|arc_easy | 0|acc |45.71|± | 1.02|
| | |acc_norm|41.20|± | 1.01|
|boolq | 1|acc |53.33|± | 0.87|
|copa | 0|acc |69.00|± | 4.65|
|hellaswag | 0|acc |30.78|± | 0.46|
| | |acc_norm|35.20|± | 0.48|
|mc_taco | 0|em |14.04| | |
| | |f1 |40.42| | |
|openbookqa | 0|acc |15.00|± | 1.60|
| | |acc_norm|28.80|± | 2.03|
|piqa | 0|acc |65.13|± | 1.11|
| | |acc_norm|64.85|± | 1.11|
|prost | 0|acc |24.04|± | 0.31|
| | |acc_norm|31.05|± | 0.34|
|swag | 0|acc |41.11|± | 0.35|
| | |acc_norm|54.26|± | 0.35|
|winogrande | 0|acc |52.49|± | 1.40|
|wsc273 | 0|acc |58.61|± | 2.99|
## xglm-564M_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc | 1.21|± | 0.3|
## xglm-564M_lambada_openai_0-shot.json
| Task |Version|Metric| Value | |Stderr|
|--------------------|------:|------|------:|---|-----:|
|lambada_openai | 0|ppl | 28.57|± | 1.03|
| | |acc | 35.94|± | 0.67|
|lambada_openai_cloze| 0|ppl |6898.44|± |322.93|
| | |acc | 0.04|± | 0.03|
## xglm-564M_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 0.38|± | 0.06|
| | |f1 | 3.06|± | 0.11|
|gsm8k | 0|acc | 0.83|± | 0.25|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.21|± | 0.21|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.00|± | 0.00|
|math_prealgebra | 1|acc | 0.23|± | 0.16|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |21.11|± | 0.75|
| | |acc_norm|21.17|± | 0.75|
## xglm-564M_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |49.10|± | 1.12|
|pawsx_en| 0|acc |50.65|± | 1.12|
|pawsx_es| 0|acc |52.55|± | 1.12|
|pawsx_fr| 0|acc |50.80|± | 1.12|
|pawsx_ja| 0|acc |44.10|± | 1.11|
|pawsx_ko| 0|acc |46.25|± | 1.12|
|pawsx_zh| 0|acc |47.80|± | 1.12|
## xglm-564M_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 55.6|± | 2.22|
|xcopa_ht| 0|acc | 55.0|± | 2.23|
|xcopa_id| 0|acc | 57.2|± | 2.21|
|xcopa_it| 0|acc | 53.8|± | 2.23|
|xcopa_qu| 0|acc | 49.2|± | 2.24|
|xcopa_sw| 0|acc | 53.2|± | 2.23|
|xcopa_ta| 0|acc | 56.2|± | 2.22|
|xcopa_th| 0|acc | 55.2|± | 2.23|
|xcopa_tr| 0|acc | 54.4|± | 2.23|
|xcopa_vi| 0|acc | 58.4|± | 2.21|
|xcopa_zh| 0|acc | 55.6|± | 2.22|
## xglm-564M_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.41|± | 0.67|
|xnli_bg| 0|acc |41.30|± | 0.70|
|xnli_de| 0|acc |44.49|± | 0.70|
|xnli_el| 0|acc |39.56|± | 0.69|
|xnli_en| 0|acc |48.28|± | 0.71|
|xnli_es| 0|acc |42.04|± | 0.70|
|xnli_fr| 0|acc |45.49|± | 0.70|
|xnli_hi| 0|acc |38.68|± | 0.69|
|xnli_ru| 0|acc |44.63|± | 0.70|
|xnli_sw| 0|acc |36.07|± | 0.68|
|xnli_th| 0|acc |38.78|± | 0.69|
|xnli_tr| 0|acc |40.20|± | 0.69|
|xnli_ur| 0|acc |34.47|± | 0.67|
|xnli_vi| 0|acc |38.48|± | 0.69|
|xnli_zh| 0|acc |33.51|± | 0.67|
## xglm-564M_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |50.10|± | 1.29|
|xstory_cloze_en| 0|acc |60.56|± | 1.26|
|xstory_cloze_es| 0|acc |55.06|± | 1.28|
|xstory_cloze_eu| 0|acc |53.14|± | 1.28|
|xstory_cloze_hi| 0|acc |52.28|± | 1.29|
|xstory_cloze_id| 0|acc |54.00|± | 1.28|
|xstory_cloze_my| 0|acc |51.49|± | 1.29|
|xstory_cloze_ru| 0|acc |56.19|± | 1.28|
|xstory_cloze_sw| 0|acc |53.08|± | 1.28|
|xstory_cloze_te| 0|acc |55.86|± | 1.28|
|xstory_cloze_zh| 0|acc |53.28|± | 1.28|
## xglm-564M_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |62.62|± | 1.00|
|xwinograd_fr| 0|acc |57.83|± | 5.45|
|xwinograd_jp| 0|acc |54.54|± | 1.61|
|xwinograd_pt| 0|acc |58.56|± | 3.04|
|xwinograd_ru| 0|acc |59.05|± | 2.78|
|xwinograd_zh| 0|acc |65.67|± | 2.12|
# xglm-7.5B
## xglm-7.5B_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |28.75|± | 1.32|
| | |acc_norm|31.91|± | 1.36|
|arc_easy | 0|acc |62.37|± | 0.99|
| | |acc_norm|58.63|± | 1.01|
|boolq | 1|acc |60.18|± | 0.86|
|copa | 0|acc |79.00|± | 4.09|
|hellaswag | 0|acc |45.69|± | 0.50|
| | |acc_norm|61.23|± | 0.49|
|mc_taco | 0|em |13.81| | |
| | |f1 |47.92| | |
|openbookqa | 0|acc |25.40|± | 1.95|
| | |acc_norm|35.80|± | 2.15|
|piqa | 0|acc |73.94|± | 1.02|
| | |acc_norm|74.92|± | 1.01|
|prost | 0|acc |25.89|± | 0.32|
| | |acc_norm|26.36|± | 0.32|
|swag | 0|acc |50.51|± | 0.35|
| | |acc_norm|69.23|± | 0.33|
|winogrande | 0|acc |57.85|± | 1.39|
|wsc273 | 0|acc |75.82|± | 2.60|
## xglm-7.5B_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc | 0.15|± | 0.11|
## xglm-7.5B_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 5.42|± | 0.23|
| | |f1 | 8.96|± | 0.26|
|gsm8k | 0|acc | 0.23|± | 0.13|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.00|± | 0.00|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.00|± | 0.00|
|math_prealgebra | 1|acc | 0.00|± | 0.00|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |23.99|± | 0.78|
| | |acc_norm|23.52|± | 0.78|
## xglm-7.5B_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |55.90|± | 1.11|
|pawsx_en| 0|acc |58.85|± | 1.10|
|pawsx_es| 0|acc |52.80|± | 1.12|
|pawsx_fr| 0|acc |51.80|± | 1.12|
|pawsx_ja| 0|acc |52.00|± | 1.12|
|pawsx_ko| 0|acc |45.95|± | 1.11|
|pawsx_zh| 0|acc |51.30|± | 1.12|
## xglm-7.5B_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 61.2|± | 2.18|
|xcopa_ht| 0|acc | 57.4|± | 2.21|
|xcopa_id| 0|acc | 69.4|± | 2.06|
|xcopa_it| 0|acc | 63.6|± | 2.15|
|xcopa_qu| 0|acc | 48.8|± | 2.24|
|xcopa_sw| 0|acc | 60.0|± | 2.19|
|xcopa_ta| 0|acc | 54.4|± | 2.23|
|xcopa_th| 0|acc | 59.4|± | 2.20|
|xcopa_tr| 0|acc | 58.4|± | 2.21|
|xcopa_vi| 0|acc | 70.2|± | 2.05|
|xcopa_zh| 0|acc | 63.8|± | 2.15|
## xglm-7.5B_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.37|± | 0.67|
|xnli_bg| 0|acc |44.89|± | 0.70|
|xnli_de| 0|acc |48.98|± | 0.71|
|xnli_el| 0|acc |40.66|± | 0.69|
|xnli_en| 0|acc |53.85|± | 0.70|
|xnli_es| 0|acc |47.70|± | 0.71|
|xnli_fr| 0|acc |46.95|± | 0.71|
|xnli_hi| 0|acc |47.21|± | 0.71|
|xnli_ru| 0|acc |46.33|± | 0.70|
|xnli_sw| 0|acc |45.83|± | 0.70|
|xnli_th| 0|acc |43.71|± | 0.70|
|xnli_tr| 0|acc |46.27|± | 0.70|
|xnli_ur| 0|acc |42.10|± | 0.70|
|xnli_vi| 0|acc |46.33|± | 0.70|
|xnli_zh| 0|acc |35.37|± | 0.68|
## xglm-7.5B_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |56.19|± | 1.28|
|xstory_cloze_en| 0|acc |69.82|± | 1.18|
|xstory_cloze_es| 0|acc |64.06|± | 1.23|
|xstory_cloze_eu| 0|acc |57.71|± | 1.27|
|xstory_cloze_hi| 0|acc |58.77|± | 1.27|
|xstory_cloze_id| 0|acc |62.94|± | 1.24|
|xstory_cloze_my| 0|acc |57.11|± | 1.27|
|xstory_cloze_ru| 0|acc |63.53|± | 1.24|
|xstory_cloze_sw| 0|acc |59.30|± | 1.26|
|xstory_cloze_te| 0|acc |60.23|± | 1.26|
|xstory_cloze_zh| 0|acc |58.90|± | 1.27|
## xglm-7.5B_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |79.48|± | 0.84|
|xwinograd_fr| 0|acc |65.06|± | 5.27|
|xwinograd_jp| 0|acc |64.96|± | 1.54|
|xwinograd_pt| 0|acc |67.30|± | 2.90|
|xwinograd_ru| 0|acc |63.17|± | 2.72|
|xwinograd_zh| 0|acc |72.82|± | 1.98|
"""
Usage:
python make_table_tasks.py --output <markdown_filename>
"""
import logging
from lm_eval import tasks
from pytablewriter import MarkdownTableWriter, LatexTableWriter
import os
import json
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def make_table(result_dict):
"""Generate table of results."""
md_writer = MarkdownTableWriter()
latex_writer = LatexTableWriter()
md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
latex_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
values = []
for k, dic in sorted(result_dict["results"].items()):
version = result_dict["versions"][k]
percent = k == "squad2"
for m, v in dic.items():
if m.endswith("_stderr"):
continue
if m + "_stderr" in dic:
se = dic[m + "_stderr"]
if percent or m == "ppl":
values.append([k, version, m, "%.2f" % v, "±", "%.2f" % se])
else:
values.append(
[k, version, m, "%.2f" % (v * 100), "±", "%.2f" % (se * 100)]
)
else:
if percent or m == "ppl":
values.append([k, version, m, "%.2f" % v, "", ""])
else:
values.append([k, version, m, "%.2f" % (v * 100), "", ""])
k = ""
version = ""
md_writer.value_matrix = values
latex_writer.value_matrix = values
# todo: make latex table look good
# print(latex_writer.dumps())
return md_writer.dumps()
if __name__ == "__main__":
task_names = tasks.ALL_TASKS
# loop dirs and subdirs in results dir
# for each dir, load json files
for dirpath, dirnames, filenames in os.walk("../results"):
# skip dirs without files
if not filenames:
continue
path_readme = os.path.join(dirpath, "README.md")
with open(path_readme, "w") as f:
# get path name, only last folder
path_name = dirpath.split("/")[-1]
f.write(f"# {path_name} \n\n")
for filename in sorted([f for f in filenames if f.endswith(".json")]):
path = os.path.join(dirpath, filename)
with open(path, "r") as f:
result_dict = json.load(f)
with open(path_readme, "a") as f:
f.write(f"## {filename} \n")
f.write(f"{make_table(result_dict)} \n")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment