Commit 4a0b0d6e authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'gakada-big-refactor-merge' into big-refactor

parents 6ae376e3 c490f165
{
"results": {
"math_counting_and_prob": {
"acc": 0.002109704641350211,
"acc_stderr": 0.00210970464135021
},
"math_algebra": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_precalc": {
"acc": 0.0,
"acc_stderr": 0.0
},
"gsm8k": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_prealgebra": {
"acc": 0.001148105625717566,
"acc_stderr": 0.0011481056257175708
},
"math_geometry": {
"acc": 0.0,
"acc_stderr": 0.0
},
"drop": {
"em": 0.02097315436241611,
"em_stderr": 0.001467468637213982,
"f1": 0.04631921140939603,
"f1_stderr": 0.001664167972365937
},
"math_num_theory": {
"acc": 0.001851851851851852,
"acc_stderr": 0.0018518518518518448
},
"math_intermediate_algebra": {
"acc": 0.0,
"acc_stderr": 0.0
},
"mathqa": {
"acc": 0.2525963149078727,
"acc_stderr": 0.007954112207299597,
"acc_norm": 0.25058626465661643,
"acc_norm_stderr": 0.00793304734353984
}
},
"versions": {
"math_counting_and_prob": 1,
"math_algebra": 1,
"math_precalc": 1,
"mathqa": 0,
"gsm8k": 0,
"math_prealgebra": 1,
"math_geometry": 1,
"drop": 1,
"math_num_theory": 1,
"math_intermediate_algebra": 1
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-3b,use_accelerate=True",
"num_fewshot": 5,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"pawsx_es": {
"acc": 0.564,
"acc_stderr": 0.011091145421162657
},
"pawsx_ja": {
"acc": 0.446,
"acc_stderr": 0.011117724672834362
},
"pawsx_ko": {
"acc": 0.463,
"acc_stderr": 0.011152474561478177
},
"pawsx_zh": {
"acc": 0.471,
"acc_stderr": 0.011164310140373722
},
"pawsx_en": {
"acc": 0.568,
"acc_stderr": 0.011079231683079107
},
"pawsx_de": {
"acc": 0.546,
"acc_stderr": 0.0111357084193598
},
"pawsx_fr": {
"acc": 0.476,
"acc_stderr": 0.011170245619215438
}
},
"versions": {
"pawsx_es": 0,
"pawsx_ja": 0,
"pawsx_ko": 0,
"pawsx_zh": 0,
"pawsx_en": 0,
"pawsx_de": 0,
"pawsx_fr": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-3b",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"squad2": {
"exact": 6.914848816642803,
"f1": 11.511512971067512,
"HasAns_exact": 11.099865047233468,
"HasAns_f1": 20.306375422652543,
"NoAns_exact": 2.7417998317914214,
"NoAns_f1": 2.7417998317914214,
"best_exact": 50.07159100480081,
"best_f1": 50.08024690773861
},
"logiqa": {
"acc": 0.2073732718894009,
"acc_stderr": 0.015902084913876333,
"acc_norm": 0.29185867895545314,
"acc_norm_stderr": 0.017831570553971925
},
"headqa_en": {
"acc": 0.2840991976659373,
"acc_stderr": 0.008614040521644994,
"acc_norm": 0.3336980306345733,
"acc_norm_stderr": 0.009006537310888562
},
"truthfulqa_mc": {
"mc1": 0.23255813953488372,
"mc1_stderr": 0.014789157531080503,
"mc2": 0.40572206357204965,
"mc2_stderr": 0.014390512893375817
},
"webqs": {
"acc": 0.01673228346456693,
"acc_stderr": 0.0028461549169432184
},
"triviaqa": {
"acc": 0.04154512507734465,
"acc_stderr": 0.0018761872163031025
},
"headqa_es": {
"acc": 0.26440554339897887,
"acc_stderr": 0.008423643607316284,
"acc_norm": 0.3099927060539752,
"acc_norm_stderr": 0.008833810133604958
}
},
"versions": {
"squad2": 1,
"logiqa": 0,
"headqa_en": 0,
"truthfulqa_mc": 1,
"webqs": 0,
"triviaqa": 1,
"headqa_es": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-3b,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"race": {
"acc": 0.3521531100478469,
"acc_stderr": 0.014782629897202266
},
"coqa": {
"f1": 0.6149953283852732,
"f1_stderr": 0.017671101287646335,
"em": 0.4606666666666665,
"em_stderr": 0.020185214533701167
},
"drop": {
"em": 0.0194001677852349,
"em_stderr": 0.0014124994962717904,
"f1": 0.08879089765100667,
"f1_stderr": 0.002002230574295575
}
},
"versions": {
"race": 1,
"coqa": 1,
"drop": 1
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-3b,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"xcopa_qu": {
"acc": 0.506,
"acc_stderr": 0.022381462412439324
},
"xcopa_ta": {
"acc": 0.58,
"acc_stderr": 0.02209471322976178
},
"xcopa_it": {
"acc": 0.516,
"acc_stderr": 0.0223716109825804
},
"xcopa_vi": {
"acc": 0.688,
"acc_stderr": 0.02074059653648807
},
"xcopa_id": {
"acc": 0.692,
"acc_stderr": 0.0206670329874661
},
"xcopa_sw": {
"acc": 0.514,
"acc_stderr": 0.02237429816635319
},
"xcopa_et": {
"acc": 0.492,
"acc_stderr": 0.022380208834928028
},
"xcopa_tr": {
"acc": 0.534,
"acc_stderr": 0.02233126442325838
},
"xcopa_th": {
"acc": 0.526,
"acc_stderr": 0.02235279165091416
},
"xcopa_ht": {
"acc": 0.502,
"acc_stderr": 0.022382894986483524
},
"xcopa_zh": {
"acc": 0.62,
"acc_stderr": 0.021728881438701702
}
},
"versions": {
"xcopa_qu": 0,
"xcopa_ta": 0,
"xcopa_it": 0,
"xcopa_vi": 0,
"xcopa_id": 0,
"xcopa_sw": 0,
"xcopa_et": 0,
"xcopa_tr": 0,
"xcopa_th": 0,
"xcopa_ht": 0,
"xcopa_zh": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-3b",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"xnli_ar": {
"acc": 0.3343313373253493,
"acc_stderr": 0.006665643509474758
},
"xnli_bg": {
"acc": 0.3790419161676647,
"acc_stderr": 0.006854870366766951
},
"xnli_de": {
"acc": 0.4039920159680639,
"acc_stderr": 0.00693325085793088
},
"xnli_el": {
"acc": 0.3321357285429142,
"acc_stderr": 0.006654667909419873
},
"xnli_en": {
"acc": 0.5341317365269461,
"acc_stderr": 0.007048232775587604
},
"xnli_es": {
"acc": 0.4908183632734531,
"acc_stderr": 0.007063521183707481
},
"xnli_fr": {
"acc": 0.49181636726546907,
"acc_stderr": 0.007063766092285604
},
"xnli_hi": {
"acc": 0.4554890219560878,
"acc_stderr": 0.007036663146398998
},
"xnli_ru": {
"acc": 0.41397205588822356,
"acc_stderr": 0.006959357713092714
},
"xnli_sw": {
"acc": 0.3582834331337325,
"acc_stderr": 0.006775002711732832
},
"xnli_th": {
"acc": 0.3339321357285429,
"acc_stderr": 0.006663660032909966
},
"xnli_tr": {
"acc": 0.33812375249501,
"acc_stderr": 0.006684219795088851
},
"xnli_ur": {
"acc": 0.4,
"acc_stderr": 0.006921976252528393
},
"xnli_vi": {
"acc": 0.46506986027944114,
"acc_stderr": 0.0070474518252208835
},
"xnli_zh": {
"acc": 0.37425149700598803,
"acc_stderr": 0.006837638981887555
}
},
"versions": {
"xnli_ar": 0,
"xnli_bg": 0,
"xnli_de": 0,
"xnli_el": 0,
"xnli_en": 0,
"xnli_es": 0,
"xnli_fr": 0,
"xnli_hi": 0,
"xnli_ru": 0,
"xnli_sw": 0,
"xnli_th": 0,
"xnli_tr": 0,
"xnli_ur": 0,
"xnli_vi": 0,
"xnli_zh": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-3b",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"xstory_cloze_eu": {
"acc": 0.5565850430178689,
"acc_stderr": 0.012784462136657198
},
"xstory_cloze_en": {
"acc": 0.6677696889477167,
"acc_stderr": 0.01212116892354459
},
"xstory_cloze_hi": {
"acc": 0.5757776307081403,
"acc_stderr": 0.012718494399531051
},
"xstory_cloze_sw": {
"acc": 0.5301125082726671,
"acc_stderr": 0.012843769248432169
},
"xstory_cloze_id": {
"acc": 0.6082064857710126,
"acc_stderr": 0.012562199063960642
},
"xstory_cloze_my": {
"acc": 0.46591661151555264,
"acc_stderr": 0.012837195610619434
},
"xstory_cloze_ru": {
"acc": 0.5069490403706155,
"acc_stderr": 0.01286588257096072
},
"xstory_cloze_te": {
"acc": 0.5817339510258107,
"acc_stderr": 0.012694045150564688
},
"xstory_cloze_zh": {
"acc": 0.6088682991396426,
"acc_stderr": 0.012558411693622684
},
"xstory_cloze_ar": {
"acc": 0.5658504301786896,
"acc_stderr": 0.01275504628991221
},
"xstory_cloze_es": {
"acc": 0.6412971542025149,
"acc_stderr": 0.012342655113112371
}
},
"versions": {
"xstory_cloze_eu": 0,
"xstory_cloze_en": 0,
"xstory_cloze_hi": 0,
"xstory_cloze_sw": 0,
"xstory_cloze_id": 0,
"xstory_cloze_my": 0,
"xstory_cloze_ru": 0,
"xstory_cloze_te": 0,
"xstory_cloze_zh": 0,
"xstory_cloze_ar": 0,
"xstory_cloze_es": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-3b",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"xwinograd_en": {
"acc": 0.7909677419354839,
"acc_stderr": 0.008434667753827384
},
"xwinograd_pt": {
"acc": 0.7034220532319392,
"acc_stderr": 0.028218035233491295
},
"xwinograd_jp": {
"acc": 0.5662148070907195,
"acc_stderr": 0.016011986615263228
},
"xwinograd_fr": {
"acc": 0.7108433734939759,
"acc_stderr": 0.050066428050419186
},
"xwinograd_zh": {
"acc": 0.7361111111111112,
"acc_stderr": 0.019651614961528674
},
"xwinograd_ru": {
"acc": 0.5365079365079365,
"acc_stderr": 0.028141315964997568
}
},
"versions": {
"xwinograd_en": 0,
"xwinograd_pt": 0,
"xwinograd_jp": 0,
"xwinograd_fr": 0,
"xwinograd_zh": 0,
"xwinograd_ru": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-3b",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
# bloom-560m
## bloom-560m_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |22.44|± | 1.22|
| | |acc_norm|23.98|± | 1.25|
|arc_easy | 0|acc |47.35|± | 1.02|
| | |acc_norm|41.67|± | 1.01|
|boolq | 1|acc |55.14|± | 0.87|
|copa | 0|acc |61.00|± | 4.90|
|hellaswag | 0|acc |31.56|± | 0.46|
| | |acc_norm|36.56|± | 0.48|
|mc_taco | 0|em |17.42| | |
| | |f1 |31.43| | |
|openbookqa | 0|acc |17.20|± | 1.69|
| | |acc_norm|28.20|± | 2.01|
|piqa | 0|acc |64.09|± | 1.12|
| | |acc_norm|65.13|± | 1.11|
|prost | 0|acc |22.08|± | 0.30|
| | |acc_norm|32.08|± | 0.34|
|swag | 0|acc |40.35|± | 0.35|
| | |acc_norm|52.96|± | 0.35|
|winogrande | 0|acc |52.80|± | 1.40|
|wsc273 | 0|acc |66.67|± | 2.86|
## bloom-560m_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc | 0.53|± | 0.2|
## bloom-560m_lambada_openai_0-shot.json
| Task |Version|Metric| Value | |Stderr|
|--------------------|------:|------|------:|---|-----:|
|lambada_openai | 0|ppl | 28.68|± | 1.08|
| | |acc | 35.40|± | 0.67|
|lambada_openai_cloze| 0|ppl |6212.81|± |267.17|
| | |acc | 0.45|± | 0.09|
## bloom-560m_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 1.26|± | 0.11|
| | |f1 | 3.50|± | 0.14|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.00|± | 0.00|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.19|± | 0.19|
|math_prealgebra | 1|acc | 0.23|± | 0.16|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |22.51|± | 0.76|
| | |acc_norm|22.35|± | 0.76|
## bloom-560m_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |52.80|± | 1.12|
|pawsx_en| 0|acc |52.00|± | 1.12|
|pawsx_es| 0|acc |53.25|± | 1.12|
|pawsx_fr| 0|acc |47.95|± | 1.12|
|pawsx_ja| 0|acc |44.90|± | 1.11|
|pawsx_ko| 0|acc |51.90|± | 1.12|
|pawsx_zh| 0|acc |45.20|± | 1.11|
## bloom-560m_question_answering_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|------------|----:|---|-----:|
|headqa_en | 0|acc |25.67|± | 0.83|
| | |acc_norm |29.58|± | 0.87|
|headqa_es | 0|acc |23.96|± | 0.82|
| | |acc_norm |27.17|± | 0.85|
|logiqa | 0|acc |22.58|± | 1.64|
| | |acc_norm |27.19|± | 1.75|
|squad2 | 1|exact | 0.43| | |
| | |f1 | 1.86| | |
| | |HasAns_exact| 0.76| | |
| | |HasAns_f1 | 3.62| | |
| | |NoAns_exact | 0.10| | |
| | |NoAns_f1 | 0.10| | |
| | |best_exact |50.07| | |
| | |best_f1 |50.07| | |
|triviaqa | 1|acc | 1.44|± | 0.11|
|truthfulqa_mc| 1|mc1 |24.48|± | 1.51|
| | |mc2 |42.43|± | 1.51|
|webqs | 0|acc | 0.84|± | 0.20|
## bloom-560m_reading_comprehension_0-shot.json
|Task|Version|Metric|Value| |Stderr|
|----|------:|------|----:|---|-----:|
|coqa| 1|f1 |22.71|± | 1.67|
| | |em |17.40|± | 1.62|
|drop| 1|em | 1.50|± | 0.12|
| | |f1 | 6.21|± | 0.17|
|race| 1|acc |30.24|± | 1.42|
## bloom-560m_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 49.0|± | 2.24|
|xcopa_ht| 0|acc | 50.2|± | 2.24|
|xcopa_id| 0|acc | 59.2|± | 2.20|
|xcopa_it| 0|acc | 50.8|± | 2.24|
|xcopa_qu| 0|acc | 50.2|± | 2.24|
|xcopa_sw| 0|acc | 51.6|± | 2.24|
|xcopa_ta| 0|acc | 55.8|± | 2.22|
|xcopa_th| 0|acc | 54.4|± | 2.23|
|xcopa_tr| 0|acc | 53.0|± | 2.23|
|xcopa_vi| 0|acc | 61.0|± | 2.18|
|xcopa_zh| 0|acc | 58.6|± | 2.20|
## bloom-560m_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.35|± | 0.67|
|xnli_bg| 0|acc |33.39|± | 0.67|
|xnli_de| 0|acc |34.79|± | 0.67|
|xnli_el| 0|acc |33.33|± | 0.67|
|xnli_en| 0|acc |49.50|± | 0.71|
|xnli_es| 0|acc |45.23|± | 0.70|
|xnli_fr| 0|acc |45.29|± | 0.70|
|xnli_hi| 0|acc |40.84|± | 0.69|
|xnli_ru| 0|acc |34.01|± | 0.67|
|xnli_sw| 0|acc |33.17|± | 0.67|
|xnli_th| 0|acc |33.57|± | 0.67|
|xnli_tr| 0|acc |33.43|± | 0.67|
|xnli_ur| 0|acc |37.13|± | 0.68|
|xnli_vi| 0|acc |40.52|± | 0.69|
|xnli_zh| 0|acc |33.95|± | 0.67|
## bloom-560m_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |52.08|± | 1.29|
|xstory_cloze_en| 0|acc |61.22|± | 1.25|
|xstory_cloze_es| 0|acc |55.86|± | 1.28|
|xstory_cloze_eu| 0|acc |53.61|± | 1.28|
|xstory_cloze_hi| 0|acc |55.00|± | 1.28|
|xstory_cloze_id| 0|acc |55.53|± | 1.28|
|xstory_cloze_my| 0|acc |47.19|± | 1.28|
|xstory_cloze_ru| 0|acc |49.17|± | 1.29|
|xstory_cloze_sw| 0|acc |49.83|± | 1.29|
|xstory_cloze_te| 0|acc |55.72|± | 1.28|
|xstory_cloze_zh| 0|acc |54.53|± | 1.28|
## bloom-560m_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |65.89|± | 0.98|
|xwinograd_fr| 0|acc |60.24|± | 5.40|
|xwinograd_jp| 0|acc |52.97|± | 1.61|
|xwinograd_pt| 0|acc |60.08|± | 3.03|
|xwinograd_ru| 0|acc |49.21|± | 2.82|
|xwinograd_zh| 0|acc |67.66|± | 2.09|
{
"results": {
"hellaswag": {
"acc": 0.31557458673571004,
"acc_stderr": 0.004637944965914592,
"acc_norm": 0.3655646285600478,
"acc_norm_stderr": 0.00480603903900897
},
"prost": {
"acc": 0.22080486763450044,
"acc_stderr": 0.0030304044027250577,
"acc_norm": 0.3207728437233134,
"acc_norm_stderr": 0.003410197007857463
},
"piqa": {
"acc": 0.6409140369967355,
"acc_stderr": 0.011192949073844103,
"acc_norm": 0.6512513601741022,
"acc_norm_stderr": 0.011119263056159595
},
"arc_easy": {
"acc": 0.4734848484848485,
"acc_stderr": 0.010245347015573713,
"acc_norm": 0.4166666666666667,
"acc_norm_stderr": 0.01011628297778124
},
"winogrande": {
"acc": 0.5280189423835833,
"acc_stderr": 0.014030404213405784
},
"mc_taco": {
"em": 0.17417417417417416,
"f1": 0.31427590778450365
},
"openbookqa": {
"acc": 0.172,
"acc_stderr": 0.01689386887634748,
"acc_norm": 0.282,
"acc_norm_stderr": 0.020143572847290795
},
"copa": {
"acc": 0.61,
"acc_stderr": 0.04902071300001975
},
"boolq": {
"acc": 0.5513761467889908,
"acc_stderr": 0.008698767182005265
},
"swag": {
"acc": 0.40347895631310604,
"acc_stderr": 0.003468598652499914,
"acc_norm": 0.5296411076676997,
"acc_norm_stderr": 0.003528874749486556
},
"arc_challenge": {
"acc": 0.22440273037542663,
"acc_stderr": 0.012191404938603838,
"acc_norm": 0.23976109215017063,
"acc_norm_stderr": 0.012476304127453947
},
"wsc273": {
"acc": 0.6666666666666666,
"acc_stderr": 0.028583097523751506
}
},
"versions": {
"hellaswag": 0,
"prost": 0,
"piqa": 0,
"arc_easy": 0,
"winogrande": 0,
"mc_taco": 0,
"openbookqa": 0,
"copa": 0,
"boolq": 1,
"swag": 0,
"arc_challenge": 0,
"wsc273": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-560m,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"gsm8k": {
"acc": 0.00530705079605762,
"acc_stderr": 0.002001305720948044
}
},
"versions": {
"gsm8k": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-560m,use_accelerate=True",
"num_fewshot": 8,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"lambada_openai_cloze": {
"ppl": 6212.811566448059,
"ppl_stderr": 267.17480839849884,
"acc": 0.00446341936735882,
"acc_stderr": 0.0009286980441682362
},
"lambada_openai": {
"ppl": 28.67792043529687,
"ppl_stderr": 1.0838905590384336,
"acc": 0.35396856200271687,
"acc_stderr": 0.006662254900290991
}
},
"versions": {
"lambada_openai_cloze": 0,
"lambada_openai": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-560m,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"math_intermediate_algebra": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_geometry": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_prealgebra": {
"acc": 0.002296211251435132,
"acc_stderr": 0.001622733136934621
},
"drop": {
"em": 0.012583892617449664,
"em_stderr": 0.0011415560941551478,
"f1": 0.034977978187919485,
"f1_stderr": 0.0013870691301022255
},
"math_counting_and_prob": {
"acc": 0.0,
"acc_stderr": 0.0
},
"gsm8k": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_num_theory": {
"acc": 0.001851851851851852,
"acc_stderr": 0.0018518518518518465
},
"math_precalc": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_algebra": {
"acc": 0.0,
"acc_stderr": 0.0
},
"mathqa": {
"acc": 0.22512562814070353,
"acc_stderr": 0.00764590166234271,
"acc_norm": 0.2234505862646566,
"acc_norm_stderr": 0.00762563278617748
}
},
"versions": {
"math_intermediate_algebra": 1,
"math_geometry": 1,
"math_prealgebra": 1,
"drop": 1,
"mathqa": 0,
"math_counting_and_prob": 1,
"gsm8k": 0,
"math_num_theory": 1,
"math_precalc": 1,
"math_algebra": 1
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-560m,use_accelerate=True",
"num_fewshot": 5,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"pawsx_zh": {
"acc": 0.452,
"acc_stderr": 0.011131484850525779
},
"pawsx_de": {
"acc": 0.528,
"acc_stderr": 0.01116558709462154
},
"pawsx_ja": {
"acc": 0.449,
"acc_stderr": 0.011124809242874427
},
"pawsx_en": {
"acc": 0.52,
"acc_stderr": 0.011174185930778313
},
"pawsx_fr": {
"acc": 0.4795,
"acc_stderr": 0.011173732641806813
},
"pawsx_es": {
"acc": 0.5325,
"acc_stderr": 0.011159486640120933
},
"pawsx_ko": {
"acc": 0.519,
"acc_stderr": 0.011175058879956058
}
},
"versions": {
"pawsx_zh": 0,
"pawsx_de": 0,
"pawsx_ja": 0,
"pawsx_en": 0,
"pawsx_fr": 0,
"pawsx_es": 0,
"pawsx_ko": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-560m",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"logiqa": {
"acc": 0.22580645161290322,
"acc_stderr": 0.01639971378844507,
"acc_norm": 0.271889400921659,
"acc_norm_stderr": 0.01745171600943683
},
"webqs": {
"acc": 0.008366141732283465,
"acc_stderr": 0.002021079144496907
},
"triviaqa": {
"acc": 0.014408202952355696,
"acc_stderr": 0.001120427476389321
},
"truthfulqa_mc": {
"mc1": 0.24479804161566707,
"mc1_stderr": 0.01505186948671501,
"mc2": 0.42431454034377986,
"mc2_stderr": 0.015119647586282134
},
"headqa_en": {
"acc": 0.2567469000729395,
"acc_stderr": 0.008343849291869198,
"acc_norm": 0.29576951130561635,
"acc_norm_stderr": 0.008717251898361426
},
"headqa_es": {
"acc": 0.23960612691466082,
"acc_stderr": 0.008152930613263032,
"acc_norm": 0.27169948942377825,
"acc_norm_stderr": 0.00849660053640109
},
"squad2": {
"exact": 0.4295460288048513,
"f1": 1.8591225589833205,
"HasAns_exact": 0.7591093117408907,
"HasAns_f1": 3.622362034886802,
"NoAns_exact": 0.10092514718250631,
"NoAns_f1": 0.10092514718250631,
"best_exact": 50.07159100480081,
"best_f1": 50.07159100480081
}
},
"versions": {
"logiqa": 0,
"webqs": 0,
"triviaqa": 1,
"truthfulqa_mc": 1,
"headqa_en": 0,
"headqa_es": 0,
"squad2": 1
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-560m,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"drop": {
"em": 0.014995805369127516,
"em_stderr": 0.0012446395261788805,
"f1": 0.0621245805369127,
"f1_stderr": 0.001730980795797461
},
"coqa": {
"f1": 0.22712736568843772,
"f1_stderr": 0.01673094848597647,
"em": 0.174,
"em_stderr": 0.016190705499013296
},
"race": {
"acc": 0.30239234449760766,
"acc_stderr": 0.014214800395178313
}
},
"versions": {
"drop": 1,
"race": 1,
"coqa": 1
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-560m,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"xcopa_id": {
"acc": 0.592,
"acc_stderr": 0.02200091089387719
},
"xcopa_ht": {
"acc": 0.502,
"acc_stderr": 0.022382894986483524
},
"xcopa_qu": {
"acc": 0.502,
"acc_stderr": 0.02238289498648353
},
"xcopa_et": {
"acc": 0.49,
"acc_stderr": 0.022378596989230785
},
"xcopa_th": {
"acc": 0.544,
"acc_stderr": 0.022296238348407056
},
"xcopa_tr": {
"acc": 0.53,
"acc_stderr": 0.02234274819250285
},
"xcopa_it": {
"acc": 0.508,
"acc_stderr": 0.02238020883492804
},
"xcopa_ta": {
"acc": 0.558,
"acc_stderr": 0.02223197069632112
},
"xcopa_sw": {
"acc": 0.516,
"acc_stderr": 0.022371610982580396
},
"xcopa_vi": {
"acc": 0.61,
"acc_stderr": 0.021834685869369208
},
"xcopa_zh": {
"acc": 0.586,
"acc_stderr": 0.022049497969827865
}
},
"versions": {
"xcopa_id": 0,
"xcopa_ht": 0,
"xcopa_qu": 0,
"xcopa_et": 0,
"xcopa_th": 0,
"xcopa_tr": 0,
"xcopa_it": 0,
"xcopa_ta": 0,
"xcopa_sw": 0,
"xcopa_vi": 0,
"xcopa_zh": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-560m",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"xnli_sw": {
"acc": 0.3317365269461078,
"acc_stderr": 0.006652654857813421
},
"xnli_th": {
"acc": 0.33572854291417165,
"acc_stderr": 0.0066725434859242665
},
"xnli_ur": {
"acc": 0.3712574850299401,
"acc_stderr": 0.0068265064875345964
},
"xnli_bg": {
"acc": 0.3339321357285429,
"acc_stderr": 0.00666366003290998
},
"xnli_tr": {
"acc": 0.3343313373253493,
"acc_stderr": 0.006665643509474755
},
"xnli_zh": {
"acc": 0.3395209580838323,
"acc_stderr": 0.006690942515072474
},
"xnli_ar": {
"acc": 0.3335329341317365,
"acc_stderr": 0.006661671189931638
},
"xnli_el": {
"acc": 0.3333333333333333,
"acc_stderr": 0.006660674754535592
},
"xnli_hi": {
"acc": 0.40838323353293415,
"acc_stderr": 0.006945102706766183
},
"xnli_fr": {
"acc": 0.4528942115768463,
"acc_stderr": 0.007033289986695001
},
"xnli_es": {
"acc": 0.45229540918163674,
"acc_stderr": 0.007032484191375647
},
"xnli_vi": {
"acc": 0.405189620758483,
"acc_stderr": 0.006936540228025353
},
"xnli_en": {
"acc": 0.49500998003992014,
"acc_stderr": 0.007064360593648105
},
"xnli_de": {
"acc": 0.34790419161676644,
"acc_stderr": 0.006729921818907755
},
"xnli_ru": {
"acc": 0.3401197604790419,
"acc_stderr": 0.006693803790492355
}
},
"versions": {
"xnli_sw": 0,
"xnli_th": 0,
"xnli_ur": 0,
"xnli_bg": 0,
"xnli_tr": 0,
"xnli_zh": 0,
"xnli_ar": 0,
"xnli_el": 0,
"xnli_hi": 0,
"xnli_fr": 0,
"xnli_es": 0,
"xnli_vi": 0,
"xnli_en": 0,
"xnli_de": 0,
"xnli_ru": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-560m",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"xstory_cloze_es": {
"acc": 0.5585704831237591,
"acc_stderr": 0.012778538985880637
},
"xstory_cloze_hi": {
"acc": 0.5499669093315684,
"acc_stderr": 0.01280271359821983
},
"xstory_cloze_eu": {
"acc": 0.5360688285903376,
"acc_stderr": 0.012833602406620015
},
"xstory_cloze_ar": {
"acc": 0.5208471211118465,
"acc_stderr": 0.012855936282881267
},
"xstory_cloze_zh": {
"acc": 0.5453342157511581,
"acc_stderr": 0.012814127367359424
},
"xstory_cloze_te": {
"acc": 0.557246856386499,
"acc_stderr": 0.012782510750319236
},
"xstory_cloze_sw": {
"acc": 0.4983454665784249,
"acc_stderr": 0.012867054869163334
},
"xstory_cloze_ru": {
"acc": 0.49172733289212445,
"acc_stderr": 0.012865364020375405
},
"xstory_cloze_my": {
"acc": 0.47187293183322304,
"acc_stderr": 0.012846749995797694
},
"xstory_cloze_en": {
"acc": 0.6121773659827928,
"acc_stderr": 0.012539110696551456
},
"xstory_cloze_id": {
"acc": 0.5552614162806089,
"acc_stderr": 0.01278829597020778
}
},
"versions": {
"xstory_cloze_es": 0,
"xstory_cloze_hi": 0,
"xstory_cloze_eu": 0,
"xstory_cloze_ar": 0,
"xstory_cloze_zh": 0,
"xstory_cloze_te": 0,
"xstory_cloze_sw": 0,
"xstory_cloze_ru": 0,
"xstory_cloze_my": 0,
"xstory_cloze_en": 0,
"xstory_cloze_id": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-560m",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"xwinograd_en": {
"acc": 0.6589247311827957,
"acc_stderr": 0.009833881195698878
},
"xwinograd_pt": {
"acc": 0.6007604562737643,
"acc_stderr": 0.03025636835693898
},
"xwinograd_ru": {
"acc": 0.49206349206349204,
"acc_stderr": 0.028213077547815057
},
"xwinograd_fr": {
"acc": 0.6024096385542169,
"acc_stderr": 0.054045178247868114
},
"xwinograd_jp": {
"acc": 0.529718456725756,
"acc_stderr": 0.01612570703179889
},
"xwinograd_zh": {
"acc": 0.6765873015873016,
"acc_stderr": 0.020857221952855685
}
},
"versions": {
"xwinograd_en": 0,
"xwinograd_pt": 0,
"xwinograd_ru": 0,
"xwinograd_fr": 0,
"xwinograd_jp": 0,
"xwinograd_zh": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-560m",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment