Commit 21e128d8 authored by Julen Etxaniz's avatar Julen Etxaniz
Browse files

add bloom, xglm and llama results

parent 0542d35d
{
"results": {
"xstory_cloze_es": {
"acc": 0.5585704831237591,
"acc_stderr": 0.012778538985880637
},
"xstory_cloze_hi": {
"acc": 0.5499669093315684,
"acc_stderr": 0.01280271359821983
},
"xstory_cloze_eu": {
"acc": 0.5360688285903376,
"acc_stderr": 0.012833602406620015
},
"xstory_cloze_ar": {
"acc": 0.5208471211118465,
"acc_stderr": 0.012855936282881267
},
"xstory_cloze_zh": {
"acc": 0.5453342157511581,
"acc_stderr": 0.012814127367359424
},
"xstory_cloze_te": {
"acc": 0.557246856386499,
"acc_stderr": 0.012782510750319236
},
"xstory_cloze_sw": {
"acc": 0.4983454665784249,
"acc_stderr": 0.012867054869163334
},
"xstory_cloze_ru": {
"acc": 0.49172733289212445,
"acc_stderr": 0.012865364020375405
},
"xstory_cloze_my": {
"acc": 0.47187293183322304,
"acc_stderr": 0.012846749995797694
},
"xstory_cloze_en": {
"acc": 0.6121773659827928,
"acc_stderr": 0.012539110696551456
},
"xstory_cloze_id": {
"acc": 0.5552614162806089,
"acc_stderr": 0.01278829597020778
}
},
"versions": {
"xstory_cloze_es": 0,
"xstory_cloze_hi": 0,
"xstory_cloze_eu": 0,
"xstory_cloze_ar": 0,
"xstory_cloze_zh": 0,
"xstory_cloze_te": 0,
"xstory_cloze_sw": 0,
"xstory_cloze_ru": 0,
"xstory_cloze_my": 0,
"xstory_cloze_en": 0,
"xstory_cloze_id": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-560m",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"xwinograd_en": {
"acc": 0.6589247311827957,
"acc_stderr": 0.009833881195698878
},
"xwinograd_pt": {
"acc": 0.6007604562737643,
"acc_stderr": 0.03025636835693898
},
"xwinograd_ru": {
"acc": 0.49206349206349204,
"acc_stderr": 0.028213077547815057
},
"xwinograd_fr": {
"acc": 0.6024096385542169,
"acc_stderr": 0.054045178247868114
},
"xwinograd_jp": {
"acc": 0.529718456725756,
"acc_stderr": 0.01612570703179889
},
"xwinograd_zh": {
"acc": 0.6765873015873016,
"acc_stderr": 0.020857221952855685
}
},
"versions": {
"xwinograd_en": 0,
"xwinograd_pt": 0,
"xwinograd_ru": 0,
"xwinograd_fr": 0,
"xwinograd_jp": 0,
"xwinograd_zh": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-560m",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"bigbench_disambiguation_qa": {
"multiple_choice_grade": 0.26356589147286824,
"multiple_choice_grade_stderr": 0.027481788262218698
},
"bigbench_logical_deduction_three_objects": {
"multiple_choice_grade": 0.37,
"multiple_choice_grade_stderr": 0.027921294063982
},
"bigbench_causal_judgement": {
"multiple_choice_grade": 0.5210526315789473,
"multiple_choice_grade_stderr": 0.03633739504773335
},
"bigbench_date_understanding": {
"multiple_choice_grade": 0.36585365853658536,
"multiple_choice_grade_stderr": 0.025108717905729792
},
"bigbench_navigate": {
"multiple_choice_grade": 0.499,
"multiple_choice_grade_stderr": 0.015819268290576817
},
"bigbench_salient_translation_error_detection": {
"multiple_choice_grade": 0.19138276553106212,
"multiple_choice_grade_stderr": 0.012458774650265594
},
"bigbench_temporal_sequences": {
"multiple_choice_grade": 0.248,
"multiple_choice_grade_stderr": 0.013663187134877651
},
"bigbench_tracking_shuffled_objects_seven_objects": {
"multiple_choice_grade": 0.14,
"multiple_choice_grade_stderr": 0.00829694743648913
},
"bigbench_ruin_names": {
"multiple_choice_grade": 0.34375,
"multiple_choice_grade_stderr": 0.02246478414865448
},
"bigbench_reasoning_about_colored_objects": {
"multiple_choice_grade": 0.2485,
"multiple_choice_grade_stderr": 0.009665432493822852
},
"bigbench_dyck_languages": {
"multiple_choice_grade": 0.144,
"multiple_choice_grade_stderr": 0.01110798754893915
},
"bigbench_logical_deduction_five_objects": {
"multiple_choice_grade": 0.26,
"multiple_choice_grade_stderr": 0.019635965529725512
},
"bigbench_sports_understanding": {
"multiple_choice_grade": 0.5030425963488844,
"multiple_choice_grade_stderr": 0.015931029729145698
},
"bigbench_tracking_shuffled_objects_three_objects": {
"multiple_choice_grade": 0.37,
"multiple_choice_grade_stderr": 0.027921294063982
},
"bigbench_geometric_shapes": {
"multiple_choice_grade": 0.20055710306406685,
"multiple_choice_grade_stderr": 0.021162707757982353,
"exact_str_match": 0.0,
"exact_str_match_stderr": 0.0
},
"bigbench_hyperbaton": {
"multiple_choice_grade": 0.48618,
"multiple_choice_grade_stderr": 0.0022352360227943418
},
"bigbench_logical_deduction_seven_objects": {
"multiple_choice_grade": 0.19142857142857142,
"multiple_choice_grade_stderr": 0.014880721436998012
},
"bigbench_snarks": {
"multiple_choice_grade": 0.4972375690607735,
"multiple_choice_grade_stderr": 0.037267230837657574
},
"bigbench_formal_fallacies_syllogisms_negation": {
"multiple_choice_grade": 0.5005633802816901,
"multiple_choice_grade_stderr": 0.004196051878850066
},
"bigbench_tracking_shuffled_objects_five_objects": {
"multiple_choice_grade": 0.184,
"multiple_choice_grade_stderr": 0.010964094540602657
},
"bigbench_movie_recommendation": {
"multiple_choice_grade": 0.264,
"multiple_choice_grade_stderr": 0.019732885585922087
}
},
"versions": {
"bigbench_disambiguation_qa": 0,
"bigbench_logical_deduction_three_objects": 0,
"bigbench_causal_judgement": 0,
"bigbench_date_understanding": 0,
"bigbench_navigate": 0,
"bigbench_salient_translation_error_detection": 0,
"bigbench_temporal_sequences": 0,
"bigbench_tracking_shuffled_objects_seven_objects": 0,
"bigbench_ruin_names": 0,
"bigbench_reasoning_about_colored_objects": 0,
"bigbench_dyck_languages": 0,
"bigbench_logical_deduction_five_objects": 0,
"bigbench_sports_understanding": 0,
"bigbench_tracking_shuffled_objects_three_objects": 0,
"bigbench_geometric_shapes": 0,
"bigbench_hyperbaton": 0,
"bigbench_logical_deduction_seven_objects": 0,
"bigbench_snarks": 0,
"bigbench_formal_fallacies_syllogisms_negation": 0,
"bigbench_tracking_shuffled_objects_five_objects": 0,
"bigbench_movie_recommendation": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-7b1,use_accelerate=True",
"num_fewshot": 3,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"copa": {
"acc": 0.72,
"acc_stderr": 0.04512608598542127
},
"winogrande": {
"acc": 0.6432517758484609,
"acc_stderr": 0.013463393958028726
},
"piqa": {
"acc": 0.7274211099020674,
"acc_stderr": 0.010389256803296021,
"acc_norm": 0.7366702937976061,
"acc_norm_stderr": 0.010276185322196764
},
"arc_challenge": {
"acc": 0.3037542662116041,
"acc_stderr": 0.013438909184778757,
"acc_norm": 0.33532423208191126,
"acc_norm_stderr": 0.013796182947785564
},
"arc_easy": {
"acc": 0.6494107744107744,
"acc_stderr": 0.009791003829831557,
"acc_norm": 0.5732323232323232,
"acc_norm_stderr": 0.010149141043955626
},
"boolq": {
"acc": 0.6287461773700306,
"acc_stderr": 0.008450174658715903
},
"wsc273": {
"acc": 0.8131868131868132,
"acc_stderr": 0.023632761722644544
},
"openbookqa": {
"acc": 0.252,
"acc_stderr": 0.019435727282249536,
"acc_norm": 0.358,
"acc_norm_stderr": 0.021461434862859122
},
"prost": {
"acc": 0.26184884713919726,
"acc_stderr": 0.003211967450351038,
"acc_norm": 0.30572160546541416,
"acc_norm_stderr": 0.003365914208405272
},
"mc_taco": {
"em": 0.13588588588588588,
"f1": 0.5052611696967991
},
"hellaswag": {
"acc": 0.4623580959968134,
"acc_stderr": 0.0049756211474061025,
"acc_norm": 0.5967934674367655,
"acc_norm_stderr": 0.0048953903414456264
},
"swag": {
"acc": 0.5024992502249325,
"acc_stderr": 0.0035350478846161142,
"acc_norm": 0.6825952214335699,
"acc_norm_stderr": 0.0032909332559412758
}
},
"versions": {
"copa": 0,
"winogrande": 0,
"piqa": 0,
"arc_challenge": 0,
"arc_easy": 0,
"boolq": 1,
"wsc273": 0,
"openbookqa": 0,
"prost": 0,
"mc_taco": 0,
"hellaswag": 0,
"swag": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-7b1,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"gsm8k": {
"acc": 0.018953752843062926,
"acc_stderr": 0.0037560783410314704
}
},
"versions": {
"gsm8k": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-7b1,use_accelerate=True",
"num_fewshot": 8,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"mathqa": {
"acc": 0.26566164154103855,
"acc_stderr": 0.008085616216226046,
"acc_norm": 0.26532663316582916,
"acc_norm_stderr": 0.008082359462649721
},
"math_prealgebra": {
"acc": 0.0,
"acc_stderr": 0.0
},
"drop": {
"em": 0.02506291946308725,
"em_stderr": 0.0016008246934367681,
"f1": 0.05092911073825512,
"f1_stderr": 0.0017766603696206904
},
"math_precalc": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_geometry": {
"acc": 0.0,
"acc_stderr": 0.0
},
"gsm8k": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_counting_and_prob": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_num_theory": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_algebra": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_intermediate_algebra": {
"acc": 0.0,
"acc_stderr": 0.0
}
},
"versions": {
"mathqa": 0,
"math_prealgebra": 1,
"drop": 1,
"math_precalc": 1,
"math_geometry": 1,
"gsm8k": 0,
"math_counting_and_prob": 1,
"math_num_theory": 1,
"math_algebra": 1,
"math_intermediate_algebra": 1
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-7b1,use_accelerate=True",
"num_fewshot": 5,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"pawsx_zh": {
"acc": 0.4735,
"acc_stderr": 0.011167418260963935
},
"pawsx_de": {
"acc": 0.5285,
"acc_stderr": 0.011164954236428803
},
"pawsx_en": {
"acc": 0.613,
"acc_stderr": 0.010893798117218195
},
"pawsx_ko": {
"acc": 0.451,
"acc_stderr": 0.01112930504188632
},
"pawsx_fr": {
"acc": 0.509,
"acc_stderr": 0.011181324206260283
},
"pawsx_es": {
"acc": 0.5935,
"acc_stderr": 0.010985864536294245
},
"pawsx_ja": {
"acc": 0.4545,
"acc_stderr": 0.01113673598700373
}
},
"versions": {
"pawsx_zh": 0,
"pawsx_de": 0,
"pawsx_en": 0,
"pawsx_ko": 0,
"pawsx_fr": 0,
"pawsx_es": 0,
"pawsx_ja": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-7b1",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"webqs": {
"acc": 0.022637795275590553,
"acc_stderr": 0.0033005770276179373
},
"headqa_en": {
"acc": 0.31181619256017507,
"acc_stderr": 0.008848039223989218,
"acc_norm": 0.35557986870897157,
"acc_norm_stderr": 0.009143208309033068
},
"squad2": {
"exact": 7.816053230017687,
"f1": 12.640343596838946,
"HasAns_exact": 14.84480431848853,
"HasAns_f1": 24.507219892926596,
"NoAns_exact": 0.8074011774600505,
"NoAns_f1": 0.8074011774600505,
"best_exact": 50.07159100480081,
"best_f1": 50.07159100480081
},
"truthfulqa_mc": {
"mc1": 0.22399020807833536,
"mc1_stderr": 0.014594964329474202,
"mc2": 0.38898018897492265,
"mc2_stderr": 0.014014176010735629
},
"triviaqa": {
"acc": 0.055246176964554056,
"acc_stderr": 0.0021480319949071717
},
"headqa_es": {
"acc": 0.29540481400437635,
"acc_stderr": 0.008714131357853837,
"acc_norm": 0.34318016046681254,
"acc_norm_stderr": 0.009068379779817705
},
"logiqa": {
"acc": 0.20276497695852536,
"acc_stderr": 0.015770046635584564,
"acc_norm": 0.28110599078341014,
"acc_norm_stderr": 0.017632374626460005
}
},
"versions": {
"webqs": 0,
"headqa_en": 0,
"squad2": 1,
"truthfulqa_mc": 1,
"triviaqa": 1,
"headqa_es": 0,
"logiqa": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-7b1,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"coqa": {
"f1": 0.6882976860781418,
"f1_stderr": 0.016322647326969194,
"em": 0.5386666666666665,
"em_stderr": 0.01995482540089559
},
"drop": {
"em": 0.02569211409395973,
"em_stderr": 0.0016202710827118362,
"f1": 0.09853712248322138,
"f1_stderr": 0.0021424507419289577
},
"race": {
"acc": 0.36555023923444974,
"acc_stderr": 0.014904654247182307
}
},
"versions": {
"coqa": 1,
"race": 1,
"drop": 1
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-7b1,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"xcopa_ta": {
"acc": 0.592,
"acc_stderr": 0.02200091089387719
},
"xcopa_id": {
"acc": 0.698,
"acc_stderr": 0.02055326917420918
},
"xcopa_tr": {
"acc": 0.512,
"acc_stderr": 0.02237662679792717
},
"xcopa_th": {
"acc": 0.554,
"acc_stderr": 0.022252153078595897
},
"xcopa_ht": {
"acc": 0.508,
"acc_stderr": 0.022380208834928035
},
"xcopa_qu": {
"acc": 0.508,
"acc_stderr": 0.022380208834928035
},
"xcopa_sw": {
"acc": 0.516,
"acc_stderr": 0.0223716109825804
},
"xcopa_it": {
"acc": 0.528,
"acc_stderr": 0.022347949832668086
},
"xcopa_zh": {
"acc": 0.652,
"acc_stderr": 0.021323728632807498
},
"xcopa_et": {
"acc": 0.482,
"acc_stderr": 0.02236856511738799
},
"xcopa_vi": {
"acc": 0.708,
"acc_stderr": 0.02035437548053008
}
},
"versions": {
"xcopa_ta": 0,
"xcopa_id": 0,
"xcopa_tr": 0,
"xcopa_th": 0,
"xcopa_ht": 0,
"xcopa_qu": 0,
"xcopa_sw": 0,
"xcopa_it": 0,
"xcopa_zh": 0,
"xcopa_et": 0,
"xcopa_vi": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-7b1",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"xnli_ar": {
"acc": 0.3383233532934132,
"acc_stderr": 0.006685184166851475
},
"xnli_bg": {
"acc": 0.3970059880239521,
"acc_stderr": 0.006913206227417221
},
"xnli_de": {
"acc": 0.39860279441117763,
"acc_stderr": 0.0069179171504068675
},
"xnli_el": {
"acc": 0.35748502994011977,
"acc_stderr": 0.006771658365506411
},
"xnli_en": {
"acc": 0.539121756487026,
"acc_stderr": 0.007043053978003474
},
"xnli_es": {
"acc": 0.4870259481037924,
"acc_stderr": 0.007062333678954121
},
"xnli_fr": {
"acc": 0.49680638722554893,
"acc_stderr": 0.00706456831954508
},
"xnli_hi": {
"acc": 0.46506986027944114,
"acc_stderr": 0.007047451825220883
},
"xnli_ru": {
"acc": 0.4305389221556886,
"acc_stderr": 0.006996208063220089
},
"xnli_sw": {
"acc": 0.37924151696606784,
"acc_stderr": 0.006855572898852684
},
"xnli_th": {
"acc": 0.3499001996007984,
"acc_stderr": 0.00673886250800537
},
"xnli_tr": {
"acc": 0.3508982035928144,
"acc_stderr": 0.00674328417575373
},
"xnli_ur": {
"acc": 0.42095808383233535,
"acc_stderr": 0.006975878576227378
},
"xnli_vi": {
"acc": 0.47045908183632734,
"acc_stderr": 0.007052371383794704
},
"xnli_zh": {
"acc": 0.35429141716566864,
"acc_stderr": 0.006758076124936785
}
},
"versions": {
"xnli_ar": 0,
"xnli_bg": 0,
"xnli_de": 0,
"xnli_el": 0,
"xnli_en": 0,
"xnli_es": 0,
"xnli_fr": 0,
"xnli_hi": 0,
"xnli_ru": 0,
"xnli_sw": 0,
"xnli_th": 0,
"xnli_tr": 0,
"xnli_ur": 0,
"xnli_vi": 0,
"xnli_zh": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-7b1",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"xstory_cloze_ru": {
"acc": 0.5274652547981469,
"acc_stderr": 0.012847698270388222
},
"xstory_cloze_eu": {
"acc": 0.57180675049636,
"acc_stderr": 0.012733742799515155
},
"xstory_cloze_en": {
"acc": 0.7074784910655195,
"acc_stderr": 0.011707038572975033
},
"xstory_cloze_ar": {
"acc": 0.585704831237591,
"acc_stderr": 0.012676689821720669
},
"xstory_cloze_es": {
"acc": 0.6611515552614163,
"acc_stderr": 0.012180490758739058
},
"xstory_cloze_hi": {
"acc": 0.6055592322964924,
"acc_stderr": 0.01257710651393614
},
"xstory_cloze_my": {
"acc": 0.48974189278623426,
"acc_stderr": 0.012864417047980468
},
"xstory_cloze_sw": {
"acc": 0.5393778954334878,
"acc_stderr": 0.012827159238891916
},
"xstory_cloze_zh": {
"acc": 0.6187954996690933,
"acc_stderr": 0.01249867885093408
},
"xstory_cloze_id": {
"acc": 0.6446062210456651,
"acc_stderr": 0.01231724793041837
},
"xstory_cloze_te": {
"acc": 0.5744540039708802,
"acc_stderr": 0.012723670419166324
}
},
"versions": {
"xstory_cloze_ru": 0,
"xstory_cloze_eu": 0,
"xstory_cloze_en": 0,
"xstory_cloze_ar": 0,
"xstory_cloze_es": 0,
"xstory_cloze_hi": 0,
"xstory_cloze_my": 0,
"xstory_cloze_sw": 0,
"xstory_cloze_zh": 0,
"xstory_cloze_id": 0,
"xstory_cloze_te": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-7b1",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"xwinograd_fr": {
"acc": 0.7108433734939759,
"acc_stderr": 0.050066428050419186
},
"xwinograd_ru": {
"acc": 0.5682539682539682,
"acc_stderr": 0.027952495861671634
},
"xwinograd_en": {
"acc": 0.821505376344086,
"acc_stderr": 0.00794327709606643
},
"xwinograd_pt": {
"acc": 0.7680608365019012,
"acc_stderr": 0.026075593860304693
},
"xwinograd_jp": {
"acc": 0.5849843587069864,
"acc_stderr": 0.015919213413834392
},
"xwinograd_zh": {
"acc": 0.7440476190476191,
"acc_stderr": 0.019457899684028012
}
},
"versions": {
"xwinograd_fr": 0,
"xwinograd_ru": 0,
"xwinograd_en": 0,
"xwinograd_pt": 0,
"xwinograd_jp": 0,
"xwinograd_zh": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-7b1",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"arithmetic_2dm": {
"acc": 0.0,
"acc_stderr": 0.0
},
"arithmetic_5ds": {
"acc": 0.0,
"acc_stderr": 0.0
},
"arithmetic_4da": {
"acc": 0.0,
"acc_stderr": 0.0
},
"arithmetic_3da": {
"acc": 0.0,
"acc_stderr": 0.0
},
"arithmetic_4ds": {
"acc": 0.0,
"acc_stderr": 0.0
},
"arithmetic_5da": {
"acc": 0.0,
"acc_stderr": 0.0
},
"arithmetic_2da": {
"acc": 0.0,
"acc_stderr": 0.0
},
"arithmetic_3ds": {
"acc": 0.0,
"acc_stderr": 0.0
},
"arithmetic_1dc": {
"acc": 0.0,
"acc_stderr": 0.0
},
"arithmetic_2ds": {
"acc": 0.0,
"acc_stderr": 0.0
}
},
"versions": {
"arithmetic_2dm": 0,
"arithmetic_5ds": 0,
"arithmetic_4da": 0,
"arithmetic_3da": 0,
"arithmetic_4ds": 0,
"arithmetic_5da": 0,
"arithmetic_2da": 0,
"arithmetic_3ds": 0,
"arithmetic_1dc": 0,
"arithmetic_2ds": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True",
"num_fewshot": 5,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": false,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"bigbench_sports_understanding": {
"multiple_choice_grade": 0.5811359026369168,
"multiple_choice_grade_stderr": 0.015720172474974117
},
"bigbench_salient_translation_error_detection": {
"multiple_choice_grade": 0.1933867735470942,
"multiple_choice_grade_stderr": 0.012508305339715512
},
"bigbench_date_understanding": {
"multiple_choice_grade": 0.6395663956639567,
"multiple_choice_grade_stderr": 0.025028311208714224
},
"bigbench_navigate": {
"multiple_choice_grade": 0.517,
"multiple_choice_grade_stderr": 0.015810153729833434
},
"bigbench_dyck_languages": {
"multiple_choice_grade": 0.201,
"multiple_choice_grade_stderr": 0.012679107214617324
},
"bigbench_movie_recommendation": {
"multiple_choice_grade": 0.436,
"multiple_choice_grade_stderr": 0.022198954641476802
},
"bigbench_snarks": {
"multiple_choice_grade": 0.4696132596685083,
"multiple_choice_grade_stderr": 0.03719891321680327
},
"bigbench_disambiguation_qa": {
"multiple_choice_grade": 0.4573643410852713,
"multiple_choice_grade_stderr": 0.03107554499047266
},
"bigbench_reasoning_about_colored_objects": {
"multiple_choice_grade": 0.3705,
"multiple_choice_grade_stderr": 0.010801537464907349
},
"bigbench_geometric_shapes": {
"multiple_choice_grade": 0.23119777158774374,
"multiple_choice_grade_stderr": 0.02228217728550543,
"exact_str_match": 0.0,
"exact_str_match_stderr": 0.0
},
"bigbench_tracking_shuffled_objects_five_objects": {
"multiple_choice_grade": 0.2144,
"multiple_choice_grade_stderr": 0.011612665292522431
},
"bigbench_formal_fallacies_syllogisms_negation": {
"multiple_choice_grade": 0.5113380281690141,
"multiple_choice_grade_stderr": 0.004194975590734721
},
"bigbench_tracking_shuffled_objects_three_objects": {
"multiple_choice_grade": 0.4166666666666667,
"multiple_choice_grade_stderr": 0.028511310643917567
},
"bigbench_hyperbaton": {
"multiple_choice_grade": 0.5038,
"multiple_choice_grade_stderr": 0.0022360257592931206
},
"bigbench_temporal_sequences": {
"multiple_choice_grade": 0.28,
"multiple_choice_grade_stderr": 0.014205696104091493
},
"bigbench_logical_deduction_three_objects": {
"multiple_choice_grade": 0.4166666666666667,
"multiple_choice_grade_stderr": 0.028511310643917567
},
"bigbench_causal_judgement": {
"multiple_choice_grade": 0.49473684210526314,
"multiple_choice_grade_stderr": 0.036367633377878836
},
"bigbench_tracking_shuffled_objects_seven_objects": {
"multiple_choice_grade": 0.14457142857142857,
"multiple_choice_grade_stderr": 0.008408881015830339
},
"bigbench_logical_deduction_seven_objects": {
"multiple_choice_grade": 0.22285714285714286,
"multiple_choice_grade_stderr": 0.015740739118727993
},
"bigbench_logical_deduction_five_objects": {
"multiple_choice_grade": 0.3,
"multiple_choice_grade_stderr": 0.020514426225628046
},
"bigbench_ruin_names": {
"multiple_choice_grade": 0.34598214285714285,
"multiple_choice_grade_stderr": 0.02249924183068251
}
},
"versions": {
"bigbench_sports_understanding": 0,
"bigbench_salient_translation_error_detection": 0,
"bigbench_date_understanding": 0,
"bigbench_navigate": 0,
"bigbench_dyck_languages": 0,
"bigbench_movie_recommendation": 0,
"bigbench_snarks": 0,
"bigbench_disambiguation_qa": 0,
"bigbench_reasoning_about_colored_objects": 0,
"bigbench_geometric_shapes": 0,
"bigbench_tracking_shuffled_objects_five_objects": 0,
"bigbench_formal_fallacies_syllogisms_negation": 0,
"bigbench_tracking_shuffled_objects_three_objects": 0,
"bigbench_hyperbaton": 0,
"bigbench_temporal_sequences": 0,
"bigbench_logical_deduction_three_objects": 0,
"bigbench_causal_judgement": 0,
"bigbench_tracking_shuffled_objects_seven_objects": 0,
"bigbench_logical_deduction_seven_objects": 0,
"bigbench_logical_deduction_five_objects": 0,
"bigbench_ruin_names": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True",
"num_fewshot": 3,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"blimp_anaphor_gender_agreement": {
"acc": 0.576,
"acc_stderr": 0.015635487471405186
},
"blimp_wh_questions_subject_gap_long_distance": {
"acc": 0.408,
"acc_stderr": 0.01554920505292068
},
"blimp_expletive_it_object_raising": {
"acc": 0.619,
"acc_stderr": 0.015364734787007436
},
"blimp_npi_present_2": {
"acc": 0.39,
"acc_stderr": 0.015431725053866606
},
"blimp_sentential_negation_npi_licensor_present": {
"acc": 0.392,
"acc_stderr": 0.015445859463771302
},
"blimp_wh_vs_that_no_gap": {
"acc": 0.196,
"acc_stderr": 0.012559527926707371
},
"blimp_wh_vs_that_with_gap": {
"acc": 0.847,
"acc_stderr": 0.011389500459665546
},
"blimp_passive_2": {
"acc": 0.526,
"acc_stderr": 0.01579789775804277
},
"blimp_drop_argument": {
"acc": 0.705,
"acc_stderr": 0.014428554438445524
},
"blimp_irregular_plural_subject_verb_agreement_2": {
"acc": 0.504,
"acc_stderr": 0.015818793703510893
},
"blimp_adjunct_island": {
"acc": 0.338,
"acc_stderr": 0.014965960710224489
},
"blimp_transitive": {
"acc": 0.473,
"acc_stderr": 0.015796218551302615
},
"blimp_irregular_plural_subject_verb_agreement_1": {
"acc": 0.518,
"acc_stderr": 0.015809045699406728
},
"blimp_animate_subject_passive": {
"acc": 0.651,
"acc_stderr": 0.015080663991563098
},
"blimp_determiner_noun_agreement_1": {
"acc": 0.341,
"acc_stderr": 0.014998131348402706
},
"blimp_wh_island": {
"acc": 0.506,
"acc_stderr": 0.015818160898606715
},
"blimp_intransitive": {
"acc": 0.643,
"acc_stderr": 0.015158521721486769
},
"blimp_left_branch_island_simple_question": {
"acc": 0.411,
"acc_stderr": 0.015566673418599276
},
"blimp_irregular_past_participle_verbs": {
"acc": 0.314,
"acc_stderr": 0.01468399195108796
},
"blimp_principle_A_case_2": {
"acc": 0.443,
"acc_stderr": 0.0157161699532041
},
"blimp_principle_A_domain_3": {
"acc": 0.563,
"acc_stderr": 0.015693223928730377
},
"blimp_sentential_subject_island": {
"acc": 0.621,
"acc_stderr": 0.01534909100222535
},
"blimp_tough_vs_raising_1": {
"acc": 0.361,
"acc_stderr": 0.015195720118175127
},
"blimp_principle_A_c_command": {
"acc": 0.326,
"acc_stderr": 0.014830507204541042
},
"blimp_wh_vs_that_no_gap_long_distance": {
"acc": 0.301,
"acc_stderr": 0.014512395033543147
},
"blimp_irregular_past_participle_adjectives": {
"acc": 0.636,
"acc_stderr": 0.015222868840522019
},
"blimp_complex_NP_island": {
"acc": 0.303,
"acc_stderr": 0.014539683710535264
},
"blimp_only_npi_licensor_present": {
"acc": 0.731,
"acc_stderr": 0.014029819522568198
},
"blimp_wh_questions_subject_gap": {
"acc": 0.369,
"acc_stderr": 0.015266698139154617
},
"blimp_coordinate_structure_constraint_object_extraction": {
"acc": 0.279,
"acc_stderr": 0.014190150117612037
},
"blimp_determiner_noun_agreement_2": {
"acc": 0.361,
"acc_stderr": 0.015195720118175115
},
"blimp_ellipsis_n_bar_2": {
"acc": 0.264,
"acc_stderr": 0.01394627184944048
},
"blimp_only_npi_scope": {
"acc": 0.278,
"acc_stderr": 0.014174516461485247
},
"blimp_determiner_noun_agreement_with_adj_irregular_1": {
"acc": 0.342,
"acc_stderr": 0.015008706182121728
},
"blimp_existential_there_object_raising": {
"acc": 0.69,
"acc_stderr": 0.014632638658632902
},
"blimp_superlative_quantifiers_1": {
"acc": 0.522,
"acc_stderr": 0.015803979428161957
},
"blimp_distractor_agreement_relational_noun": {
"acc": 0.514,
"acc_stderr": 0.015813097547730987
},
"blimp_wh_vs_that_with_gap_long_distance": {
"acc": 0.692,
"acc_stderr": 0.014606483127342761
},
"blimp_determiner_noun_agreement_with_adj_2": {
"acc": 0.392,
"acc_stderr": 0.015445859463771295
},
"blimp_principle_A_domain_1": {
"acc": 0.324,
"acc_stderr": 0.01480686473373886
},
"blimp_distractor_agreement_relative_clause": {
"acc": 0.423,
"acc_stderr": 0.015630589090476345
},
"blimp_inchoative": {
"acc": 0.474,
"acc_stderr": 0.015797897758042766
},
"blimp_superlative_quantifiers_2": {
"acc": 0.714,
"acc_stderr": 0.01429714686251791
},
"blimp_tough_vs_raising_2": {
"acc": 0.642,
"acc_stderr": 0.015167928865407557
},
"blimp_principle_A_domain_2": {
"acc": 0.74,
"acc_stderr": 0.013877773329774166
},
"blimp_determiner_noun_agreement_irregular_2": {
"acc": 0.369,
"acc_stderr": 0.015266698139154614
},
"blimp_animate_subject_trans": {
"acc": 0.616,
"acc_stderr": 0.015387682761897071
},
"blimp_ellipsis_n_bar_1": {
"acc": 0.624,
"acc_stderr": 0.015325105508898134
},
"blimp_existential_there_quantifiers_1": {
"acc": 0.308,
"acc_stderr": 0.014606483127342763
},
"blimp_regular_plural_subject_verb_agreement_1": {
"acc": 0.56,
"acc_stderr": 0.01570498795436179
},
"blimp_wh_questions_object_gap": {
"acc": 0.455,
"acc_stderr": 0.01575510149834709
},
"blimp_determiner_noun_agreement_with_adj_irregular_2": {
"acc": 0.393,
"acc_stderr": 0.015452824654081496
},
"blimp_sentential_negation_npi_scope": {
"acc": 0.638,
"acc_stderr": 0.015204840912919498
},
"blimp_principle_A_case_1": {
"acc": 0.028,
"acc_stderr": 0.005219506034410047
},
"blimp_existential_there_subject_raising": {
"acc": 0.701,
"acc_stderr": 0.014484778521220482
},
"blimp_causative": {
"acc": 0.359,
"acc_stderr": 0.015177264224798597
},
"blimp_determiner_noun_agreement_with_adjective_1": {
"acc": 0.391,
"acc_stderr": 0.015438826294681783
},
"blimp_coordinate_structure_constraint_complex_left_branch": {
"acc": 0.345,
"acc_stderr": 0.015039986742055238
},
"blimp_passive_1": {
"acc": 0.529,
"acc_stderr": 0.015792669451628896
},
"blimp_npi_present_1": {
"acc": 0.304,
"acc_stderr": 0.014553205687950424
},
"blimp_left_branch_island_echo_question": {
"acc": 0.49,
"acc_stderr": 0.015816135752773207
},
"blimp_existential_there_quantifiers_2": {
"acc": 0.788,
"acc_stderr": 0.012931481864938041
},
"blimp_regular_plural_subject_verb_agreement_2": {
"acc": 0.456,
"acc_stderr": 0.01575792855397917
},
"blimp_principle_A_reconstruction": {
"acc": 0.792,
"acc_stderr": 0.012841374572096921
},
"blimp_determiner_noun_agreement_irregular_1": {
"acc": 0.356,
"acc_stderr": 0.015149042659306628
},
"blimp_matrix_question_npi_licensor_present": {
"acc": 0.548,
"acc_stderr": 0.01574623586588068
},
"blimp_anaphor_number_agreement": {
"acc": 0.565,
"acc_stderr": 0.0156850572527172
}
},
"versions": {
"blimp_anaphor_gender_agreement": 0,
"blimp_wh_questions_subject_gap_long_distance": 0,
"blimp_expletive_it_object_raising": 0,
"blimp_npi_present_2": 0,
"blimp_sentential_negation_npi_licensor_present": 0,
"blimp_wh_vs_that_no_gap": 0,
"blimp_wh_vs_that_with_gap": 0,
"blimp_passive_2": 0,
"blimp_drop_argument": 0,
"blimp_irregular_plural_subject_verb_agreement_2": 0,
"blimp_adjunct_island": 0,
"blimp_transitive": 0,
"blimp_irregular_plural_subject_verb_agreement_1": 0,
"blimp_animate_subject_passive": 0,
"blimp_determiner_noun_agreement_1": 0,
"blimp_wh_island": 0,
"blimp_intransitive": 0,
"blimp_left_branch_island_simple_question": 0,
"blimp_irregular_past_participle_verbs": 0,
"blimp_principle_A_case_2": 0,
"blimp_principle_A_domain_3": 0,
"blimp_sentential_subject_island": 0,
"blimp_tough_vs_raising_1": 0,
"blimp_principle_A_c_command": 0,
"blimp_wh_vs_that_no_gap_long_distance": 0,
"blimp_irregular_past_participle_adjectives": 0,
"blimp_complex_NP_island": 0,
"blimp_only_npi_licensor_present": 0,
"blimp_wh_questions_subject_gap": 0,
"blimp_coordinate_structure_constraint_object_extraction": 0,
"blimp_determiner_noun_agreement_2": 0,
"blimp_ellipsis_n_bar_2": 0,
"blimp_only_npi_scope": 0,
"blimp_determiner_noun_agreement_with_adj_irregular_1": 0,
"blimp_existential_there_object_raising": 0,
"blimp_superlative_quantifiers_1": 0,
"blimp_distractor_agreement_relational_noun": 0,
"blimp_wh_vs_that_with_gap_long_distance": 0,
"blimp_determiner_noun_agreement_with_adj_2": 0,
"blimp_principle_A_domain_1": 0,
"blimp_distractor_agreement_relative_clause": 0,
"blimp_inchoative": 0,
"blimp_superlative_quantifiers_2": 0,
"blimp_tough_vs_raising_2": 0,
"blimp_principle_A_domain_2": 0,
"blimp_determiner_noun_agreement_irregular_2": 0,
"blimp_animate_subject_trans": 0,
"blimp_ellipsis_n_bar_1": 0,
"blimp_existential_there_quantifiers_1": 0,
"blimp_regular_plural_subject_verb_agreement_1": 0,
"blimp_wh_questions_object_gap": 0,
"blimp_determiner_noun_agreement_with_adj_irregular_2": 0,
"blimp_sentential_negation_npi_scope": 0,
"blimp_principle_A_case_1": 0,
"blimp_existential_there_subject_raising": 0,
"blimp_causative": 0,
"blimp_determiner_noun_agreement_with_adjective_1": 0,
"blimp_coordinate_structure_constraint_complex_left_branch": 0,
"blimp_passive_1": 0,
"blimp_npi_present_1": 0,
"blimp_left_branch_island_echo_question": 0,
"blimp_existential_there_quantifiers_2": 0,
"blimp_regular_plural_subject_verb_agreement_2": 0,
"blimp_principle_A_reconstruction": 0,
"blimp_determiner_noun_agreement_irregular_1": 0,
"blimp_matrix_question_npi_licensor_present": 0,
"blimp_anaphor_number_agreement": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"arc_challenge": {
"acc": 0.439419795221843,
"acc_stderr": 0.014503747823580122,
"acc_norm": 0.4462457337883959,
"acc_norm_stderr": 0.014526705548539982
},
"prost": {
"acc": 0.2688941076003416,
"acc_stderr": 0.0032393206239968247,
"acc_norm": 0.3052412467976089,
"acc_norm_stderr": 0.003364432149066356
},
"swag": {
"acc": 0.5673298010596821,
"acc_stderr": 0.003502894135944166,
"acc_norm": 0.6934919524142757,
"acc_norm_stderr": 0.0032596605453371346
},
"arc_easy": {
"acc": 0.7457912457912458,
"acc_stderr": 0.008934537681141528,
"acc_norm": 0.5989057239057239,
"acc_norm_stderr": 0.010057051106534378
},
"boolq": {
"acc": 0.6850152905198776,
"acc_stderr": 0.00812432724981665
},
"wsc273": {
"acc": 0.8608058608058609,
"acc_stderr": 0.020988366070851
},
"mc_taco": {
"em": 0.10960960960960961,
"f1": 0.4753174430074593
},
"piqa": {
"acc": 0.7883569096844396,
"acc_stderr": 0.009530351270479397,
"acc_norm": 0.7910772578890098,
"acc_norm_stderr": 0.009485227030105093
},
"hellaswag": {
"acc": 0.5910177255526787,
"acc_stderr": 0.004906411984476791,
"acc_norm": 0.7623979286994622,
"acc_norm_stderr": 0.004247442237702478
},
"winogrande": {
"acc": 0.7016574585635359,
"acc_stderr": 0.012858885010030434
},
"copa": {
"acc": 0.9,
"acc_stderr": 0.030151134457776348
},
"openbookqa": {
"acc": 0.306,
"acc_stderr": 0.020629569998345403,
"acc_norm": 0.422,
"acc_norm_stderr": 0.022109039310618552
}
},
"versions": {
"arc_challenge": 0,
"prost": 0,
"swag": 0,
"arc_easy": 0,
"boolq": 1,
"wsc273": 0,
"mc_taco": 0,
"piqa": 0,
"hellaswag": 0,
"winogrande": 0,
"copa": 0,
"openbookqa": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"mnli_mismatched": {
"acc": 0.45351912123677784,
"acc_stderr": 0.005020956265665617
},
"wnli": {
"acc": 0.4647887323943662,
"acc_stderr": 0.0596130578497224
},
"sst": {
"acc": 0.6536697247706422,
"acc_stderr": 0.01612186710508361
},
"cola": {
"mcc": 0.0,
"mcc_stderr": 0.0
},
"mnli": {
"acc": 0.43555781966377993,
"acc_stderr": 0.005005063722742048
},
"qnli": {
"acc": 0.4995423759838916,
"acc_stderr": 0.006765407718154766
},
"mrpc": {
"acc": 0.6862745098039216,
"acc_stderr": 0.022999936277943434,
"f1": 0.8134110787172011,
"f1_stderr": 0.01621238238910757
},
"rte": {
"acc": 0.6534296028880866,
"acc_stderr": 0.02864445699455754
},
"qqp": {
"acc": 0.3679198614889933,
"acc_stderr": 0.0023983700314094665,
"f1": 0.5365853658536586,
"f1_stderr": 0.0025607085094365924
}
},
"versions": {
"mnli_mismatched": 0,
"wnli": 1,
"sst": 0,
"cola": 0,
"mnli": 0,
"qnli": 0,
"mrpc": 0,
"rte": 0,
"qqp": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": false,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"gsm8k": {
"acc": 0.13570887035633056,
"acc_stderr": 0.009433577908567345
}
},
"versions": {
"gsm8k": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True",
"num_fewshot": 8,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"crows_pairs_english_autre": {
"likelihood_difference": 11.426136363636363,
"likelihood_difference_stderr": 4.353329595154678,
"pct_stereotype": 0.36363636363636365,
"pct_stereotype_stderr": 0.15212000482437738
},
"crows_pairs_french_age": {
"likelihood_difference": 13.10138888888889,
"likelihood_difference_stderr": 1.1200506582638412,
"pct_stereotype": 0.3888888888888889,
"pct_stereotype_stderr": 0.05167468693203863
},
"crows_pairs_french_disability": {
"likelihood_difference": 17.323863636363637,
"likelihood_difference_stderr": 1.824043354324447,
"pct_stereotype": 0.4090909090909091,
"pct_stereotype_stderr": 0.060983672113630656
},
"ethics_utilitarianism": {
"acc": 0.5245424292845258,
"acc_stderr": 0.007202929002919329
},
"ethics_deontology": {
"acc": 0.503337041156841,
"acc_stderr": 0.008338940677034744,
"em": 0.0011123470522803114
},
"ethics_cm": {
"acc": 0.5173745173745173,
"acc_stderr": 0.008018036537975452
},
"crows_pairs_english_age": {
"likelihood_difference": 7.710164835164835,
"likelihood_difference_stderr": 0.936561657229967,
"pct_stereotype": 0.5604395604395604,
"pct_stereotype_stderr": 0.0523181569856619
},
"crows_pairs_french_autre": {
"likelihood_difference": 9.942307692307692,
"likelihood_difference_stderr": 3.1484255128649896,
"pct_stereotype": 0.5384615384615384,
"pct_stereotype_stderr": 0.14390989949130545
},
"crows_pairs_english_gender": {
"likelihood_difference": 8.67578125,
"likelihood_difference_stderr": 0.6549450667276699,
"pct_stereotype": 0.584375,
"pct_stereotype_stderr": 0.02759315140230172
},
"crows_pairs_french_physical_appearance": {
"likelihood_difference": 12.57986111111111,
"likelihood_difference_stderr": 1.5739147906459943,
"pct_stereotype": 0.5277777777777778,
"pct_stereotype_stderr": 0.05924743948371486
},
"crows_pairs_english_religion": {
"likelihood_difference": 11.8125,
"likelihood_difference_stderr": 1.175150775782876,
"pct_stereotype": 0.4954954954954955,
"pct_stereotype_stderr": 0.047671194793956616
},
"crows_pairs_english_race_color": {
"likelihood_difference": 9.858390748031496,
"likelihood_difference_stderr": 0.5056938997647007,
"pct_stereotype": 0.5019685039370079,
"pct_stereotype_stderr": 0.02220560748841351
},
"crows_pairs_french_socioeconomic": {
"likelihood_difference": 12.791533801020408,
"likelihood_difference_stderr": 0.9369927660413013,
"pct_stereotype": 0.35714285714285715,
"pct_stereotype_stderr": 0.03431317581537576
},
"toxigen": {
"acc": 0.42659574468085104,
"acc_stderr": 0.01614008877637632,
"acc_norm": 0.4319148936170213,
"acc_norm_stderr": 0.016164899004911828
},
"ethics_justice": {
"acc": 0.4992603550295858,
"acc_stderr": 0.009617152578791647,
"em": 0.0014792899408284023
},
"crows_pairs_english_sexual_orientation": {
"likelihood_difference": 10.72244623655914,
"likelihood_difference_stderr": 1.1561263889540778,
"pct_stereotype": 0.5483870967741935,
"pct_stereotype_stderr": 0.05188393075201662
},
"crows_pairs_french_nationality": {
"likelihood_difference": 16.33102766798419,
"likelihood_difference_stderr": 0.9224360930325354,
"pct_stereotype": 0.31620553359683795,
"pct_stereotype_stderr": 0.029291880485542005
},
"crows_pairs_english_socioeconomic": {
"likelihood_difference": 11.222368421052632,
"likelihood_difference_stderr": 0.7806572774635993,
"pct_stereotype": 0.5052631578947369,
"pct_stereotype_stderr": 0.036367633377878815
},
"crows_pairs_french_race_color": {
"likelihood_difference": 11.927445652173914,
"likelihood_difference_stderr": 0.5028450572837085,
"pct_stereotype": 0.35,
"pct_stereotype_stderr": 0.022263034418628928
},
"crows_pairs_english_nationality": {
"likelihood_difference": 11.848668981481481,
"likelihood_difference_stderr": 0.8342534014656857,
"pct_stereotype": 0.38425925925925924,
"pct_stereotype_stderr": 0.03317354514310742
},
"ethics_virtue": {
"acc": 0.20321608040201006,
"acc_stderr": 0.005705535674037668,
"em": 0.0
},
"crows_pairs_english_physical_appearance": {
"likelihood_difference": 7.529513888888889,
"likelihood_difference_stderr": 0.8793312801173977,
"pct_stereotype": 0.4722222222222222,
"pct_stereotype_stderr": 0.05924743948371486
},
"ethics_utilitarianism_original": {
"acc": 0.9806572379367721,
"acc_stderr": 0.0019864644750587196
},
"crows_pairs_french_sexual_orientation": {
"likelihood_difference": 17.554945054945055,
"likelihood_difference_stderr": 1.1803100062671743,
"pct_stereotype": 0.7802197802197802,
"pct_stereotype_stderr": 0.043649726328985346
},
"crows_pairs_french_religion": {
"likelihood_difference": 11.192391304347826,
"likelihood_difference_stderr": 1.0866295680081195,
"pct_stereotype": 0.591304347826087,
"pct_stereotype_stderr": 0.04604188749503789
},
"crows_pairs_french_gender": {
"likelihood_difference": 10.791471962616823,
"likelihood_difference_stderr": 0.6767399211366819,
"pct_stereotype": 0.514018691588785,
"pct_stereotype_stderr": 0.027939861549302374
},
"crows_pairs_english_disability": {
"likelihood_difference": 12.978846153846154,
"likelihood_difference_stderr": 1.8287537323468364,
"pct_stereotype": 0.35384615384615387,
"pct_stereotype_stderr": 0.05977027026123098
}
},
"versions": {
"crows_pairs_english_autre": 0,
"crows_pairs_french_age": 0,
"crows_pairs_french_disability": 0,
"ethics_utilitarianism": 0,
"ethics_deontology": 0,
"ethics_cm": 0,
"crows_pairs_english_age": 0,
"crows_pairs_french_autre": 0,
"crows_pairs_english_gender": 0,
"crows_pairs_french_physical_appearance": 0,
"crows_pairs_english_religion": 0,
"crows_pairs_english_race_color": 0,
"crows_pairs_french_socioeconomic": 0,
"toxigen": 0,
"ethics_justice": 0,
"crows_pairs_english_sexual_orientation": 0,
"crows_pairs_french_nationality": 0,
"crows_pairs_english_socioeconomic": 0,
"crows_pairs_french_race_color": 0,
"crows_pairs_english_nationality": 0,
"ethics_virtue": 0,
"crows_pairs_english_physical_appearance": 0,
"ethics_utilitarianism_original": 0,
"crows_pairs_french_sexual_orientation": 0,
"crows_pairs_french_religion": 0,
"crows_pairs_french_gender": 0,
"crows_pairs_english_disability": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": false,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment