Unverified Commit 546fd5cd authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Merge pull request #686 from EleutherAI/cleanup

[Refactor] Cleanup for `big-refactor`
parents 465c695b 540d468e
{
"results": {
"xcopa_ta": {
"acc": 0.592,
"acc_stderr": 0.02200091089387719
},
"xcopa_id": {
"acc": 0.698,
"acc_stderr": 0.02055326917420918
},
"xcopa_tr": {
"acc": 0.512,
"acc_stderr": 0.02237662679792717
},
"xcopa_th": {
"acc": 0.554,
"acc_stderr": 0.022252153078595897
},
"xcopa_ht": {
"acc": 0.508,
"acc_stderr": 0.022380208834928035
},
"xcopa_qu": {
"acc": 0.508,
"acc_stderr": 0.022380208834928035
},
"xcopa_sw": {
"acc": 0.516,
"acc_stderr": 0.0223716109825804
},
"xcopa_it": {
"acc": 0.528,
"acc_stderr": 0.022347949832668086
},
"xcopa_zh": {
"acc": 0.652,
"acc_stderr": 0.021323728632807498
},
"xcopa_et": {
"acc": 0.482,
"acc_stderr": 0.02236856511738799
},
"xcopa_vi": {
"acc": 0.708,
"acc_stderr": 0.02035437548053008
}
},
"versions": {
"xcopa_ta": 0,
"xcopa_id": 0,
"xcopa_tr": 0,
"xcopa_th": 0,
"xcopa_ht": 0,
"xcopa_qu": 0,
"xcopa_sw": 0,
"xcopa_it": 0,
"xcopa_zh": 0,
"xcopa_et": 0,
"xcopa_vi": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-7b1",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"xnli_ar": {
"acc": 0.3383233532934132,
"acc_stderr": 0.006685184166851475
},
"xnli_bg": {
"acc": 0.3970059880239521,
"acc_stderr": 0.006913206227417221
},
"xnli_de": {
"acc": 0.39860279441117763,
"acc_stderr": 0.0069179171504068675
},
"xnli_el": {
"acc": 0.35748502994011977,
"acc_stderr": 0.006771658365506411
},
"xnli_en": {
"acc": 0.539121756487026,
"acc_stderr": 0.007043053978003474
},
"xnli_es": {
"acc": 0.4870259481037924,
"acc_stderr": 0.007062333678954121
},
"xnli_fr": {
"acc": 0.49680638722554893,
"acc_stderr": 0.00706456831954508
},
"xnli_hi": {
"acc": 0.46506986027944114,
"acc_stderr": 0.007047451825220883
},
"xnli_ru": {
"acc": 0.4305389221556886,
"acc_stderr": 0.006996208063220089
},
"xnli_sw": {
"acc": 0.37924151696606784,
"acc_stderr": 0.006855572898852684
},
"xnli_th": {
"acc": 0.3499001996007984,
"acc_stderr": 0.00673886250800537
},
"xnli_tr": {
"acc": 0.3508982035928144,
"acc_stderr": 0.00674328417575373
},
"xnli_ur": {
"acc": 0.42095808383233535,
"acc_stderr": 0.006975878576227378
},
"xnli_vi": {
"acc": 0.47045908183632734,
"acc_stderr": 0.007052371383794704
},
"xnli_zh": {
"acc": 0.35429141716566864,
"acc_stderr": 0.006758076124936785
}
},
"versions": {
"xnli_ar": 0,
"xnli_bg": 0,
"xnli_de": 0,
"xnli_el": 0,
"xnli_en": 0,
"xnli_es": 0,
"xnli_fr": 0,
"xnli_hi": 0,
"xnli_ru": 0,
"xnli_sw": 0,
"xnli_th": 0,
"xnli_tr": 0,
"xnli_ur": 0,
"xnli_vi": 0,
"xnli_zh": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-7b1",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"xstory_cloze_ru": {
"acc": 0.5274652547981469,
"acc_stderr": 0.012847698270388222
},
"xstory_cloze_eu": {
"acc": 0.57180675049636,
"acc_stderr": 0.012733742799515155
},
"xstory_cloze_en": {
"acc": 0.7074784910655195,
"acc_stderr": 0.011707038572975033
},
"xstory_cloze_ar": {
"acc": 0.585704831237591,
"acc_stderr": 0.012676689821720669
},
"xstory_cloze_es": {
"acc": 0.6611515552614163,
"acc_stderr": 0.012180490758739058
},
"xstory_cloze_hi": {
"acc": 0.6055592322964924,
"acc_stderr": 0.01257710651393614
},
"xstory_cloze_my": {
"acc": 0.48974189278623426,
"acc_stderr": 0.012864417047980468
},
"xstory_cloze_sw": {
"acc": 0.5393778954334878,
"acc_stderr": 0.012827159238891916
},
"xstory_cloze_zh": {
"acc": 0.6187954996690933,
"acc_stderr": 0.01249867885093408
},
"xstory_cloze_id": {
"acc": 0.6446062210456651,
"acc_stderr": 0.01231724793041837
},
"xstory_cloze_te": {
"acc": 0.5744540039708802,
"acc_stderr": 0.012723670419166324
}
},
"versions": {
"xstory_cloze_ru": 0,
"xstory_cloze_eu": 0,
"xstory_cloze_en": 0,
"xstory_cloze_ar": 0,
"xstory_cloze_es": 0,
"xstory_cloze_hi": 0,
"xstory_cloze_my": 0,
"xstory_cloze_sw": 0,
"xstory_cloze_zh": 0,
"xstory_cloze_id": 0,
"xstory_cloze_te": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-7b1",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"xwinograd_fr": {
"acc": 0.7108433734939759,
"acc_stderr": 0.050066428050419186
},
"xwinograd_ru": {
"acc": 0.5682539682539682,
"acc_stderr": 0.027952495861671634
},
"xwinograd_en": {
"acc": 0.821505376344086,
"acc_stderr": 0.00794327709606643
},
"xwinograd_pt": {
"acc": 0.7680608365019012,
"acc_stderr": 0.026075593860304693
},
"xwinograd_jp": {
"acc": 0.5849843587069864,
"acc_stderr": 0.015919213413834392
},
"xwinograd_zh": {
"acc": 0.7440476190476191,
"acc_stderr": 0.019457899684028012
}
},
"versions": {
"xwinograd_fr": 0,
"xwinograd_ru": 0,
"xwinograd_en": 0,
"xwinograd_pt": 0,
"xwinograd_jp": 0,
"xwinograd_zh": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-7b1",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
# llama-13B
## llama-13B_arithmetic_5-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------------|------:|------|----:|---|-----:|
|arithmetic_1dc| 0|acc | 0|± | 0|
|arithmetic_2da| 0|acc | 0|± | 0|
|arithmetic_2dm| 0|acc | 0|± | 0|
|arithmetic_2ds| 0|acc | 0|± | 0|
|arithmetic_3da| 0|acc | 0|± | 0|
|arithmetic_3ds| 0|acc | 0|± | 0|
|arithmetic_4da| 0|acc | 0|± | 0|
|arithmetic_4ds| 0|acc | 0|± | 0|
|arithmetic_5da| 0|acc | 0|± | 0|
|arithmetic_5ds| 0|acc | 0|± | 0|
## llama-13B_bbh_3-shot.json
| Task |Version| Metric |Value| |Stderr|
|------------------------------------------------|------:|---------------------|----:|---|-----:|
|bigbench_causal_judgement | 0|multiple_choice_grade|49.47|± | 3.64|
|bigbench_date_understanding | 0|multiple_choice_grade|63.96|± | 2.50|
|bigbench_disambiguation_qa | 0|multiple_choice_grade|45.74|± | 3.11|
|bigbench_dyck_languages | 0|multiple_choice_grade|20.10|± | 1.27|
|bigbench_formal_fallacies_syllogisms_negation | 0|multiple_choice_grade|51.13|± | 0.42|
|bigbench_geometric_shapes | 0|multiple_choice_grade|23.12|± | 2.23|
| | |exact_str_match | 0.00|± | 0.00|
|bigbench_hyperbaton | 0|multiple_choice_grade|50.38|± | 0.22|
|bigbench_logical_deduction_five_objects | 0|multiple_choice_grade|30.00|± | 2.05|
|bigbench_logical_deduction_seven_objects | 0|multiple_choice_grade|22.29|± | 1.57|
|bigbench_logical_deduction_three_objects | 0|multiple_choice_grade|41.67|± | 2.85|
|bigbench_movie_recommendation | 0|multiple_choice_grade|43.60|± | 2.22|
|bigbench_navigate | 0|multiple_choice_grade|51.70|± | 1.58|
|bigbench_reasoning_about_colored_objects | 0|multiple_choice_grade|37.05|± | 1.08|
|bigbench_ruin_names | 0|multiple_choice_grade|34.60|± | 2.25|
|bigbench_salient_translation_error_detection | 0|multiple_choice_grade|19.34|± | 1.25|
|bigbench_snarks | 0|multiple_choice_grade|46.96|± | 3.72|
|bigbench_sports_understanding | 0|multiple_choice_grade|58.11|± | 1.57|
|bigbench_temporal_sequences | 0|multiple_choice_grade|28.00|± | 1.42|
|bigbench_tracking_shuffled_objects_five_objects | 0|multiple_choice_grade|21.44|± | 1.16|
|bigbench_tracking_shuffled_objects_seven_objects| 0|multiple_choice_grade|14.46|± | 0.84|
|bigbench_tracking_shuffled_objects_three_objects| 0|multiple_choice_grade|41.67|± | 2.85|
## llama-13B_blimp_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------------------------------------------------|------:|------|----:|---|-----:|
|blimp_adjunct_island | 0|acc | 33.8|± | 1.50|
|blimp_anaphor_gender_agreement | 0|acc | 57.6|± | 1.56|
|blimp_anaphor_number_agreement | 0|acc | 56.5|± | 1.57|
|blimp_animate_subject_passive | 0|acc | 65.1|± | 1.51|
|blimp_animate_subject_trans | 0|acc | 61.6|± | 1.54|
|blimp_causative | 0|acc | 35.9|± | 1.52|
|blimp_complex_NP_island | 0|acc | 30.3|± | 1.45|
|blimp_coordinate_structure_constraint_complex_left_branch| 0|acc | 34.5|± | 1.50|
|blimp_coordinate_structure_constraint_object_extraction | 0|acc | 27.9|± | 1.42|
|blimp_determiner_noun_agreement_1 | 0|acc | 34.1|± | 1.50|
|blimp_determiner_noun_agreement_2 | 0|acc | 36.1|± | 1.52|
|blimp_determiner_noun_agreement_irregular_1 | 0|acc | 35.6|± | 1.51|
|blimp_determiner_noun_agreement_irregular_2 | 0|acc | 36.9|± | 1.53|
|blimp_determiner_noun_agreement_with_adj_2 | 0|acc | 39.2|± | 1.54|
|blimp_determiner_noun_agreement_with_adj_irregular_1 | 0|acc | 34.2|± | 1.50|
|blimp_determiner_noun_agreement_with_adj_irregular_2 | 0|acc | 39.3|± | 1.55|
|blimp_determiner_noun_agreement_with_adjective_1 | 0|acc | 39.1|± | 1.54|
|blimp_distractor_agreement_relational_noun | 0|acc | 51.4|± | 1.58|
|blimp_distractor_agreement_relative_clause | 0|acc | 42.3|± | 1.56|
|blimp_drop_argument | 0|acc | 70.5|± | 1.44|
|blimp_ellipsis_n_bar_1 | 0|acc | 62.4|± | 1.53|
|blimp_ellipsis_n_bar_2 | 0|acc | 26.4|± | 1.39|
|blimp_existential_there_object_raising | 0|acc | 69.0|± | 1.46|
|blimp_existential_there_quantifiers_1 | 0|acc | 30.8|± | 1.46|
|blimp_existential_there_quantifiers_2 | 0|acc | 78.8|± | 1.29|
|blimp_existential_there_subject_raising | 0|acc | 70.1|± | 1.45|
|blimp_expletive_it_object_raising | 0|acc | 61.9|± | 1.54|
|blimp_inchoative | 0|acc | 47.4|± | 1.58|
|blimp_intransitive | 0|acc | 64.3|± | 1.52|
|blimp_irregular_past_participle_adjectives | 0|acc | 63.6|± | 1.52|
|blimp_irregular_past_participle_verbs | 0|acc | 31.4|± | 1.47|
|blimp_irregular_plural_subject_verb_agreement_1 | 0|acc | 51.8|± | 1.58|
|blimp_irregular_plural_subject_verb_agreement_2 | 0|acc | 50.4|± | 1.58|
|blimp_left_branch_island_echo_question | 0|acc | 49.0|± | 1.58|
|blimp_left_branch_island_simple_question | 0|acc | 41.1|± | 1.56|
|blimp_matrix_question_npi_licensor_present | 0|acc | 54.8|± | 1.57|
|blimp_npi_present_1 | 0|acc | 30.4|± | 1.46|
|blimp_npi_present_2 | 0|acc | 39.0|± | 1.54|
|blimp_only_npi_licensor_present | 0|acc | 73.1|± | 1.40|
|blimp_only_npi_scope | 0|acc | 27.8|± | 1.42|
|blimp_passive_1 | 0|acc | 52.9|± | 1.58|
|blimp_passive_2 | 0|acc | 52.6|± | 1.58|
|blimp_principle_A_c_command | 0|acc | 32.6|± | 1.48|
|blimp_principle_A_case_1 | 0|acc | 2.8|± | 0.52|
|blimp_principle_A_case_2 | 0|acc | 44.3|± | 1.57|
|blimp_principle_A_domain_1 | 0|acc | 32.4|± | 1.48|
|blimp_principle_A_domain_2 | 0|acc | 74.0|± | 1.39|
|blimp_principle_A_domain_3 | 0|acc | 56.3|± | 1.57|
|blimp_principle_A_reconstruction | 0|acc | 79.2|± | 1.28|
|blimp_regular_plural_subject_verb_agreement_1 | 0|acc | 56.0|± | 1.57|
|blimp_regular_plural_subject_verb_agreement_2 | 0|acc | 45.6|± | 1.58|
|blimp_sentential_negation_npi_licensor_present | 0|acc | 39.2|± | 1.54|
|blimp_sentential_negation_npi_scope | 0|acc | 63.8|± | 1.52|
|blimp_sentential_subject_island | 0|acc | 62.1|± | 1.53|
|blimp_superlative_quantifiers_1 | 0|acc | 52.2|± | 1.58|
|blimp_superlative_quantifiers_2 | 0|acc | 71.4|± | 1.43|
|blimp_tough_vs_raising_1 | 0|acc | 36.1|± | 1.52|
|blimp_tough_vs_raising_2 | 0|acc | 64.2|± | 1.52|
|blimp_transitive | 0|acc | 47.3|± | 1.58|
|blimp_wh_island | 0|acc | 50.6|± | 1.58|
|blimp_wh_questions_object_gap | 0|acc | 45.5|± | 1.58|
|blimp_wh_questions_subject_gap | 0|acc | 36.9|± | 1.53|
|blimp_wh_questions_subject_gap_long_distance | 0|acc | 40.8|± | 1.55|
|blimp_wh_vs_that_no_gap | 0|acc | 19.6|± | 1.26|
|blimp_wh_vs_that_no_gap_long_distance | 0|acc | 30.1|± | 1.45|
|blimp_wh_vs_that_with_gap | 0|acc | 84.7|± | 1.14|
|blimp_wh_vs_that_with_gap_long_distance | 0|acc | 69.2|± | 1.46|
## llama-13B_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |43.94|± | 1.45|
| | |acc_norm|44.62|± | 1.45|
|arc_easy | 0|acc |74.58|± | 0.89|
| | |acc_norm|59.89|± | 1.01|
|boolq | 1|acc |68.50|± | 0.81|
|copa | 0|acc |90.00|± | 3.02|
|hellaswag | 0|acc |59.10|± | 0.49|
| | |acc_norm|76.24|± | 0.42|
|mc_taco | 0|em |10.96| | |
| | |f1 |47.53| | |
|openbookqa | 0|acc |30.60|± | 2.06|
| | |acc_norm|42.20|± | 2.21|
|piqa | 0|acc |78.84|± | 0.95|
| | |acc_norm|79.11|± | 0.95|
|prost | 0|acc |26.89|± | 0.32|
| | |acc_norm|30.52|± | 0.34|
|swag | 0|acc |56.73|± | 0.35|
| | |acc_norm|69.35|± | 0.33|
|winogrande | 0|acc |70.17|± | 1.29|
|wsc273 | 0|acc |86.08|± | 2.10|
## llama-13B_glue_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|cola | 0|mcc | 0.00|± | 0.00|
|mnli | 0|acc |43.56|± | 0.50|
|mnli_mismatched| 0|acc |45.35|± | 0.50|
|mrpc | 0|acc |68.63|± | 2.30|
| | |f1 |81.34|± | 1.62|
|qnli | 0|acc |49.95|± | 0.68|
|qqp | 0|acc |36.79|± | 0.24|
| | |f1 |53.66|± | 0.26|
|rte | 0|acc |65.34|± | 2.86|
|sst | 0|acc |65.37|± | 1.61|
|wnli | 1|acc |46.48|± | 5.96|
## llama-13B_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc |13.57|± | 0.94|
## llama-13B_human_alignment_0-shot.json
| Task |Version| Metric | Value | |Stderr|
|---------------------------------------|------:|---------------------|------:|---|-----:|
|crows_pairs_english_age | 0|likelihood_difference| 771.02|± | 93.66|
| | |pct_stereotype | 56.04|± | 5.23|
|crows_pairs_english_autre | 0|likelihood_difference|1142.61|± |435.33|
| | |pct_stereotype | 36.36|± | 15.21|
|crows_pairs_english_disability | 0|likelihood_difference|1297.88|± |182.88|
| | |pct_stereotype | 35.38|± | 5.98|
|crows_pairs_english_gender | 0|likelihood_difference| 867.58|± | 65.49|
| | |pct_stereotype | 58.44|± | 2.76|
|crows_pairs_english_nationality | 0|likelihood_difference|1184.87|± | 83.43|
| | |pct_stereotype | 38.43|± | 3.32|
|crows_pairs_english_physical_appearance| 0|likelihood_difference| 752.95|± | 87.93|
| | |pct_stereotype | 47.22|± | 5.92|
|crows_pairs_english_race_color | 0|likelihood_difference| 985.84|± | 50.57|
| | |pct_stereotype | 50.20|± | 2.22|
|crows_pairs_english_religion | 0|likelihood_difference|1181.25|± |117.52|
| | |pct_stereotype | 49.55|± | 4.77|
|crows_pairs_english_sexual_orientation | 0|likelihood_difference|1072.24|± |115.61|
| | |pct_stereotype | 54.84|± | 5.19|
|crows_pairs_english_socioeconomic | 0|likelihood_difference|1122.24|± | 78.07|
| | |pct_stereotype | 50.53|± | 3.64|
|crows_pairs_french_age | 0|likelihood_difference|1310.14|± |112.01|
| | |pct_stereotype | 38.89|± | 5.17|
|crows_pairs_french_autre | 0|likelihood_difference| 994.23|± |314.84|
| | |pct_stereotype | 53.85|± | 14.39|
|crows_pairs_french_disability | 0|likelihood_difference|1732.39|± |182.40|
| | |pct_stereotype | 40.91|± | 6.10|
|crows_pairs_french_gender | 0|likelihood_difference|1079.15|± | 67.67|
| | |pct_stereotype | 51.40|± | 2.79|
|crows_pairs_french_nationality | 0|likelihood_difference|1633.10|± | 92.24|
| | |pct_stereotype | 31.62|± | 2.93|
|crows_pairs_french_physical_appearance | 0|likelihood_difference|1257.99|± |157.39|
| | |pct_stereotype | 52.78|± | 5.92|
|crows_pairs_french_race_color | 0|likelihood_difference|1192.74|± | 50.28|
| | |pct_stereotype | 35.00|± | 2.23|
|crows_pairs_french_religion | 0|likelihood_difference|1119.24|± |108.66|
| | |pct_stereotype | 59.13|± | 4.60|
|crows_pairs_french_sexual_orientation | 0|likelihood_difference|1755.49|± |118.03|
| | |pct_stereotype | 78.02|± | 4.36|
|crows_pairs_french_socioeconomic | 0|likelihood_difference|1279.15|± | 93.70|
| | |pct_stereotype | 35.71|± | 3.43|
|ethics_cm | 0|acc | 51.74|± | 0.80|
|ethics_deontology | 0|acc | 50.33|± | 0.83|
| | |em | 0.11| | |
|ethics_justice | 0|acc | 49.93|± | 0.96|
| | |em | 0.15| | |
|ethics_utilitarianism | 0|acc | 52.45|± | 0.72|
|ethics_utilitarianism_original | 0|acc | 98.07|± | 0.20|
|ethics_virtue | 0|acc | 20.32|± | 0.57|
| | |em | 0.00| | |
|toxigen | 0|acc | 42.66|± | 1.61|
| | |acc_norm | 43.19|± | 1.62|
## llama-13B_lambada_0-shot.json
| Task |Version|Metric| Value | | Stderr |
|----------------------|------:|------|---------:|---|--------:|
|lambada_openai | 0|ppl |1279051.05|± | 60995.63|
| | |acc | 0.00|± | 0.00|
|lambada_openai_cloze | 0|ppl | 204515.39|± | 9705.34|
| | |acc | 0.02|± | 0.02|
|lambada_openai_mt_de | 0|ppl |1310285.44|± | 71395.91|
| | |acc | 0.00|± | 0.00|
|lambada_openai_mt_en | 0|ppl |1279051.05|± | 60995.63|
| | |acc | 0.00|± | 0.00|
|lambada_openai_mt_es | 0|ppl |1980241.77|± |101614.20|
| | |acc | 0.00|± | 0.00|
|lambada_openai_mt_fr | 0|ppl |2461448.49|± |128013.99|
| | |acc | 0.00|± | 0.00|
|lambada_openai_mt_it | 0|ppl |4091504.35|± |218020.97|
| | |acc | 0.00|± | 0.00|
|lambada_standard | 0|ppl |1409048.00|± | 47832.88|
| | |acc | 0.00|± | 0.00|
|lambada_standard_cloze| 0|ppl |4235345.03|± |132892.57|
| | |acc | 0.00|± | 0.00|
## llama-13B_mathematical_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 3.88|± | 0.20|
| | |f1 |13.99|± | 0.25|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 1.85|± | 0.39|
|math_asdiv | 0|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 1.48|± | 0.55|
|math_geometry | 1|acc | 1.25|± | 0.51|
|math_intermediate_algebra| 1|acc | 1.22|± | 0.37|
|math_num_theory | 1|acc | 1.48|± | 0.52|
|math_prealgebra | 1|acc | 2.87|± | 0.57|
|math_precalc | 1|acc | 1.10|± | 0.45|
|mathqa | 0|acc |28.44|± | 0.83|
| | |acc_norm|28.68|± | 0.83|
## llama-13B_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 1.71|± | 0.13|
| | |f1 | 2.45|± | 0.14|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.21|± | 0.21|
|math_geometry | 1|acc | 0.00|± | 0.00|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.19|± | 0.19|
|math_prealgebra | 1|acc | 0.11|± | 0.11|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |29.98|± | 0.84|
| | |acc_norm|30.35|± | 0.84|
## llama-13B_mmlu_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------------------------------|------:|--------|----:|---|-----:|
|hendrycksTest-abstract_algebra | 0|acc |32.00|± | 4.69|
| | |acc_norm|30.00|± | 4.61|
|hendrycksTest-anatomy | 0|acc |42.96|± | 4.28|
| | |acc_norm|29.63|± | 3.94|
|hendrycksTest-astronomy | 0|acc |48.03|± | 4.07|
| | |acc_norm|48.03|± | 4.07|
|hendrycksTest-business_ethics | 0|acc |53.00|± | 5.02|
| | |acc_norm|44.00|± | 4.99|
|hendrycksTest-clinical_knowledge | 0|acc |46.04|± | 3.07|
| | |acc_norm|38.49|± | 2.99|
|hendrycksTest-college_biology | 0|acc |45.83|± | 4.17|
| | |acc_norm|32.64|± | 3.92|
|hendrycksTest-college_chemistry | 0|acc |31.00|± | 4.65|
| | |acc_norm|30.00|± | 4.61|
|hendrycksTest-college_computer_science | 0|acc |33.00|± | 4.73|
| | |acc_norm|28.00|± | 4.51|
|hendrycksTest-college_mathematics | 0|acc |29.00|± | 4.56|
| | |acc_norm|34.00|± | 4.76|
|hendrycksTest-college_medicine | 0|acc |42.77|± | 3.77|
| | |acc_norm|30.06|± | 3.50|
|hendrycksTest-college_physics | 0|acc |28.43|± | 4.49|
| | |acc_norm|35.29|± | 4.76|
|hendrycksTest-computer_security | 0|acc |57.00|± | 4.98|
| | |acc_norm|44.00|± | 4.99|
|hendrycksTest-conceptual_physics | 0|acc |42.13|± | 3.23|
| | |acc_norm|24.26|± | 2.80|
|hendrycksTest-econometrics | 0|acc |27.19|± | 4.19|
| | |acc_norm|26.32|± | 4.14|
|hendrycksTest-electrical_engineering | 0|acc |41.38|± | 4.10|
| | |acc_norm|34.48|± | 3.96|
|hendrycksTest-elementary_mathematics | 0|acc |36.77|± | 2.48|
| | |acc_norm|32.80|± | 2.42|
|hendrycksTest-formal_logic | 0|acc |32.54|± | 4.19|
| | |acc_norm|34.13|± | 4.24|
|hendrycksTest-global_facts | 0|acc |34.00|± | 4.76|
| | |acc_norm|29.00|± | 4.56|
|hendrycksTest-high_school_biology | 0|acc |49.68|± | 2.84|
| | |acc_norm|36.13|± | 2.73|
|hendrycksTest-high_school_chemistry | 0|acc |31.03|± | 3.26|
| | |acc_norm|32.02|± | 3.28|
|hendrycksTest-high_school_computer_science | 0|acc |49.00|± | 5.02|
| | |acc_norm|41.00|± | 4.94|
|hendrycksTest-high_school_european_history | 0|acc |52.73|± | 3.90|
| | |acc_norm|49.70|± | 3.90|
|hendrycksTest-high_school_geography | 0|acc |57.58|± | 3.52|
| | |acc_norm|42.42|± | 3.52|
|hendrycksTest-high_school_government_and_politics| 0|acc |58.55|± | 3.56|
| | |acc_norm|38.86|± | 3.52|
|hendrycksTest-high_school_macroeconomics | 0|acc |37.69|± | 2.46|
| | |acc_norm|31.79|± | 2.36|
|hendrycksTest-high_school_mathematics | 0|acc |26.67|± | 2.70|
| | |acc_norm|31.85|± | 2.84|
|hendrycksTest-high_school_microeconomics | 0|acc |42.02|± | 3.21|
| | |acc_norm|40.76|± | 3.19|
|hendrycksTest-high_school_physics | 0|acc |27.15|± | 3.63|
| | |acc_norm|25.17|± | 3.54|
|hendrycksTest-high_school_psychology | 0|acc |60.73|± | 2.09|
| | |acc_norm|36.88|± | 2.07|
|hendrycksTest-high_school_statistics | 0|acc |38.43|± | 3.32|
| | |acc_norm|37.50|± | 3.30|
|hendrycksTest-high_school_us_history | 0|acc |52.45|± | 3.51|
| | |acc_norm|37.25|± | 3.39|
|hendrycksTest-high_school_world_history | 0|acc |49.79|± | 3.25|
| | |acc_norm|42.62|± | 3.22|
|hendrycksTest-human_aging | 0|acc |57.40|± | 3.32|
| | |acc_norm|33.63|± | 3.17|
|hendrycksTest-human_sexuality | 0|acc |54.96|± | 4.36|
| | |acc_norm|39.69|± | 4.29|
|hendrycksTest-international_law | 0|acc |56.20|± | 4.53|
| | |acc_norm|60.33|± | 4.47|
|hendrycksTest-jurisprudence | 0|acc |48.15|± | 4.83|
| | |acc_norm|50.00|± | 4.83|
|hendrycksTest-logical_fallacies | 0|acc |45.40|± | 3.91|
| | |acc_norm|36.81|± | 3.79|
|hendrycksTest-machine_learning | 0|acc |28.57|± | 4.29|
| | |acc_norm|29.46|± | 4.33|
|hendrycksTest-management | 0|acc |64.08|± | 4.75|
| | |acc_norm|41.75|± | 4.88|
|hendrycksTest-marketing | 0|acc |72.65|± | 2.92|
| | |acc_norm|61.54|± | 3.19|
|hendrycksTest-medical_genetics | 0|acc |49.00|± | 5.02|
| | |acc_norm|48.00|± | 5.02|
|hendrycksTest-miscellaneous | 0|acc |69.60|± | 1.64|
| | |acc_norm|48.53|± | 1.79|
|hendrycksTest-moral_disputes | 0|acc |44.80|± | 2.68|
| | |acc_norm|38.15|± | 2.62|
|hendrycksTest-moral_scenarios | 0|acc |28.27|± | 1.51|
| | |acc_norm|27.26|± | 1.49|
|hendrycksTest-nutrition | 0|acc |45.10|± | 2.85|
| | |acc_norm|46.73|± | 2.86|
|hendrycksTest-philosophy | 0|acc |45.98|± | 2.83|
| | |acc_norm|38.59|± | 2.76|
|hendrycksTest-prehistory | 0|acc |49.69|± | 2.78|
| | |acc_norm|34.57|± | 2.65|
|hendrycksTest-professional_accounting | 0|acc |29.79|± | 2.73|
| | |acc_norm|28.01|± | 2.68|
|hendrycksTest-professional_law | 0|acc |30.38|± | 1.17|
| | |acc_norm|30.90|± | 1.18|
|hendrycksTest-professional_medicine | 0|acc |39.34|± | 2.97|
| | |acc_norm|33.09|± | 2.86|
|hendrycksTest-professional_psychology | 0|acc |42.32|± | 2.00|
| | |acc_norm|33.01|± | 1.90|
|hendrycksTest-public_relations | 0|acc |54.55|± | 4.77|
| | |acc_norm|29.09|± | 4.35|
|hendrycksTest-security_studies | 0|acc |45.71|± | 3.19|
| | |acc_norm|37.55|± | 3.10|
|hendrycksTest-sociology | 0|acc |58.21|± | 3.49|
| | |acc_norm|45.77|± | 3.52|
|hendrycksTest-us_foreign_policy | 0|acc |68.00|± | 4.69|
| | |acc_norm|52.00|± | 5.02|
|hendrycksTest-virology | 0|acc |40.96|± | 3.83|
| | |acc_norm|30.12|± | 3.57|
|hendrycksTest-world_religions | 0|acc |74.27|± | 3.35|
| | |acc_norm|64.91|± | 3.66|
## llama-13B_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |52.95|± | 1.12|
|pawsx_en| 0|acc |53.70|± | 1.12|
|pawsx_es| 0|acc |52.10|± | 1.12|
|pawsx_fr| 0|acc |54.50|± | 1.11|
|pawsx_ja| 0|acc |45.00|± | 1.11|
|pawsx_ko| 0|acc |47.05|± | 1.12|
|pawsx_zh| 0|acc |45.20|± | 1.11|
## llama-13B_question_answering_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|------------|----:|---|-----:|
|headqa_en | 0|acc |34.43|± | 0.91|
| | |acc_norm |38.58|± | 0.93|
|headqa_es | 0|acc |30.56|± | 0.88|
| | |acc_norm |35.16|± | 0.91|
|logiqa | 0|acc |26.42|± | 1.73|
| | |acc_norm |32.10|± | 1.83|
|squad2 | 1|exact |16.44| | |
| | |f1 |24.06| | |
| | |HasAns_exact|21.09| | |
| | |HasAns_f1 |36.35| | |
| | |NoAns_exact |11.81| | |
| | |NoAns_f1 |11.81| | |
| | |best_exact |50.07| | |
| | |best_f1 |50.07| | |
|triviaqa | 1|acc | 0.00|± | 0.00|
|truthfulqa_mc| 1|mc1 |25.83|± | 1.53|
| | |mc2 |39.88|± | 1.37|
|webqs | 0|acc | 0.00|± | 0.00|
## llama-13B_reading_comprehension_0-shot.json
|Task|Version|Metric|Value| |Stderr|
|----|------:|------|----:|---|-----:|
|coqa| 1|f1 |77.04|± | 1.42|
| | |em |63.70|± | 1.85|
|drop| 1|em | 3.59|± | 0.19|
| | |f1 |13.38|± | 0.24|
|race| 1|acc |39.33|± | 1.51|
## llama-13B_superglue_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|boolq | 1|acc |68.44|± | 0.81|
|cb | 1|acc |48.21|± | 6.74|
| | |f1 |38.82| | |
|copa | 0|acc |90.00|± | 3.02|
|multirc| 1|acc | 1.57|± | 0.40|
|record | 0|f1 |92.32|± | 0.26|
| | |em |91.54|± | 0.28|
|wic | 0|acc |49.84|± | 1.98|
|wsc | 0|acc |35.58|± | 4.72|
## llama-13B_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 48.2|± | 2.24|
|xcopa_ht| 0|acc | 52.8|± | 2.23|
|xcopa_id| 0|acc | 57.8|± | 2.21|
|xcopa_it| 0|acc | 67.2|± | 2.10|
|xcopa_qu| 0|acc | 50.2|± | 2.24|
|xcopa_sw| 0|acc | 51.2|± | 2.24|
|xcopa_ta| 0|acc | 54.4|± | 2.23|
|xcopa_th| 0|acc | 54.6|± | 2.23|
|xcopa_tr| 0|acc | 53.0|± | 2.23|
|xcopa_vi| 0|acc | 53.8|± | 2.23|
|xcopa_zh| 0|acc | 58.4|± | 2.21|
## llama-13B_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |34.07|± | 0.67|
|xnli_bg| 0|acc |34.21|± | 0.67|
|xnli_de| 0|acc |35.25|± | 0.68|
|xnli_el| 0|acc |34.69|± | 0.67|
|xnli_en| 0|acc |35.63|± | 0.68|
|xnli_es| 0|acc |33.49|± | 0.67|
|xnli_fr| 0|acc |33.49|± | 0.67|
|xnli_hi| 0|acc |35.59|± | 0.68|
|xnli_ru| 0|acc |33.79|± | 0.67|
|xnli_sw| 0|acc |33.15|± | 0.67|
|xnli_th| 0|acc |34.83|± | 0.67|
|xnli_tr| 0|acc |33.99|± | 0.67|
|xnli_ur| 0|acc |34.21|± | 0.67|
|xnli_vi| 0|acc |34.21|± | 0.67|
|xnli_zh| 0|acc |34.47|± | 0.67|
## llama-13B_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |49.70|± | 1.29|
|xstory_cloze_en| 0|acc |77.30|± | 1.08|
|xstory_cloze_es| 0|acc |69.42|± | 1.19|
|xstory_cloze_eu| 0|acc |50.69|± | 1.29|
|xstory_cloze_hi| 0|acc |52.35|± | 1.29|
|xstory_cloze_id| 0|acc |55.26|± | 1.28|
|xstory_cloze_my| 0|acc |47.78|± | 1.29|
|xstory_cloze_ru| 0|acc |63.40|± | 1.24|
|xstory_cloze_sw| 0|acc |49.90|± | 1.29|
|xstory_cloze_te| 0|acc |53.34|± | 1.28|
|xstory_cloze_zh| 0|acc |56.45|± | 1.28|
## llama-13B_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |86.75|± | 0.70|
|xwinograd_fr| 0|acc |68.67|± | 5.12|
|xwinograd_jp| 0|acc |59.85|± | 1.58|
|xwinograd_pt| 0|acc |71.48|± | 2.79|
|xwinograd_ru| 0|acc |70.79|± | 2.57|
|xwinograd_zh| 0|acc |70.04|± | 2.04|
{
"results": {
"arithmetic_2dm": {
"acc": 0.0,
"acc_stderr": 0.0
},
"arithmetic_5ds": {
"acc": 0.0,
"acc_stderr": 0.0
},
"arithmetic_4da": {
"acc": 0.0,
"acc_stderr": 0.0
},
"arithmetic_3da": {
"acc": 0.0,
"acc_stderr": 0.0
},
"arithmetic_4ds": {
"acc": 0.0,
"acc_stderr": 0.0
},
"arithmetic_5da": {
"acc": 0.0,
"acc_stderr": 0.0
},
"arithmetic_2da": {
"acc": 0.0,
"acc_stderr": 0.0
},
"arithmetic_3ds": {
"acc": 0.0,
"acc_stderr": 0.0
},
"arithmetic_1dc": {
"acc": 0.0,
"acc_stderr": 0.0
},
"arithmetic_2ds": {
"acc": 0.0,
"acc_stderr": 0.0
}
},
"versions": {
"arithmetic_2dm": 0,
"arithmetic_5ds": 0,
"arithmetic_4da": 0,
"arithmetic_3da": 0,
"arithmetic_4ds": 0,
"arithmetic_5da": 0,
"arithmetic_2da": 0,
"arithmetic_3ds": 0,
"arithmetic_1dc": 0,
"arithmetic_2ds": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True",
"num_fewshot": 5,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": false,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"bigbench_sports_understanding": {
"multiple_choice_grade": 0.5811359026369168,
"multiple_choice_grade_stderr": 0.015720172474974117
},
"bigbench_salient_translation_error_detection": {
"multiple_choice_grade": 0.1933867735470942,
"multiple_choice_grade_stderr": 0.012508305339715512
},
"bigbench_date_understanding": {
"multiple_choice_grade": 0.6395663956639567,
"multiple_choice_grade_stderr": 0.025028311208714224
},
"bigbench_navigate": {
"multiple_choice_grade": 0.517,
"multiple_choice_grade_stderr": 0.015810153729833434
},
"bigbench_dyck_languages": {
"multiple_choice_grade": 0.201,
"multiple_choice_grade_stderr": 0.012679107214617324
},
"bigbench_movie_recommendation": {
"multiple_choice_grade": 0.436,
"multiple_choice_grade_stderr": 0.022198954641476802
},
"bigbench_snarks": {
"multiple_choice_grade": 0.4696132596685083,
"multiple_choice_grade_stderr": 0.03719891321680327
},
"bigbench_disambiguation_qa": {
"multiple_choice_grade": 0.4573643410852713,
"multiple_choice_grade_stderr": 0.03107554499047266
},
"bigbench_reasoning_about_colored_objects": {
"multiple_choice_grade": 0.3705,
"multiple_choice_grade_stderr": 0.010801537464907349
},
"bigbench_geometric_shapes": {
"multiple_choice_grade": 0.23119777158774374,
"multiple_choice_grade_stderr": 0.02228217728550543,
"exact_str_match": 0.0,
"exact_str_match_stderr": 0.0
},
"bigbench_tracking_shuffled_objects_five_objects": {
"multiple_choice_grade": 0.2144,
"multiple_choice_grade_stderr": 0.011612665292522431
},
"bigbench_formal_fallacies_syllogisms_negation": {
"multiple_choice_grade": 0.5113380281690141,
"multiple_choice_grade_stderr": 0.004194975590734721
},
"bigbench_tracking_shuffled_objects_three_objects": {
"multiple_choice_grade": 0.4166666666666667,
"multiple_choice_grade_stderr": 0.028511310643917567
},
"bigbench_hyperbaton": {
"multiple_choice_grade": 0.5038,
"multiple_choice_grade_stderr": 0.0022360257592931206
},
"bigbench_temporal_sequences": {
"multiple_choice_grade": 0.28,
"multiple_choice_grade_stderr": 0.014205696104091493
},
"bigbench_logical_deduction_three_objects": {
"multiple_choice_grade": 0.4166666666666667,
"multiple_choice_grade_stderr": 0.028511310643917567
},
"bigbench_causal_judgement": {
"multiple_choice_grade": 0.49473684210526314,
"multiple_choice_grade_stderr": 0.036367633377878836
},
"bigbench_tracking_shuffled_objects_seven_objects": {
"multiple_choice_grade": 0.14457142857142857,
"multiple_choice_grade_stderr": 0.008408881015830339
},
"bigbench_logical_deduction_seven_objects": {
"multiple_choice_grade": 0.22285714285714286,
"multiple_choice_grade_stderr": 0.015740739118727993
},
"bigbench_logical_deduction_five_objects": {
"multiple_choice_grade": 0.3,
"multiple_choice_grade_stderr": 0.020514426225628046
},
"bigbench_ruin_names": {
"multiple_choice_grade": 0.34598214285714285,
"multiple_choice_grade_stderr": 0.02249924183068251
}
},
"versions": {
"bigbench_sports_understanding": 0,
"bigbench_salient_translation_error_detection": 0,
"bigbench_date_understanding": 0,
"bigbench_navigate": 0,
"bigbench_dyck_languages": 0,
"bigbench_movie_recommendation": 0,
"bigbench_snarks": 0,
"bigbench_disambiguation_qa": 0,
"bigbench_reasoning_about_colored_objects": 0,
"bigbench_geometric_shapes": 0,
"bigbench_tracking_shuffled_objects_five_objects": 0,
"bigbench_formal_fallacies_syllogisms_negation": 0,
"bigbench_tracking_shuffled_objects_three_objects": 0,
"bigbench_hyperbaton": 0,
"bigbench_temporal_sequences": 0,
"bigbench_logical_deduction_three_objects": 0,
"bigbench_causal_judgement": 0,
"bigbench_tracking_shuffled_objects_seven_objects": 0,
"bigbench_logical_deduction_seven_objects": 0,
"bigbench_logical_deduction_five_objects": 0,
"bigbench_ruin_names": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True",
"num_fewshot": 3,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"blimp_anaphor_gender_agreement": {
"acc": 0.576,
"acc_stderr": 0.015635487471405186
},
"blimp_wh_questions_subject_gap_long_distance": {
"acc": 0.408,
"acc_stderr": 0.01554920505292068
},
"blimp_expletive_it_object_raising": {
"acc": 0.619,
"acc_stderr": 0.015364734787007436
},
"blimp_npi_present_2": {
"acc": 0.39,
"acc_stderr": 0.015431725053866606
},
"blimp_sentential_negation_npi_licensor_present": {
"acc": 0.392,
"acc_stderr": 0.015445859463771302
},
"blimp_wh_vs_that_no_gap": {
"acc": 0.196,
"acc_stderr": 0.012559527926707371
},
"blimp_wh_vs_that_with_gap": {
"acc": 0.847,
"acc_stderr": 0.011389500459665546
},
"blimp_passive_2": {
"acc": 0.526,
"acc_stderr": 0.01579789775804277
},
"blimp_drop_argument": {
"acc": 0.705,
"acc_stderr": 0.014428554438445524
},
"blimp_irregular_plural_subject_verb_agreement_2": {
"acc": 0.504,
"acc_stderr": 0.015818793703510893
},
"blimp_adjunct_island": {
"acc": 0.338,
"acc_stderr": 0.014965960710224489
},
"blimp_transitive": {
"acc": 0.473,
"acc_stderr": 0.015796218551302615
},
"blimp_irregular_plural_subject_verb_agreement_1": {
"acc": 0.518,
"acc_stderr": 0.015809045699406728
},
"blimp_animate_subject_passive": {
"acc": 0.651,
"acc_stderr": 0.015080663991563098
},
"blimp_determiner_noun_agreement_1": {
"acc": 0.341,
"acc_stderr": 0.014998131348402706
},
"blimp_wh_island": {
"acc": 0.506,
"acc_stderr": 0.015818160898606715
},
"blimp_intransitive": {
"acc": 0.643,
"acc_stderr": 0.015158521721486769
},
"blimp_left_branch_island_simple_question": {
"acc": 0.411,
"acc_stderr": 0.015566673418599276
},
"blimp_irregular_past_participle_verbs": {
"acc": 0.314,
"acc_stderr": 0.01468399195108796
},
"blimp_principle_A_case_2": {
"acc": 0.443,
"acc_stderr": 0.0157161699532041
},
"blimp_principle_A_domain_3": {
"acc": 0.563,
"acc_stderr": 0.015693223928730377
},
"blimp_sentential_subject_island": {
"acc": 0.621,
"acc_stderr": 0.01534909100222535
},
"blimp_tough_vs_raising_1": {
"acc": 0.361,
"acc_stderr": 0.015195720118175127
},
"blimp_principle_A_c_command": {
"acc": 0.326,
"acc_stderr": 0.014830507204541042
},
"blimp_wh_vs_that_no_gap_long_distance": {
"acc": 0.301,
"acc_stderr": 0.014512395033543147
},
"blimp_irregular_past_participle_adjectives": {
"acc": 0.636,
"acc_stderr": 0.015222868840522019
},
"blimp_complex_NP_island": {
"acc": 0.303,
"acc_stderr": 0.014539683710535264
},
"blimp_only_npi_licensor_present": {
"acc": 0.731,
"acc_stderr": 0.014029819522568198
},
"blimp_wh_questions_subject_gap": {
"acc": 0.369,
"acc_stderr": 0.015266698139154617
},
"blimp_coordinate_structure_constraint_object_extraction": {
"acc": 0.279,
"acc_stderr": 0.014190150117612037
},
"blimp_determiner_noun_agreement_2": {
"acc": 0.361,
"acc_stderr": 0.015195720118175115
},
"blimp_ellipsis_n_bar_2": {
"acc": 0.264,
"acc_stderr": 0.01394627184944048
},
"blimp_only_npi_scope": {
"acc": 0.278,
"acc_stderr": 0.014174516461485247
},
"blimp_determiner_noun_agreement_with_adj_irregular_1": {
"acc": 0.342,
"acc_stderr": 0.015008706182121728
},
"blimp_existential_there_object_raising": {
"acc": 0.69,
"acc_stderr": 0.014632638658632902
},
"blimp_superlative_quantifiers_1": {
"acc": 0.522,
"acc_stderr": 0.015803979428161957
},
"blimp_distractor_agreement_relational_noun": {
"acc": 0.514,
"acc_stderr": 0.015813097547730987
},
"blimp_wh_vs_that_with_gap_long_distance": {
"acc": 0.692,
"acc_stderr": 0.014606483127342761
},
"blimp_determiner_noun_agreement_with_adj_2": {
"acc": 0.392,
"acc_stderr": 0.015445859463771295
},
"blimp_principle_A_domain_1": {
"acc": 0.324,
"acc_stderr": 0.01480686473373886
},
"blimp_distractor_agreement_relative_clause": {
"acc": 0.423,
"acc_stderr": 0.015630589090476345
},
"blimp_inchoative": {
"acc": 0.474,
"acc_stderr": 0.015797897758042766
},
"blimp_superlative_quantifiers_2": {
"acc": 0.714,
"acc_stderr": 0.01429714686251791
},
"blimp_tough_vs_raising_2": {
"acc": 0.642,
"acc_stderr": 0.015167928865407557
},
"blimp_principle_A_domain_2": {
"acc": 0.74,
"acc_stderr": 0.013877773329774166
},
"blimp_determiner_noun_agreement_irregular_2": {
"acc": 0.369,
"acc_stderr": 0.015266698139154614
},
"blimp_animate_subject_trans": {
"acc": 0.616,
"acc_stderr": 0.015387682761897071
},
"blimp_ellipsis_n_bar_1": {
"acc": 0.624,
"acc_stderr": 0.015325105508898134
},
"blimp_existential_there_quantifiers_1": {
"acc": 0.308,
"acc_stderr": 0.014606483127342763
},
"blimp_regular_plural_subject_verb_agreement_1": {
"acc": 0.56,
"acc_stderr": 0.01570498795436179
},
"blimp_wh_questions_object_gap": {
"acc": 0.455,
"acc_stderr": 0.01575510149834709
},
"blimp_determiner_noun_agreement_with_adj_irregular_2": {
"acc": 0.393,
"acc_stderr": 0.015452824654081496
},
"blimp_sentential_negation_npi_scope": {
"acc": 0.638,
"acc_stderr": 0.015204840912919498
},
"blimp_principle_A_case_1": {
"acc": 0.028,
"acc_stderr": 0.005219506034410047
},
"blimp_existential_there_subject_raising": {
"acc": 0.701,
"acc_stderr": 0.014484778521220482
},
"blimp_causative": {
"acc": 0.359,
"acc_stderr": 0.015177264224798597
},
"blimp_determiner_noun_agreement_with_adjective_1": {
"acc": 0.391,
"acc_stderr": 0.015438826294681783
},
"blimp_coordinate_structure_constraint_complex_left_branch": {
"acc": 0.345,
"acc_stderr": 0.015039986742055238
},
"blimp_passive_1": {
"acc": 0.529,
"acc_stderr": 0.015792669451628896
},
"blimp_npi_present_1": {
"acc": 0.304,
"acc_stderr": 0.014553205687950424
},
"blimp_left_branch_island_echo_question": {
"acc": 0.49,
"acc_stderr": 0.015816135752773207
},
"blimp_existential_there_quantifiers_2": {
"acc": 0.788,
"acc_stderr": 0.012931481864938041
},
"blimp_regular_plural_subject_verb_agreement_2": {
"acc": 0.456,
"acc_stderr": 0.01575792855397917
},
"blimp_principle_A_reconstruction": {
"acc": 0.792,
"acc_stderr": 0.012841374572096921
},
"blimp_determiner_noun_agreement_irregular_1": {
"acc": 0.356,
"acc_stderr": 0.015149042659306628
},
"blimp_matrix_question_npi_licensor_present": {
"acc": 0.548,
"acc_stderr": 0.01574623586588068
},
"blimp_anaphor_number_agreement": {
"acc": 0.565,
"acc_stderr": 0.0156850572527172
}
},
"versions": {
"blimp_anaphor_gender_agreement": 0,
"blimp_wh_questions_subject_gap_long_distance": 0,
"blimp_expletive_it_object_raising": 0,
"blimp_npi_present_2": 0,
"blimp_sentential_negation_npi_licensor_present": 0,
"blimp_wh_vs_that_no_gap": 0,
"blimp_wh_vs_that_with_gap": 0,
"blimp_passive_2": 0,
"blimp_drop_argument": 0,
"blimp_irregular_plural_subject_verb_agreement_2": 0,
"blimp_adjunct_island": 0,
"blimp_transitive": 0,
"blimp_irregular_plural_subject_verb_agreement_1": 0,
"blimp_animate_subject_passive": 0,
"blimp_determiner_noun_agreement_1": 0,
"blimp_wh_island": 0,
"blimp_intransitive": 0,
"blimp_left_branch_island_simple_question": 0,
"blimp_irregular_past_participle_verbs": 0,
"blimp_principle_A_case_2": 0,
"blimp_principle_A_domain_3": 0,
"blimp_sentential_subject_island": 0,
"blimp_tough_vs_raising_1": 0,
"blimp_principle_A_c_command": 0,
"blimp_wh_vs_that_no_gap_long_distance": 0,
"blimp_irregular_past_participle_adjectives": 0,
"blimp_complex_NP_island": 0,
"blimp_only_npi_licensor_present": 0,
"blimp_wh_questions_subject_gap": 0,
"blimp_coordinate_structure_constraint_object_extraction": 0,
"blimp_determiner_noun_agreement_2": 0,
"blimp_ellipsis_n_bar_2": 0,
"blimp_only_npi_scope": 0,
"blimp_determiner_noun_agreement_with_adj_irregular_1": 0,
"blimp_existential_there_object_raising": 0,
"blimp_superlative_quantifiers_1": 0,
"blimp_distractor_agreement_relational_noun": 0,
"blimp_wh_vs_that_with_gap_long_distance": 0,
"blimp_determiner_noun_agreement_with_adj_2": 0,
"blimp_principle_A_domain_1": 0,
"blimp_distractor_agreement_relative_clause": 0,
"blimp_inchoative": 0,
"blimp_superlative_quantifiers_2": 0,
"blimp_tough_vs_raising_2": 0,
"blimp_principle_A_domain_2": 0,
"blimp_determiner_noun_agreement_irregular_2": 0,
"blimp_animate_subject_trans": 0,
"blimp_ellipsis_n_bar_1": 0,
"blimp_existential_there_quantifiers_1": 0,
"blimp_regular_plural_subject_verb_agreement_1": 0,
"blimp_wh_questions_object_gap": 0,
"blimp_determiner_noun_agreement_with_adj_irregular_2": 0,
"blimp_sentential_negation_npi_scope": 0,
"blimp_principle_A_case_1": 0,
"blimp_existential_there_subject_raising": 0,
"blimp_causative": 0,
"blimp_determiner_noun_agreement_with_adjective_1": 0,
"blimp_coordinate_structure_constraint_complex_left_branch": 0,
"blimp_passive_1": 0,
"blimp_npi_present_1": 0,
"blimp_left_branch_island_echo_question": 0,
"blimp_existential_there_quantifiers_2": 0,
"blimp_regular_plural_subject_verb_agreement_2": 0,
"blimp_principle_A_reconstruction": 0,
"blimp_determiner_noun_agreement_irregular_1": 0,
"blimp_matrix_question_npi_licensor_present": 0,
"blimp_anaphor_number_agreement": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"arc_challenge": {
"acc": 0.439419795221843,
"acc_stderr": 0.014503747823580122,
"acc_norm": 0.4462457337883959,
"acc_norm_stderr": 0.014526705548539982
},
"prost": {
"acc": 0.2688941076003416,
"acc_stderr": 0.0032393206239968247,
"acc_norm": 0.3052412467976089,
"acc_norm_stderr": 0.003364432149066356
},
"swag": {
"acc": 0.5673298010596821,
"acc_stderr": 0.003502894135944166,
"acc_norm": 0.6934919524142757,
"acc_norm_stderr": 0.0032596605453371346
},
"arc_easy": {
"acc": 0.7457912457912458,
"acc_stderr": 0.008934537681141528,
"acc_norm": 0.5989057239057239,
"acc_norm_stderr": 0.010057051106534378
},
"boolq": {
"acc": 0.6850152905198776,
"acc_stderr": 0.00812432724981665
},
"wsc273": {
"acc": 0.8608058608058609,
"acc_stderr": 0.020988366070851
},
"mc_taco": {
"em": 0.10960960960960961,
"f1": 0.4753174430074593
},
"piqa": {
"acc": 0.7883569096844396,
"acc_stderr": 0.009530351270479397,
"acc_norm": 0.7910772578890098,
"acc_norm_stderr": 0.009485227030105093
},
"hellaswag": {
"acc": 0.5910177255526787,
"acc_stderr": 0.004906411984476791,
"acc_norm": 0.7623979286994622,
"acc_norm_stderr": 0.004247442237702478
},
"winogrande": {
"acc": 0.7016574585635359,
"acc_stderr": 0.012858885010030434
},
"copa": {
"acc": 0.9,
"acc_stderr": 0.030151134457776348
},
"openbookqa": {
"acc": 0.306,
"acc_stderr": 0.020629569998345403,
"acc_norm": 0.422,
"acc_norm_stderr": 0.022109039310618552
}
},
"versions": {
"arc_challenge": 0,
"prost": 0,
"swag": 0,
"arc_easy": 0,
"boolq": 1,
"wsc273": 0,
"mc_taco": 0,
"piqa": 0,
"hellaswag": 0,
"winogrande": 0,
"copa": 0,
"openbookqa": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"mnli_mismatched": {
"acc": 0.45351912123677784,
"acc_stderr": 0.005020956265665617
},
"wnli": {
"acc": 0.4647887323943662,
"acc_stderr": 0.0596130578497224
},
"sst": {
"acc": 0.6536697247706422,
"acc_stderr": 0.01612186710508361
},
"cola": {
"mcc": 0.0,
"mcc_stderr": 0.0
},
"mnli": {
"acc": 0.43555781966377993,
"acc_stderr": 0.005005063722742048
},
"qnli": {
"acc": 0.4995423759838916,
"acc_stderr": 0.006765407718154766
},
"mrpc": {
"acc": 0.6862745098039216,
"acc_stderr": 0.022999936277943434,
"f1": 0.8134110787172011,
"f1_stderr": 0.01621238238910757
},
"rte": {
"acc": 0.6534296028880866,
"acc_stderr": 0.02864445699455754
},
"qqp": {
"acc": 0.3679198614889933,
"acc_stderr": 0.0023983700314094665,
"f1": 0.5365853658536586,
"f1_stderr": 0.0025607085094365924
}
},
"versions": {
"mnli_mismatched": 0,
"wnli": 1,
"sst": 0,
"cola": 0,
"mnli": 0,
"qnli": 0,
"mrpc": 0,
"rte": 0,
"qqp": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": false,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"gsm8k": {
"acc": 0.13570887035633056,
"acc_stderr": 0.009433577908567345
}
},
"versions": {
"gsm8k": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True",
"num_fewshot": 8,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"crows_pairs_english_autre": {
"likelihood_difference": 11.426136363636363,
"likelihood_difference_stderr": 4.353329595154678,
"pct_stereotype": 0.36363636363636365,
"pct_stereotype_stderr": 0.15212000482437738
},
"crows_pairs_french_age": {
"likelihood_difference": 13.10138888888889,
"likelihood_difference_stderr": 1.1200506582638412,
"pct_stereotype": 0.3888888888888889,
"pct_stereotype_stderr": 0.05167468693203863
},
"crows_pairs_french_disability": {
"likelihood_difference": 17.323863636363637,
"likelihood_difference_stderr": 1.824043354324447,
"pct_stereotype": 0.4090909090909091,
"pct_stereotype_stderr": 0.060983672113630656
},
"ethics_utilitarianism": {
"acc": 0.5245424292845258,
"acc_stderr": 0.007202929002919329
},
"ethics_deontology": {
"acc": 0.503337041156841,
"acc_stderr": 0.008338940677034744,
"em": 0.0011123470522803114
},
"ethics_cm": {
"acc": 0.5173745173745173,
"acc_stderr": 0.008018036537975452
},
"crows_pairs_english_age": {
"likelihood_difference": 7.710164835164835,
"likelihood_difference_stderr": 0.936561657229967,
"pct_stereotype": 0.5604395604395604,
"pct_stereotype_stderr": 0.0523181569856619
},
"crows_pairs_french_autre": {
"likelihood_difference": 9.942307692307692,
"likelihood_difference_stderr": 3.1484255128649896,
"pct_stereotype": 0.5384615384615384,
"pct_stereotype_stderr": 0.14390989949130545
},
"crows_pairs_english_gender": {
"likelihood_difference": 8.67578125,
"likelihood_difference_stderr": 0.6549450667276699,
"pct_stereotype": 0.584375,
"pct_stereotype_stderr": 0.02759315140230172
},
"crows_pairs_french_physical_appearance": {
"likelihood_difference": 12.57986111111111,
"likelihood_difference_stderr": 1.5739147906459943,
"pct_stereotype": 0.5277777777777778,
"pct_stereotype_stderr": 0.05924743948371486
},
"crows_pairs_english_religion": {
"likelihood_difference": 11.8125,
"likelihood_difference_stderr": 1.175150775782876,
"pct_stereotype": 0.4954954954954955,
"pct_stereotype_stderr": 0.047671194793956616
},
"crows_pairs_english_race_color": {
"likelihood_difference": 9.858390748031496,
"likelihood_difference_stderr": 0.5056938997647007,
"pct_stereotype": 0.5019685039370079,
"pct_stereotype_stderr": 0.02220560748841351
},
"crows_pairs_french_socioeconomic": {
"likelihood_difference": 12.791533801020408,
"likelihood_difference_stderr": 0.9369927660413013,
"pct_stereotype": 0.35714285714285715,
"pct_stereotype_stderr": 0.03431317581537576
},
"toxigen": {
"acc": 0.42659574468085104,
"acc_stderr": 0.01614008877637632,
"acc_norm": 0.4319148936170213,
"acc_norm_stderr": 0.016164899004911828
},
"ethics_justice": {
"acc": 0.4992603550295858,
"acc_stderr": 0.009617152578791647,
"em": 0.0014792899408284023
},
"crows_pairs_english_sexual_orientation": {
"likelihood_difference": 10.72244623655914,
"likelihood_difference_stderr": 1.1561263889540778,
"pct_stereotype": 0.5483870967741935,
"pct_stereotype_stderr": 0.05188393075201662
},
"crows_pairs_french_nationality": {
"likelihood_difference": 16.33102766798419,
"likelihood_difference_stderr": 0.9224360930325354,
"pct_stereotype": 0.31620553359683795,
"pct_stereotype_stderr": 0.029291880485542005
},
"crows_pairs_english_socioeconomic": {
"likelihood_difference": 11.222368421052632,
"likelihood_difference_stderr": 0.7806572774635993,
"pct_stereotype": 0.5052631578947369,
"pct_stereotype_stderr": 0.036367633377878815
},
"crows_pairs_french_race_color": {
"likelihood_difference": 11.927445652173914,
"likelihood_difference_stderr": 0.5028450572837085,
"pct_stereotype": 0.35,
"pct_stereotype_stderr": 0.022263034418628928
},
"crows_pairs_english_nationality": {
"likelihood_difference": 11.848668981481481,
"likelihood_difference_stderr": 0.8342534014656857,
"pct_stereotype": 0.38425925925925924,
"pct_stereotype_stderr": 0.03317354514310742
},
"ethics_virtue": {
"acc": 0.20321608040201006,
"acc_stderr": 0.005705535674037668,
"em": 0.0
},
"crows_pairs_english_physical_appearance": {
"likelihood_difference": 7.529513888888889,
"likelihood_difference_stderr": 0.8793312801173977,
"pct_stereotype": 0.4722222222222222,
"pct_stereotype_stderr": 0.05924743948371486
},
"ethics_utilitarianism_original": {
"acc": 0.9806572379367721,
"acc_stderr": 0.0019864644750587196
},
"crows_pairs_french_sexual_orientation": {
"likelihood_difference": 17.554945054945055,
"likelihood_difference_stderr": 1.1803100062671743,
"pct_stereotype": 0.7802197802197802,
"pct_stereotype_stderr": 0.043649726328985346
},
"crows_pairs_french_religion": {
"likelihood_difference": 11.192391304347826,
"likelihood_difference_stderr": 1.0866295680081195,
"pct_stereotype": 0.591304347826087,
"pct_stereotype_stderr": 0.04604188749503789
},
"crows_pairs_french_gender": {
"likelihood_difference": 10.791471962616823,
"likelihood_difference_stderr": 0.6767399211366819,
"pct_stereotype": 0.514018691588785,
"pct_stereotype_stderr": 0.027939861549302374
},
"crows_pairs_english_disability": {
"likelihood_difference": 12.978846153846154,
"likelihood_difference_stderr": 1.8287537323468364,
"pct_stereotype": 0.35384615384615387,
"pct_stereotype_stderr": 0.05977027026123098
}
},
"versions": {
"crows_pairs_english_autre": 0,
"crows_pairs_french_age": 0,
"crows_pairs_french_disability": 0,
"ethics_utilitarianism": 0,
"ethics_deontology": 0,
"ethics_cm": 0,
"crows_pairs_english_age": 0,
"crows_pairs_french_autre": 0,
"crows_pairs_english_gender": 0,
"crows_pairs_french_physical_appearance": 0,
"crows_pairs_english_religion": 0,
"crows_pairs_english_race_color": 0,
"crows_pairs_french_socioeconomic": 0,
"toxigen": 0,
"ethics_justice": 0,
"crows_pairs_english_sexual_orientation": 0,
"crows_pairs_french_nationality": 0,
"crows_pairs_english_socioeconomic": 0,
"crows_pairs_french_race_color": 0,
"crows_pairs_english_nationality": 0,
"ethics_virtue": 0,
"crows_pairs_english_physical_appearance": 0,
"ethics_utilitarianism_original": 0,
"crows_pairs_french_sexual_orientation": 0,
"crows_pairs_french_religion": 0,
"crows_pairs_french_gender": 0,
"crows_pairs_english_disability": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": false,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"lambada_openai": {
"ppl": 1279051.053451683,
"ppl_stderr": 60995.62964377304,
"acc": 0.0,
"acc_stderr": 0.0
},
"lambada_openai_mt_de": {
"ppl": 1310285.4433720284,
"ppl_stderr": 71395.90633942866,
"acc": 0.0,
"acc_stderr": 0.0
},
"lambada_openai_mt_it": {
"ppl": 4091504.352954044,
"ppl_stderr": 218020.965277226,
"acc": 0.0,
"acc_stderr": 0.0
},
"lambada_standard": {
"ppl": 1409047.9981006894,
"ppl_stderr": 47832.883755899915,
"acc": 0.0,
"acc_stderr": 0.0
},
"lambada_standard_cloze": {
"ppl": 4235345.031433833,
"ppl_stderr": 132892.5654001927,
"acc": 0.0,
"acc_stderr": 0.0
},
"lambada_openai_mt_fr": {
"ppl": 2461448.491005768,
"ppl_stderr": 128013.98724687536,
"acc": 0.0,
"acc_stderr": 0.0
},
"lambada_openai_cloze": {
"ppl": 204515.38946166556,
"ppl_stderr": 9705.341358126625,
"acc": 0.00019406171162429653,
"acc_stderr": 0.00019406171162430135
},
"lambada_openai_mt_en": {
"ppl": 1279051.053451683,
"ppl_stderr": 60995.62964377304,
"acc": 0.0,
"acc_stderr": 0.0
},
"lambada_openai_mt_es": {
"ppl": 1980241.7718905837,
"ppl_stderr": 101614.2034914904,
"acc": 0.0,
"acc_stderr": 0.0
}
},
"versions": {
"lambada_openai": 0,
"lambada_openai_mt_de": 0,
"lambada_openai_mt_it": 0,
"lambada_standard": 0,
"lambada_standard_cloze": 0,
"lambada_openai_mt_fr": 0,
"lambada_openai_cloze": 0,
"lambada_openai_mt_en": 0,
"lambada_openai_mt_es": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"math_prealgebra": {
"acc": 0.02870264064293915,
"acc_stderr": 0.0056607946989983855
},
"math_num_theory": {
"acc": 0.014814814814814815,
"acc_stderr": 0.005203704987512651
},
"drop": {
"em": 0.0388003355704698,
"em_stderr": 0.0019777172311177993,
"f1": 0.13990771812080444,
"f1_stderr": 0.002512880034517493
},
"gsm8k": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_intermediate_algebra": {
"acc": 0.012181616832779624,
"acc_stderr": 0.0036524791938863576
},
"math_algebra": {
"acc": 0.018534119629317607,
"acc_stderr": 0.003916347676363957
},
"math_counting_and_prob": {
"acc": 0.014767932489451477,
"acc_stderr": 0.0055462385896684775
},
"math_geometry": {
"acc": 0.012526096033402923,
"acc_stderr": 0.005086941389677977
},
"math_precalc": {
"acc": 0.01098901098901099,
"acc_stderr": 0.004465618427331416
},
"mathqa": {
"acc": 0.28442211055276384,
"acc_stderr": 0.008258681628795297,
"acc_norm": 0.28676716917922945,
"acc_norm_stderr": 0.00827905882129993
},
"math_asdiv": {
"acc": 0.0,
"acc_stderr": 0.0
}
},
"versions": {
"math_prealgebra": 1,
"math_num_theory": 1,
"drop": 1,
"mathqa": 0,
"gsm8k": 0,
"math_intermediate_algebra": 1,
"math_algebra": 1,
"math_counting_and_prob": 1,
"math_geometry": 1,
"math_precalc": 1,
"math_asdiv": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"math_prealgebra": {
"acc": 0.001148105625717566,
"acc_stderr": 0.0011481056257175704
},
"drop": {
"em": 0.01709312080536913,
"em_stderr": 0.001327414384722433,
"f1": 0.024450503355704672,
"f1_stderr": 0.001413124400630544
},
"math_intermediate_algebra": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_counting_and_prob": {
"acc": 0.002109704641350211,
"acc_stderr": 0.0021097046413502104
},
"math_num_theory": {
"acc": 0.001851851851851852,
"acc_stderr": 0.0018518518518518502
},
"gsm8k": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_geometry": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_algebra": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_precalc": {
"acc": 0.0,
"acc_stderr": 0.0
},
"mathqa": {
"acc": 0.2998324958123953,
"acc_stderr": 0.008387661895516162,
"acc_norm": 0.3035175879396985,
"acc_norm_stderr": 0.008416811454701563
}
},
"versions": {
"math_prealgebra": 1,
"drop": 1,
"mathqa": 0,
"math_intermediate_algebra": 1,
"math_counting_and_prob": 1,
"math_num_theory": 1,
"gsm8k": 0,
"math_geometry": 1,
"math_algebra": 1,
"math_precalc": 1
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True",
"num_fewshot": 5,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"hendrycksTest-college_biology": {
"acc": 0.4583333333333333,
"acc_stderr": 0.04166666666666665,
"acc_norm": 0.3263888888888889,
"acc_norm_stderr": 0.03921067198982266
},
"hendrycksTest-clinical_knowledge": {
"acc": 0.46037735849056605,
"acc_stderr": 0.030676096599389188,
"acc_norm": 0.3849056603773585,
"acc_norm_stderr": 0.029946498567699948
},
"hendrycksTest-high_school_european_history": {
"acc": 0.5272727272727272,
"acc_stderr": 0.03898531605579418,
"acc_norm": 0.49696969696969695,
"acc_norm_stderr": 0.03904272341431855
},
"hendrycksTest-high_school_psychology": {
"acc": 0.6073394495412844,
"acc_stderr": 0.02093750516120109,
"acc_norm": 0.3688073394495413,
"acc_norm_stderr": 0.020686227560729537
},
"hendrycksTest-business_ethics": {
"acc": 0.53,
"acc_stderr": 0.05016135580465919,
"acc_norm": 0.44,
"acc_norm_stderr": 0.04988876515698589
},
"hendrycksTest-high_school_government_and_politics": {
"acc": 0.5854922279792746,
"acc_stderr": 0.035553003195576686,
"acc_norm": 0.38860103626943004,
"acc_norm_stderr": 0.03517739796373132
},
"hendrycksTest-security_studies": {
"acc": 0.45714285714285713,
"acc_stderr": 0.03189141832421396,
"acc_norm": 0.37551020408163266,
"acc_norm_stderr": 0.03100120903989484
},
"hendrycksTest-high_school_macroeconomics": {
"acc": 0.3769230769230769,
"acc_stderr": 0.024570975364225995,
"acc_norm": 0.31794871794871793,
"acc_norm_stderr": 0.02361088430892786
},
"hendrycksTest-sociology": {
"acc": 0.582089552238806,
"acc_stderr": 0.034875586404620636,
"acc_norm": 0.4577114427860697,
"acc_norm_stderr": 0.035228658640995975
},
"hendrycksTest-college_mathematics": {
"acc": 0.29,
"acc_stderr": 0.04560480215720683,
"acc_norm": 0.34,
"acc_norm_stderr": 0.04760952285695235
},
"hendrycksTest-professional_accounting": {
"acc": 0.2978723404255319,
"acc_stderr": 0.02728160834446941,
"acc_norm": 0.2801418439716312,
"acc_norm_stderr": 0.02678917235114023
},
"hendrycksTest-anatomy": {
"acc": 0.42962962962962964,
"acc_stderr": 0.04276349494376599,
"acc_norm": 0.2962962962962963,
"acc_norm_stderr": 0.03944624162501116
},
"hendrycksTest-professional_psychology": {
"acc": 0.42320261437908496,
"acc_stderr": 0.019987809769482067,
"acc_norm": 0.3300653594771242,
"acc_norm_stderr": 0.01902372616072456
},
"hendrycksTest-moral_scenarios": {
"acc": 0.28268156424581004,
"acc_stderr": 0.015060381730018082,
"acc_norm": 0.27262569832402234,
"acc_norm_stderr": 0.014893391735249588
},
"hendrycksTest-conceptual_physics": {
"acc": 0.42127659574468085,
"acc_stderr": 0.03227834510146268,
"acc_norm": 0.2425531914893617,
"acc_norm_stderr": 0.028020226271200217
},
"hendrycksTest-virology": {
"acc": 0.40963855421686746,
"acc_stderr": 0.03828401115079021,
"acc_norm": 0.30120481927710846,
"acc_norm_stderr": 0.035716092300534796
},
"hendrycksTest-world_religions": {
"acc": 0.7426900584795322,
"acc_stderr": 0.03352799844161865,
"acc_norm": 0.6491228070175439,
"acc_norm_stderr": 0.03660298834049162
},
"hendrycksTest-high_school_computer_science": {
"acc": 0.49,
"acc_stderr": 0.05024183937956911,
"acc_norm": 0.41,
"acc_norm_stderr": 0.049431107042371025
},
"hendrycksTest-abstract_algebra": {
"acc": 0.32,
"acc_stderr": 0.046882617226215034,
"acc_norm": 0.3,
"acc_norm_stderr": 0.046056618647183814
},
"hendrycksTest-medical_genetics": {
"acc": 0.49,
"acc_stderr": 0.05024183937956911,
"acc_norm": 0.48,
"acc_norm_stderr": 0.050211673156867795
},
"hendrycksTest-nutrition": {
"acc": 0.45098039215686275,
"acc_stderr": 0.02849199358617156,
"acc_norm": 0.4673202614379085,
"acc_norm_stderr": 0.02856869975222588
},
"hendrycksTest-elementary_mathematics": {
"acc": 0.36772486772486773,
"acc_stderr": 0.024833839825562424,
"acc_norm": 0.328042328042328,
"acc_norm_stderr": 0.024180497164376907
},
"hendrycksTest-philosophy": {
"acc": 0.45980707395498394,
"acc_stderr": 0.028306190403305696,
"acc_norm": 0.3858520900321543,
"acc_norm_stderr": 0.02764814959975146
},
"hendrycksTest-high_school_microeconomics": {
"acc": 0.42016806722689076,
"acc_stderr": 0.03206183783236152,
"acc_norm": 0.40756302521008403,
"acc_norm_stderr": 0.031918633744784645
},
"hendrycksTest-management": {
"acc": 0.6407766990291263,
"acc_stderr": 0.04750458399041696,
"acc_norm": 0.4174757281553398,
"acc_norm_stderr": 0.048828405482122375
},
"hendrycksTest-us_foreign_policy": {
"acc": 0.68,
"acc_stderr": 0.046882617226215034,
"acc_norm": 0.52,
"acc_norm_stderr": 0.050211673156867795
},
"hendrycksTest-international_law": {
"acc": 0.5619834710743802,
"acc_stderr": 0.04529146804435792,
"acc_norm": 0.6033057851239669,
"acc_norm_stderr": 0.044658697805310094
},
"hendrycksTest-college_chemistry": {
"acc": 0.31,
"acc_stderr": 0.04648231987117316,
"acc_norm": 0.3,
"acc_norm_stderr": 0.046056618647183814
},
"hendrycksTest-high_school_mathematics": {
"acc": 0.26666666666666666,
"acc_stderr": 0.026962424325073817,
"acc_norm": 0.31851851851851853,
"acc_norm_stderr": 0.028406533090608463
},
"hendrycksTest-high_school_world_history": {
"acc": 0.4978902953586498,
"acc_stderr": 0.032546938018020076,
"acc_norm": 0.42616033755274263,
"acc_norm_stderr": 0.03219035703131774
},
"hendrycksTest-human_sexuality": {
"acc": 0.549618320610687,
"acc_stderr": 0.04363643698524779,
"acc_norm": 0.3969465648854962,
"acc_norm_stderr": 0.04291135671009224
},
"hendrycksTest-college_computer_science": {
"acc": 0.33,
"acc_stderr": 0.047258156262526045,
"acc_norm": 0.28,
"acc_norm_stderr": 0.045126085985421276
},
"hendrycksTest-college_medicine": {
"acc": 0.4277456647398844,
"acc_stderr": 0.037724468575180255,
"acc_norm": 0.30057803468208094,
"acc_norm_stderr": 0.0349610148119118
},
"hendrycksTest-formal_logic": {
"acc": 0.3253968253968254,
"acc_stderr": 0.041905964388711366,
"acc_norm": 0.3412698412698413,
"acc_norm_stderr": 0.04240799327574925
},
"hendrycksTest-high_school_physics": {
"acc": 0.271523178807947,
"acc_stderr": 0.03631329803969653,
"acc_norm": 0.25165562913907286,
"acc_norm_stderr": 0.035433042343899844
},
"hendrycksTest-marketing": {
"acc": 0.7264957264957265,
"acc_stderr": 0.029202540153431173,
"acc_norm": 0.6153846153846154,
"acc_norm_stderr": 0.03187195347942466
},
"hendrycksTest-jurisprudence": {
"acc": 0.48148148148148145,
"acc_stderr": 0.04830366024635331,
"acc_norm": 0.5,
"acc_norm_stderr": 0.04833682445228318
},
"hendrycksTest-computer_security": {
"acc": 0.57,
"acc_stderr": 0.049756985195624284,
"acc_norm": 0.44,
"acc_norm_stderr": 0.04988876515698589
},
"hendrycksTest-high_school_chemistry": {
"acc": 0.3103448275862069,
"acc_stderr": 0.03255086769970103,
"acc_norm": 0.32019704433497537,
"acc_norm_stderr": 0.032826493853041504
},
"hendrycksTest-prehistory": {
"acc": 0.49691358024691357,
"acc_stderr": 0.02782021415859437,
"acc_norm": 0.345679012345679,
"acc_norm_stderr": 0.026462487777001876
},
"hendrycksTest-machine_learning": {
"acc": 0.2857142857142857,
"acc_stderr": 0.04287858751340455,
"acc_norm": 0.29464285714285715,
"acc_norm_stderr": 0.043270409325787296
},
"hendrycksTest-professional_medicine": {
"acc": 0.39338235294117646,
"acc_stderr": 0.02967428828131118,
"acc_norm": 0.33088235294117646,
"acc_norm_stderr": 0.028582709753898452
},
"hendrycksTest-global_facts": {
"acc": 0.34,
"acc_stderr": 0.04760952285695235,
"acc_norm": 0.29,
"acc_norm_stderr": 0.04560480215720684
},
"hendrycksTest-high_school_us_history": {
"acc": 0.5245098039215687,
"acc_stderr": 0.03505093194348798,
"acc_norm": 0.37254901960784315,
"acc_norm_stderr": 0.033933885849584046
},
"hendrycksTest-high_school_geography": {
"acc": 0.5757575757575758,
"acc_stderr": 0.03521224908841586,
"acc_norm": 0.42424242424242425,
"acc_norm_stderr": 0.03521224908841583
},
"hendrycksTest-human_aging": {
"acc": 0.5739910313901345,
"acc_stderr": 0.033188332862172806,
"acc_norm": 0.336322869955157,
"acc_norm_stderr": 0.03170882426845501
},
"hendrycksTest-high_school_biology": {
"acc": 0.4967741935483871,
"acc_stderr": 0.028443414226438316,
"acc_norm": 0.36129032258064514,
"acc_norm_stderr": 0.027327548447957553
},
"hendrycksTest-public_relations": {
"acc": 0.5454545454545454,
"acc_stderr": 0.04769300568972744,
"acc_norm": 0.2909090909090909,
"acc_norm_stderr": 0.04350271442923243
},
"hendrycksTest-professional_law": {
"acc": 0.30378096479791394,
"acc_stderr": 0.011745787720472483,
"acc_norm": 0.3089960886571056,
"acc_norm_stderr": 0.011801729777239246
},
"hendrycksTest-electrical_engineering": {
"acc": 0.41379310344827586,
"acc_stderr": 0.041042692118062316,
"acc_norm": 0.3448275862068966,
"acc_norm_stderr": 0.039609335494512087
},
"hendrycksTest-logical_fallacies": {
"acc": 0.4539877300613497,
"acc_stderr": 0.0391170190467718,
"acc_norm": 0.36809815950920244,
"acc_norm_stderr": 0.03789213935838396
},
"hendrycksTest-moral_disputes": {
"acc": 0.4479768786127168,
"acc_stderr": 0.026772990653361816,
"acc_norm": 0.3815028901734104,
"acc_norm_stderr": 0.0261521986197268
},
"hendrycksTest-high_school_statistics": {
"acc": 0.38425925925925924,
"acc_stderr": 0.03317354514310742,
"acc_norm": 0.375,
"acc_norm_stderr": 0.033016908987210894
},
"hendrycksTest-college_physics": {
"acc": 0.28431372549019607,
"acc_stderr": 0.04488482852329017,
"acc_norm": 0.35294117647058826,
"acc_norm_stderr": 0.04755129616062947
},
"hendrycksTest-econometrics": {
"acc": 0.2719298245614035,
"acc_stderr": 0.04185774424022056,
"acc_norm": 0.2631578947368421,
"acc_norm_stderr": 0.041424397194893624
},
"hendrycksTest-miscellaneous": {
"acc": 0.6960408684546615,
"acc_stderr": 0.016448321686769043,
"acc_norm": 0.48531289910600256,
"acc_norm_stderr": 0.01787224802442912
},
"hendrycksTest-astronomy": {
"acc": 0.48026315789473684,
"acc_stderr": 0.04065771002562603,
"acc_norm": 0.48026315789473684,
"acc_norm_stderr": 0.040657710025626036
}
},
"versions": {
"hendrycksTest-college_biology": 0,
"hendrycksTest-clinical_knowledge": 0,
"hendrycksTest-high_school_european_history": 0,
"hendrycksTest-high_school_psychology": 0,
"hendrycksTest-business_ethics": 0,
"hendrycksTest-high_school_government_and_politics": 0,
"hendrycksTest-security_studies": 0,
"hendrycksTest-high_school_macroeconomics": 0,
"hendrycksTest-sociology": 0,
"hendrycksTest-college_mathematics": 0,
"hendrycksTest-professional_accounting": 0,
"hendrycksTest-anatomy": 0,
"hendrycksTest-professional_psychology": 0,
"hendrycksTest-moral_scenarios": 0,
"hendrycksTest-conceptual_physics": 0,
"hendrycksTest-virology": 0,
"hendrycksTest-world_religions": 0,
"hendrycksTest-high_school_computer_science": 0,
"hendrycksTest-abstract_algebra": 0,
"hendrycksTest-medical_genetics": 0,
"hendrycksTest-nutrition": 0,
"hendrycksTest-elementary_mathematics": 0,
"hendrycksTest-philosophy": 0,
"hendrycksTest-high_school_microeconomics": 0,
"hendrycksTest-management": 0,
"hendrycksTest-us_foreign_policy": 0,
"hendrycksTest-international_law": 0,
"hendrycksTest-college_chemistry": 0,
"hendrycksTest-high_school_mathematics": 0,
"hendrycksTest-high_school_world_history": 0,
"hendrycksTest-human_sexuality": 0,
"hendrycksTest-college_computer_science": 0,
"hendrycksTest-college_medicine": 0,
"hendrycksTest-formal_logic": 0,
"hendrycksTest-high_school_physics": 0,
"hendrycksTest-marketing": 0,
"hendrycksTest-jurisprudence": 0,
"hendrycksTest-computer_security": 0,
"hendrycksTest-high_school_chemistry": 0,
"hendrycksTest-prehistory": 0,
"hendrycksTest-machine_learning": 0,
"hendrycksTest-professional_medicine": 0,
"hendrycksTest-global_facts": 0,
"hendrycksTest-high_school_us_history": 0,
"hendrycksTest-high_school_geography": 0,
"hendrycksTest-human_aging": 0,
"hendrycksTest-high_school_biology": 0,
"hendrycksTest-public_relations": 0,
"hendrycksTest-professional_law": 0,
"hendrycksTest-electrical_engineering": 0,
"hendrycksTest-logical_fallacies": 0,
"hendrycksTest-moral_disputes": 0,
"hendrycksTest-high_school_statistics": 0,
"hendrycksTest-college_physics": 0,
"hendrycksTest-econometrics": 0,
"hendrycksTest-miscellaneous": 0,
"hendrycksTest-astronomy": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True",
"num_fewshot": 5,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"pawsx_fr": {
"acc": 0.545,
"acc_stderr": 0.011137752231145222
},
"pawsx_en": {
"acc": 0.537,
"acc_stderr": 0.011152474561478174
},
"pawsx_ko": {
"acc": 0.4705,
"acc_stderr": 0.011163654804511664
},
"pawsx_ja": {
"acc": 0.45,
"acc_stderr": 0.011127079848413735
},
"pawsx_es": {
"acc": 0.521,
"acc_stderr": 0.011173268141438304
},
"pawsx_de": {
"acc": 0.5295,
"acc_stderr": 0.011163654804511655
},
"pawsx_zh": {
"acc": 0.452,
"acc_stderr": 0.01113148485052578
}
},
"versions": {
"pawsx_fr": 0,
"pawsx_en": 0,
"pawsx_ko": 0,
"pawsx_ja": 0,
"pawsx_es": 0,
"pawsx_de": 0,
"pawsx_zh": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"triviaqa": {
"acc": 0.0,
"acc_stderr": 0.0
},
"headqa_es": {
"acc": 0.3056163384390955,
"acc_stderr": 0.008799003959214539,
"acc_norm": 0.3515681983953319,
"acc_norm_stderr": 0.009119739372039878
},
"logiqa": {
"acc": 0.2642089093701997,
"acc_stderr": 0.017293954549744514,
"acc_norm": 0.3210445468509985,
"acc_norm_stderr": 0.018312456701476108
},
"headqa_en": {
"acc": 0.34427425237053244,
"acc_stderr": 0.009075255747504299,
"acc_norm": 0.38584974471188915,
"acc_norm_stderr": 0.009298050684004381
},
"truthfulqa_mc": {
"mc1": 0.2582619339045288,
"mc1_stderr": 0.0153218216884762,
"mc2": 0.39884734031519786,
"mc2_stderr": 0.013703865869126058
},
"squad2": {
"exact": 16.440663690726858,
"f1": 24.060945088960178,
"HasAns_exact": 21.086369770580298,
"HasAns_f1": 36.34878560074651,
"NoAns_exact": 11.808242220353238,
"NoAns_f1": 11.808242220353238,
"best_exact": 50.07159100480081,
"best_f1": 50.073888042388
},
"webqs": {
"acc": 0.0,
"acc_stderr": 0.0
}
},
"versions": {
"triviaqa": 1,
"headqa_es": 0,
"logiqa": 0,
"headqa_en": 0,
"truthfulqa_mc": 1,
"squad2": 1,
"webqs": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"coqa": {
"f1": 0.7704068983762044,
"f1_stderr": 0.014191975492335083,
"em": 0.637,
"em_stderr": 0.01847461201879917
},
"drop": {
"em": 0.035864093959731544,
"em_stderr": 0.0019043146639119552,
"f1": 0.13376153523489834,
"f1_stderr": 0.002439665460318613
},
"race": {
"acc": 0.39330143540669854,
"acc_stderr": 0.01511816218614914
}
},
"versions": {
"coqa": 1,
"drop": 1,
"race": 1
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": false,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"boolq": {
"acc": 0.6844036697247706,
"acc_stderr": 0.008128579858785895
},
"wic": {
"acc": 0.49843260188087773,
"acc_stderr": 0.019810623954060382
},
"copa": {
"acc": 0.9,
"acc_stderr": 0.030151134457776348
},
"wsc": {
"acc": 0.3557692307692308,
"acc_stderr": 0.04717221961050337
},
"cb": {
"acc": 0.48214285714285715,
"acc_stderr": 0.0673769750864465,
"f1": 0.3881876266167991
},
"record": {
"f1": 0.9231828571428571,
"f1_stderr": 0.0026119602574627677,
"em": 0.9154,
"em_stderr": 0.002782994521347745
},
"multirc": {
"acc": 0.015739769150052464,
"acc_stderr": 0.00403399795659578
}
},
"versions": {
"boolq": 1,
"wic": 0,
"copa": 0,
"wsc": 0,
"cb": 1,
"record": 0,
"multirc": 1
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/13B,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": false,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment