Commit 0e849758 authored by Julen Etxaniz's avatar Julen Etxaniz
Browse files

update task table with missing tasks

parent 8a707701
...@@ -17,6 +17,27 @@ ...@@ -17,6 +17,27 @@
|arithmetic_4ds | |✓ | | 2000|acc | |arithmetic_4ds | |✓ | | 2000|acc |
|arithmetic_5da | |✓ | | 2000|acc | |arithmetic_5da | |✓ | | 2000|acc |
|arithmetic_5ds | |✓ | | 2000|acc | |arithmetic_5ds | |✓ | | 2000|acc |
|bigbench_causal_judgement | | |✓ | 190|multiple_choice_grade, exact_str_match |
|bigbench_date_understanding | | |✓ | 369|multiple_choice_grade, exact_str_match |
|bigbench_disambiguation_qa | | |✓ | 258|multiple_choice_grade, exact_str_match |
|bigbench_dyck_languages | | |✓ | 1000|multiple_choice_grade, exact_str_match |
|bigbench_formal_fallacies_syllogisms_negation | | |✓ | 14200|multiple_choice_grade, exact_str_match |
|bigbench_geometric_shapes | | |✓ | 359|multiple_choice_grade, exact_str_match |
|bigbench_hyperbaton | | |✓ | 50000|multiple_choice_grade, exact_str_match |
|bigbench_logical_deduction_five_objects | | |✓ | 500|multiple_choice_grade, exact_str_match |
|bigbench_logical_deduction_seven_objects | | |✓ | 700|multiple_choice_grade, exact_str_match |
|bigbench_logical_deduction_three_objects | | |✓ | 300|multiple_choice_grade, exact_str_match |
|bigbench_movie_recommendation | | |✓ | 500|multiple_choice_grade, exact_str_match |
|bigbench_navigate | | |✓ | 1000|multiple_choice_grade, exact_str_match |
|bigbench_reasoning_about_colored_objects | | |✓ | 2000|multiple_choice_grade, exact_str_match |
|bigbench_ruin_names | | |✓ | 448|multiple_choice_grade, exact_str_match |
|bigbench_salient_translation_error_detection | | |✓ | 998|multiple_choice_grade, exact_str_match |
|bigbench_snarks | | |✓ | 181|multiple_choice_grade, exact_str_match |
|bigbench_sports_understanding | | |✓ | 986|multiple_choice_grade, exact_str_match |
|bigbench_temporal_sequences | | |✓ | 1000|multiple_choice_grade, exact_str_match |
|bigbench_tracking_shuffled_objects_five_objects | | |✓ | 1250|multiple_choice_grade, exact_str_match |
|bigbench_tracking_shuffled_objects_seven_objects | | |✓ | 1750|multiple_choice_grade, exact_str_match |
|bigbench_tracking_shuffled_objects_three_objects | | |✓ | 300|multiple_choice_grade, exact_str_match |
|blimp_adjunct_island | |✓ | | 1000|acc | |blimp_adjunct_island | |✓ | | 1000|acc |
|blimp_anaphor_gender_agreement | |✓ | | 1000|acc | |blimp_anaphor_gender_agreement | |✓ | | 1000|acc |
|blimp_anaphor_number_agreement | |✓ | | 1000|acc | |blimp_anaphor_number_agreement | |✓ | | 1000|acc |
...@@ -89,6 +110,28 @@ ...@@ -89,6 +110,28 @@
|cola |✓ |✓ | | 1043|mcc | |cola |✓ |✓ | | 1043|mcc |
|copa |✓ |✓ | | 100|acc | |copa |✓ |✓ | | 100|acc |
|coqa |✓ |✓ | | 500|f1, em | |coqa |✓ |✓ | | 500|f1, em |
|crows_pairs_english | |✓ | | 1677|likelihood_difference, pct_stereotype |
|crows_pairs_english_age | |✓ | | 91|likelihood_difference, pct_stereotype |
|crows_pairs_english_autre | |✓ | | 11|likelihood_difference, pct_stereotype |
|crows_pairs_english_disability | |✓ | | 65|likelihood_difference, pct_stereotype |
|crows_pairs_english_gender | |✓ | | 320|likelihood_difference, pct_stereotype |
|crows_pairs_english_nationality | |✓ | | 216|likelihood_difference, pct_stereotype |
|crows_pairs_english_physical_appearance | |✓ | | 72|likelihood_difference, pct_stereotype |
|crows_pairs_english_race_color | |✓ | | 508|likelihood_difference, pct_stereotype |
|crows_pairs_english_religion | |✓ | | 111|likelihood_difference, pct_stereotype |
|crows_pairs_english_sexual_orientation | |✓ | | 93|likelihood_difference, pct_stereotype |
|crows_pairs_english_socioeconomic | |✓ | | 190|likelihood_difference, pct_stereotype |
|crows_pairs_french | |✓ | | 1677|likelihood_difference, pct_stereotype |
|crows_pairs_french_age | |✓ | | 90|likelihood_difference, pct_stereotype |
|crows_pairs_french_autre | |✓ | | 13|likelihood_difference, pct_stereotype |
|crows_pairs_french_disability | |✓ | | 66|likelihood_difference, pct_stereotype |
|crows_pairs_french_gender | |✓ | | 321|likelihood_difference, pct_stereotype |
|crows_pairs_french_nationality | |✓ | | 253|likelihood_difference, pct_stereotype |
|crows_pairs_french_physical_appearance | |✓ | | 72|likelihood_difference, pct_stereotype |
|crows_pairs_french_race_color | |✓ | | 460|likelihood_difference, pct_stereotype |
|crows_pairs_french_religion | |✓ | | 115|likelihood_difference, pct_stereotype |
|crows_pairs_french_sexual_orientation | |✓ | | 91|likelihood_difference, pct_stereotype |
|crows_pairs_french_socioeconomic | |✓ | | 196|likelihood_difference, pct_stereotype |
|cycle_letters | |✓ | | 10000|acc | |cycle_letters | |✓ | | 10000|acc |
|drop |✓ |✓ | | 9536|em, f1 | |drop |✓ |✓ | | 9536|em, f1 |
|ethics_cm |✓ | |✓ | 3885|acc | |ethics_cm |✓ | |✓ | 3885|acc |
...@@ -161,13 +204,13 @@ ...@@ -161,13 +204,13 @@
|hendrycksTest-world_religions | |✓ |✓ | 171|acc, acc_norm | |hendrycksTest-world_religions | |✓ |✓ | 171|acc, acc_norm |
|iwslt17-ar-en | | |✓ | 1460|bleu, chrf, ter | |iwslt17-ar-en | | |✓ | 1460|bleu, chrf, ter |
|iwslt17-en-ar | | |✓ | 1460|bleu, chrf, ter | |iwslt17-en-ar | | |✓ | 1460|bleu, chrf, ter |
|lambada_openai | | | | 5153|ppl, acc | |lambada_openai | | | | 5153|ppl, acc |
|lambada_openai_cloze | | | | 5153|ppl, acc | |lambada_openai_cloze | | | | 5153|ppl, acc |
|lambada_openai_mt_de | | | | 5153|ppl, acc | |lambada_openai_mt_de | | | | 5153|ppl, acc |
|lambada_openai_mt_en | | | | 5153|ppl, acc | |lambada_openai_mt_en | | | | 5153|ppl, acc |
|lambada_openai_mt_es | | | | 5153|ppl, acc | |lambada_openai_mt_es | | | | 5153|ppl, acc |
|lambada_openai_mt_fr | | | | 5153|ppl, acc | |lambada_openai_mt_fr | | | | 5153|ppl, acc |
|lambada_openai_mt_it | | | | 5153|ppl, acc | |lambada_openai_mt_it | | | | 5153|ppl, acc |
|lambada_standard | |✓ |✓ | 5153|ppl, acc | |lambada_standard | |✓ |✓ | 5153|ppl, acc |
|lambada_standard_cloze | |✓ |✓ | 5153|ppl, acc | |lambada_standard_cloze | |✓ |✓ | 5153|ppl, acc |
|logiqa |✓ |✓ |✓ | 651|acc, acc_norm | |logiqa |✓ |✓ |✓ | 651|acc, acc_norm |
...@@ -228,6 +271,7 @@ ...@@ -228,6 +271,7 @@
|squad2 |✓ |✓ | | 11873|exact, f1, HasAns_exact, HasAns_f1, NoAns_exact, NoAns_f1, best_exact, best_f1 | |squad2 |✓ |✓ | | 11873|exact, f1, HasAns_exact, HasAns_f1, NoAns_exact, NoAns_f1, best_exact, best_f1 |
|sst |✓ |✓ | | 872|acc | |sst |✓ |✓ | | 872|acc |
|swag |✓ |✓ | | 20006|acc, acc_norm | |swag |✓ |✓ | | 20006|acc, acc_norm |
|toxigen |✓ | |✓ | 940|acc, acc_norm |
|triviaqa |✓ |✓ | | 11313|acc | |triviaqa |✓ |✓ | | 11313|acc |
|truthfulqa_gen | |✓ | | 817|bleurt_max, bleurt_acc, bleurt_diff, bleu_max, bleu_acc, bleu_diff, rouge1_max, rouge1_acc, rouge1_diff, rouge2_max, rouge2_acc, rouge2_diff, rougeL_max, rougeL_acc, rougeL_diff| |truthfulqa_gen | |✓ | | 817|bleurt_max, bleurt_acc, bleurt_diff, bleu_max, bleu_acc, bleu_diff, rouge1_max, rouge1_acc, rouge1_diff, rouge2_max, rouge2_acc, rouge2_diff, rougeL_max, rougeL_acc, rougeL_diff|
|truthfulqa_mc | |✓ | | 817|mc1, mc2 | |truthfulqa_mc | |✓ | | 817|mc1, mc2 |
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment