Commit 3c0fa5a6 authored by researcher2's avatar researcher2
Browse files

Merge branch 'researcher2' of https://github.com/EleutherAI/lm-evaluation-harness into researcher2

parents f495bfb4 57569bbb
......@@ -9,7 +9,7 @@ This project provides a unified framework to test autoregressive language models
Features:
- 100+ tasks implemented
- 200+ tasks implemented
- Support for GPT-2, GPT-3, GPT-Neo, GPT-NeoX, and GPT-J, with flexible tokenization-agnostic interface
- Task versioning to ensure reproducibility
......@@ -51,6 +51,15 @@ python main.py \
--tasks lambada,hellaswag
```
And if you want to verify the data integrity of the tasks you're performing in addition to running the tasks themselves, you can use the `--check_integrity` flag:
```bash
python main.py \
--model gpt3 \
--model_args engine=davinci \
--tasks lambada,hellaswag \
--check_integrity
```
To evaluate mesh-transformer-jax models that are not available on HF, please invoke eval harness through [this script](https://github.com/kingoflolz/mesh-transformer-jax/blob/master/eval_harness.py).
## Implementing new tasks
......@@ -90,257 +99,269 @@ To implement a new task in eval harness, see [this guide](./docs/task_guide.md).
### Full Task List
| Task Name |Train|Val|Test|Val/Test Docs| Metrics |
|---------------------------------------------------------|-----|---|----|------------:|------------------------------------------------------------------------------|
|cola |✓ |✓ | | 1043|mcc |
|mnli |✓ |✓ | | 9815|acc |
|mnli_mismatched |✓ |✓ | | 9832|acc |
|mrpc |✓ |✓ | | 408|acc, f1 |
|rte |✓ |✓ | | 277|acc |
|qnli |✓ |✓ | | 5463|acc |
|qqp |✓ |✓ | | 40430|acc, f1 |
|sst |✓ |✓ | | 872|acc |
|wnli |✓ |✓ | | 71|acc |
|boolq |✓ |✓ | | 3270|acc |
|cb |✓ |✓ | | 56|acc, f1 |
|copa |✓ |✓ | | 100|acc |
|multirc |✓ |✓ | | 4848|acc |
|record |✓ |✓ | | 10000|f1, em |
|wic |✓ |✓ | | 638|acc |
|wsc |✓ |✓ | | 104|acc |
|coqa |✓ |✓ | | 500|f1, em |
|drop |✓ |✓ | | 9536|em, f1 |
|lambada | |✓ | | 5153|ppl, acc |
|lambada_cloze | |✓ | | 5153|ppl, acc |
|wikitext | |✓ |✓ | 62|word_perplexity, byte_perplexity, bits_per_byte |
|piqa |✓ |✓ | | 1838|acc, acc_norm |
|prost | | |✓ | 18736|acc, acc_norm |
|pubmedqa | | |✓ | 1000|acc |
|sciq |✓ |✓ |✓ | 1000|acc, acc_norm |
|qa4mre_2011 | | |✓ | 120|acc, acc_norm |
|qa4mre_2012 | | |✓ | 160|acc, acc_norm |
|qa4mre_2013 | | |✓ | 284|acc, acc_norm |
|triviaqa |✓ |✓ | | 11313|acc |
|arc_easy |✓ |✓ |✓ | 2376|acc, acc_norm |
|arc_challenge |✓ |✓ |✓ | 1172|acc, acc_norm |
|logiqa |✓ |✓ |✓ | 651|acc, acc_norm |
|hellaswag |✓ |✓ | | 10042|acc, acc_norm |
|openbookqa |✓ |✓ |✓ | 500|acc, acc_norm |
|squad2 |✓ |✓ | | 11873|exact, f1, HasAns_exact, HasAns_f1, NoAns_exact, NoAns_f1, best_exact, best_f1|
|race |✓ |✓ |✓ | 1045|acc |
|mathqa |✓ |✓ |✓ | 2985|acc, acc_norm |
|headqa_es |✓ |✓ |✓ | 2742|acc, acc_norm |
|headqa_en |✓ |✓ |✓ | 2742|acc, acc_norm |
|webqs |✓ | |✓ | 2032|acc |
|wsc273 | | |✓ | 273|acc |
|winogrande |✓ |✓ | | 1267|acc |
|anli_r1 |✓ |✓ |✓ | 1000|acc |
|anli_r2 |✓ |✓ |✓ | 1000|acc |
|anli_r3 |✓ |✓ |✓ | 1200|acc |
|ethics_cm |✓ | |✓ | 3885|acc |
|ethics_deontology |✓ | |✓ | 3596|acc, em |
|ethics_justice |✓ | |✓ | 2704|acc, em |
|ethics_utilitarianism_original | | |✓ | 4808|acc |
|ethics_utilitarianism |✓ | |✓ | 4808|acc |
|ethics_virtue |✓ | |✓ | 4975|acc, em |
|math_algebra |✓ | |✓ | 1187|acc |
|math_counting_and_prob |✓ | |✓ | 474|acc |
|math_geometry |✓ | |✓ | 479|acc |
|math_intermediate_algebra |✓ | |✓ | 903|acc |
|math_num_theory |✓ | |✓ | 540|acc |
|math_prealgebra |✓ | |✓ | 871|acc |
|math_precalc |✓ | |✓ | 546|acc |
|arithmetic_2da | |✓ | | 2000|acc |
|arithmetic_2ds | |✓ | | 2000|acc |
|arithmetic_3da | |✓ | | 2000|acc |
|arithmetic_3ds | |✓ | | 2000|acc |
|arithmetic_4da | |✓ | | 2000|acc |
|arithmetic_4ds | |✓ | | 2000|acc |
|arithmetic_5da | |✓ | | 2000|acc |
|arithmetic_5ds | |✓ | | 2000|acc |
|arithmetic_2dm | |✓ | | 2000|acc |
|arithmetic_1dc | |✓ | | 2000|acc |
|hendrycksTest-abstract_algebra |✓ |✓ |✓ | 100|acc, acc_norm |
|hendrycksTest-anatomy |✓ |✓ |✓ | 135|acc, acc_norm |
|hendrycksTest-astronomy |✓ |✓ |✓ | 152|acc, acc_norm |
|hendrycksTest-business_ethics |✓ |✓ |✓ | 100|acc, acc_norm |
|hendrycksTest-clinical_knowledge |✓ |✓ |✓ | 265|acc, acc_norm |
|hendrycksTest-college_biology |✓ |✓ |✓ | 144|acc, acc_norm |
|hendrycksTest-college_chemistry |✓ |✓ |✓ | 100|acc, acc_norm |
|hendrycksTest-college_computer_science |✓ |✓ |✓ | 100|acc, acc_norm |
|hendrycksTest-college_mathematics |✓ |✓ |✓ | 100|acc, acc_norm |
|hendrycksTest-college_medicine |✓ |✓ |✓ | 173|acc, acc_norm |
|hendrycksTest-college_physics |✓ |✓ |✓ | 102|acc, acc_norm |
|hendrycksTest-computer_security |✓ |✓ |✓ | 100|acc, acc_norm |
|hendrycksTest-conceptual_physics |✓ |✓ |✓ | 235|acc, acc_norm |
|hendrycksTest-econometrics |✓ |✓ |✓ | 114|acc, acc_norm |
|hendrycksTest-electrical_engineering |✓ |✓ |✓ | 145|acc, acc_norm |
|hendrycksTest-elementary_mathematics |✓ |✓ |✓ | 378|acc, acc_norm |
|hendrycksTest-formal_logic |✓ |✓ |✓ | 126|acc, acc_norm |
|hendrycksTest-global_facts |✓ |✓ |✓ | 100|acc, acc_norm |
|hendrycksTest-high_school_biology |✓ |✓ |✓ | 310|acc, acc_norm |
|hendrycksTest-high_school_chemistry |✓ |✓ |✓ | 203|acc, acc_norm |
|hendrycksTest-high_school_computer_science |✓ |✓ |✓ | 100|acc, acc_norm |
|hendrycksTest-high_school_european_history |✓ |✓ |✓ | 165|acc, acc_norm |
|hendrycksTest-high_school_geography |✓ |✓ |✓ | 198|acc, acc_norm |
|hendrycksTest-high_school_government_and_politics |✓ |✓ |✓ | 193|acc, acc_norm |
|hendrycksTest-high_school_macroeconomics |✓ |✓ |✓ | 390|acc, acc_norm |
|hendrycksTest-high_school_mathematics |✓ |✓ |✓ | 270|acc, acc_norm |
|hendrycksTest-high_school_microeconomics |✓ |✓ |✓ | 238|acc, acc_norm |
|hendrycksTest-high_school_physics |✓ |✓ |✓ | 151|acc, acc_norm |
|hendrycksTest-high_school_psychology |✓ |✓ |✓ | 545|acc, acc_norm |
|hendrycksTest-high_school_statistics |✓ |✓ |✓ | 216|acc, acc_norm |
|hendrycksTest-high_school_us_history |✓ |✓ |✓ | 204|acc, acc_norm |
|hendrycksTest-high_school_world_history |✓ |✓ |✓ | 237|acc, acc_norm |
|hendrycksTest-human_aging |✓ |✓ |✓ | 223|acc, acc_norm |
|hendrycksTest-human_sexuality |✓ |✓ |✓ | 131|acc, acc_norm |
|hendrycksTest-international_law |✓ |✓ |✓ | 121|acc, acc_norm |
|hendrycksTest-jurisprudence |✓ |✓ |✓ | 108|acc, acc_norm |
|hendrycksTest-logical_fallacies |✓ |✓ |✓ | 163|acc, acc_norm |
|hendrycksTest-machine_learning |✓ |✓ |✓ | 112|acc, acc_norm |
|hendrycksTest-management |✓ |✓ |✓ | 103|acc, acc_norm |
|hendrycksTest-marketing |✓ |✓ |✓ | 234|acc, acc_norm |
|hendrycksTest-medical_genetics |✓ |✓ |✓ | 100|acc, acc_norm |
|hendrycksTest-miscellaneous |✓ |✓ |✓ | 783|acc, acc_norm |
|hendrycksTest-moral_disputes |✓ |✓ |✓ | 346|acc, acc_norm |
|hendrycksTest-moral_scenarios |✓ |✓ |✓ | 895|acc, acc_norm |
|hendrycksTest-nutrition |✓ |✓ |✓ | 306|acc, acc_norm |
|hendrycksTest-philosophy |✓ |✓ |✓ | 311|acc, acc_norm |
|hendrycksTest-prehistory |✓ |✓ |✓ | 324|acc, acc_norm |
|hendrycksTest-professional_accounting |✓ |✓ |✓ | 282|acc, acc_norm |
|hendrycksTest-professional_law |✓ |✓ |✓ | 1534|acc, acc_norm |
|hendrycksTest-professional_medicine |✓ |✓ |✓ | 272|acc, acc_norm |
|hendrycksTest-professional_psychology |✓ |✓ |✓ | 612|acc, acc_norm |
|hendrycksTest-public_relations |✓ |✓ |✓ | 110|acc, acc_norm |
|hendrycksTest-security_studies |✓ |✓ |✓ | 245|acc, acc_norm |
|hendrycksTest-sociology |✓ |✓ |✓ | 201|acc, acc_norm |
|hendrycksTest-us_foreign_policy |✓ |✓ |✓ | 100|acc, acc_norm |
|hendrycksTest-virology |✓ |✓ |✓ | 166|acc, acc_norm |
|hendrycksTest-world_religions |✓ |✓ |✓ | 171|acc, acc_norm |
|wmt14-en-fr | | |✓ | 3003|bleu, chrf, ter |
|wmt14-fr-en | | |✓ | 3003|bleu, chrf, ter |
|wmt16-en-ro | | |✓ | 1999|bleu, chrf, ter |
|wmt16-ro-en | | |✓ | 1999|bleu, chrf, ter |
|wmt16-de-en | | |✓ | 2999|bleu, chrf, ter |
|wmt16-en-de | | |✓ | 2999|bleu, chrf, ter |
|wmt20-cs-en | | |✓ | 664|bleu, chrf, ter |
|wmt20-de-en | | |✓ | 785|bleu, chrf, ter |
|wmt20-de-fr | | |✓ | 1619|bleu, chrf, ter |
|wmt20-en-cs | | |✓ | 1418|bleu, chrf, ter |
|wmt20-en-de | | |✓ | 1418|bleu, chrf, ter |
|wmt20-en-iu | | |✓ | 2971|bleu, chrf, ter |
|wmt20-en-ja | | |✓ | 1000|bleu, chrf, ter |
|wmt20-en-km | | |✓ | 2320|bleu, chrf, ter |
|wmt20-en-pl | | |✓ | 1000|bleu, chrf, ter |
|wmt20-en-ps | | |✓ | 2719|bleu, chrf, ter |
|wmt20-en-ru | | |✓ | 2002|bleu, chrf, ter |
|wmt20-en-ta | | |✓ | 1000|bleu, chrf, ter |
|wmt20-en-zh | | |✓ | 1418|bleu, chrf, ter |
|wmt20-fr-de | | |✓ | 1619|bleu, chrf, ter |
|wmt20-iu-en | | |✓ | 2971|bleu, chrf, ter |
|wmt20-ja-en | | |✓ | 993|bleu, chrf, ter |
|wmt20-km-en | | |✓ | 2320|bleu, chrf, ter |
|wmt20-pl-en | | |✓ | 1001|bleu, chrf, ter |
|wmt20-ps-en | | |✓ | 2719|bleu, chrf, ter |
|wmt20-ru-en | | |✓ | 991|bleu, chrf, ter |
|wmt20-ta-en | | |✓ | 997|bleu, chrf, ter |
|wmt20-zh-en | | |✓ | 2000|bleu, chrf, ter |
|iwslt17-en-ar | | |✓ | 1460|bleu, chrf, ter |
|iwslt17-ar-en | | |✓ | 1460|bleu, chrf, ter |
|anagrams1 | |✓ | | 10000|acc |
|anagrams2 | |✓ | | 10000|acc |
|cycle_letters | |✓ | | 10000|acc |
|random_insertion | |✓ | | 10000|acc |
|reversed_words | |✓ | | 10000|acc |
|pile_arxiv | |✓ |✓ | 2407|word_perplexity, byte_perplexity, bits_per_byte |
|pile_books3 | |✓ |✓ | 269|word_perplexity, byte_perplexity, bits_per_byte |
|pile_bookcorpus2 | |✓ |✓ | 28|word_perplexity, byte_perplexity, bits_per_byte |
|pile_dm-mathematics | |✓ |✓ | 1922|word_perplexity, byte_perplexity, bits_per_byte |
|pile_enron | |✓ |✓ | 1010|word_perplexity, byte_perplexity, bits_per_byte |
|pile_europarl | |✓ |✓ | 157|word_perplexity, byte_perplexity, bits_per_byte |
|pile_freelaw | |✓ |✓ | 5101|word_perplexity, byte_perplexity, bits_per_byte |
|pile_github | |✓ |✓ | 18195|word_perplexity, byte_perplexity, bits_per_byte |
|pile_gutenberg | |✓ |✓ | 80|word_perplexity, byte_perplexity, bits_per_byte |
|pile_hackernews | |✓ |✓ | 1632|word_perplexity, byte_perplexity, bits_per_byte |
|pile_nih-exporter | |✓ |✓ | 1884|word_perplexity, byte_perplexity, bits_per_byte |
|pile_opensubtitles | |✓ |✓ | 642|word_perplexity, byte_perplexity, bits_per_byte |
|pile_openwebtext2 | |✓ |✓ | 32925|word_perplexity, byte_perplexity, bits_per_byte |
|pile_philpapers | |✓ |✓ | 68|word_perplexity, byte_perplexity, bits_per_byte |
|pile_pile-cc | |✓ |✓ | 52790|word_perplexity, byte_perplexity, bits_per_byte |
|pile_pubmed-abstracts | |✓ |✓ | 29895|word_perplexity, byte_perplexity, bits_per_byte |
|pile_pubmed-central | |✓ |✓ | 5911|word_perplexity, byte_perplexity, bits_per_byte |
|pile_stackexchange | |✓ |✓ | 30378|word_perplexity, byte_perplexity, bits_per_byte |
|pile_uspto | |✓ |✓ | 11415|word_perplexity, byte_perplexity, bits_per_byte |
|pile_ubuntu-irc | |✓ |✓ | 22|word_perplexity, byte_perplexity, bits_per_byte |
|pile_wikipedia | |✓ |✓ | 17511|word_perplexity, byte_perplexity, bits_per_byte |
|pile_youtubesubtitles | |✓ | | 1000|acc
|blimp_adjunct_island | |✓ | | 1000|acc
|blimp_anaphor_gender_agreement | |✓ | | 1000|acc
|blimp_anaphor_number_agreement | |✓ | | 1000|acc
|blimp_animate_subject_passive | |✓ | | 1000|acc
|blimp_animate_subject_trans | |✓ | | 1000|acc
|blimp_causative | |✓ | | 1000|acc
|blimp_complex_NP_island | |✓ | | 1000|acc
|blimp_coordinate_structure_constraint_complex_left_branch| |✓ | | 1000|acc
|blimp_coordinate_structure_constraint_object_extraction | |✓ | | 1000|acc
|blimp_determiner_noun_agreement_1 | |✓ | | 1000|acc
|blimp_determiner_noun_agreement_2 | |✓ | | 1000|acc
|blimp_determiner_noun_agreement_irregular_1 | |✓ | | 1000|acc
|blimp_determiner_noun_agreement_irregular_2 | |✓ | | 1000|acc
|blimp_determiner_noun_agreement_with_adj_2 | |✓ | | 1000|acc
|blimp_determiner_noun_agreement_with_adj_irregular_1 | |✓ | | 1000|acc
|blimp_determiner_noun_agreement_with_adj_irregular_2 | |✓ | | 1000|acc
|blimp_determiner_noun_agreement_with_adjective_1 | |✓ | | 1000|acc
|blimp_distractor_agreement_relational_noun | |✓ | | 1000|acc
|blimp_distractor_agreement_relative_clause | |✓ | | 1000|acc
|blimp_drop_argument | |✓ | | 1000|acc
|blimp_ellipsis_n_bar_1 | |✓ | | 1000|acc
|blimp_ellipsis_n_bar_2 | |✓ | | 1000|acc
|blimp_existential_there_object_raising | |✓ | | 1000|acc
|blimp_existential_there_quantifiers_1 | |✓ | | 1000|acc
|blimp_existential_there_quantifiers_2 | |✓ | | 1000|acc
|blimp_existential_there_subject_raising | |✓ | | 1000|acc
|blimp_expletive_it_object_raising | |✓ | | 1000|acc
|blimp_inchoative | |✓ | | 1000|acc
|blimp_intransitive | |✓ | | 1000|acc
|blimp_irregular_past_participle_adjectives | |✓ | | 1000|acc
|blimp_irregular_past_participle_verbs | |✓ | | 1000|acc
|blimp_irregular_plural_subject_verb_agreement_1 | |✓ | | 1000|acc
|blimp_irregular_plural_subject_verb_agreement_2 | |✓ | | 1000|acc
|blimp_left_branch_island_echo_question | |✓ | | 1000|acc
|blimp_left_branch_island_simple_question | |✓ | | 1000|acc
|blimp_matrix_question_npi_licensor_present | |✓ | | 1000|acc
|blimp_npi_present_1 | |✓ | | 1000|acc
|blimp_npi_present_2 | |✓ | | 1000|acc
|blimp_only_npi_licensor_present | |✓ | | 1000|acc
|blimp_only_npi_scope | |✓ | | 1000|acc
|blimp_passive_1 | |✓ | | 1000|acc
|blimp_passive_2 | |✓ | | 1000|acc
|blimp_principle_A_c_command | |✓ | | 1000|acc
|blimp_principle_A_case_1 | |✓ | | 1000|acc
|blimp_principle_A_case_2 | |✓ | | 1000|acc
|blimp_principle_A_domain_1 | |✓ | | 1000|acc
|blimp_principle_A_domain_2 | |✓ | | 1000|acc
|blimp_principle_A_domain_3 | |✓ | | 1000|acc
|blimp_principle_A_reconstruction | |✓ | | 1000|acc
|blimp_regular_plural_subject_verb_agreement_1 | |✓ | | 1000|acc
|blimp_regular_plural_subject_verb_agreement_2 | |✓ | | 1000|acc
|blimp_sentential_negation_npi_licensor_present | |✓ | | 1000|acc
|blimp_sentential_negation_npi_scope | |✓ | | 1000|acc
|blimp_sentential_subject_island | |✓ | | 1000|acc
|blimp_superlative_quantifiers_1 | |✓ | | 1000|acc
|blimp_superlative_quantifiers_2 | |✓ | | 1000|acc
|blimp_tough_vs_raising_1 | |✓ | | 1000|acc
|blimp_tough_vs_raising_2 | |✓ | | 1000|acc
|blimp_transitive | |✓ | | 1000|acc
|blimp_wh_island | |✓ | | 1000|acc
|blimp_wh_questions_object_gap | |✓ | | 1000|acc
|blimp_wh_questions_subject_gap | |✓ | | 1000|acc
|blimp_wh_questions_subject_gap_long_distance | |✓ | | 1000|acc
|blimp_wh_vs_that_no_gap | |✓ | | 1000|acc
|blimp_wh_vs_that_no_gap_long_distance | |✓ | | 1000|acc
|blimp_wh_vs_that_with_gap | |✓ | | 1000|acc
|blimp_wh_vs_that_with_gap_long_distance | |✓ | | 1000|acc
| Task Name |Train|Val|Test|Val/Test Docs| Metrics |
|---------------------------------------------------------|-----|---|----|------------:|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|cola |✓ |✓ | | 1043|mcc |
|mnli |✓ |✓ | | 9815|acc |
|mnli_mismatched |✓ |✓ | | 9832|acc |
|mrpc |✓ |✓ | | 408|acc, f1 |
|rte |✓ |✓ | | 277|acc |
|qnli |✓ |✓ | | 5463|acc |
|qqp |✓ |✓ | | 40430|acc, f1 |
|sst |✓ |✓ | | 872|acc |
|wnli |✓ |✓ | | 71|acc |
|boolq |✓ |✓ | | 3270|acc |
|cb |✓ |✓ | | 56|acc, f1 |
|copa |✓ |✓ | | 100|acc |
|multirc |✓ |✓ | | 4848|acc |
|record |✓ |✓ | | 10000|f1, em |
|wic |✓ |✓ | | 638|acc |
|wsc |✓ |✓ | | 104|acc |
|coqa |✓ |✓ | | 500|f1, em |
|drop |✓ |✓ | | 9536|em, f1 |
|lambada | |✓ | | 5153|ppl, acc |
|lambada_cloze | |✓ | | 5153|ppl, acc |
|lambada_mt_en | |✓ | | 5153|ppl, acc |
|lambada_mt_fr | |✓ | | 5153|ppl, acc |
|lambada_mt_de | |✓ | | 5153|ppl, acc |
|lambada_mt_it | |✓ | | 5153|ppl, acc |
|lambada_mt_es | |✓ | | 5153|ppl, acc |
|wikitext | |✓ |✓ | 62|word_perplexity, byte_perplexity, bits_per_byte |
|piqa |✓ |✓ | | 1838|acc, acc_norm |
|prost | | |✓ | 18736|acc, acc_norm |
|mc_taco | |✓ |✓ | 9442|f1, em |
|pubmedqa | | |✓ | 1000|acc |
|sciq |✓ |✓ |✓ | 1000|acc, acc_norm |
|qa4mre_2011 | | |✓ | 120|acc, acc_norm |
|qa4mre_2012 | | |✓ | 160|acc, acc_norm |
|qa4mre_2013 | | |✓ | 284|acc, acc_norm |
|triviaqa |✓ |✓ | | 11313|acc |
|arc_easy |✓ |✓ |✓ | 2376|acc, acc_norm |
|arc_challenge |✓ |✓ |✓ | 1172|acc, acc_norm |
|logiqa |✓ |✓ |✓ | 651|acc, acc_norm |
|hellaswag |✓ |✓ | | 10042|acc, acc_norm |
|openbookqa |✓ |✓ |✓ | 500|acc, acc_norm |
|squad2 |✓ |✓ | | 11873|exact, f1, HasAns_exact, HasAns_f1, NoAns_exact, NoAns_f1, best_exact, best_f1 |
|race |✓ |✓ |✓ | 1045|acc |
|headqa |✓ |✓ |✓ | 2742|acc, acc_norm |
|headqa_es |✓ |✓ |✓ | 2742|acc, acc_norm |
|headqa_en |✓ |✓ |✓ | 2742|acc, acc_norm |
|mathqa |✓ |✓ |✓ | 2985|acc, acc_norm |
|webqs |✓ | |✓ | 2032|acc |
|wsc273 | | |✓ | 273|acc |
|winogrande |✓ |✓ | | 1267|acc |
|anli_r1 |✓ |✓ |✓ | 1000|acc |
|anli_r2 |✓ |✓ |✓ | 1000|acc |
|anli_r3 |✓ |✓ |✓ | 1200|acc |
|ethics_cm |✓ | |✓ | 3885|acc |
|ethics_deontology |✓ | |✓ | 3596|acc, em |
|ethics_justice |✓ | |✓ | 2704|acc, em |
|ethics_utilitarianism_original | | |✓ | 4808|acc |
|ethics_utilitarianism |✓ | |✓ | 4808|acc |
|ethics_virtue |✓ | |✓ | 4975|acc, em |
|truthfulqa_mc | |✓ | | 817|mc1, mc2 |
|truthfulqa_gen | |✓ | | 817|bleurt_max, bleurt_acc, bleurt_diff, bleu_max, bleu_acc, bleu_diff, rouge1_max, rouge1_acc, rouge1_diff, rouge2_max, rouge2_acc, rouge2_diff, rougeL_max, rougeL_acc, rougeL_diff|
|mutual |✓ |✓ | | 886|r@1, r@2, mrr |
|mutual_plus |✓ |✓ | | 886|r@1, r@2, mrr |
|math_algebra |✓ | |✓ | 1187|acc |
|math_counting_and_prob |✓ | |✓ | 474|acc |
|math_geometry |✓ | |✓ | 479|acc |
|math_intermediate_algebra |✓ | |✓ | 903|acc |
|math_num_theory |✓ | |✓ | 540|acc |
|math_prealgebra |✓ | |✓ | 871|acc |
|math_precalc |✓ | |✓ | 546|acc |
|math_asdiv | |✓ | | 2305|acc |
|arithmetic_2da | |✓ | | 2000|acc |
|arithmetic_2ds | |✓ | | 2000|acc |
|arithmetic_3da | |✓ | | 2000|acc |
|arithmetic_3ds | |✓ | | 2000|acc |
|arithmetic_4da | |✓ | | 2000|acc |
|arithmetic_4ds | |✓ | | 2000|acc |
|arithmetic_5da | |✓ | | 2000|acc |
|arithmetic_5ds | |✓ | | 2000|acc |
|arithmetic_2dm | |✓ | | 2000|acc |
|arithmetic_1dc | |✓ | | 2000|acc |
|hendrycksTest-abstract_algebra |✓ |✓ |✓ | 100|acc, acc_norm |
|hendrycksTest-anatomy |✓ |✓ |✓ | 135|acc, acc_norm |
|hendrycksTest-astronomy |✓ |✓ |✓ | 152|acc, acc_norm |
|hendrycksTest-business_ethics |✓ |✓ |✓ | 100|acc, acc_norm |
|hendrycksTest-clinical_knowledge |✓ |✓ |✓ | 265|acc, acc_norm |
|hendrycksTest-college_biology |✓ |✓ |✓ | 144|acc, acc_norm |
|hendrycksTest-college_chemistry |✓ |✓ |✓ | 100|acc, acc_norm |
|hendrycksTest-college_computer_science |✓ |✓ |✓ | 100|acc, acc_norm |
|hendrycksTest-college_mathematics |✓ |✓ |✓ | 100|acc, acc_norm |
|hendrycksTest-college_medicine |✓ |✓ |✓ | 173|acc, acc_norm |
|hendrycksTest-college_physics |✓ |✓ |✓ | 102|acc, acc_norm |
|hendrycksTest-computer_security |✓ |✓ |✓ | 100|acc, acc_norm |
|hendrycksTest-conceptual_physics |✓ |✓ |✓ | 235|acc, acc_norm |
|hendrycksTest-econometrics |✓ |✓ |✓ | 114|acc, acc_norm |
|hendrycksTest-electrical_engineering |✓ |✓ |✓ | 145|acc, acc_norm |
|hendrycksTest-elementary_mathematics |✓ |✓ |✓ | 378|acc, acc_norm |
|hendrycksTest-formal_logic |✓ |✓ |✓ | 126|acc, acc_norm |
|hendrycksTest-global_facts |✓ |✓ |✓ | 100|acc, acc_norm |
|hendrycksTest-high_school_biology |✓ |✓ |✓ | 310|acc, acc_norm |
|hendrycksTest-high_school_chemistry |✓ |✓ |✓ | 203|acc, acc_norm |
|hendrycksTest-high_school_computer_science |✓ |✓ |✓ | 100|acc, acc_norm |
|hendrycksTest-high_school_european_history |✓ |✓ |✓ | 165|acc, acc_norm |
|hendrycksTest-high_school_geography |✓ |✓ |✓ | 198|acc, acc_norm |
|hendrycksTest-high_school_government_and_politics |✓ |✓ |✓ | 193|acc, acc_norm |
|hendrycksTest-high_school_macroeconomics |✓ |✓ |✓ | 390|acc, acc_norm |
|hendrycksTest-high_school_mathematics |✓ |✓ |✓ | 270|acc, acc_norm |
|hendrycksTest-high_school_microeconomics |✓ |✓ |✓ | 238|acc, acc_norm |
|hendrycksTest-high_school_physics |✓ |✓ |✓ | 151|acc, acc_norm |
|hendrycksTest-high_school_psychology |✓ |✓ |✓ | 545|acc, acc_norm |
|hendrycksTest-high_school_statistics |✓ |✓ |✓ | 216|acc, acc_norm |
|hendrycksTest-high_school_us_history |✓ |✓ |✓ | 204|acc, acc_norm |
|hendrycksTest-high_school_world_history |✓ |✓ |✓ | 237|acc, acc_norm |
|hendrycksTest-human_aging |✓ |✓ |✓ | 223|acc, acc_norm |
|hendrycksTest-human_sexuality |✓ |✓ |✓ | 131|acc, acc_norm |
|hendrycksTest-international_law |✓ |✓ |✓ | 121|acc, acc_norm |
|hendrycksTest-jurisprudence |✓ |✓ |✓ | 108|acc, acc_norm |
|hendrycksTest-logical_fallacies |✓ |✓ |✓ | 163|acc, acc_norm |
|hendrycksTest-machine_learning |✓ |✓ |✓ | 112|acc, acc_norm |
|hendrycksTest-management |✓ |✓ |✓ | 103|acc, acc_norm |
|hendrycksTest-marketing |✓ |✓ |✓ | 234|acc, acc_norm |
|hendrycksTest-medical_genetics |✓ |✓ |✓ | 100|acc, acc_norm |
|hendrycksTest-miscellaneous |✓ |✓ |✓ | 783|acc, acc_norm |
|hendrycksTest-moral_disputes |✓ |✓ |✓ | 346|acc, acc_norm |
|hendrycksTest-moral_scenarios |✓ |✓ |✓ | 895|acc, acc_norm |
|hendrycksTest-nutrition |✓ |✓ |✓ | 306|acc, acc_norm |
|hendrycksTest-philosophy |✓ |✓ |✓ | 311|acc, acc_norm |
|hendrycksTest-prehistory |✓ |✓ |✓ | 324|acc, acc_norm |
|hendrycksTest-professional_accounting |✓ |✓ |✓ | 282|acc, acc_norm |
|hendrycksTest-professional_law |✓ |✓ |✓ | 1534|acc, acc_norm |
|hendrycksTest-professional_medicine |✓ |✓ |✓ | 272|acc, acc_norm |
|hendrycksTest-professional_psychology |✓ |✓ |✓ | 612|acc, acc_norm |
|hendrycksTest-public_relations |✓ |✓ |✓ | 110|acc, acc_norm |
|hendrycksTest-security_studies |✓ |✓ |✓ | 245|acc, acc_norm |
|hendrycksTest-sociology |✓ |✓ |✓ | 201|acc, acc_norm |
|hendrycksTest-us_foreign_policy |✓ |✓ |✓ | 100|acc, acc_norm |
|hendrycksTest-virology |✓ |✓ |✓ | 166|acc, acc_norm |
|hendrycksTest-world_religions |✓ |✓ |✓ | 171|acc, acc_norm |
|wmt14-en-fr | | |✓ | 3003|bleu, chrf, ter |
|wmt14-fr-en | | |✓ | 3003|bleu, chrf, ter |
|wmt16-en-ro | | |✓ | 1999|bleu, chrf, ter |
|wmt16-ro-en | | |✓ | 1999|bleu, chrf, ter |
|wmt16-de-en | | |✓ | 2999|bleu, chrf, ter |
|wmt16-en-de | | |✓ | 2999|bleu, chrf, ter |
|wmt20-cs-en | | |✓ | 664|bleu, chrf, ter |
|wmt20-de-en | | |✓ | 785|bleu, chrf, ter |
|wmt20-de-fr | | |✓ | 1619|bleu, chrf, ter |
|wmt20-en-cs | | |✓ | 1418|bleu, chrf, ter |
|wmt20-en-de | | |✓ | 1418|bleu, chrf, ter |
|wmt20-en-iu | | |✓ | 2971|bleu, chrf, ter |
|wmt20-en-ja | | |✓ | 1000|bleu, chrf, ter |
|wmt20-en-km | | |✓ | 2320|bleu, chrf, ter |
|wmt20-en-pl | | |✓ | 1000|bleu, chrf, ter |
|wmt20-en-ps | | |✓ | 2719|bleu, chrf, ter |
|wmt20-en-ru | | |✓ | 2002|bleu, chrf, ter |
|wmt20-en-ta | | |✓ | 1000|bleu, chrf, ter |
|wmt20-en-zh | | |✓ | 1418|bleu, chrf, ter |
|wmt20-fr-de | | |✓ | 1619|bleu, chrf, ter |
|wmt20-iu-en | | |✓ | 2971|bleu, chrf, ter |
|wmt20-ja-en | | |✓ | 993|bleu, chrf, ter |
|wmt20-km-en | | |✓ | 2320|bleu, chrf, ter |
|wmt20-pl-en | | |✓ | 1001|bleu, chrf, ter |
|wmt20-ps-en | | |✓ | 2719|bleu, chrf, ter |
|wmt20-ru-en | | |✓ | 991|bleu, chrf, ter |
|wmt20-ta-en | | |✓ | 997|bleu, chrf, ter |
|wmt20-zh-en | | |✓ | 2000|bleu, chrf, ter |
|iwslt17-en-ar | | |✓ | 1460|bleu, chrf, ter |
|iwslt17-ar-en | | |✓ | 1460|bleu, chrf, ter |
|anagrams1 | |✓ | | 10000|acc |
|anagrams2 | |✓ | | 10000|acc |
|cycle_letters | |✓ | | 10000|acc |
|random_insertion | |✓ | | 10000|acc |
|reversed_words | |✓ | | 10000|acc |
|pile_arxiv | |✓ |✓ | 2407|word_perplexity, byte_perplexity, bits_per_byte |
|pile_books3 | |✓ |✓ | 269|word_perplexity, byte_perplexity, bits_per_byte |
|pile_bookcorpus2 | |✓ |✓ | 28|word_perplexity, byte_perplexity, bits_per_byte |
|pile_dm-mathematics | |✓ |✓ | 1922|word_perplexity, byte_perplexity, bits_per_byte |
|pile_enron | |✓ |✓ | 1010|word_perplexity, byte_perplexity, bits_per_byte |
|pile_europarl | |✓ |✓ | 157|word_perplexity, byte_perplexity, bits_per_byte |
|pile_freelaw | |✓ |✓ | 5101|word_perplexity, byte_perplexity, bits_per_byte |
|pile_github | |✓ |✓ | 18195|word_perplexity, byte_perplexity, bits_per_byte |
|pile_gutenberg | |✓ |✓ | 80|word_perplexity, byte_perplexity, bits_per_byte |
|pile_hackernews | |✓ |✓ | 1632|word_perplexity, byte_perplexity, bits_per_byte |
|pile_nih-exporter | |✓ |✓ | 1884|word_perplexity, byte_perplexity, bits_per_byte |
|pile_opensubtitles | |✓ |✓ | 642|word_perplexity, byte_perplexity, bits_per_byte |
|pile_openwebtext2 | |✓ |✓ | 32925|word_perplexity, byte_perplexity, bits_per_byte |
|pile_philpapers | |✓ |✓ | 68|word_perplexity, byte_perplexity, bits_per_byte |
|pile_pile-cc | |✓ |✓ | 52790|word_perplexity, byte_perplexity, bits_per_byte |
|pile_pubmed-abstracts | |✓ |✓ | 29895|word_perplexity, byte_perplexity, bits_per_byte |
|pile_pubmed-central | |✓ |✓ | 5911|word_perplexity, byte_perplexity, bits_per_byte |
|pile_stackexchange | |✓ |✓ | 30378|word_perplexity, byte_perplexity, bits_per_byte |
|pile_uspto | |✓ |✓ | 11415|word_perplexity, byte_perplexity, bits_per_byte |
|pile_ubuntu-irc | |✓ |✓ | 22|word_perplexity, byte_perplexity, bits_per_byte |
|pile_wikipedia | |✓ |✓ | 17511|word_perplexity, byte_perplexity, bits_per_byte |
|pile_youtubesubtitles | |✓ |✓ | 342|word_perplexity, byte_perplexity, bits_per_byte |
|blimp_adjunct_island | |✓ | | 1000|acc |
|blimp_anaphor_gender_agreement | |✓ | | 1000|acc |
|blimp_anaphor_number_agreement | |✓ | | 1000|acc |
|blimp_animate_subject_passive | |✓ | | 1000|acc |
|blimp_animate_subject_trans | |✓ | | 1000|acc |
|blimp_causative | |✓ | | 1000|acc |
|blimp_complex_NP_island | |✓ | | 1000|acc |
|blimp_coordinate_structure_constraint_complex_left_branch| |✓ | | 1000|acc |
|blimp_coordinate_structure_constraint_object_extraction | |✓ | | 1000|acc |
|blimp_determiner_noun_agreement_1 | |✓ | | 1000|acc |
|blimp_determiner_noun_agreement_2 | |✓ | | 1000|acc |
|blimp_determiner_noun_agreement_irregular_1 | |✓ | | 1000|acc |
|blimp_determiner_noun_agreement_irregular_2 | |✓ | | 1000|acc |
|blimp_determiner_noun_agreement_with_adj_2 | |✓ | | 1000|acc |
|blimp_determiner_noun_agreement_with_adj_irregular_1 | |✓ | | 1000|acc |
|blimp_determiner_noun_agreement_with_adj_irregular_2 | |✓ | | 1000|acc |
|blimp_determiner_noun_agreement_with_adjective_1 | |✓ | | 1000|acc |
|blimp_distractor_agreement_relational_noun | |✓ | | 1000|acc |
|blimp_distractor_agreement_relative_clause | |✓ | | 1000|acc |
|blimp_drop_argument | |✓ | | 1000|acc |
|blimp_ellipsis_n_bar_1 | |✓ | | 1000|acc |
|blimp_ellipsis_n_bar_2 | |✓ | | 1000|acc |
|blimp_existential_there_object_raising | |✓ | | 1000|acc |
|blimp_existential_there_quantifiers_1 | |✓ | | 1000|acc |
|blimp_existential_there_quantifiers_2 | |✓ | | 1000|acc |
|blimp_existential_there_subject_raising | |✓ | | 1000|acc |
|blimp_expletive_it_object_raising | |✓ | | 1000|acc |
|blimp_inchoative | |✓ | | 1000|acc |
|blimp_intransitive | |✓ | | 1000|acc |
|blimp_irregular_past_participle_adjectives | |✓ | | 1000|acc |
|blimp_irregular_past_participle_verbs | |✓ | | 1000|acc |
|blimp_irregular_plural_subject_verb_agreement_1 | |✓ | | 1000|acc |
|blimp_irregular_plural_subject_verb_agreement_2 | |✓ | | 1000|acc |
|blimp_left_branch_island_echo_question | |✓ | | 1000|acc |
|blimp_left_branch_island_simple_question | |✓ | | 1000|acc |
|blimp_matrix_question_npi_licensor_present | |✓ | | 1000|acc |
|blimp_npi_present_1 | |✓ | | 1000|acc |
|blimp_npi_present_2 | |✓ | | 1000|acc |
|blimp_only_npi_licensor_present | |✓ | | 1000|acc |
|blimp_only_npi_scope | |✓ | | 1000|acc |
|blimp_passive_1 | |✓ | | 1000|acc |
|blimp_passive_2 | |✓ | | 1000|acc |
|blimp_principle_A_c_command | |✓ | | 1000|acc |
|blimp_principle_A_case_1 | |✓ | | 1000|acc |
|blimp_principle_A_case_2 | |✓ | | 1000|acc |
|blimp_principle_A_domain_1 | |✓ | | 1000|acc |
|blimp_principle_A_domain_2 | |✓ | | 1000|acc |
|blimp_principle_A_domain_3 | |✓ | | 1000|acc |
|blimp_principle_A_reconstruction | |✓ | | 1000|acc |
|blimp_regular_plural_subject_verb_agreement_1 | |✓ | | 1000|acc |
|blimp_regular_plural_subject_verb_agreement_2 | |✓ | | 1000|acc |
|blimp_sentential_negation_npi_licensor_present | |✓ | | 1000|acc |
|blimp_sentential_negation_npi_scope | |✓ | | 1000|acc |
|blimp_sentential_subject_island | |✓ | | 1000|acc |
|blimp_superlative_quantifiers_1 | |✓ | | 1000|acc |
|blimp_superlative_quantifiers_2 | |✓ | | 1000|acc |
|blimp_tough_vs_raising_1 | |✓ | | 1000|acc |
|blimp_tough_vs_raising_2 | |✓ | | 1000|acc |
|blimp_transitive | |✓ | | 1000|acc |
|blimp_wh_island | |✓ | | 1000|acc |
|blimp_wh_questions_object_gap | |✓ | | 1000|acc |
|blimp_wh_questions_subject_gap | |✓ | | 1000|acc |
|blimp_wh_questions_subject_gap_long_distance | |✓ | | 1000|acc |
|blimp_wh_vs_that_no_gap | |✓ | | 1000|acc |
|blimp_wh_vs_that_no_gap_long_distance | |✓ | | 1000|acc |
|blimp_wh_vs_that_with_gap | |✓ | | 1000|acc |
|blimp_wh_vs_that_with_gap_long_distance | |✓ | | 1000|acc |
## Usage
......
import collections
import itertools
import pathlib
import random
import lm_eval.metrics
import lm_eval.models
......@@ -7,14 +8,16 @@ import lm_eval.tasks
import lm_eval.base
import lm_eval.decontamination
import numpy as np
from lm_eval.utils import positional_deprecated
from lm_eval.utils import positional_deprecated, run_task_tests
from lm_eval.decontamination.decontaminate import get_train_overlap
@positional_deprecated
def simple_evaluate(model, model_args=None, tasks=[],
num_fewshot=0, batch_size=None, device=None,
no_cache=False, limit=None, bootstrap_iters=100000,
description_dict=None, decontamination_ngrams_path=None):
description_dict=None, check_integrity=False,
decontamination_ngrams_path=None):
"""Instantiate and evaluate a model on a list of tasks.
:param model: Union[str, LM]
......@@ -38,6 +41,8 @@ def simple_evaluate(model, model_args=None, tasks=[],
Number of iterations for bootstrap statistics
:param description_dict: dict[str, str]
Dictionary of custom task descriptions of the form: `task_name: description`
:param check_integrity: bool
Whether to run the relevant part of the test suite for the tasks
:return
Dictionary of results
"""
......@@ -62,6 +67,9 @@ def simple_evaluate(model, model_args=None, tasks=[],
task_dict = lm_eval.tasks.get_task_dict(tasks)
if check_integrity:
run_task_tests(task_list=tasks)
results = evaluate(
lm=lm,
task_dict=task_dict,
......
......@@ -7,30 +7,30 @@ class DummyLM(LM):
pass
@classmethod
def create_from_arg_string(cls, arg_string):
def create_from_arg_string(cls, arg_string, additional_config=None):
return cls()
def loglikelihood(self, requests):
res = []
for _ in requests:
res.append((-random.random(), False))
return res
def greedy_until(self, requests):
res = []
for ctx, _ in requests:
res.append("lol")
assert ctx.strip() != ''
assert ctx.strip() != ""
return res
def loglikelihood_rolling(self, requests):
res = []
for _ in requests:
res.append(-random.random())
return res
\ No newline at end of file
return res
......@@ -46,6 +46,8 @@ def oa_completion(**kwargs):
try:
return openai.Completion.create(**kwargs)
except openai.error.OpenAIError:
import traceback
traceback.print_exc()
time.sleep(backoff_time)
backoff_time *= 1.5
......
......@@ -29,6 +29,7 @@ from . import triviaqa
from . import pubmedqa
from . import sciq
from . import webqs
from . import qasper
from . import qa4mre
from . import translation
from . import headqa
......@@ -48,6 +49,7 @@ from . import mutual
from . import truthfulqa
from . import blimp
from . import asdiv
from . import gsm8k
########################################
# Translation tasks
......@@ -121,6 +123,8 @@ TASK_REGISTRY = {
"pubmedqa" : pubmedqa.Pubmed_QA,
"sciq" : sciq.SciQ,
"qasper": qasper.QASPER,
"qa4mre_2011" : qa4mre.QA4MRE_2011,
"qa4mre_2012" : qa4mre.QA4MRE_2012,
"qa4mre_2013" : qa4mre.QA4MRE_2013,
......@@ -170,6 +174,7 @@ TASK_REGISTRY = {
"math_prealgebra": hendrycks_math.MathPrealgebra,
"math_precalc": hendrycks_math.MathPrecalculus,
"math_asdiv": asdiv.Asdiv,
"gsm8k": gsm8k.GradeSchoolMath8K,
# arithmetic
"arithmetic_2da": arithmetic.Arithmetic2DPlus,
......
"""
"Training Verifiers to Solve Math Word Problems"
https://arxiv.org/abs/2110.14168
@misc{cobbe2021training,
title={Training Verifiers to Solve Math Word Problems},
author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
year={2021},
eprint={2110.14168},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
NOTE: See the official implementation of the task:
https://github.com/openai/grade-school-math/blob/master/grade_school_math/calculator.py
for how to make use of the dataset's calculator annotations in your language
model's sample/generation function.
"""
import json
import re
from best_download import download_file
from pathlib import Path
from lm_eval.base import Task, rf
from lm_eval.metrics import mean
ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
INVALID_ANS = "[invalid]"
class GradeSchoolMath8K(Task):
VERSION = 0
DATASET_PATH = Path('data/gsm8k')
def download(self):
if self.DATASET_PATH.exists():
return
Path.mkdir(self.DATASET_PATH, parents=True)
base_url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data"
splits = [
{"name": "train", "checksum": "17f347dc51477c50d4efb83959dbb7c56297aba886e5544ee2aaed3024813465"},
{"name": "test", "checksum": "3730d312f6e3440559ace48831e51066acaca737f6eabec99bccb9e4b3c39d14"},
]
for split in splits:
file = self.DATASET_PATH / f"{split['name']}.jsonl"
download_file(f"{base_url}/{split['name']}.jsonl", str(file), split["checksum"])
def has_training_docs(self):
return True
def has_validation_docs(self):
return False
def has_test_docs(self):
return True
def _load_docs(self, file):
return (json.loads(line) for line in open(file).read().splitlines())
def training_docs(self):
return self._load_docs(self.DATASET_PATH / "train.jsonl")
def validation_docs(self):
raise NotImplementedError
def test_docs(self):
return self._load_docs(self.DATASET_PATH / "test.jsonl")
def doc_to_text(self, doc):
return "Question: " + doc['question'] + '\nAnswer:'
def doc_to_target(self, doc):
return " " + doc['answer']
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# NOTE: The paper implements "verifiers" that assign a score to multiple
# solutions and output the highest ranked solution.
completion = rf.greedy_until(ctx, ['\n'])
return completion
def _extract_answer(self, completion):
match = ANS_RE.search(completion)
if match:
match_str = match.group(1).strip()
match_str = match_str.replace(",", "")
return match_str
else:
return INVALID_ANS
def _is_correct(self, completion, answer):
gold = self._extract_answer(answer)
assert gold != INVALID_ANS, "No ground truth answer found in the document."
return self._extract_answer(completion) == gold
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
completion = results[0]
answer = doc["answer"]
return {
"acc": self._is_correct(completion, answer)
}
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {
"acc": mean
}
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {
"acc": True
}
......@@ -18,7 +18,7 @@ class Math(Task):
def download(self):
if not (self.DATASET_PATH / 'test').exists() or not (self.DATASET_PATH / 'done').exists():
sh(f"mkdir -p {self.DATASET_PATH}")
download_file("https://people.eecs.berkeley.edu/~hendrycks/MATH.tar", local_file=f"{self.DATASET_PATH}.tar", expected_checksum="01256fd7cd5430596fdf07e6e6a5827111b5235b7ffed679c662a12f898932da")
download_file("https://people.eecs.berkeley.edu/~hendrycks/MATH.tar", local_file=f"{self.DATASET_PATH}.tar", expected_checksum="0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac")
sh(f"""
tar -xf {self.DATASET_PATH}.tar -C data/ && touch {self.DATASET_PATH / 'done'}
rm {self.DATASET_PATH}.tar
......@@ -291,42 +291,42 @@ class Math(Task):
class MathAlgebra(Math):
VERSION = 0
VERSION = 1
def get_file_info(self):
return 'algebra'
class MathCountingAndProbability(Math):
VERSION = 0
VERSION = 1
def get_file_info(self):
return 'counting_and_probability'
class MathGeometry(Math):
VERSION = 0
VERSION = 1
def get_file_info(self):
return 'geometry'
class MathIntermediateAlgebra(Math):
VERSION = 0
VERSION = 1
def get_file_info(self):
return 'intermediate_algebra'
class MathNumberTheory(Math):
VERSION = 0
VERSION = 1
def get_file_info(self):
return 'number_theory'
class MathPrealgebra(Math):
VERSION = 0
VERSION = 1
def get_file_info(self):
return 'prealgebra'
class MathPrecalculus(Math):
VERSION = 0
VERSION = 1
def get_file_info(self):
return 'precalculus'
"""
A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers
https://arxiv.org/abs/2105.03011
@article{DBLP:journals/corr/abs-2105-03011,
author = {Pradeep Dasigi and
Kyle Lo and
Iz Beltagy and
Arman Cohan and
Noah A. Smith and
Matt Gardner},
title = {A Dataset of Information-Seeking Questions and Answers Anchored in
Research Papers},
journal = {CoRR},
volume = {abs/2105.03011},
year = {2021},
url = {https://arxiv.org/abs/2105.03011},
eprinttype = {arXiv},
eprint = {2105.03011},
timestamp = {Fri, 14 May 2021 12:13:30 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2105-03011.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
"""
from collections import Counter
from math import exp
import random
import re
import string
from lm_eval.base import rf
from lm_eval.metrics import f1_score, mean
from .common import HFTask
def normalize_answer(s):
"""
Taken from the official evaluation script for v1.1 of the SQuAD dataset.
Lower text and remove punctuation, articles and extra whitespace.
"""
def remove_articles(text):
return re.sub(r"\b(a|an|the)\b", " ", text)
def white_space_fix(text):
return " ".join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return "".join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def categorise_answer(answer_blob):
if answer_blob["unanswerable"]:
answer = "unanswerable"
answer_type = "unanswerable"
return answer, answer_type
elif answer_blob["yes_no"]:
answer = "yes"
answer_type = "bool"
return answer, answer_type
elif answer_blob["free_form_answer"]:
answer = answer_blob["free_form_answer"]
answer_type = "free form answer"
return answer, answer_type
elif answer_blob["extractive_spans"]:
answer = answer_blob["extractive_spans"]
answer_type = "extractive_spans"
return answer, answer_type
elif answer_blob["yes_no"] is False:
answer = "no"
answer_type = "bool"
return answer, answer_type
def token_f1_score(prediction, ground_truth):
"""
Taken from the official evaluation script for v1.1 of the SQuAD dataset.
"""
prediction_tokens = normalize_answer(prediction).split()
ground_truth_tokens = normalize_answer(ground_truth).split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
class QASPER(HFTask):
VERSION = 0
DATASET_PATH = "qasper"
DATASET_NAME = None
def doc_to_text(self, doc):
return (
"TITLE: "
+ doc["title"]
+ "\n"
+ "ABSTRACT: "
+ doc["abstract"]
+ "\n\n"
+ "Q: "
+ doc["question"]
+ "\n\n"
+ "A:"
)
def doc_to_target(self, doc):
answer = doc["answer"]
if isinstance(answer, list):
answer = ", ".join(answer)
return " " + answer
def training_docs(self):
for doc in self.data["train"]:
yield from self.process_doc(doc)
def validation_docs(self):
for doc in self.data["train"]:
yield from self.process_doc(doc)
def process_doc(self, doc):
"""Given a `doc`, flatten it out so that each JSON blob
contains exactly one question and one answer. Logic taken from
the reference implementation available at
https://github.com/allenai/qasper-led-baseline/blob/main/scripts/evaluator.py
"""
obs_list = []
for question, answer_list in zip(doc["qas"]["question"], doc["qas"]["answers"]):
for answer_blob in answer_list["answer"]:
answer, answer_type = categorise_answer(answer_blob)
obs_list.append(
{
"title": doc["title"],
"abstract": doc["abstract"],
"question": question,
"answer": answer,
"answer_type": answer_type,
}
)
return obs_list
def process_results(self, doc, results):
# TODO: Calculate a score for extractive spans once a request type for generating
# extractive spans is available
if not results:
return {}
elif len(results) == 1:
[res] = results
elif len(results) == 2:
[ll_yes, ll_no] = results
# TODO: Handle unanswerability first
# unanswerable_gold = doc["answer_type"] == "unanswerable"
# unanswerable_pred = exp(logprob_unanswerable)
# res_dict["f1_unanswerable"] = (unanswerable_gold, unanswerable_pred)
res_dict = {}
# Handle yes/no questions
if doc["answer_type"] == "bool":
gold = 1 if doc["answer"] == "yes" else 0
pred = ll_yes > ll_no
res_dict["f1_yesno"] = (gold, pred)
# Handle completions
if doc["answer_type"] == "free form answer":
res_dict["f1_abstractive"] = token_f1_score(res, doc["answer"])
# TODO: Handle extraction
# if doc["answer_type"] == "extractive_spans":
# res_dict["f1_extractive"] = 0
return res_dict
def aggregation(self):
return {
"f1_yesno": f1_score,
"f1_abstractive": mean,
}
def construct_requests(self, doc, ctx):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# unanswerable = rf.loglikelihood(ctx, " " + "unanswerable")
if doc["answer_type"] in ("free form answer"):
return [rf.greedy_until(ctx, ["\n"])]
elif doc["answer_type"] in ("bool"):
ll_yes, _ = rf.loglikelihood(ctx, " yes")
ll_no, _ = rf.loglikelihood(ctx, " no")
return [ll_yes, ll_no]
else:
return []
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {
"f1_yesno": True,
"f1_abstractive": True,
}
import os
import pathlib
import re
import collections
import functools
import inspect
import sys
import pytest
from typing import List
class ExitCodeError(Exception):
......@@ -155,3 +159,32 @@ def positional_deprecated(fn):
"lm-evaluation-harness!")
return fn(*args, **kwargs)
return _wrapper
@positional_deprecated
def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
"""
Search upward in the directory tree to a maximum of three layers
to find and return the package root (containing the 'tests' folder)
"""
cur_path = start_path.resolve()
max_layers = 3
for _ in range(max_layers):
if (cur_path / 'tests' / 'test_version_stable.py').exists():
return cur_path
else:
cur_path = cur_path.parent.resolve()
raise FileNotFoundError(f"Unable to find package root within {max_layers} upwards" +\
f"of {start_path}")
@positional_deprecated
def run_task_tests(task_list: List[str]):
"""
Find the package root and run the tests for the given tasks
"""
package_root = find_test_root(start_path=pathlib.Path(__file__))
task_string = ' or '.join(task_list)
args = [f'{package_root}/tests/test_version_stable.py', f'--rootdir={package_root}', '-k', f'{task_string}']
sys.path.append(str(package_root))
pytest_return_val = pytest.main(args)
if pytest_return_val:
raise ValueError(f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}")
\ No newline at end of file
......@@ -36,7 +36,8 @@ def parse_args():
parser.add_argument('--limit', type=int, default=None)
parser.add_argument('--no_cache', action="store_true")
parser.add_argument('--decontamination_ngrams_path', default=None)
parser.add_argument('--description_dict_path', default=None)
parser.add_argument('--description_dict_path', default=None)
parser.add_argument('--check_integrity', action="store_true")
return parser.parse_args()
......@@ -79,7 +80,8 @@ def main():
no_cache=args.no_cache,
limit=args.limit,
description_dict=description_dict,
decontamination_ngrams_path=args.decontamination_ngrams_path
decontamination_ngrams_path=args.decontamination_ngrams_path,
check_integrity=args.check_integrity
)
dumped = json.dumps(results, indent=2)
......
......@@ -21,7 +21,7 @@ setuptools.setup(
python_requires='>=3.6',
install_requires=[
"black",
"best_download>=0.0.6",
"best_download==0.0.9",
"datasets==1.15.1",
"click>=7.1",
"scikit-learn>=0.24.1",
......
e7292dbdd7fd8419ba954f2e0701e04c8d0e8842fe053dbf2fe47d926630e35e
\ No newline at end of file
{"results": {"gsm8k": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"gsm8k": 0}}
\ No newline at end of file
f19182ce697a2c095d9e5b56ee6659dc38c93994b69ca75d7c3d3f5fd87572b4
\ No newline at end of file
{"results": {"math_algebra": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_algebra": 1}}
\ No newline at end of file
2aa9ae43ee9dbb2457525247d7b65358632c5eaa9cbfc40cf95a4f17f5d942ad
\ No newline at end of file
{"results": {"math_counting_and_prob": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_counting_and_prob": 1}}
\ No newline at end of file
46bc4cb219b6903397da782699a684bdbb982c0c954ff82e6beeed5c84878f42
\ No newline at end of file
{"results": {"math_geometry": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_geometry": 1}}
\ No newline at end of file
d53c699de272d517ed7ad783b4e692302be9f9f97a8d4ac7a6541e538a7cabe0
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment