[Refactor] [WIP] New YAML advanced docs (#567)

* add wip gsm8k yaml * cleanup tasks dir * push gsm8k yaml changes * rename gpt2.py * add updated gsm8k , triviaqa baseline * add new cot yaml * allow for multiple filter pipelines, new filter types * updated gsm8k + sampling gen configs * cleanup self-consistency yaml * push outline for advanced docs * push docs checklist * switch to inheritance for many tasks * acc_norm and acc_mutual_info fixed * fix missing newline in error msg * remove many .py tasks * updated GSM8k * added more doc * Update advanced_task_guide.md Added list of parameters * Update advanced_task_guide.md * Added details on listing metrics * Update advanced_task_guide.md * Added more explanation * modify current default filter name * add new tags to tasks * remove a lingering print() * add rest of param docs, cleanup deprecated fields * push docs update * move ALL_TASKS definition location * confirm write_out.py works if no description dict passed --------- Co-authored-by: lintangsutawika <lintang@sutawika.com>

[Refactor] [WIP] New YAML advanced docs (#567)
* add wip gsm8k yaml * cleanup tasks dir * push gsm8k yaml changes * rename gpt2.py * add updated gsm8k , triviaqa baseline * add new cot yaml * allow for multiple filter pipelines, new filter types * updated gsm8k + sampling gen configs * cleanup self-consistency yaml * push outline for advanced docs * push docs checklist * switch to inheritance for many tasks * acc_norm and acc_mutual_info fixed * fix missing newline in error msg * remove many .py tasks * updated GSM8k * added more doc * Update advanced_task_guide.md Added list of parameters * Update advanced_task_guide.md * Added details on listing metrics * Update advanced_task_guide.md * Added more explanation * modify current default filter name * add new tags to tasks * remove a lingering print() * add rest of param docs, cleanup deprecated fields * push docs update * move ALL_TASKS definition location * confirm write_out.py works if no description dict passed --------- Co-authored-by: lintangsutawika <lintang@sutawika.com>
79b972d6 · Hailey Schoelkopf · GitHub · 761f0087 · 79b972d6 · 79b972d6
Unverified Commit 79b972d6 authored Jun 12, 2023 by Hailey Schoelkopf Committed by GitHub Jun 12, 2023
20 changed files
--- a/lm_eval/tasks/pile/pile_bookcorpus2.yaml
+++ b/lm_eval/tasks/pile/pile_bookcorpus2.yaml
-group:
-  - pile
+include: pile_arxiv.yaml
 task: pile_bookcorpus2
-dataset_path: EleutherAI/the_pile
 dataset_name: pile_bookcorpus2
-output_type: loglikelihood_rolling
-test_split: train
-template_aliases: ""
-doc_to_text: ""
-doc_to_target: "{{text}}"
-should_decontaminate: true
-doc_to_decontamination_query: "{{text}}"
-metric_list:
-  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
--- a/lm_eval/tasks/pile/pile_books3.yaml
+++ b/lm_eval/tasks/pile/pile_books3.yaml
-group:
-  - pile
+include: pile_arxiv.yaml
 task: pile_books3
-dataset_path: EleutherAI/the_pile
 dataset_name: pile_books3
-output_type: loglikelihood_rolling
-test_split: train
-template_aliases: ""
-doc_to_text: ""
-doc_to_target: "{{text}}"
-should_decontaminate: true
-doc_to_decontamination_query: "{{text}}"
-metric_list:
-  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
--- a/lm_eval/tasks/pile/pile_dm-mathematics.yaml
+++ b/lm_eval/tasks/pile/pile_dm-mathematics.yaml
-group:
-  - pile
+include: pile_arxiv.yaml
 task: pile_dm-mathematics
-dataset_path: EleutherAI/the_pile
 dataset_name: pile_dm-mathematics
-output_type: loglikelihood_rolling
-test_split: train
-template_aliases: ""
-doc_to_text: ""
-doc_to_target: "{{text}}"
-should_decontaminate: true
-doc_to_decontamination_query: "{{text}}"
-metric_list:
-  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
--- a/lm_eval/tasks/pile/pile_enron.yaml
+++ b/lm_eval/tasks/pile/pile_enron.yaml
-group:
-  - pile
+include: pile_arxiv.yaml
 task: pile_enron
-dataset_path: EleutherAI/the_pile
-dataset_name: enron_emails
-output_type: loglikelihood_rolling
-test_split: train
-template_aliases: ""
-doc_to_text: ""
-doc_to_target: "{{text}}"
-should_decontaminate: true
-doc_to_decontamination_query: "{{text}}"
-metric_list:
-  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
+dataset_name: pile_enron
--- a/lm_eval/tasks/pile/pile_europarl.yaml
+++ b/lm_eval/tasks/pile/pile_europarl.yaml
-group:
-  - pile
+include: pile_arxiv.yaml
 task: pile_europarl
-dataset_path: EleutherAI/the_pile
-dataset_name: pile_europarl
-output_type: loglikelihood_rolling
-test_split: train
-template_aliases: ""
-doc_to_text: ""
-doc_to_target: "{{text}}"
-should_decontaminate: true
-doc_to_decontamination_query: "{{text}}"
-metric_list:
-  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
+dataset_name: pile_europarl
\ No newline at end of file
--- a/lm_eval/tasks/pile/pile_freelaw.yaml
+++ b/lm_eval/tasks/pile/pile_freelaw.yaml
-group:
-  - pile
+include: pile_arxiv.yaml
 task: pile_freelaw
-dataset_path: EleutherAI/the_pile
 dataset_name: pile_freelaw
-output_type: loglikelihood_rolling
-test_split: train
-template_aliases: ""
-doc_to_text: ""
-doc_to_target: "{{text}}"
-should_decontaminate: true
-doc_to_decontamination_query: "{{text}}"
-metric_list:
-  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
--- a/lm_eval/tasks/pile/pile_github.yaml
+++ b/lm_eval/tasks/pile/pile_github.yaml
-group:
-  - pile
+include: pile_arxiv.yaml
 task: pile_github
-dataset_path: EleutherAI/the_pile
 dataset_name: pile_github
-output_type: loglikelihood_rolling
-test_split: train
-template_aliases: ""
-doc_to_text: ""
-doc_to_target: "{{text}}"
-should_decontaminate: true
-doc_to_decontamination_query: "{{text}}"
-metric_list:
-  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
--- a/lm_eval/tasks/pile/pile_gutenberg.yaml
+++ b/lm_eval/tasks/pile/pile_gutenberg.yaml
-group:
-  - pile
+include: pile_arxiv.yaml
 task: pile_gutenberg
-dataset_path: EleutherAI/the_pile
-dataset_name: pile_gutenberg
-output_type: loglikelihood_rolling
-test_split: train
-template_aliases: ""
-doc_to_text: ""
-doc_to_target: "{{text}}"
-should_decontaminate: true
-doc_to_decontamination_query: "{{text}}"
-metric_list:
-  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
+dataset_name: pile_gutenberg
\ No newline at end of file
--- a/lm_eval/tasks/pile/pile_hackernews.yaml
+++ b/lm_eval/tasks/pile/pile_hackernews.yaml
-group:
-  - pile
+include: pile_arxiv.yaml
 task: pile_hackernews
-dataset_path: EleutherAI/the_pile
 dataset_name: pile_hackernews
-output_type: loglikelihood_rolling
-test_split: train
-template_aliases: ""
-doc_to_text: ""
-doc_to_target: "{{text}}"
-should_decontaminate: true
-doc_to_decontamination_query: "{{text}}"
-metric_list:
-  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
--- a/lm_eval/tasks/pile/pile_nih-exporter.yaml
+++ b/lm_eval/tasks/pile/pile_nih-exporter.yaml
-group:
-  - pile
+include: pile_arxiv.yaml
 task: pile_nih-exporter
-dataset_path: EleutherAI/the_pile
 dataset_name: pile_nih-exporter
-output_type: loglikelihood_rolling
-test_split: train
-template_aliases: ""
-doc_to_text: ""
-doc_to_target: "{{text}}"
-should_decontaminate: true
-doc_to_decontamination_query: "{{text}}"
-metric_list:
-  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
--- a/lm_eval/tasks/pile/pile_opensubtitles.yaml
+++ b/lm_eval/tasks/pile/pile_opensubtitles.yaml
-group:
-  - pile
+include: pile_arxiv.yaml
 task: pile_opensubtitles
-dataset_path: EleutherAI/the_pile
 dataset_name: pile_opensubtitles
-output_type: loglikelihood_rolling
-test_split: train
-template_aliases: ""
-doc_to_text: ""
-doc_to_target: "{{text}}"
-should_decontaminate: true
-doc_to_decontamination_query: "{{text}}"
-metric_list:
-  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
--- a/lm_eval/tasks/pile/pile_openwebtext2.yaml
+++ b/lm_eval/tasks/pile/pile_openwebtext2.yaml
-group:
-  - pile
+include: pile_arxiv.yaml
 task: pile_openwebtext2
-dataset_path: EleutherAI/the_pile
 dataset_name: pile_openwebtext2
-output_type: loglikelihood_rolling
-test_split: train
-template_aliases: ""
-doc_to_text: ""
-doc_to_target: "{{text}}"
-should_decontaminate: true
-doc_to_decontamination_query: "{{text}}"
-metric_list:
-  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
--- a/lm_eval/tasks/pile/pile_philpapers.yaml
+++ b/lm_eval/tasks/pile/pile_philpapers.yaml
-group:
-  - pile
+include: pile_arxiv.yaml
 task: pile_philpapers
-dataset_path: EleutherAI/the_pile
 dataset_name: pile_philpapers
-output_type: loglikelihood_rolling
-test_split: train
-template_aliases: ""
-doc_to_text: ""
-doc_to_target: "{{text}}"
-should_decontaminate: true
-doc_to_decontamination_query: "{{text}}"
-metric_list:
-  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
--- a/lm_eval/tasks/pile/pile_pile-cc.yaml
+++ b/lm_eval/tasks/pile/pile_pile-cc.yaml
-group:
-  - pile
+include: pile_arxiv.yaml
 task: pile_pile-cc
-dataset_path: EleutherAI/the_pile
 dataset_name: pile_pile-cc
-output_type: loglikelihood_rolling
-test_split: train
-template_aliases: ""
-doc_to_text: ""
-doc_to_target: "{{text}}"
-should_decontaminate: true
-doc_to_decontamination_query: "{{text}}"
-metric_list:
-  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
--- a/lm_eval/tasks/pile/pile_pubmed-abstracts.yaml
+++ b/lm_eval/tasks/pile/pile_pubmed-abstracts.yaml
-group:
-  - pile
+include: pile_arxiv.yaml
 task: pile_pubmed-abstracts
-dataset_path: EleutherAI/the_pile
 dataset_name: pile_pubmed-abstracts
-output_type: loglikelihood_rolling
-test_split: train
-template_aliases: ""
-doc_to_text: ""
-doc_to_target: "{{text}}"
-should_decontaminate: true
-doc_to_decontamination_query: "{{text}}"
-metric_list:
-  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
+
--- a/lm_eval/tasks/pile/pile_pubmed-central.yaml
+++ b/lm_eval/tasks/pile/pile_pubmed-central.yaml
-group:
-  - pile
+include: pile_arxiv.yaml
 task: pile_pubmed-central
-dataset_path: EleutherAI/the_pile
 dataset_name: pile_pubmed-central
-output_type: loglikelihood_rolling
-test_split: train
-template_aliases: ""
-doc_to_text: ""
-doc_to_target: "{{text}}"
-should_decontaminate: true
-doc_to_decontamination_query: "{{text}}"
-metric_list:
-  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
+
--- a/lm_eval/tasks/pile/pile_stackexchange.yaml
+++ b/lm_eval/tasks/pile/pile_stackexchange.yaml
-group:
-  - pile
+include: pile_arxiv.yaml
 task: pile_stackexchange
-dataset_path: EleutherAI/the_pile
 dataset_name: pile_stackexchange
-output_type: loglikelihood_rolling
-test_split: train
-template_aliases: ""
-doc_to_text: ""
-doc_to_target: "{{text}}"
-should_decontaminate: true
-doc_to_decontamination_query: "{{text}}"
-metric_list:
-  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
+
--- a/lm_eval/tasks/pile/pile_ubuntu-irc.yaml
+++ b/lm_eval/tasks/pile/pile_ubuntu-irc.yaml
-group:
-  - pile
+include: pile_arxiv.yaml
 task: pile_ubuntu-irc
-dataset_path: EleutherAI/the_pile
 dataset_name: pile_ubuntu-irc
-output_type: loglikelihood_rolling
-test_split: train
-template_aliases: ""
-doc_to_text: ""
-doc_to_target: "{{text}}"
-should_decontaminate: true
-doc_to_decontamination_query: "{{text}}"
-metric_list:
-  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
+
--- a/lm_eval/tasks/pile/pile_uspto.yaml
+++ b/lm_eval/tasks/pile/pile_uspto.yaml
-group:
-  - pile
+include: pile_arxiv.yaml
 task: pile_uspto
-dataset_path: EleutherAI/the_pile
 dataset_name: pile_uspto
-output_type: loglikelihood_rolling
-test_split: train
-template_aliases: ""
-doc_to_text: ""
-doc_to_target: "{{text}}"
-should_decontaminate: true
-doc_to_decontamination_query: "{{text}}"
-metric_list:
-  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
+
--- a/lm_eval/tasks/pile/pile_wikipedia.yaml
+++ b/lm_eval/tasks/pile/pile_wikipedia.yaml
-group:
-  - pile
+include: pile_arxiv.yaml
 task: pile_wikipedia
-dataset_path: EleutherAI/the_pile
 dataset_name: pile_wikipedia
-output_type: loglikelihood_rolling
-test_split: train
-template_aliases: ""
-doc_to_text: ""
-doc_to_target: "{{text}}"
-should_decontaminate: true
-doc_to_decontamination_query: "{{text}}"
-metric_list:
-  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
+