push many yaml seed tasks

44eec73b · haileyschoelkopf · 82401c96 · 44eec73b · 44eec73b · 44eec73b
Commit 44eec73b authored Jun 02, 2023 by haileyschoelkopf
13 changed files
--- a/lm_eval/tasks/pile/pile_pile-cc.yaml
+++ b/lm_eval/tasks/pile/pile_pile-cc.yaml
+group:
+  - pile
+task: pile_pile-cc
+dataset_path: EleutherAI/the_pile
+dataset_name: pile_pile-cc
+output_type: loglikelihood_rolling
+test_split: train
+template_aliases: ""
+doc_to_text: ""
+doc_to_target: "{{text}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{text}}"
+metric_list:
+  - metric: word_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: byte_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: bits_per_byte
+    aggregation: bits_per_byte
+    higher_is_better: false
\ No newline at end of file
--- a/lm_eval/tasks/pile/pile_pubmed-abstracts.yaml
+++ b/lm_eval/tasks/pile/pile_pubmed-abstracts.yaml
+group:
+  - pile
+task: pile_pubmed-abstracts
+dataset_path: EleutherAI/the_pile
+dataset_name: pile_pubmed-abstracts
+output_type: loglikelihood_rolling
+test_split: train
+template_aliases: ""
+doc_to_text: ""
+doc_to_target: "{{text}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{text}}"
+metric_list:
+  - metric: word_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: byte_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: bits_per_byte
+    aggregation: bits_per_byte
+    higher_is_better: false
\ No newline at end of file
--- a/lm_eval/tasks/pile/pile_pubmed-central.yaml
+++ b/lm_eval/tasks/pile/pile_pubmed-central.yaml
+group:
+  - pile
+task: pile_pubmed-central
+dataset_path: EleutherAI/the_pile
+dataset_name: pile_pubmed-central
+output_type: loglikelihood_rolling
+test_split: train
+template_aliases: ""
+doc_to_text: ""
+doc_to_target: "{{text}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{text}}"
+metric_list:
+  - metric: word_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: byte_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: bits_per_byte
+    aggregation: bits_per_byte
+    higher_is_better: false
\ No newline at end of file
--- a/lm_eval/tasks/pile/pile_stackexchange.yaml
+++ b/lm_eval/tasks/pile/pile_stackexchange.yaml
+group:
+  - pile
+task: pile_stackexchange
+dataset_path: EleutherAI/the_pile
+dataset_name: pile_stackexchange
+output_type: loglikelihood_rolling
+test_split: train
+template_aliases: ""
+doc_to_text: ""
+doc_to_target: "{{text}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{text}}"
+metric_list:
+  - metric: word_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: byte_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: bits_per_byte
+    aggregation: bits_per_byte
+    higher_is_better: false
\ No newline at end of file
--- a/lm_eval/tasks/pile/pile_ubuntu-irc.yaml
+++ b/lm_eval/tasks/pile/pile_ubuntu-irc.yaml
+group:
+  - pile
+task: pile_ubuntu-irc
+dataset_path: EleutherAI/the_pile
+dataset_name: pile_ubuntu-irc
+output_type: loglikelihood_rolling
+test_split: train
+template_aliases: ""
+doc_to_text: ""
+doc_to_target: "{{text}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{text}}"
+metric_list:
+  - metric: word_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: byte_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: bits_per_byte
+    aggregation: bits_per_byte
+    higher_is_better: false
\ No newline at end of file
--- a/lm_eval/tasks/pile/pile_uspto.yaml
+++ b/lm_eval/tasks/pile/pile_uspto.yaml
+group:
+  - pile
+task: pile_uspto
+dataset_path: EleutherAI/the_pile
+dataset_name: pile_uspto
+output_type: loglikelihood_rolling
+test_split: train
+template_aliases: ""
+doc_to_text: ""
+doc_to_target: "{{text}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{text}}"
+metric_list:
+  - metric: word_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: byte_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: bits_per_byte
+    aggregation: bits_per_byte
+    higher_is_better: false
\ No newline at end of file
--- a/lm_eval/tasks/pile/pile_wikipedia.yaml
+++ b/lm_eval/tasks/pile/pile_wikipedia.yaml
+group:
+  - pile
+task: pile_wikipedia
+dataset_path: EleutherAI/the_pile
+dataset_name: pile_wikipedia
+output_type: loglikelihood_rolling
+test_split: train
+template_aliases: ""
+doc_to_text: ""
+doc_to_target: "{{text}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{text}}"
+metric_list:
+  - metric: word_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: byte_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: bits_per_byte
+    aggregation: bits_per_byte
+    higher_is_better: false
\ No newline at end of file
--- a/lm_eval/tasks/pile/pile_youtubesubtitles.yaml
+++ b/lm_eval/tasks/pile/pile_youtubesubtitles.yaml
+group:
+  - pile
+task: pile_youtubesubtitles
+dataset_path: EleutherAI/the_pile
+dataset_name: pile_youtubesubtitles
+output_type: loglikelihood_rolling
+test_split: train
+template_aliases: ""
+doc_to_text: ""
+doc_to_target: "{{text}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{text}}"
+metric_list:
+  - metric: word_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: byte_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: bits_per_byte
+    aggregation: bits_per_byte
+    higher_is_better: false
\ No newline at end of file
--- a/lm_eval/tasks/piqa/piqa.yaml
+++ b/lm_eval/tasks/piqa/piqa.yaml
+group: 
+  - piqa_yaml_grp
+task: piqa_yaml
+dataset_path: piqa
+dataset_name: null
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: null
+template_aliases: "{% set question = goal %}{% set answer_choices = [sol1, sol2] %}{% set gold = label %}" # set the list of possible answer choices, and set what this doc's gold label idx is
+doc_to_text: "Question: {{question}}\nAnswer:"
+doc_to_target: "{{gold}}" # this will be cast to an int. 
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_mutual_info
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/sciq/sciq.yaml
+++ b/lm_eval/tasks/sciq/sciq.yaml
+group: 
+  - sciq_yaml_grp
+task: sciq_yaml
+dataset_path: sciq
+dataset_name: null
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: test
+# TODO: we should see how shuffling answer choices affects perf.
+template_aliases: "{% set answer_choices = [distractor1, distractor2, distractor3, correct_answer] %}{% set gold = 3 %}" # set the list of possible answer choices, and set what this doc's gold label idx is
+doc_to_text: "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:"
+doc_to_target: "{{gold}}" # this will be cast to an int. 
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_mutual_info
+    aggregation: mean
+    higher_is_better: true
\ No newline at end of file
--- a/lm_eval/tasks/wikitext/README.md
+++ b/lm_eval/tasks/wikitext/README.md
+# Wikitext
+### Paper
+Pointer Sentinel Mixture Models
+https://arxiv.org/pdf/1609.07843.pdf
+The WikiText language modeling dataset is a collection of over 100 million tokens
+extracted from the set of verified Good and Featured articles on Wikipedia.
+NOTE: This `Task` is based on WikiText-2.
+Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/
+### Citation
+```
+@misc{merity2016pointer,
+    title={Pointer Sentinel Mixture Models},
+    author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},
+    year={2016},
+    eprint={1609.07843},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
+### Subtasks
+### Checklist
+- [x] Is in Eval-harness v1.0 ?
+- [x] Has been checked for regression from v1.0?
+- [ ] Has been checked for equivalence with original paper methodology?
+- [ ] "Main" checked variant clearly denoted?
\ No newline at end of file
--- a/lm_eval/tasks/wikitext/preprocess_wikitext.py
+++ b/lm_eval/tasks/wikitext/preprocess_wikitext.py
+import re
+def wikitext_detokenizer(doc):
+    string = doc["page"]
+    # contractions
+    string = string.replace("s '", "s'")
+    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
+    # number separators
+    string = string.replace(" @-@ ", "-")
+    string = string.replace(" @,@ ", ",")
+    string = string.replace(" @.@ ", ".")
+    # punctuation
+    string = string.replace(" : ", ": ")
+    string = string.replace(" ; ", "; ")
+    string = string.replace(" . ", ". ")
+    string = string.replace(" ! ", "! ")
+    string = string.replace(" ? ", "? ")
+    string = string.replace(" , ", ", ")
+    # double brackets
+    string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
+    string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
+    string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
+    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
+    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
+    # miscellaneous
+    string = string.replace("= = = =", "====")
+    string = string.replace("= = =", "===")
+    string = string.replace("= =", "==")
+    string = string.replace(" " + chr(176) + " ", chr(176))
+    string = string.replace(" \n", "\n")
+    string = string.replace("\n ", "\n")
+    string = string.replace(" N ", " 1 ")
+    string = string.replace(" 's", "'s")
+    return string
--- a/lm_eval/tasks/wikitext/wikitext.yaml
+++ b/lm_eval/tasks/wikitext/wikitext.yaml
+group:
+  - wikitext_group
+task: wikitext_yaml
+dataset_path: EleutherAI/wikitext_document_level
+dataset_name: wikitext-2-raw-v1
+output_type: loglikelihood_rolling
+training_split: train
+validation_split: validation
+test_split: test
+template_aliases: ""
+doc_to_text: ""
+doc_to_target: !function preprocess_wikitext.wikitext_detokenizer
+should_decontaminate: true
+doc_to_decontamination_query: "{{page}}"
+metric_list:
+  - metric: word_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: byte_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: bits_per_byte
+    aggregation: bits_per_byte
+    higher_is_better: false
\ No newline at end of file