[Refactor] [WIP] New YAML advanced docs (#567)

* add wip gsm8k yaml * cleanup tasks dir * push gsm8k yaml changes * rename gpt2.py * add updated gsm8k , triviaqa baseline * add new cot yaml * allow for multiple filter pipelines, new filter types * updated gsm8k + sampling gen configs * cleanup self-consistency yaml * push outline for advanced docs * push docs checklist * switch to inheritance for many tasks * acc_norm and acc_mutual_info fixed * fix missing newline in error msg * remove many .py tasks * updated GSM8k * added more doc * Update advanced_task_guide.md Added list of parameters * Update advanced_task_guide.md * Added details on listing metrics * Update advanced_task_guide.md * Added more explanation * modify current default filter name * add new tags to tasks * remove a lingering print() * add rest of param docs, cleanup deprecated fields * push docs update * move ALL_TASKS definition location * confirm write_out.py works if no description dict passed --------- Co-authored-by: lintangsutawika <lintang@sutawika.com>

[Refactor] [WIP] New YAML advanced docs (#567)
* add wip gsm8k yaml * cleanup tasks dir * push gsm8k yaml changes * rename gpt2.py * add updated gsm8k , triviaqa baseline * add new cot yaml * allow for multiple filter pipelines, new filter types * updated gsm8k + sampling gen configs * cleanup self-consistency yaml * push outline for advanced docs * push docs checklist * switch to inheritance for many tasks * acc_norm and acc_mutual_info fixed * fix missing newline in error msg * remove many .py tasks * updated GSM8k * added more doc * Update advanced_task_guide.md Added list of parameters * Update advanced_task_guide.md * Added details on listing metrics * Update advanced_task_guide.md * Added more explanation * modify current default filter name * add new tags to tasks * remove a lingering print() * add rest of param docs, cleanup deprecated fields * push docs update * move ALL_TASKS definition location * confirm write_out.py works if no description dict passed --------- Co-authored-by: lintangsutawika <lintang@sutawika.com>
79b972d6 · Hailey Schoelkopf · GitHub · 761f0087 · 79b972d6 · 79b972d6
Unverified Commit 79b972d6 authored Jun 12, 2023 by Hailey Schoelkopf Committed by GitHub Jun 12, 2023
8 changed files
--- a/lm_eval/tasks/pile/pile_youtubesubtitles.yaml
+++ b/lm_eval/tasks/pile/pile_youtubesubtitles.yaml
-group:
-  - pile
+include: pile_arxiv.yaml
 task: pile_youtubesubtitles
-dataset_path: EleutherAI/the_pile
 dataset_name: pile_youtubesubtitles
-output_type: loglikelihood_rolling
-test_split: train
-template_aliases: ""
-doc_to_text: ""
-doc_to_target: "{{text}}"
-should_decontaminate: true
-doc_to_decontamination_query: "{{text}}"
-metric_list:
-  - metric: word_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: byte_perplexity
-    aggregation: weighted_perplexity
-    higher_is_better: false
-  - metric: bits_per_byte
-    aggregation: bits_per_byte
-    higher_is_better: false
--- a/lm_eval/tasks/piqa/piqa.yaml
+++ b/lm_eval/tasks/piqa/piqa.yaml
 group:
-  - piqa_yaml_grp
-task: piqa_yaml
+  - multiple_choice
+task: piqa
 dataset_path: piqa
 dataset_name: null
 output_type: multiple_choice

--- a/lm_eval/tasks/sciq/sciq.yaml
+++ b/lm_eval/tasks/sciq/sciq.yaml
 group:
-  - sciq_yaml_grp
-task: sciq_yaml
+  - multiple_choice
+task: sciq
 dataset_path: sciq
 dataset_name: null
 output_type: multiple_choice

--- a/lm_eval/tasks/wikitext.py
+++ b/lm_eval/tasks/wikitext.py
-"""
-Pointer Sentinel Mixture Models
-https://arxiv.org/pdf/1609.07843.pdf
-
-The WikiText language modeling dataset is a collection of over 100 million tokens
-extracted from the set of verified Good and Featured articles on Wikipedia.
-
-NOTE: This `Task` is based on WikiText-2.
-
-Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/
-"""
-import re
-
-from lm_eval.api.task import PerplexityTask
-
-from lm_eval.api.registry import register_task, register_group
-
-_CITATION = """
-@misc{merity2016pointer,
-    title={Pointer Sentinel Mixture Models},
-    author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},
-    year={2016},
-    eprint={1609.07843},
-    archivePrefix={arXiv},
-    primaryClass={cs.CL}
-}
-"""
-
-
-def wikitext_detokenizer(string):
-    # contractions
-    string = string.replace("s '", "s'")
-    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
-    # number separators
-    string = string.replace(" @-@ ", "-")
-    string = string.replace(" @,@ ", ",")
-    string = string.replace(" @.@ ", ".")
-    # punctuation
-    string = string.replace(" : ", ": ")
-    string = string.replace(" ; ", "; ")
-    string = string.replace(" . ", ". ")
-    string = string.replace(" ! ", "! ")
-    string = string.replace(" ? ", "? ")
-    string = string.replace(" , ", ", ")
-    # double brackets
-    string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
-    string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
-    string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
-    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
-    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
-    # miscellaneous
-    string = string.replace("= = = =", "====")
-    string = string.replace("= = =", "===")
-    string = string.replace("= =", "==")
-    string = string.replace(" " + chr(176) + " ", chr(176))
-    string = string.replace(" \n", "\n")
-    string = string.replace("\n ", "\n")
-    string = string.replace(" N ", " 1 ")
-    string = string.replace(" 's", "'s")
-
-    return string
-
-
-@register_task("wikitext")
-class WikiText(PerplexityTask):
-    VERSION = "2.0"
-    DATASET_PATH = "EleutherAI/wikitext_document_level"
-    DATASET_NAME = "wikitext-2-raw-v1"
-
-    def has_training_docs(self):
-        return True
-
-    def has_validation_docs(self):
-        return True
-
-    def has_test_docs(self):
-        return True
-
-    def training_docs(self):
-        return map(self._process_doc, self.dataset["train"])
-
-    def validation_docs(self):
-        return map(self._process_doc, self.dataset["validation"])
-
-    def test_docs(self):
-        return map(self._process_doc, self.dataset["test"])
-
-    def _process_doc(self, doc):
-        return doc["page"]
-
-    def doc_to_target(self, doc):
-        return wikitext_detokenizer(doc)
-
-    def should_decontaminate(self):
-        return True
-
-    def count_words(self, doc):
-        # count number of words in *original doc before detokenization*
-        return len(re.split(r"\s+", doc))
--- a/lm_eval/tasks/wikitext/wikitext.yaml
+++ b/lm_eval/tasks/wikitext/wikitext.yaml
 group:
-  - wikitext_group
-task: default
+  - perplexity
+  - loglikelihood_rolling
+task: wikitext
 dataset_path: EleutherAI/wikitext_document_level
 dataset_name: wikitext-2-raw-v1
 output_type: loglikelihood_rolling

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -174,7 +174,6 @@ def make_table(result_dict):
        version = result_dict["versions"][k]
        for (mf), v in dic.items():
            m, _, f = mf.partition(",")
-            print(m, f)
            if m.endswith("_stderr"):
                continue


--- a/main.py
+++ b/main.py
@@ -4,11 +4,11 @@ import fnmatch
 import argparse

 from lm_eval import evaluator, utils
-from lm_eval.tasks import ALL_TASKS
+from lm_eval.api.registry import GROUP_REGISTRY, TASK_REGISTRY
 from lm_eval.logger import eval_logger

 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
+ALL_TASKS = sorted(list(TASK_REGISTRY.keys()) + list(GROUP_REGISTRY.keys()))

 class MultiChoice:
    def __init__(self, choices):
@@ -20,9 +20,9 @@ class MultiChoice:
            if len(fnmatch.filter(self.choices, value)) == 0:
                eval_logger.warning("{} is not in task list.".format(value))
                eval_logger.info(f"Available tasks to choose:")
-                for choice in self.choices:
-                    eval_logger.info(f"    {choice}")
-
+                # for choice in self.choices:
+                    # eval_logger.info(f"    {choice}")
+                eval_logger.info(ALL_TASKS)
        return True

    def __iter__(self):

--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -72,7 +72,7 @@ def main():
                    doc=doc,
                    num_fewshot=args.num_fewshot,
                    rnd=rnd,
-                    description=description,
+                    # description=description,
                )
                f.write(ctx + "\n")