[Refactor] Non-greedy generation ; WIP GSM8k yaml (#559)

* add wip gsm8k yaml * cleanup tasks dir * push gsm8k yaml changes * rename gpt2.py * add updated gsm8k , triviaqa baseline * add new cot yaml * allow for multiple filter pipelines, new filter types * updated gsm8k + sampling gen configs * cleanup self-consistency yaml

[Refactor] Non-greedy generation ; WIP GSM8k yaml (#559)
* add wip gsm8k yaml * cleanup tasks dir * push gsm8k yaml changes * rename gpt2.py * add updated gsm8k , triviaqa baseline * add new cot yaml * allow for multiple filter pipelines, new filter types * updated gsm8k + sampling gen configs * cleanup self-consistency yaml
232632c6 · Hailey Schoelkopf · GitHub · 4e9412d5 · 232632c6 · 4e9412d5
Unverified Commit 232632c6 authored Jun 07, 2023 by Hailey Schoelkopf Committed by GitHub Jun 07, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 122 additions and 106 deletions

lm_eval/tasks/triviaqa.py lm_eval/tasks/triviaqa.py +113 -0

lm_eval/tasks/wikitext.py lm_eval/tasks/wikitext.py +0 -99

lm_eval/utils.py lm_eval/utils.py +9 -7

No files found.
--- a/lm_eval/tasks/triviaqa.py
+++ b/lm_eval/tasks/triviaqa.py
+"""
+TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension
+https://arxiv.org/pdf/1705.03551.pdf
+
+TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence
+triples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts
+and independently gathered evidence documents, six per question on average, that provide
+high quality distant supervision for answering the questions.
+
+Homepage: https://nlp.cs.washington.edu/triviaqa/
+"""
+import inspect
+# import lm_eval.datasets.triviaqa.triviaqa
+import string
+from lm_eval.api.task import Task
+from lm_eval.api.instance import Instance
+from lm_eval.api.register import register_task
+from lm_eval.api.metrics import mean
+
+_CITATION = """
+@InProceedings{JoshiTriviaQA2017,
+    author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},
+    title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
+    booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},
+    month = {July},
+    year = {2017},
+    address = {Vancouver, Canada},
+    publisher = {Association for Computational Linguistics},
+}
+"""
+
+@register_task("triviaqa")
+class TriviaQA(Task):
+    VERSION = 1
+    DATASET_PATH = "trivia_qa" #inspect.getfile(lm_eval.datasets.triviaqa.triviaqa)
+    DATASET_NAME = "unfiltered.nocontext"
+
+    OUTPUT_TYPE = "greedy_until"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def training_docs(self):
+        return self.dataset["train"]
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
+    def test_docs(self):
+        raise NotImplementedError()
+
+    def doc_to_text(self, doc):
+        return f"Q: {doc['question']}\nA:"
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["question"]
+
+    def doc_to_target(self, doc):
+        return " " + doc["answer"]["value"]
+
+    def _remove_prefixes(self, aliases):
+        # Optimization: Remove any alias that has a strict prefix elsewhere in the list
+        # we can do this because if the prefix is acceptable by isgreedy, we can stop looking
+        aliases.sort()
+        ret = [aliases[0]]
+        for alias in aliases[1:]:
+            if not alias.startswith(ret[-1]):
+                ret.append(alias)
+        return ret
+
+    def construct_requests(self, doc, ctx, **kwargs):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+        :param doc:
+                The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+                The context string, generated by fewshot_context. This includes the natural
+                language description, as well as the few shot examples, and the question
+                part of the document for `doc`.
+        """
+        continuation = Instance(
+            request_type=self.OUTPUT_TYPE,
+            doc=doc,
+            arguments=(ctx, {
+                "until": ["\n", ".", ","], 
+                "do_sample": False, 
+            }),
+            idx=0,
+            **kwargs,
+        )
+        return continuation
+
+    def process_results(self, doc, results):
+        continuation = results[0].strip().lower().translate(str.maketrans('', '', string.punctuation))
+        list_of_candidates = [alias.lower().translate(str.maketrans('', '', string.punctuation)) for alias in self._remove_prefixes(doc["answer"]["aliases"])]
+        return {"em": float(continuation in list_of_candidates)}
+
+    def aggregation(self):
+        return {
+            "em": mean,
+        }
+
+    def higher_is_better(self):
+        return {"em": True}
\ No newline at end of file
--- a/lm_eval/tasks/wikitext.py
+++ b/lm_eval/tasks/wikitext.py
-"""
-Pointer Sentinel Mixture Models
-https://arxiv.org/pdf/1609.07843.pdf
-
-The WikiText language modeling dataset is a collection of over 100 million tokens
-extracted from the set of verified Good and Featured articles on Wikipedia.
-
-NOTE: This `Task` is based on WikiText-2.
-
-Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/
-"""
-import re
-
-from lm_eval.api.task import PerplexityTask
-
-from lm_eval.api.register import register_task, register_group
-
-_CITATION = """
-@misc{merity2016pointer,
-    title={Pointer Sentinel Mixture Models},
-    author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},
-    year={2016},
-    eprint={1609.07843},
-    archivePrefix={arXiv},
-    primaryClass={cs.CL}
-}
-"""
-
-
-def wikitext_detokenizer(string):
-    # contractions
-    string = string.replace("s '", "s'")
-    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
-    # number separators
-    string = string.replace(" @-@ ", "-")
-    string = string.replace(" @,@ ", ",")
-    string = string.replace(" @.@ ", ".")
-    # punctuation
-    string = string.replace(" : ", ": ")
-    string = string.replace(" ; ", "; ")
-    string = string.replace(" . ", ". ")
-    string = string.replace(" ! ", "! ")
-    string = string.replace(" ? ", "? ")
-    string = string.replace(" , ", ", ")
-    # double brackets
-    string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
-    string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
-    string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
-    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
-    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
-    # miscellaneous
-    string = string.replace("= = = =", "====")
-    string = string.replace("= = =", "===")
-    string = string.replace("= =", "==")
-    string = string.replace(" " + chr(176) + " ", chr(176))
-    string = string.replace(" \n", "\n")
-    string = string.replace("\n ", "\n")
-    string = string.replace(" N ", " 1 ")
-    string = string.replace(" 's", "'s")
-
-    return string
-
-
-@register_task("wikitext")
-class WikiText(PerplexityTask):
-    VERSION = "2.0"
-    DATASET_PATH = "EleutherAI/wikitext_document_level"
-    DATASET_NAME = "wikitext-2-raw-v1"
-
-    def has_training_docs(self):
-        return True
-
-    def has_validation_docs(self):
-        return True
-
-    def has_test_docs(self):
-        return True
-
-    def training_docs(self):
-        return map(self._process_doc, self.dataset["train"])
-
-    def validation_docs(self):
-        return map(self._process_doc, self.dataset["validation"])
-
-    def test_docs(self):
-        return map(self._process_doc, self.dataset["test"])
-
-    def _process_doc(self, doc):
-        return doc["page"]
-
-    def doc_to_target(self, doc):
-        return wikitext_detokenizer(doc)
-
-    def should_decontaminate(self):
-        return True
-
-    def count_words(self, doc):
-        # count number of words in *original doc before detokenization*
-        return len(re.split(r"\s+", doc))
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -157,22 +157,24 @@ def make_table(result_dict):

    md_writer = MarkdownTableWriter()
    latex_writer = LatexTableWriter()
-    md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
-    latex_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
+    md_writer.headers = ["Task", "Version", "Filter", "Metric", "Value", "", "Stderr"]
+    latex_writer.headers = ["Task", "Version", "Filter", "Metric", "Value", "", "Stderr"]

    values = []

    for k, dic in result_dict["results"].items():
        version = result_dict["versions"][k]
-        for m, v in dic.items():
+        for (mf), v in dic.items():
+            m, _, f = mf.partition(",")
+            print(m,f)
            if m.endswith("_stderr"):
                continue

-            if m + "_stderr" in dic:
-                se = dic[m + "_stderr"]
-                values.append([k, version, m, "%.4f" % v, "±", "%.4f" % se])
+            if m + "_stderr" + "," + f in dic:
+                se = dic[m + "_stderr" + "," + f]
+                values.append([k, version, f, m, "%.4f" % v, "±", "%.4f" % se])
            else:
-                values.append([k, version, m, "%.4f" % v, "", ""])
+                values.append([k, version, f, m, "%.4f" % v, "", ""])
            k = ""
            version = ""
    md_writer.value_matrix = values