Merge pull request #560 from EleutherAI/dataset-metric-log

Dataset metric log [WIP]

Merge pull request #560 from EleutherAI/dataset-metric-log
Dataset metric log [WIP]
761f0087 · Lintang Sutawika · GitHub · 232632c6 · ae4d9ed2 · 761f0087
Unverified Commit 761f0087 authored Jun 08, 2023 by Lintang Sutawika Committed by GitHub Jun 08, 2023
20 changed files
--- a/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
@@ -39,4 +39,4 @@ filter_list:
    filter:
      - function: "regex"
        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
-      - function: "take_first"
\ No newline at end of file
+      - function: "take_first"
--- a/lm_eval/tasks/gsm8k/gsm8k.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k.yaml
@@ -32,4 +32,4 @@ num_fewshot: 5
 #     filter:
 #       - function: "regex"
 #         regex_pattern: "### (\\-?[0-9\\.\\,]+)"
-#       - function: "take_first"
\ No newline at end of file
+#       - function: "take_first"
--- a/lm_eval/tasks/lambada.py
+++ b/lm_eval/tasks/lambada.py
+"""
+The LAMBADA dataset: Word prediction requiring a broad discourse context∗
+https://arxiv.org/pdf/1606.06031.pdf
+
+LAMBADA is a dataset to evaluate the capabilities of computational models for text
+understanding by means of a word prediction task. LAMBADA is a collection of narrative
+passages sharing the characteristic that human subjects are able to guess their last
+word if they are exposed to the whole passage, but not if they only see the last
+sentence preceding the target word. To succeed on LAMBADA, computational models
+cannot simply rely on local context, but must be able to keep track of information
+in the broader discourse.
+
+Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI
+"""
+from lm_eval.api.task import Task
+from lm_eval.api.instance import Instance
+from lm_eval.api.metrics import mean, perplexity
+
+from lm_eval.api.registry import register_task, register_group
+
+_CITATION = """
+@misc{
+    author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
+    title={The LAMBADA dataset},
+    DOI={10.5281/zenodo.2630551},
+    publisher={Zenodo},
+    year={2016},
+    month={Aug}
+}
+"""
+
+
+class LambadaBase(Task):
+    VERSION = None
+
+    OUTPUT_TYPE = "loglikelihood"
+
+    def training_docs(self):
+        if self.has_training_docs():
+            return self.dataset["train"]
+
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return self.dataset["validation"]
+
+    def test_docs(self):
+        if self.has_test_docs():
+            return self.dataset["test"]
+
+    def doc_to_text(self, doc):
+        return doc["text"].rsplit(" ", 1)[0]
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["text"]
+
+    def doc_to_target(self, doc):
+        return " " + doc["text"].rsplit(" ", 1)[1]
+
+    def construct_requests(self, doc, ctx, **kwargs):
+        return Instance(
+            request_type=self.OUTPUT_TYPE,
+            doc=doc,
+            arguments=(ctx, self.doc_to_target(doc)),
+            **kwargs
+        )
+
+    def process_results(self, doc, results):
+        # TODO: this ^ is a hack. filters should make it so that we only have one response per request that we score
+        results = results[
+            0
+        ]  # TODO: recheck this. currently a list of [(ll, is_greedy)] is passed in
+        ll, is_greedy = results
+
+        return {"ppl": ll, "acc": int(is_greedy)}
+
+    def aggregation(self):
+        return {"ppl": perplexity, "acc": mean}
+
+    def higher_is_better(self):
+        return {"ppl": False, "acc": True}
+
+
+@register_task("lambada_standard")
+class LambadaStandard(LambadaBase):
+    """The LAMBADA task using the standard original LAMBADA dataset."""
+
+    VERSION = "2.0"
+    DATASET_PATH = "lambada"
+
+    def has_training_docs(self):
+        return False
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+
+@register_task("lambada_openai")
+class LambadaOpenAI(LambadaBase):
+    """The LAMBADA task using the LAMBADA OpenAI dataset, a modified version of the
+    original LAMBADA dataset created by OpenAI for evaluating their GPT-2 model.
+
+    Reference: https://github.com/openai/gpt-2/issues/131#issuecomment-497136199
+    """
+
+    VERSION = "2.0"
+    DATASET_PATH = "EleutherAI/lambada_openai"
+
+    def has_training_docs(self):
+        return False
+
+    def has_validation_docs(self):
+        return False
+
+    def has_test_docs(self):
+        return True
--- a/lm_eval/tasks/lambada/README.md
+++ b/lm_eval/tasks/lambada/README.md
 # LAMBADA

 ### Paper
-The LAMBADA dataset: Word prediction requiring a broad discourse context 
+The LAMBADA dataset: Word prediction requiring a broad discourse context
 https://arxiv.org/pdf/1606.06031.pdf

 LAMBADA is a dataset to evaluate the capabilities of computational models for text
@@ -23,4 +23,4 @@ Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI
    publisher={Zenodo},
    year={2016},
    month={Aug}
-}
\ No newline at end of file
+}
--- a/lm_eval/tasks/pile.py
+++ b/lm_eval/tasks/pile.py
+"""
+The Pile: An 800GB Dataset of Diverse Text for Language Modeling
+https://arxiv.org/pdf/2101.00027.pdf
+
+The Pile is a 825 GiB diverse, open source language modelling data set that consists
+of 22 smaller, high-quality datasets combined together. To score well on Pile
+BPB (bits per byte), a model must be able to understand many disparate domains
+including books, github repositories, webpages, chat logs, and medical, physics,
+math, computer science, and philosophy papers.
+Homepage: https://pile.eleuther.ai/
+"""
+
+from lm_eval.api.task import PerplexityTask
+
+from lm_eval.api.registry import register_task, register_group
+
+_CITATION = """
+@article{pile,
+  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},
+  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},
+  journal={arXiv preprint arXiv:2101.00027},
+  year={2020}
+}
+"""
+
+
+class PilePerplexityTask(PerplexityTask):
+    VERSION = "2.0"
+    DATASET_PATH = "EleutherAI/the_pile"
+    DATASET_NAME = None
+
+    def has_training_docs(self):
+        return False
+
+    def test_docs(self):
+        for doc in self.dataset["train"].select(range(100)):
+            yield doc
+
+    def has_validation_docs(self):
+        return False
+
+    def has_test_docs(self):
+        return True
+
+    def doc_to_target(self, doc):
+        return doc["text"]
+
+    # def validation_docs(self):
+    #     for doc in self.dataset["validation"]:
+    #         yield doc["text"]
+
+    # def test_docs(self):
+    #     for doc in self.dataset["test"]:
+    #         yield doc["text"]
+
+
+class PileArxiv(PilePerplexityTask):
+    DATASET_NAME = "pile_arxiv"
+
+
+class PileBooks3(PilePerplexityTask):
+    DATASET_NAME = "pile_books3"
+
+
+class PileBookCorpus2(PilePerplexityTask):
+    DATASET_NAME = "pile_bookcorpus2"
+
+
+class PileDmMathematics(PilePerplexityTask):
+    DATASET_NAME = "pile_dm-mathematics"
+
+
+@register_task("pile_enron")
+class PileEnron(PilePerplexityTask):
+    DATASET_NAME = "enron_emails"
+
+
+class PileEuroparl(PilePerplexityTask):
+    DATASET_NAME = "pile_europarl"
+
+
+class PileFreeLaw(PilePerplexityTask):
+    DATASET_NAME = "pile_freelaw"
+
+
+class PileGithub(PilePerplexityTask):
+    DATASET_NAME = "pile_github"
+
+
+class PileGutenberg(PilePerplexityTask):
+    DATASET_NAME = "pile_gutenberg"
+
+
+class PileHackernews(PilePerplexityTask):
+    DATASET_NAME = "pile_hackernews"
+
+
+class PileNIHExporter(PilePerplexityTask):
+    DATASET_NAME = "pile_nih-exporter"
+
+
+class PileOpenSubtitles(PilePerplexityTask):
+    DATASET_NAME = "pile_opensubtitles"
+
+
+class PileOpenWebText2(PilePerplexityTask):
+    DATASET_NAME = "pile_openwebtext2"
+
+
+class PilePhilPapers(PilePerplexityTask):
+    DATASET_NAME = "pile_philpapers"
+
+
+class PilePileCc(PilePerplexityTask):
+    DATASET_NAME = "pile_pile-cc"
+
+
+class PilePubmedAbstracts(PilePerplexityTask):
+    DATASET_NAME = "pile_pubmed-abstracts"
+
+
+class PilePubmedCentral(PilePerplexityTask):
+    DATASET_NAME = "pile_pubmed-central"
+
+
+class PileStackExchange(PilePerplexityTask):
+    DATASET_NAME = "pile_stackexchange"
+
+
+class PileUspto(PilePerplexityTask):
+    DATASET_NAME = "pile_upsto"
+
+
+class PileUbuntuIrc(PilePerplexityTask):
+    DATASET_NAME = "pile_ubuntu-irc"
+
+
+class PileWikipedia(PilePerplexityTask):
+    DATASET_NAME = "pile_wikipedia"
+
+
+class PileYoutubeSubtitles(PilePerplexityTask):
+    DATASET_NAME = "pile_youtubesubtitles"
--- a/lm_eval/tasks/pile/README.md
+++ b/lm_eval/tasks/pile/README.md
@@ -20,4 +20,4 @@ Homepage: https://pile.eleuther.ai/
  journal={arXiv preprint arXiv:2101.00027},
  year={2020}
 }
-```
\ No newline at end of file
+```
--- a/lm_eval/tasks/pile/pile_arxiv.yaml
+++ b/lm_eval/tasks/pile/pile_arxiv.yaml
@@ -19,4 +19,4 @@ metric_list:
    higher_is_better: false
  - metric: bits_per_byte
    aggregation: bits_per_byte
-    higher_is_better: false
\ No newline at end of file
+    higher_is_better: false
--- a/lm_eval/tasks/pile/pile_bookcorpus2.yaml
+++ b/lm_eval/tasks/pile/pile_bookcorpus2.yaml
@@ -19,4 +19,4 @@ metric_list:
    higher_is_better: false
  - metric: bits_per_byte
    aggregation: bits_per_byte
-    higher_is_better: false
\ No newline at end of file
+    higher_is_better: false
--- a/lm_eval/tasks/pile/pile_books3.yaml
+++ b/lm_eval/tasks/pile/pile_books3.yaml
@@ -19,4 +19,4 @@ metric_list:
    higher_is_better: false
  - metric: bits_per_byte
    aggregation: bits_per_byte
-    higher_is_better: false
\ No newline at end of file
+    higher_is_better: false
--- a/lm_eval/tasks/pile/pile_dm-mathematics.yaml
+++ b/lm_eval/tasks/pile/pile_dm-mathematics.yaml
@@ -19,4 +19,4 @@ metric_list:
    higher_is_better: false
  - metric: bits_per_byte
    aggregation: bits_per_byte
-    higher_is_better: false
\ No newline at end of file
+    higher_is_better: false
--- a/lm_eval/tasks/pile/pile_europarl.yaml
+++ b/lm_eval/tasks/pile/pile_europarl.yaml
@@ -19,4 +19,4 @@ metric_list:
    higher_is_better: false
  - metric: bits_per_byte
    aggregation: bits_per_byte
-    higher_is_better: false
\ No newline at end of file
+    higher_is_better: false
--- a/lm_eval/tasks/pile/pile_freelaw.yaml
+++ b/lm_eval/tasks/pile/pile_freelaw.yaml
@@ -19,4 +19,4 @@ metric_list:
    higher_is_better: false
  - metric: bits_per_byte
    aggregation: bits_per_byte
-    higher_is_better: false
\ No newline at end of file
+    higher_is_better: false
--- a/lm_eval/tasks/pile/pile_github.yaml
+++ b/lm_eval/tasks/pile/pile_github.yaml
@@ -19,4 +19,4 @@ metric_list:
    higher_is_better: false
  - metric: bits_per_byte
    aggregation: bits_per_byte
-    higher_is_better: false
\ No newline at end of file
+    higher_is_better: false
--- a/lm_eval/tasks/pile/pile_gutenberg.yaml
+++ b/lm_eval/tasks/pile/pile_gutenberg.yaml
@@ -19,4 +19,4 @@ metric_list:
    higher_is_better: false
  - metric: bits_per_byte
    aggregation: bits_per_byte
-    higher_is_better: false
\ No newline at end of file
+    higher_is_better: false
--- a/lm_eval/tasks/pile/pile_hackernews.yaml
+++ b/lm_eval/tasks/pile/pile_hackernews.yaml
@@ -19,4 +19,4 @@ metric_list:
    higher_is_better: false
  - metric: bits_per_byte
    aggregation: bits_per_byte
-    higher_is_better: false
\ No newline at end of file
+    higher_is_better: false
--- a/lm_eval/tasks/pile/pile_nih-exporter.yaml
+++ b/lm_eval/tasks/pile/pile_nih-exporter.yaml
@@ -19,4 +19,4 @@ metric_list:
    higher_is_better: false
  - metric: bits_per_byte
    aggregation: bits_per_byte
-    higher_is_better: false
\ No newline at end of file
+    higher_is_better: false
--- a/lm_eval/tasks/pile/pile_opensubtitles.yaml
+++ b/lm_eval/tasks/pile/pile_opensubtitles.yaml
@@ -19,4 +19,4 @@ metric_list:
    higher_is_better: false
  - metric: bits_per_byte
    aggregation: bits_per_byte
-    higher_is_better: false
\ No newline at end of file
+    higher_is_better: false
--- a/lm_eval/tasks/pile/pile_openwebtext2.yaml
+++ b/lm_eval/tasks/pile/pile_openwebtext2.yaml
@@ -19,4 +19,4 @@ metric_list:
    higher_is_better: false
  - metric: bits_per_byte
    aggregation: bits_per_byte
-    higher_is_better: false
\ No newline at end of file
+    higher_is_better: false
--- a/lm_eval/tasks/pile/pile_philpapers.yaml
+++ b/lm_eval/tasks/pile/pile_philpapers.yaml
@@ -19,4 +19,4 @@ metric_list:
    higher_is_better: false
  - metric: bits_per_byte
    aggregation: bits_per_byte
-    higher_is_better: false
\ No newline at end of file
+    higher_is_better: false
--- a/lm_eval/tasks/pile/pile_pile-cc.yaml
+++ b/lm_eval/tasks/pile/pile_pile-cc.yaml
@@ -19,4 +19,4 @@ metric_list:
    higher_is_better: false
  - metric: bits_per_byte
    aggregation: bits_per_byte
-    higher_is_better: false
\ No newline at end of file
+    higher_is_better: false