fix for merge from master

baa8b0d3 · bzantium · a956bc63 · baa8b0d3 · baa8b0d3 · baa8b0d3
Commit baa8b0d3 authored May 18, 2023 by bzantium
20 changed files
--- a/lm_eval/datasets/logiqa/logiqa.py
+++ b/lm_eval/datasets/logiqa/logiqa.py
@@ -19,7 +19,7 @@ import datasets

 _CITATION = """\
 @misc{liu2020logiqa,
-    title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, 
+    title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning},
    author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},
    year={2020},
    eprint={2007.08124},
@@ -54,7 +54,9 @@ class Logiqa(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("0.0.1")

    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="logiqa", version=VERSION, description="The LogiQA dataset."),
+        datasets.BuilderConfig(
+            name="logiqa", version=VERSION, description="The LogiQA dataset."
+        ),
    ]

    def _info(self):
@@ -63,9 +65,7 @@ class Logiqa(datasets.GeneratorBasedBuilder):
                "label": datasets.Value("string"),
                "context": datasets.Value("string"),
                "question": datasets.Value("string"),
-                "options": datasets.features.Sequence(
-                    datasets.Value("string")
-                ),
+                "options": datasets.features.Sequence(datasets.Value("string")),
            }
        )
        return datasets.DatasetInfo(
@@ -77,7 +77,11 @@ class Logiqa(datasets.GeneratorBasedBuilder):
        )

    def _split_generators(self, dl_manager):
-        urls = {"train": _URLS["train"], "test": _URLS["test"], "validation": _URLS["validation"]}
+        urls = {
+            "train": _URLS["train"],
+            "test": _URLS["test"],
+            "validation": _URLS["validation"],
+        }
        data_dir = dl_manager.download_and_extract(urls)
        return [
            datasets.SplitGenerator(
@@ -91,10 +95,7 @@ class Logiqa(datasets.GeneratorBasedBuilder):
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": data_dir["test"],
-                    "split": "test"
-                },
+                gen_kwargs={"filepath": data_dir["test"], "split": "test"},
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
@@ -110,6 +111,7 @@ class Logiqa(datasets.GeneratorBasedBuilder):
    def _generate_examples(self, filepath, split):
        def normalize(text):
            return text.replace(".", ". ").strip()
+
        with open(filepath, encoding="utf-8") as f:
            data = f.read().strip().split("\n\n")
            for key, row in enumerate(data):

--- a/lm_eval/datasets/mutual/__init__.py
+++ b/lm_eval/datasets/mutual/__init__.py
--- a/lm_eval/datasets/mutual/dataset_infos.json
+++ b/lm_eval/datasets/mutual/dataset_infos.json
-{"mutual": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nThe MuTual dataset.", "citation": "@inproceedings{mutual,\n    title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n    author = \"Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n    booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n    year = \"2020\",\n    publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 5141602, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 634396, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 624271, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6400269, "size_in_bytes": 17398147}, "mutual_plus": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nMuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.", "citation": "@inproceedings{mutual,\n    title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n    author = \"Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n    booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n    year = \"2020\",\n    publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual_plus", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 4921179, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 606620, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 597340, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6125139, "size_in_bytes": 17123017}}
\ No newline at end of file
+{"mutual": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nThe MuTual dataset.", "citation": "@inproceedings{mutual,\n    title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n    author = \"Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n    booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n    year = \"2020\",\n    publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 5141602, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 634396, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 624271, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6400269, "size_in_bytes": 17398147}, "mutual_plus": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nMuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.", "citation": "@inproceedings{mutual,\n    title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n    author = \"Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n    booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n    year = \"2020\",\n    publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual_plus", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 4921179, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 606620, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 597340, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6125139, "size_in_bytes": 17123017}}
--- a/lm_eval/datasets/mutual/mutual.py
+++ b/lm_eval/datasets/mutual/mutual.py
@@ -50,8 +50,14 @@ class Mutual(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("0.0.1")

    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="mutual", version=VERSION, description="The MuTual dataset."),
-        datasets.BuilderConfig(name="mutual_plus", version=VERSION, description="MuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses."),
+        datasets.BuilderConfig(
+            name="mutual", version=VERSION, description="The MuTual dataset."
+        ),
+        datasets.BuilderConfig(
+            name="mutual_plus",
+            version=VERSION,
+            description="MuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.",
+        ),
    ]

    def _info(self):
@@ -79,7 +85,9 @@ class Mutual(datasets.GeneratorBasedBuilder):
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
-                    "basepath": os.path.join(data_dir, "MuTual-master", "data", self.config.name, "train"),
+                    "basepath": os.path.join(
+                        data_dir, "MuTual-master", "data", self.config.name, "train"
+                    ),
                    "split": "train",
                },
            ),
@@ -87,7 +95,9 @@ class Mutual(datasets.GeneratorBasedBuilder):
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
-                    "basepath": os.path.join(data_dir, "MuTual-master", "data", self.config.name, "test"),
+                    "basepath": os.path.join(
+                        data_dir, "MuTual-master", "data", self.config.name, "test"
+                    ),
                    "split": "test",
                },
            ),
@@ -95,7 +105,9 @@ class Mutual(datasets.GeneratorBasedBuilder):
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
-                    "basepath": os.path.join(data_dir, "MuTual-master", "data", self.config.name, "dev"),
+                    "basepath": os.path.join(
+                        data_dir, "MuTual-master", "data", self.config.name, "dev"
+                    ),
                    "split": "dev",
                },
            ),
@@ -109,7 +121,7 @@ class Mutual(datasets.GeneratorBasedBuilder):
        for file in sorted(Path(basepath).iterdir()):
            if file.suffix != ".txt":
                continue
-            with open(file, "r", encoding='utf-8') as f:
+            with open(file, "r", encoding="utf-8") as f:
                data_str = f.read()
                # Ignore the occasional empty file.
                if not data_str:

--- a/lm_eval/datasets/pile/__init__.py
+++ b/lm_eval/datasets/pile/__init__.py
--- a/lm_eval/datasets/pile/dataset_infos.json
+++ b/lm_eval/datasets/pile/dataset_infos.json
-{"pile_arxiv": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nArXiv", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_arxiv", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 113218251, "num_examples": 2407, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 115653720, "num_examples": 2434, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 228871971, "size_in_bytes": 1160030307}, "pile_books3": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nBooks3", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_books3", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 150095743, "num_examples": 269, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 177359876, "num_examples": 301, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 327455619, "size_in_bytes": 1258613955}, "pile_bookcorpus2": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nBookCorpus2", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_bookcorpus2", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 9680652, "num_examples": 28, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 9776271, "num_examples": 26, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 19456923, "size_in_bytes": 950615259}, "pile_dm-mathematics": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nDM Mathematics", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_dm-mathematics", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 15756556, "num_examples": 1922, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 16453386, "num_examples": 2007, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 32209942, "size_in_bytes": 963368278}, "pile_enron": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nEnron Emails", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_enron", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 1638859, "num_examples": 1010, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 1556487, "num_examples": 947, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 3195346, "size_in_bytes": 934353682}, "pile_europarl": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nEuroParl", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_europarl", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 8789652, "num_examples": 157, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 9111791, "num_examples": 133, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 17901443, "size_in_bytes": 949059779}, "pile_freelaw": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nFreeLaw", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_freelaw", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 80808693, "num_examples": 5101, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 80363814, "num_examples": 5094, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 161172507, "size_in_bytes": 1092330843}, "pile_github": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nGithub", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_github", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 95654706, "num_examples": 18195, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 97179576, "num_examples": 18337, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 192834282, "size_in_bytes": 1123992618}, "pile_gutenberg": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nGutenberg (PG-19)", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_gutenberg", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 30243176, "num_examples": 80, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 24685980, "num_examples": 60, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 54929156, "size_in_bytes": 986087492}, "pile_hackernews": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nHackerNews", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_hackernews", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 8124255, "num_examples": 1632, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 9803822, "num_examples": 1619, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 17928077, "size_in_bytes": 949086413}, "pile_nih-exporter": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nNIH ExPorter", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_nih-exporter", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 3928804, "num_examples": 1884, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 3927967, "num_examples": 1825, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 7856771, "size_in_bytes": 939015107}, "pile_opensubtitles": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nOpenSubtitles", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_opensubtitles", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 21008996, "num_examples": 642, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 19622904, "num_examples": 621, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 40631900, "size_in_bytes": 971790236}, "pile_openwebtext2": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nOpenWebText2", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_openwebtext2", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 128624303, "num_examples": 32925, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 131554302, "num_examples": 33400, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 260178605, "size_in_bytes": 1191336941}, "pile_philpapers": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPhilPapers", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_philpapers", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 5090158, "num_examples": 68, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 6499078, "num_examples": 64, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 11589236, "size_in_bytes": 942747572}, "pile_pile-cc": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPile-CC", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_pile-cc", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 235004043, "num_examples": 52790, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 233535650, "num_examples": 52792, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 468539693, "size_in_bytes": 1399698029}, "pile_pubmed-abstracts": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPubMed Abstracts", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_pubmed-abstracts", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 39908950, "num_examples": 29895, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 40008336, "num_examples": 29871, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 79917286, "size_in_bytes": 1011075622}, "pile_pubmed-central": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPubMed Central", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_pubmed-central", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 187251519, "num_examples": 5911, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 184791818, "num_examples": 5977, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 372043337, "size_in_bytes": 1303201673}, "pile_stackexchange": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nStackExchange", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_stackexchange", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 66441557, "num_examples": 30378, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 66011397, "num_examples": 29950, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 132452954, "size_in_bytes": 1063611290}, "pile_upsto": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nUSPTO Backgrounds", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_upsto", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 47345405, "num_examples": 11415, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 48122320, "num_examples": 11387, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 95467725, "size_in_bytes": 1026626061}, "pile_ubuntu-irc": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nUbuntu IRC", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_ubuntu-irc", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 5694218, "num_examples": 22, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 7410104, "num_examples": 21, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 13104322, "size_in_bytes": 944262658}, "pile_wikipedia": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nWikipedia (en)", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_wikipedia", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 52166968, "num_examples": 17511, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 53186137, "num_examples": 17478, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 105353105, "size_in_bytes": 1036511441}, "pile_youtubesubtitles": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nYoutubeSubtitles", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_youtubesubtitles", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 7377448, "num_examples": 342, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 8937546, "num_examples": 326, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 16314994, "size_in_bytes": 947473330}}
\ No newline at end of file
+{"pile_arxiv": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nArXiv", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_arxiv", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 113218251, "num_examples": 2407, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 115653720, "num_examples": 2434, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 228871971, "size_in_bytes": 1160030307}, "pile_books3": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nBooks3", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_books3", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 150095743, "num_examples": 269, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 177359876, "num_examples": 301, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 327455619, "size_in_bytes": 1258613955}, "pile_bookcorpus2": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nBookCorpus2", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_bookcorpus2", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 9680652, "num_examples": 28, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 9776271, "num_examples": 26, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 19456923, "size_in_bytes": 950615259}, "pile_dm-mathematics": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nDM Mathematics", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_dm-mathematics", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 15756556, "num_examples": 1922, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 16453386, "num_examples": 2007, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 32209942, "size_in_bytes": 963368278}, "pile_enron": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nEnron Emails", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_enron", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 1638859, "num_examples": 1010, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 1556487, "num_examples": 947, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 3195346, "size_in_bytes": 934353682}, "pile_europarl": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nEuroParl", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_europarl", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 8789652, "num_examples": 157, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 9111791, "num_examples": 133, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 17901443, "size_in_bytes": 949059779}, "pile_freelaw": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nFreeLaw", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_freelaw", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 80808693, "num_examples": 5101, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 80363814, "num_examples": 5094, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 161172507, "size_in_bytes": 1092330843}, "pile_github": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nGithub", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_github", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 95654706, "num_examples": 18195, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 97179576, "num_examples": 18337, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 192834282, "size_in_bytes": 1123992618}, "pile_gutenberg": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nGutenberg (PG-19)", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_gutenberg", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 30243176, "num_examples": 80, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 24685980, "num_examples": 60, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 54929156, "size_in_bytes": 986087492}, "pile_hackernews": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nHackerNews", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_hackernews", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 8124255, "num_examples": 1632, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 9803822, "num_examples": 1619, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 17928077, "size_in_bytes": 949086413}, "pile_nih-exporter": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nNIH ExPorter", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_nih-exporter", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 3928804, "num_examples": 1884, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 3927967, "num_examples": 1825, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 7856771, "size_in_bytes": 939015107}, "pile_opensubtitles": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nOpenSubtitles", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_opensubtitles", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 21008996, "num_examples": 642, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 19622904, "num_examples": 621, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 40631900, "size_in_bytes": 971790236}, "pile_openwebtext2": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nOpenWebText2", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_openwebtext2", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 128624303, "num_examples": 32925, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 131554302, "num_examples": 33400, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 260178605, "size_in_bytes": 1191336941}, "pile_philpapers": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPhilPapers", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_philpapers", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 5090158, "num_examples": 68, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 6499078, "num_examples": 64, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 11589236, "size_in_bytes": 942747572}, "pile_pile-cc": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPile-CC", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_pile-cc", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 235004043, "num_examples": 52790, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 233535650, "num_examples": 52792, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 468539693, "size_in_bytes": 1399698029}, "pile_pubmed-abstracts": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPubMed Abstracts", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_pubmed-abstracts", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 39908950, "num_examples": 29895, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 40008336, "num_examples": 29871, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 79917286, "size_in_bytes": 1011075622}, "pile_pubmed-central": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPubMed Central", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_pubmed-central", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 187251519, "num_examples": 5911, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 184791818, "num_examples": 5977, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 372043337, "size_in_bytes": 1303201673}, "pile_stackexchange": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nStackExchange", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_stackexchange", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 66441557, "num_examples": 30378, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 66011397, "num_examples": 29950, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 132452954, "size_in_bytes": 1063611290}, "pile_upsto": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nUSPTO Backgrounds", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_upsto", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 47345405, "num_examples": 11415, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 48122320, "num_examples": 11387, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 95467725, "size_in_bytes": 1026626061}, "pile_ubuntu-irc": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nUbuntu IRC", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_ubuntu-irc", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 5694218, "num_examples": 22, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 7410104, "num_examples": 21, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 13104322, "size_in_bytes": 944262658}, "pile_wikipedia": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nWikipedia (en)", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_wikipedia", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 52166968, "num_examples": 17511, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 53186137, "num_examples": 17478, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 105353105, "size_in_bytes": 1036511441}, "pile_youtubesubtitles": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nYoutubeSubtitles", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_youtubesubtitles", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 7377448, "num_examples": 342, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 8937546, "num_examples": 326, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 16314994, "size_in_bytes": 947473330}}
--- a/lm_eval/datasets/pile/pile.py
+++ b/lm_eval/datasets/pile/pile.py
@@ -42,8 +42,8 @@ _HOMEPAGE = "https://pile.eleuther.ai/"
 _LICENSE = ""

 _URLS = {
-    "validation": "http://eaidata.bmk.sh/data/pile/val.jsonl.zst",
-    "test": "http://eaidata.bmk.sh/data/pile/test.jsonl.zst",
+    "validation": "https://the-eye.eu/public/AI/pile/val.jsonl.zst",
+    "test": "https://the-eye.eu/public/AI/pile/test.jsonl.zst",
 }

 _NAMES = {
@@ -103,10 +103,7 @@ class Pile(datasets.GeneratorBasedBuilder):
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": data_dir["test"],
-                    "split": "test"
-                },
+                gen_kwargs={"filepath": data_dir["test"], "split": "test"},
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,

--- a/lm_eval/datasets/quac/__init__.py
+++ b/lm_eval/datasets/quac/__init__.py
--- a/lm_eval/datasets/quac/dataset_infos.json
+++ b/lm_eval/datasets/quac/dataset_infos.json
-{"quac": {"description": "Question Answering in Context (QuAC) is a dataset for modeling, understanding, and \nparticipating in information seeking dialog. Data instances consist of an interactive\ndialog between two crowd workers: (1) a student who poses a sequence of freeform\nquestions to learn as much as possible about a hidden Wikipedia text, and (2)\na teacher who answers the questions by providing short excerpts (spans) from the text.\n", "citation": "@article{choi2018quac,\n    title={Quac: Question answering in context},\n    author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},\n    journal={arXiv preprint arXiv:1808.07036},\n    year={2018}\n}\n", "homepage": "https://quac.ai/", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "section_title": {"dtype": "string", "id": null, "_type": "Value"}, "paragraph": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "quac", "config_name": "quac", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 212391958, "num_examples": 83568, "dataset_name": "quac"}, "validation": {"name": "validation", "num_bytes": 20678483, "num_examples": 7354, "dataset_name": "quac"}}, "download_checksums": {"https://s3.amazonaws.com/my89public/quac/train_v0.2.json": {"num_bytes": 68114819, "checksum": "ff5cca5a2e4b4d1cb5b5ced68b9fce88394ef6d93117426d6d4baafbcc05c56a"}, "https://s3.amazonaws.com/my89public/quac/val_v0.2.json": {"num_bytes": 8929167, "checksum": "09e622916280ba04c9352acb1bc5bbe80f11a2598f6f34e934c51d9e6570f378"}}, "download_size": 77043986, "post_processing_size": null, "dataset_size": 233070441, "size_in_bytes": 310114427}}
\ No newline at end of file
+{"quac": {"description": "Question Answering in Context (QuAC) is a dataset for modeling, understanding, and \nparticipating in information seeking dialog. Data instances consist of an interactive\ndialog between two crowd workers: (1) a student who poses a sequence of freeform\nquestions to learn as much as possible about a hidden Wikipedia text, and (2)\na teacher who answers the questions by providing short excerpts (spans) from the text.\n", "citation": "@article{choi2018quac,\n    title={Quac: Question answering in context},\n    author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},\n    journal={arXiv preprint arXiv:1808.07036},\n    year={2018}\n}\n", "homepage": "https://quac.ai/", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "section_title": {"dtype": "string", "id": null, "_type": "Value"}, "paragraph": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "quac", "config_name": "quac", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 212391958, "num_examples": 83568, "dataset_name": "quac"}, "validation": {"name": "validation", "num_bytes": 20678483, "num_examples": 7354, "dataset_name": "quac"}}, "download_checksums": {"https://s3.amazonaws.com/my89public/quac/train_v0.2.json": {"num_bytes": 68114819, "checksum": "ff5cca5a2e4b4d1cb5b5ced68b9fce88394ef6d93117426d6d4baafbcc05c56a"}, "https://s3.amazonaws.com/my89public/quac/val_v0.2.json": {"num_bytes": 8929167, "checksum": "09e622916280ba04c9352acb1bc5bbe80f11a2598f6f34e934c51d9e6570f378"}}, "download_size": 77043986, "post_processing_size": null, "dataset_size": 233070441, "size_in_bytes": 310114427}}
--- a/lm_eval/datasets/quac/quac.py
+++ b/lm_eval/datasets/quac/quac.py
@@ -30,7 +30,7 @@ _CITATION = """\
 """

 _DESCRIPTION = """\
-Question Answering in Context (QuAC) is a dataset for modeling, understanding, and 
+Question Answering in Context (QuAC) is a dataset for modeling, understanding, and
 participating in information seeking dialog. Data instances consist of an interactive
 dialog between two crowd workers: (1) a student who poses a sequence of freeform
 questions to learn as much as possible about a hidden Wikipedia text, and (2)
@@ -54,7 +54,9 @@ class Quac(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("1.1.0")

    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="quac", version=VERSION, description="The QuAC dataset"),
+        datasets.BuilderConfig(
+            name="quac", version=VERSION, description="The QuAC dataset"
+        ),
    ]

    def _info(self):
@@ -90,10 +92,7 @@ class Quac(datasets.GeneratorBasedBuilder):
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": data_dir["validation"],
-                    "split": "validation"
-                },
+                gen_kwargs={"filepath": data_dir["validation"], "split": "validation"},
            ),
        ]

@@ -105,7 +104,7 @@ class Quac(datasets.GeneratorBasedBuilder):
            for row in data:
                paragraph = row["paragraphs"][0]["context"].replace("CANNOTANSWER", "")
                qas = row["paragraphs"][0]["qas"]
-                qa_pairs = [(qa['question'], qa['answers'][0]['text']) for qa in qas]
+                qa_pairs = [(qa["question"], qa["answers"][0]["text"]) for qa in qas]
                for (question, answer) in qa_pairs:
                    # Yields examples as (key, example) tuples
                    yield key, {

--- a/lm_eval/datasets/sat_analogies/__init__.py
+++ b/lm_eval/datasets/sat_analogies/__init__.py
--- a/lm_eval/datasets/sat_analogies/sat_analogies.py
+++ b/lm_eval/datasets/sat_analogies/sat_analogies.py
@@ -44,13 +44,16 @@ _LICENSE = ""


 class SatAnalogies(datasets.GeneratorBasedBuilder):
-    """ SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374 multiple-choice analogy questions. """
+    """SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374 multiple-choice analogy questions."""

    VERSION = datasets.Version("0.0.1")

    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="sat_analogies", version=VERSION,
-                               description="The SAT Analogy Questions dataset"),
+        datasets.BuilderConfig(
+            name="sat_analogies",
+            version=VERSION,
+            description="The SAT Analogy Questions dataset",
+        ),
    ]

    @property
@@ -58,7 +61,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder):
        return (
            "To use SAT Analogy Questions you have to download it manually. Please "
            "email Peter Turney to request the data (https://www.apperceptual.com). "
-            "Once you recieve a download link for the dataset, supply the local path "
+            "Once you receive a download link for the dataset, supply the local path "
            "as the `data_dir` arg: "
            "`datasets.load_dataset('sat_analogies', data_dir='path/to/folder/folder_name')`"
        )
@@ -68,9 +71,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder):
            {
                "source": datasets.Value("string"),
                "stem": datasets.Value("string"),
-                "choices": datasets.features.Sequence(
-                    datasets.Value("string")
-                ),
+                "choices": datasets.features.Sequence(datasets.Value("string")),
                "solution": datasets.Value("string"),
            }
        )
@@ -108,7 +109,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder):
                if len(line) == 0 and record:
                    data.append(record)
                    record = []
-                elif len(line) > 0 and line[0] == '#':
+                elif len(line) > 0 and line[0] == "#":
                    # Skip comments.
                    continue
                else:
@@ -120,8 +121,8 @@ class SatAnalogies(datasets.GeneratorBasedBuilder):
            choices = record[-6:-1]
            solution = record[-1]
            yield key, {
-                'source': source,
-                'stem': stem,
-                'choices': choices,
-                'solution': solution,
+                "source": source,
+                "stem": stem,
+                "choices": choices,
+                "solution": solution,
            }
--- a/lm_eval/datasets/triviaqa/README.md
+++ b/lm_eval/datasets/triviaqa/README.md
+---
+dataset_info:
+  features:
+  - name: question_id
+    dtype: string
+  - name: question_source
+    dtype: string
+  - name: question
+    dtype: string
+  - name: answer
+    struct:
+    - name: aliases
+      sequence: string
+    - name: value
+      dtype: string
+  - name: search_results
+    sequence:
+    - name: description
+      dtype: string
+    - name: filename
+      dtype: string
+    - name: rank
+      dtype: int32
+    - name: title
+      dtype: string
+    - name: url
+      dtype: string
+    - name: search_context
+      dtype: string
+  config_name: triviaqa
+  splits:
+  - name: train
+    num_bytes: 1270894387
+    num_examples: 87622
+  - name: validation
+    num_bytes: 163755044
+    num_examples: 11313
+  download_size: 632549060
+  dataset_size: 1434649431
+---
--- a/lm_eval/datasets/triviaqa/__init__.py
+++ b/lm_eval/datasets/triviaqa/__init__.py
--- a/lm_eval/datasets/triviaqa/dataset_infos.json
+++ b/lm_eval/datasets/triviaqa/dataset_infos.json
-{"triviaqa": {"description": "TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence\ntriples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts\nand independently gathered evidence documents, six per question on average, that provide\nhigh quality distant supervision for answering the questions.\n", "citation": "@InProceedings{JoshiTriviaQA2017,\n    author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},\n    title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},\n    booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},\n    month = {July},\n    year = {2017},\n    address = {Vancouver, Canada},\n    publisher = {Association for Computational Linguistics},\n}\n", "homepage": "https://nlp.cs.washington.edu/triviaqa/", "license": "", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"aliases": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "values": {"dtype": "string", "id": null, "_type": "Value"}}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "trivia_qa", "config_name": "triviaqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 32846960, "num_examples": 87622, "dataset_name": "trivia_qa"}, "validation": {"name": "validation", "num_bytes": 4316214, "num_examples": 11313, "dataset_name": "trivia_qa"}}, "download_checksums": {"http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz": {"num_bytes": 546481381, "checksum": "adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e"}}, "download_size": 546481381, "post_processing_size": null, "dataset_size": 37163174, "size_in_bytes": 583644555}}
\ No newline at end of file
+{"triviaqa": {"description": "TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence\ntriples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts\nand independently gathered evidence documents, six per question on average, that provide\nhigh quality distant supervision for answering the questions.\n", "citation": "@InProceedings{JoshiTriviaQA2017,\n    author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},\n    title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},\n    booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},\n    month = {July},\n    year = {2017},\n    address = {Vancouver, Canada},\n    publisher = {Association for Computational Linguistics},\n}\n", "homepage": "https://nlp.cs.washington.edu/triviaqa/", "license": "Apache License 2.0", "features": {"question_id": {"dtype": "string", "id": null, "_type": "Value"}, "question_source": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"aliases": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "value": {"dtype": "string", "id": null, "_type": "Value"}}, "search_results": {"feature": {"description": {"dtype": "string", "id": null, "_type": "Value"}, "filename": {"dtype": "string", "id": null, "_type": "Value"}, "rank": {"dtype": "int32", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}, "search_context": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "triviaqa", "config_name": "triviaqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1271393601, "num_examples": 87622, "dataset_name": "triviaqa"}, "validation": {"name": "validation", "num_bytes": 163819509, "num_examples": 11313, "dataset_name": "triviaqa"}}, "download_checksums": {"http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz": {"num_bytes": 546481381, "checksum": "adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e"}}, "download_size": 546481381, "post_processing_size": null, "dataset_size": 1435213110, "size_in_bytes": 1981694491}}
--- a/lm_eval/datasets/triviaqa/triviaqa.py
+++ b/lm_eval/datasets/triviaqa/triviaqa.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 #
 # Custom TriviaQA because HF version sanitizes the dataset differently.
+# https://github.com/huggingface/datasets/blob/9977ade72191ff0b6907ec63935448c6269a91a1/datasets/trivia_qa/trivia_qa.py#L285
 """TriviaQA (Unfiltered Raw) dataset."""


@@ -43,32 +44,44 @@ high quality distant supervision for answering the questions.

 _HOMEPAGE = "https://nlp.cs.washington.edu/triviaqa/"

-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
+_LICENSE = "Apache License 2.0"

-_URLS = "http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz"
+_URLS = "https://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz"


-class TriviaQa(datasets.GeneratorBasedBuilder):
-    """ TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence triples """
+class Triviaqa(datasets.GeneratorBasedBuilder):
+    """TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence triples"""

-    VERSION = datasets.Version("0.0.1")
+    VERSION = datasets.Version("0.0.2")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
-            name="triviaqa", version=VERSION, description="The TriviaQA dataset"),
+            name="triviaqa", version=VERSION, description="The TriviaQA dataset"
+        ),
    ]

    def _info(self):
        features = datasets.Features(
            {
+                "question_id": datasets.Value("string"),
+                "question_source": datasets.Value("string"),
                "question": datasets.Value("string"),
                "answer": {
-                    "aliases":  datasets.features.Sequence(
+                    "aliases": datasets.features.Sequence(
                        datasets.Value("string"),
                    ),
-                    "value": datasets.Value("string")
-                }
+                    "value": datasets.Value("string"),
+                },
+                "search_results": datasets.features.Sequence(
+                    {
+                        "description": datasets.Value("string"),
+                        "filename": datasets.Value("string"),
+                        "rank": datasets.Value("int32"),
+                        "title": datasets.Value("string"),
+                        "url": datasets.Value("string"),
+                        "search_context": datasets.Value("string"),
+                    }
+                ),
            }
        )
        return datasets.DatasetInfo(
@@ -87,29 +100,58 @@ class TriviaQa(datasets.GeneratorBasedBuilder):
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
-                    "filepath": os.path.join(data_dir, "unfiltered-web-train.jsonl"),
-                    "split": "train",
+                    "filepath": os.path.join(
+                        data_dir, "triviaqa-unfiltered", "unfiltered-web-train.json"
+                    ),
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
-                    "filepath": os.path.join(data_dir, "unfiltered-web-dev.jsonl"),
-                    "split": "dev",
+                    "filepath": os.path.join(
+                        data_dir, "triviaqa-unfiltered", "unfiltered-web-dev.json"
+                    ),
                },
            ),
        ]

    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
-    def _generate_examples(self, filepath, split):
+    def _generate_examples(self, filepath):
        with open(filepath, encoding="utf-8") as f:
-            for key, row in enumerate(f):
-                data = json.loads(row)
+            json_data = json.load(f)["Data"]
+            for key, data in enumerate(json_data):
+                search_results = []
+                for search_result in data["SearchResults"]:
+                    search_results.append(
+                        {
+                            "description": search_result["Description"]
+                            if "Description" in search_result
+                            else "",
+                            "filename": search_result["Filename"]
+                            if "Filename" in search_result
+                            else "",
+                            "rank": search_result["Rank"]
+                            if "Rank" in search_result
+                            else -1,
+                            "title": search_result["Title"]
+                            if "Title" in search_result
+                            else "",
+                            "url": search_result["Url"]
+                            if "Url" in search_result
+                            else "",
+                            "search_context": search_result["SearchContext"]
+                            if "SearchContext" in search_result
+                            else "",
+                        }
+                    )
                yield key, {
+                    "question_id": data["QuestionId"],
+                    "question_source": data["QuestionSource"],
                    "question": data["Question"],
                    "answer": {
                        "aliases": data["Answer"]["Aliases"],
                        "value": data["Answer"]["Value"],
-                    }
+                    },
+                    "search_results": search_results,
                }
--- a/lm_eval/datasets/truthfulqa/dataset_infos.json
+++ b/lm_eval/datasets/truthfulqa/dataset_infos.json
-{"multiple_choice": {"description": "TruthfulQA is a benchmark to measure whether a language model is truthful in\ngenerating answers to questions. The benchmark comprises 817 questions that\nspan 38 categories, including health, law, finance and politics. Questions are\ncrafted so that some humans would answer falsely due to a false belief or\nmisconception. To perform well, models must avoid generating false answers\nlearned from imitating human texts.\n\nThe multiple choice TruthfulQA task", "citation": "@misc{lin2021truthfulqa,\n    title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},\n    author={Stephanie Lin and Jacob Hilton and Owain Evans},\n    year={2021},\n    eprint={2109.07958},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/sylinrl/TruthfulQA", "license": "", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "mc1_targets": {"choices": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "mc2_targets": {"choices": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "truthfulqa", "config_name": "multiple_choice", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 610333, "num_examples": 817, "dataset_name": "truthfulqa"}}, "download_checksums": {"https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/data/mc_task.json": {"num_bytes": 710607, "checksum": "6eb4125d25750c0145c4be2dce00440736684ab6f74ce6bff2139571cc758954"}}, "download_size": 710607, "post_processing_size": null, "dataset_size": 610333, "size_in_bytes": 1320940}, "generation": {"description": "TruthfulQA is a benchmark to measure whether a language model is truthful in\ngenerating answers to questions. The benchmark comprises 817 questions that\nspan 38 categories, including health, law, finance and politics. Questions are\ncrafted so that some humans would answer falsely due to a false belief or\nmisconception. To perform well, models must avoid generating false answers\nlearned from imitating human texts.\n\nThe generative TruthfulQA task", "citation": "@misc{lin2021truthfulqa,\n    title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},\n    author={Stephanie Lin and Jacob Hilton and Owain Evans},\n    year={2021},\n    eprint={2109.07958},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/sylinrl/TruthfulQA", "license": "", "features": {"category": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "best_answer": {"dtype": "string", "id": null, "_type": "Value"}, "correct_answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "incorrect_answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "truthfulqa", "config_name": "generation", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 463860, "num_examples": 817, "dataset_name": "truthfulqa"}}, "download_checksums": {"https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/TruthfulQA.csv": {"num_bytes": 443723, "checksum": "8d7dd15f033196140f032d97d30f037da7a7b1192c3f36f9937c1850925335a2"}}, "download_size": 443723, "post_processing_size": null, "dataset_size": 463860, "size_in_bytes": 907583}}
\ No newline at end of file
--- a/lm_eval/datasets/truthfulqa/truthfulqa.py
+++ b/lm_eval/datasets/truthfulqa/truthfulqa.py
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""TruthfulQA dataset."""
-
-
-import csv
-import json
-
-import datasets
-
-
-_CITATION = """\
-@misc{lin2021truthfulqa,
-    title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
-    author={Stephanie Lin and Jacob Hilton and Owain Evans},
-    year={2021},
-    eprint={2109.07958},
-    archivePrefix={arXiv},
-    primaryClass={cs.CL}
-}
-"""
-
-_DESCRIPTION = """\
-TruthfulQA is a benchmark to measure whether a language model is truthful in
-generating answers to questions. The benchmark comprises 817 questions that
-span 38 categories, including health, law, finance and politics. Questions are
-crafted so that some humans would answer falsely due to a false belief or
-misconception. To perform well, models must avoid generating false answers
-learned from imitating human texts.
-"""
-
-_HOMEPAGE = "https://github.com/sylinrl/TruthfulQA"
-
-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
-
-
-class TruthfulqaConfig(datasets.BuilderConfig):
-    """BuilderConfig for TruthfulQA."""
-
-    def __init__(self, url, features, **kwargs):
-        """BuilderConfig for TruthfulQA.
-
-        Args:
-        url: *string*, the url to the specific subset of the GPT3 Arithmetic dataset.
-        features: *list[string]*, list of the features that will appear in the
-            feature dict.
-        """
-        # Version history:
-        super().__init__(version=datasets.Version("0.0.1"), **kwargs)
-        self.url = url
-        self.features = features
-
-
-class Truthfulqa(datasets.GeneratorBasedBuilder):
-    """TruthfulQA is a benchmark to measure whether a language model is truthful in
-generating answers to questions."""
-
-    BUILDER_CONFIGS = [
-        TruthfulqaConfig(
-            name="multiple_choice",
-            url="https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/data/mc_task.json",
-            features=datasets.Features({
-                "question": datasets.Value("string"),
-                "mc1_targets": {
-                    "choices": datasets.features.Sequence(datasets.Value("string")),
-                    "labels": datasets.features.Sequence(datasets.Value("int32")),
-                },
-                "mc2_targets": {
-                    "choices": datasets.features.Sequence(datasets.Value("string")),
-                    "labels": datasets.features.Sequence(datasets.Value("int32")),
-                }
-            }),
-            description="The multiple choice TruthfulQA task"
-        ),
-        TruthfulqaConfig(
-            name="generation",
-            url="https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/TruthfulQA.csv",
-            features=datasets.Features({
-                "category": datasets.Value("string"),
-                "question": datasets.Value("string"),
-                "best_answer": datasets.Value("string"),
-                "correct_answers": datasets.features.Sequence(datasets.Value("string")),
-                "incorrect_answers": datasets.features.Sequence(datasets.Value("string")),
-                "source": datasets.Value("string"),
-            }),
-            description="The generative TruthfulQA task"
-        )
-    ]
-
-    def _info(self):
-        return datasets.DatasetInfo(
-            description=f"{_DESCRIPTION}\n{self.config.description}",
-            features=self.config.features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION,
-        )
-
-    def _split_generators(self, dl_manager):
-        urls = self.config.url
-        data_dir = dl_manager.download_and_extract(urls)
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": data_dir,
-                    "split": "validation",
-                },
-            ),
-        ]
-
-    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
-    def _generate_examples(self, filepath, split):
-        if self.config.name == "multiple_choice":
-            # Multiple choice data is in a `JSON` file.
-            with open(filepath, encoding="utf-8") as f:
-                contents = json.load(f)
-                for key, row in enumerate(contents):
-                    yield key, {
-                        "question": row["question"],
-                        "mc1_targets": {
-                            "choices": row["mc1_targets"].keys(),
-                            "labels": row["mc1_targets"].values(),
-                        },
-                        "mc2_targets": {
-                            "choices": row["mc2_targets"].keys(),
-                            "labels": row["mc2_targets"].values(),
-                        }
-                    }
-        else:
-            # Generation data is in a `CSV` file.
-            with open(filepath, newline='') as f:
-                contents = csv.DictReader(f)
-                for key, row in enumerate(contents):
-                    # Ensure that references exist.
-                    if not row['Correct Answers'] or not row['Incorrect Answers']:
-                        continue
-                    yield key, {
-                        "category": row["Category"],
-                        "question": row["Question"],
-                        "best_answer": row["Best Answer"],
-                        # split on ";"
-                        "correct_answers": row["Correct Answers"].strip().split(";"),
-                        "incorrect_answers": row["Incorrect Answers"].strip().split(";"),
-                        "source": row["Source"],
-                    }
--- a/lm_eval/datasets/unscramble/__init__.py
+++ b/lm_eval/datasets/unscramble/__init__.py
--- a/lm_eval/datasets/unscramble/dataset_infos.json
+++ b/lm_eval/datasets/unscramble/dataset_infos.json
-{"mid_word_1_anagrams": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "mid_word_1_anagrams", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 271516, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/mid_word_1_anagrams.jsonl.gz": {"num_bytes": 106533, "checksum": "6768a86896083199de4815d4964cb2f6f1046476cfd80c2a562784f182905979"}}, "download_size": 106533, "post_processing_size": null, "dataset_size": 271516, "size_in_bytes": 378049}, "mid_word_2_anagrams": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "mid_word_2_anagrams", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 282654, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/mid_word_2_anagrams.jsonl.gz": {"num_bytes": 109091, "checksum": "c3d839d09a7954b78a27cd2cd75d4ed0488656c56ef4dbd741a005343826cb01"}}, "download_size": 109091, "post_processing_size": null, "dataset_size": 282654, "size_in_bytes": 391745}, "cycle_letters_in_word": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "cycle_letters_in_word", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 282654, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/cycle_letters_in_word.jsonl.gz": {"num_bytes": 98451, "checksum": "1689c9002bb8c5988bf5f05e977c9db92f57932c1b5a38998c29ac0dd71e1d42"}}, "download_size": 98451, "post_processing_size": null, "dataset_size": 282654, "size_in_bytes": 381105}, "random_insertion_in_word": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "random_insertion_in_word", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 353981, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/random_insertion_in_word.jsonl.gz": {"num_bytes": 143626, "checksum": "72e65d83da53d15752ee0c47379509de149ddbad32d61184e5991df29616b78a"}}, "download_size": 143626, "post_processing_size": null, "dataset_size": 353981, "size_in_bytes": 497607}, "reversed_words": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "reversed_words", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 282654, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/reversed_words.jsonl.gz": {"num_bytes": 91917, "checksum": "133a08f875cd6c1ef8608a3233571a773881cc27b1c707de738cc6543439332a"}}, "download_size": 91917, "post_processing_size": null, "dataset_size": 282654, "size_in_bytes": 374571}}
\ No newline at end of file
+{"mid_word_1_anagrams": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "mid_word_1_anagrams", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 271516, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/mid_word_1_anagrams.jsonl.gz": {"num_bytes": 106533, "checksum": "6768a86896083199de4815d4964cb2f6f1046476cfd80c2a562784f182905979"}}, "download_size": 106533, "post_processing_size": null, "dataset_size": 271516, "size_in_bytes": 378049}, "mid_word_2_anagrams": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "mid_word_2_anagrams", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 282654, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/mid_word_2_anagrams.jsonl.gz": {"num_bytes": 109091, "checksum": "c3d839d09a7954b78a27cd2cd75d4ed0488656c56ef4dbd741a005343826cb01"}}, "download_size": 109091, "post_processing_size": null, "dataset_size": 282654, "size_in_bytes": 391745}, "cycle_letters_in_word": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "cycle_letters_in_word", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 282654, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/cycle_letters_in_word.jsonl.gz": {"num_bytes": 98451, "checksum": "1689c9002bb8c5988bf5f05e977c9db92f57932c1b5a38998c29ac0dd71e1d42"}}, "download_size": 98451, "post_processing_size": null, "dataset_size": 282654, "size_in_bytes": 381105}, "random_insertion_in_word": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "random_insertion_in_word", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 353981, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/random_insertion_in_word.jsonl.gz": {"num_bytes": 143626, "checksum": "72e65d83da53d15752ee0c47379509de149ddbad32d61184e5991df29616b78a"}}, "download_size": 143626, "post_processing_size": null, "dataset_size": 353981, "size_in_bytes": 497607}, "reversed_words": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "reversed_words", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 282654, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/reversed_words.jsonl.gz": {"num_bytes": 91917, "checksum": "133a08f875cd6c1ef8608a3233571a773881cc27b1c707de738cc6543439332a"}}, "download_size": 91917, "post_processing_size": null, "dataset_size": 282654, "size_in_bytes": 374571}}