{"logiqa":{"description":"LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA\ninstances, covering multiple types of deductive reasoning. Results show that state-\nof-the-art neural models perform by far worse than human ceiling. The dataset can\nalso serve as a benchmark for reinvestigating logical AI under the deep learning\nNLP setting.\n","citation":"@misc{liu2020logiqa,\n title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, \n author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},\n year={2020},\n eprint={2007.08124},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n","homepage":"https://github.com/lgw863/LogiQA-dataset","license":"","features":{"label":{"dtype":"string","id":null,"_type":"Value"},"context":{"dtype":"string","id":null,"_type":"Value"},"question":{"dtype":"string","id":null,"_type":"Value"},"options":{"feature":{"dtype":"string","id":null,"_type":"Value"},"length":-1,"id":null,"_type":"Sequence"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"logiqa","config_name":"logiqa","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"train":{"name":"train","num_bytes":6419852,"num_examples":7376,"dataset_name":"logiqa"},"test":{"name":"test","num_bytes":571705,"num_examples":651,"dataset_name":"logiqa"},"validation":{"name":"validation","num_bytes":562437,"num_examples":651,"dataset_name":"logiqa"}},"download_checksums":{"https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Train.txt":{"num_bytes":6281272,"checksum":"7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"},"https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Test.txt":{"num_bytes":559060,"checksum":"359acb78c37802208f7fde9e2f6574b8526527c63d6a336f90a53f1932cb4701"},"https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Eval.txt":{"num_bytes":550021,"checksum":"4c49e6753b7262c001506b9151135abf722247035ab075dad93acdea5789c01f"}},"download_size":7390353,"post_processing_size":null,"dataset_size":7553994,"size_in_bytes":14944347}}
{"mutual":{"description":"MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nThe MuTual dataset.","citation":"@inproceedings{mutual,\n title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n author = \"Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n year = \"2020\",\n publisher = \"Association for Computational Linguistics\",\n}\n","homepage":"https://github.com/Nealcly/MuTual","license":"","features":{"answers":{"dtype":"string","id":null,"_type":"Value"},"options":{"feature":{"dtype":"string","id":null,"_type":"Value"},"length":-1,"id":null,"_type":"Sequence"},"article":{"dtype":"string","id":null,"_type":"Value"},"id":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"mutual","config_name":"mutual","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"train":{"name":"train","num_bytes":5141602,"num_examples":7088,"dataset_name":"mutual"},"test":{"name":"test","num_bytes":634396,"num_examples":886,"dataset_name":"mutual"},"validation":{"name":"validation","num_bytes":624271,"num_examples":886,"dataset_name":"mutual"}},"download_checksums":{"https://github.com/Nealcly/MuTual/archive/master.zip":{"num_bytes":10997878,"checksum":"bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}},"download_size":10997878,"post_processing_size":null,"dataset_size":6400269,"size_in_bytes":17398147},"mutual_plus":{"description":"MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nMuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.","citation":"@inproceedings{mutual,\n title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n author = \"Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n year = \"2020\",\n publisher = \"Association for Computational Linguistics\",\n}\n","homepage":"https://github.com/Nealcly/MuTual","license":"","features":{"answers":{"dtype":"string","id":null,"_type":"Value"},"options":{"feature":{"dtype":"string","id":null,"_type":"Value"},"length":-1,"id":null,"_type":"Sequence"},"article":{"dtype":"string","id":null,"_type":"Value"},"id":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"mutual","config_name":"mutual_plus","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"train":{"name":"train","num_bytes":4921179,"num_examples":7088,"dataset_name":"mutual"},"test":{"name":"test","num_bytes":606620,"num_examples":886,"dataset_name":"mutual"},"validation":{"name":"validation","num_bytes":597340,"num_examples":886,"dataset_name":"mutual"}},"download_checksums":{"https://github.com/Nealcly/MuTual/archive/master.zip":{"num_bytes":10997878,"checksum":"bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}},"download_size":10997878,"post_processing_size":null,"dataset_size":6125139,"size_in_bytes":17123017}}
{"pile_arxiv":{"description":"The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nArXiv","citation":"@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n","homepage":"https://pile.eleuther.ai/","license":"","features":{"text":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"pile","config_name":"pile_arxiv","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"test":{"name":"test","num_bytes":113218251,"num_examples":2407,"dataset_name":"pile"},"validation":{"name":"validation","num_bytes":115653720,"num_examples":2434,"dataset_name":"pile"}},"download_checksums":{"https://the-eye.eu/public/AI/pile/val.jsonl.zst":{"num_bytes":470907480,"checksum":"264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"},"https://the-eye.eu/public/AI/pile/test.jsonl.zst":{"num_bytes":460250856,"checksum":"0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}},"download_size":931158336,"post_processing_size":null,"dataset_size":228871971,"size_in_bytes":1160030307},"pile_books3":{"description":"The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nBooks3","citation":"@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n","homepage":"https://pile.eleuther.ai/","license":"","features":{"text":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"pile","config_name":"pile_books3","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"test":{"name":"test","num_bytes":150095743,"num_examples":269,"dataset_name":"pile"},"validation":{"name":"validation","num_bytes":177359876,"num_examples":301,"dataset_name":"pile"}},"download_checksums":{"https://the-eye.eu/public/AI/pile/val.jsonl.zst":{"num_bytes":470907480,"checksum":"264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"},"https://the-eye.eu/public/AI/pile/test.jsonl.zst":{"num_bytes":460250856,"checksum":"0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}},"download_size":931158336,"post_processing_size":null,"dataset_size":327455619,"size_in_bytes":1258613955},"pile_bookcorpus2":{"description":"The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nBookCorpus2","citation":"@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n","homepage":"https://pile.eleuther.ai/","license":"","features":{"text":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"pile","config_name":"pile_bookcorpus2","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"test":{"name":"test","num_bytes":9680652,"num_examples":28,"dataset_name":"pile"},"validation":{"name":"validation","num_bytes":9776271,"num_examples":26,"dataset_name":"pile"}},"download_checksums":{"https://the-eye.eu/public/AI/pile/val.jsonl.zst":{"num_bytes":470907480,"checksum":"264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"},"https://the-eye.eu/public/AI/pile/test.jsonl.zst":{"num_bytes":460250856,"checksum":"0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}},"download_size":931158336,"post_processing_size":null,"dataset_size":19456923,"size_in_bytes":950615259},"pile_dm-mathematics":{"description":"The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nDM Mathematics","citation":"@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n","homepage":"https://pile.eleuther.ai/","license":"","features":{"text":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"pile","config_name":"pile_dm-mathematics","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"test":{"name":"test","num_bytes":15756556,"num_examples":1922,"dataset_name":"pile"},"validation":{"name":"validation","num_bytes":16453386,"num_examples":2007,"dataset_name":"pile"}},"download_checksums":{"https://the-eye.eu/public/AI/pile/val.jsonl.zst":{"num_bytes":470907480,"checksum":"264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"},"https://the-eye.eu/public/AI/pile/test.jsonl.zst":{"num_bytes":460250856,"checksum":"0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}},"download_size":931158336,"post_processing_size":null,"dataset_size":32209942,"size_in_bytes":963368278},"pile_enron":{"description":"The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nEnron Emails","citation":"@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n","homepage":"https://pile.eleuther.ai/","license":"","features":{"text":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"pile","config_name":"pile_enron","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"test":{"name":"test","num_bytes":1638859,"num_examples":1010,"dataset_name":"pile"},"validation":{"name":"validation","num_bytes":1556487,"num_examples":947,"dataset_name":"pile"}},"download_checksums":{"https://the-eye.eu/public/AI/pile/val.jsonl.zst":{"num_bytes":470907480,"checksum":"264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"},"https://the-eye.eu/public/AI/pile/test.jsonl.zst":{"num_bytes":460250856,"checksum":"0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}},"download_size":931158336,"post_processing_size":null,"dataset_size":3195346,"size_in_bytes":934353682},"pile_europarl":{"description":"The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nEuroParl","citation":"@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n","homepage":"https://pile.eleuther.ai/","license":"","features":{"text":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"pile","config_name":"pile_europarl","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"test":{"name":"test","num_bytes":8789652,"num_examples":157,"dataset_name":"pile"},"validation":{"name":"validation","num_bytes":9111791,"num_examples":133,"dataset_name":"pile"}},"download_checksums":{"https://the-eye.eu/public/AI/pile/val.jsonl.zst":{"num_bytes":470907480,"checksum":"264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"},"https://the-eye.eu/public/AI/pile/test.jsonl.zst":{"num_bytes":460250856,"checksum":"0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}},"download_size":931158336,"post_processing_size":null,"dataset_size":17901443,"size_in_bytes":949059779},"pile_freelaw":{"description":"The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nFreeLaw","citation":"@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n","homepage":"https://pile.eleuther.ai/","license":"","features":{"text":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"pile","config_name":"pile_freelaw","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"test":{"name":"test","num_bytes":80808693,"num_examples":5101,"dataset_name":"pile"},"validation":{"name":"validation","num_bytes":80363814,"num_examples":5094,"dataset_name":"pile"}},"download_checksums":{"https://the-eye.eu/public/AI/pile/val.jsonl.zst":{"num_bytes":470907480,"checksum":"264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"},"https://the-eye.eu/public/AI/pile/test.jsonl.zst":{"num_bytes":460250856,"checksum":"0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}},"download_size":931158336,"post_processing_size":null,"dataset_size":161172507,"size_in_bytes":1092330843},"pile_github":{"description":"The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nGithub","citation":"@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n","homepage":"https://pile.eleuther.ai/","license":"","features":{"text":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"pile","config_name":"pile_github","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"test":{"name":"test","num_bytes":95654706,"num_examples":18195,"dataset_name":"pile"},"validation":{"name":"validation","num_bytes":97179576,"num_examples":18337,"dataset_name":"pile"}},"download_checksums":{"https://the-eye.eu/public/AI/pile/val.jsonl.zst":{"num_bytes":470907480,"checksum":"264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"},"https://the-eye.eu/public/AI/pile/test.jsonl.zst":{"num_bytes":460250856,"checksum":"0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}},"download_size":931158336,"post_processing_size":null,"dataset_size":192834282,"size_in_bytes":1123992618},"pile_gutenberg":{"description":"The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nGutenberg (PG-19)","citation":"@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n","homepage":"https://pile.eleuther.ai/","license":"","features":{"text":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"pile","config_name":"pile_gutenberg","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"test":{"name":"test","num_bytes":30243176,"num_examples":80,"dataset_name":"pile"},"validation":{"name":"validation","num_bytes":24685980,"num_examples":60,"dataset_name":"pile"}},"download_checksums":{"https://the-eye.eu/public/AI/pile/val.jsonl.zst":{"num_bytes":470907480,"checksum":"264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"},"https://the-eye.eu/public/AI/pile/test.jsonl.zst":{"num_bytes":460250856,"checksum":"0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}},"download_size":931158336,"post_processing_size":null,"dataset_size":54929156,"size_in_bytes":986087492},"pile_hackernews":{"description":"The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nHackerNews","citation":"@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n","homepage":"https://pile.eleuther.ai/","license":"","features":{"text":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"pile","config_name":"pile_hackernews","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"test":{"name":"test","num_bytes":8124255,"num_examples":1632,"dataset_name":"pile"},"validation":{"name":"validation","num_bytes":9803822,"num_examples":1619,"dataset_name":"pile"}},"download_checksums":{"https://the-eye.eu/public/AI/pile/val.jsonl.zst":{"num_bytes":470907480,"checksum":"264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"},"https://the-eye.eu/public/AI/pile/test.jsonl.zst":{"num_bytes":460250856,"checksum":"0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}},"download_size":931158336,"post_processing_size":null,"dataset_size":17928077,"size_in_bytes":949086413},"pile_nih-exporter":{"description":"The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nNIH ExPorter","citation":"@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n","homepage":"https://pile.eleuther.ai/","license":"","features":{"text":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"pile","config_name":"pile_nih-exporter","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"test":{"name":"test","num_bytes":3928804,"num_examples":1884,"dataset_name":"pile"},"validation":{"name":"validation","num_bytes":3927967,"num_examples":1825,"dataset_name":"pile"}},"download_checksums":{"https://the-eye.eu/public/AI/pile/val.jsonl.zst":{"num_bytes":470907480,"checksum":"264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"},"https://the-eye.eu/public/AI/pile/test.jsonl.zst":{"num_bytes":460250856,"checksum":"0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}},"download_size":931158336,"post_processing_size":null,"dataset_size":7856771,"size_in_bytes":939015107},"pile_opensubtitles":{"description":"The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nOpenSubtitles","citation":"@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n","homepage":"https://pile.eleuther.ai/","license":"","features":{"text":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"pile","config_name":"pile_opensubtitles","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"test":{"name":"test","num_bytes":21008996,"num_examples":642,"dataset_name":"pile"},"validation":{"name":"validation","num_bytes":19622904,"num_examples":621,"dataset_name":"pile"}},"download_checksums":{"https://the-eye.eu/public/AI/pile/val.jsonl.zst":{"num_bytes":470907480,"checksum":"264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"},"https://the-eye.eu/public/AI/pile/test.jsonl.zst":{"num_bytes":460250856,"checksum":"0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}},"download_size":931158336,"post_processing_size":null,"dataset_size":40631900,"size_in_bytes":971790236},"pile_openwebtext2":{"description":"The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nOpenWebText2","citation":"@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n","homepage":"https://pile.eleuther.ai/","license":"","features":{"text":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"pile","config_name":"pile_openwebtext2","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"test":{"name":"test","num_bytes":128624303,"num_examples":32925,"dataset_name":"pile"},"validation":{"name":"validation","num_bytes":131554302,"num_examples":33400,"dataset_name":"pile"}},"download_checksums":{"https://the-eye.eu/public/AI/pile/val.jsonl.zst":{"num_bytes":470907480,"checksum":"264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"},"https://the-eye.eu/public/AI/pile/test.jsonl.zst":{"num_bytes":460250856,"checksum":"0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}},"download_size":931158336,"post_processing_size":null,"dataset_size":260178605,"size_in_bytes":1191336941},"pile_philpapers":{"description":"The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPhilPapers","citation":"@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n","homepage":"https://pile.eleuther.ai/","license":"","features":{"text":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"pile","config_name":"pile_philpapers","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"test":{"name":"test","num_bytes":5090158,"num_examples":68,"dataset_name":"pile"},"validation":{"name":"validation","num_bytes":6499078,"num_examples":64,"dataset_name":"pile"}},"download_checksums":{"https://the-eye.eu/public/AI/pile/val.jsonl.zst":{"num_bytes":470907480,"checksum":"264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"},"https://the-eye.eu/public/AI/pile/test.jsonl.zst":{"num_bytes":460250856,"checksum":"0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}},"download_size":931158336,"post_processing_size":null,"dataset_size":11589236,"size_in_bytes":942747572},"pile_pile-cc":{"description":"The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPile-CC","citation":"@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n","homepage":"https://pile.eleuther.ai/","license":"","features":{"text":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"pile","config_name":"pile_pile-cc","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"test":{"name":"test","num_bytes":235004043,"num_examples":52790,"dataset_name":"pile"},"validation":{"name":"validation","num_bytes":233535650,"num_examples":52792,"dataset_name":"pile"}},"download_checksums":{"https://the-eye.eu/public/AI/pile/val.jsonl.zst":{"num_bytes":470907480,"checksum":"264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"},"https://the-eye.eu/public/AI/pile/test.jsonl.zst":{"num_bytes":460250856,"checksum":"0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}},"download_size":931158336,"post_processing_size":null,"dataset_size":468539693,"size_in_bytes":1399698029},"pile_pubmed-abstracts":{"description":"The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPubMed Abstracts","citation":"@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n","homepage":"https://pile.eleuther.ai/","license":"","features":{"text":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"pile","config_name":"pile_pubmed-abstracts","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"test":{"name":"test","num_bytes":39908950,"num_examples":29895,"dataset_name":"pile"},"validation":{"name":"validation","num_bytes":40008336,"num_examples":29871,"dataset_name":"pile"}},"download_checksums":{"https://the-eye.eu/public/AI/pile/val.jsonl.zst":{"num_bytes":470907480,"checksum":"264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"},"https://the-eye.eu/public/AI/pile/test.jsonl.zst":{"num_bytes":460250856,"checksum":"0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}},"download_size":931158336,"post_processing_size":null,"dataset_size":79917286,"size_in_bytes":1011075622},"pile_pubmed-central":{"description":"The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPubMed Central","citation":"@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n","homepage":"https://pile.eleuther.ai/","license":"","features":{"text":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"pile","config_name":"pile_pubmed-central","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"test":{"name":"test","num_bytes":187251519,"num_examples":5911,"dataset_name":"pile"},"validation":{"name":"validation","num_bytes":184791818,"num_examples":5977,"dataset_name":"pile"}},"download_checksums":{"https://the-eye.eu/public/AI/pile/val.jsonl.zst":{"num_bytes":470907480,"checksum":"264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"},"https://the-eye.eu/public/AI/pile/test.jsonl.zst":{"num_bytes":460250856,"checksum":"0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}},"download_size":931158336,"post_processing_size":null,"dataset_size":372043337,"size_in_bytes":1303201673},"pile_stackexchange":{"description":"The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nStackExchange","citation":"@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n","homepage":"https://pile.eleuther.ai/","license":"","features":{"text":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"pile","config_name":"pile_stackexchange","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"test":{"name":"test","num_bytes":66441557,"num_examples":30378,"dataset_name":"pile"},"validation":{"name":"validation","num_bytes":66011397,"num_examples":29950,"dataset_name":"pile"}},"download_checksums":{"https://the-eye.eu/public/AI/pile/val.jsonl.zst":{"num_bytes":470907480,"checksum":"264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"},"https://the-eye.eu/public/AI/pile/test.jsonl.zst":{"num_bytes":460250856,"checksum":"0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}},"download_size":931158336,"post_processing_size":null,"dataset_size":132452954,"size_in_bytes":1063611290},"pile_upsto":{"description":"The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nUSPTO Backgrounds","citation":"@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n","homepage":"https://pile.eleuther.ai/","license":"","features":{"text":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"pile","config_name":"pile_upsto","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"test":{"name":"test","num_bytes":47345405,"num_examples":11415,"dataset_name":"pile"},"validation":{"name":"validation","num_bytes":48122320,"num_examples":11387,"dataset_name":"pile"}},"download_checksums":{"https://the-eye.eu/public/AI/pile/val.jsonl.zst":{"num_bytes":470907480,"checksum":"264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"},"https://the-eye.eu/public/AI/pile/test.jsonl.zst":{"num_bytes":460250856,"checksum":"0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}},"download_size":931158336,"post_processing_size":null,"dataset_size":95467725,"size_in_bytes":1026626061},"pile_ubuntu-irc":{"description":"The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nUbuntu IRC","citation":"@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n","homepage":"https://pile.eleuther.ai/","license":"","features":{"text":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"pile","config_name":"pile_ubuntu-irc","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"test":{"name":"test","num_bytes":5694218,"num_examples":22,"dataset_name":"pile"},"validation":{"name":"validation","num_bytes":7410104,"num_examples":21,"dataset_name":"pile"}},"download_checksums":{"https://the-eye.eu/public/AI/pile/val.jsonl.zst":{"num_bytes":470907480,"checksum":"264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"},"https://the-eye.eu/public/AI/pile/test.jsonl.zst":{"num_bytes":460250856,"checksum":"0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}},"download_size":931158336,"post_processing_size":null,"dataset_size":13104322,"size_in_bytes":944262658},"pile_wikipedia":{"description":"The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nWikipedia (en)","citation":"@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n","homepage":"https://pile.eleuther.ai/","license":"","features":{"text":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"pile","config_name":"pile_wikipedia","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"test":{"name":"test","num_bytes":52166968,"num_examples":17511,"dataset_name":"pile"},"validation":{"name":"validation","num_bytes":53186137,"num_examples":17478,"dataset_name":"pile"}},"download_checksums":{"https://the-eye.eu/public/AI/pile/val.jsonl.zst":{"num_bytes":470907480,"checksum":"264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"},"https://the-eye.eu/public/AI/pile/test.jsonl.zst":{"num_bytes":460250856,"checksum":"0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}},"download_size":931158336,"post_processing_size":null,"dataset_size":105353105,"size_in_bytes":1036511441},"pile_youtubesubtitles":{"description":"The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nYoutubeSubtitles","citation":"@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n","homepage":"https://pile.eleuther.ai/","license":"","features":{"text":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"pile","config_name":"pile_youtubesubtitles","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"test":{"name":"test","num_bytes":7377448,"num_examples":342,"dataset_name":"pile"},"validation":{"name":"validation","num_bytes":8937546,"num_examples":326,"dataset_name":"pile"}},"download_checksums":{"https://the-eye.eu/public/AI/pile/val.jsonl.zst":{"num_bytes":470907480,"checksum":"264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"},"https://the-eye.eu/public/AI/pile/test.jsonl.zst":{"num_bytes":460250856,"checksum":"0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}},"download_size":931158336,"post_processing_size":null,"dataset_size":16314994,"size_in_bytes":947473330}}
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Pile dataset."""
importjson
importdatasets
_CITATION="""\
@article{pile,
title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},
author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},
journal={arXiv preprint arXiv:2101.00027},
year={2020}
}
"""
_DESCRIPTION="""\
The Pile is a 825 GiB diverse, open source language modeling data set that consists
of 22 smaller, high-quality datasets combined together. To score well on Pile
BPB (bits per byte), a model must be able to understand many disparate domains
including books, github repositories, webpages, chat logs, and medical, physics,
math, computer science, and philosophy papers.
"""
_HOMEPAGE="https://pile.eleuther.ai/"
# TODO: Add the licence for the dataset here if you can find it
{"quac":{"description":"Question Answering in Context (QuAC) is a dataset for modeling, understanding, and \nparticipating in information seeking dialog. Data instances consist of an interactive\ndialog between two crowd workers: (1) a student who poses a sequence of freeform\nquestions to learn as much as possible about a hidden Wikipedia text, and (2)\na teacher who answers the questions by providing short excerpts (spans) from the text.\n","citation":"@article{choi2018quac,\n title={Quac: Question answering in context},\n author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},\n journal={arXiv preprint arXiv:1808.07036},\n year={2018}\n}\n","homepage":"https://quac.ai/","license":"","features":{"title":{"dtype":"string","id":null,"_type":"Value"},"section_title":{"dtype":"string","id":null,"_type":"Value"},"paragraph":{"dtype":"string","id":null,"_type":"Value"},"question":{"dtype":"string","id":null,"_type":"Value"},"answer":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"quac","config_name":"quac","version":{"version_str":"1.1.0","description":null,"major":1,"minor":1,"patch":0},"splits":{"train":{"name":"train","num_bytes":212391958,"num_examples":83568,"dataset_name":"quac"},"validation":{"name":"validation","num_bytes":20678483,"num_examples":7354,"dataset_name":"quac"}},"download_checksums":{"https://s3.amazonaws.com/my89public/quac/train_v0.2.json":{"num_bytes":68114819,"checksum":"ff5cca5a2e4b4d1cb5b5ced68b9fce88394ef6d93117426d6d4baafbcc05c56a"},"https://s3.amazonaws.com/my89public/quac/val_v0.2.json":{"num_bytes":8929167,"checksum":"09e622916280ba04c9352acb1bc5bbe80f11a2598f6f34e934c51d9e6570f378"}},"download_size":77043986,"post_processing_size":null,"dataset_size":233070441,"size_in_bytes":310114427}}
f"{data_dir} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('matinf', data_dir=...)` that includes SAT-package-V3.txt. Manual download instructions: {self.manual_download_instructions}"
)
return[
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
{"mid_word_1_anagrams":{"description":"Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n","citation":"@inproceedings{NEURIPS2020_1457c0d6,\n author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n booktitle = {Advances in Neural Information Processing Systems},\n editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n pages = {1877--1901},\n publisher = {Curran Associates, Inc.},\n title = {Language Models are Few-Shot Learners},\n url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n volume = {33},\n year = {2020}\n}\n","homepage":"https://github.com/openai/gpt-3/tree/master/data","license":"","features":{"context":{"dtype":"string","id":null,"_type":"Value"},"completion":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"unscramble","config_name":"mid_word_1_anagrams","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"validation":{"name":"validation","num_bytes":271516,"num_examples":10000,"dataset_name":"unscramble"}},"download_checksums":{"https://raw.githubusercontent.com/openai/gpt-3/master/data/mid_word_1_anagrams.jsonl.gz":{"num_bytes":106533,"checksum":"6768a86896083199de4815d4964cb2f6f1046476cfd80c2a562784f182905979"}},"download_size":106533,"post_processing_size":null,"dataset_size":271516,"size_in_bytes":378049},"mid_word_2_anagrams":{"description":"Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n","citation":"@inproceedings{NEURIPS2020_1457c0d6,\n author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n booktitle = {Advances in Neural Information Processing Systems},\n editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n pages = {1877--1901},\n publisher = {Curran Associates, Inc.},\n title = {Language Models are Few-Shot Learners},\n url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n volume = {33},\n year = {2020}\n}\n","homepage":"https://github.com/openai/gpt-3/tree/master/data","license":"","features":{"context":{"dtype":"string","id":null,"_type":"Value"},"completion":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"unscramble","config_name":"mid_word_2_anagrams","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"validation":{"name":"validation","num_bytes":282654,"num_examples":10000,"dataset_name":"unscramble"}},"download_checksums":{"https://raw.githubusercontent.com/openai/gpt-3/master/data/mid_word_2_anagrams.jsonl.gz":{"num_bytes":109091,"checksum":"c3d839d09a7954b78a27cd2cd75d4ed0488656c56ef4dbd741a005343826cb01"}},"download_size":109091,"post_processing_size":null,"dataset_size":282654,"size_in_bytes":391745},"cycle_letters_in_word":{"description":"Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n","citation":"@inproceedings{NEURIPS2020_1457c0d6,\n author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n booktitle = {Advances in Neural Information Processing Systems},\n editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n pages = {1877--1901},\n publisher = {Curran Associates, Inc.},\n title = {Language Models are Few-Shot Learners},\n url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n volume = {33},\n year = {2020}\n}\n","homepage":"https://github.com/openai/gpt-3/tree/master/data","license":"","features":{"context":{"dtype":"string","id":null,"_type":"Value"},"completion":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"unscramble","config_name":"cycle_letters_in_word","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"validation":{"name":"validation","num_bytes":282654,"num_examples":10000,"dataset_name":"unscramble"}},"download_checksums":{"https://raw.githubusercontent.com/openai/gpt-3/master/data/cycle_letters_in_word.jsonl.gz":{"num_bytes":98451,"checksum":"1689c9002bb8c5988bf5f05e977c9db92f57932c1b5a38998c29ac0dd71e1d42"}},"download_size":98451,"post_processing_size":null,"dataset_size":282654,"size_in_bytes":381105},"random_insertion_in_word":{"description":"Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n","citation":"@inproceedings{NEURIPS2020_1457c0d6,\n author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n booktitle = {Advances in Neural Information Processing Systems},\n editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n pages = {1877--1901},\n publisher = {Curran Associates, Inc.},\n title = {Language Models are Few-Shot Learners},\n url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n volume = {33},\n year = {2020}\n}\n","homepage":"https://github.com/openai/gpt-3/tree/master/data","license":"","features":{"context":{"dtype":"string","id":null,"_type":"Value"},"completion":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"unscramble","config_name":"random_insertion_in_word","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"validation":{"name":"validation","num_bytes":353981,"num_examples":10000,"dataset_name":"unscramble"}},"download_checksums":{"https://raw.githubusercontent.com/openai/gpt-3/master/data/random_insertion_in_word.jsonl.gz":{"num_bytes":143626,"checksum":"72e65d83da53d15752ee0c47379509de149ddbad32d61184e5991df29616b78a"}},"download_size":143626,"post_processing_size":null,"dataset_size":353981,"size_in_bytes":497607},"reversed_words":{"description":"Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n","citation":"@inproceedings{NEURIPS2020_1457c0d6,\n author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n booktitle = {Advances in Neural Information Processing Systems},\n editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n pages = {1877--1901},\n publisher = {Curran Associates, Inc.},\n title = {Language Models are Few-Shot Learners},\n url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n volume = {33},\n year = {2020}\n}\n","homepage":"https://github.com/openai/gpt-3/tree/master/data","license":"","features":{"context":{"dtype":"string","id":null,"_type":"Value"},"completion":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"unscramble","config_name":"reversed_words","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"validation":{"name":"validation","num_bytes":282654,"num_examples":10000,"dataset_name":"unscramble"}},"download_checksums":{"https://raw.githubusercontent.com/openai/gpt-3/master/data/reversed_words.jsonl.gz":{"num_bytes":91917,"checksum":"133a08f875cd6c1ef8608a3233571a773881cc27b1c707de738cc6543439332a"}},"download_size":91917,"post_processing_size":null,"dataset_size":282654,"size_in_bytes":374571}}
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Unscramble dataset."""
importjson
importos
importdatasets
_CITATION="""\
@inproceedings{NEURIPS2020_1457c0d6,
author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
booktitle = {Advances in Neural Information Processing Systems},
editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},