{"wikitext-103-v1":{"description":" The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.\n","citation":"@misc{merity2016pointer,\n title={Pointer Sentinel Mixture Models},\n author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n year={2016},\n eprint={1609.07843},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n","homepage":"https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/","license":"Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)","features":{"page":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"wikitext","config_name":"wikitext-103-v1","version":{"version_str":"1.0.0","description":null,"major":1,"minor":0,"patch":0},"splits":{"test":{"name":"test","num_bytes":1281262,"num_examples":62,"dataset_name":"wikitext"},"train":{"name":"train","num_bytes":539297488,"num_examples":29444,"dataset_name":"wikitext"},"validation":{"name":"validation","num_bytes":1142488,"num_examples":60,"dataset_name":"wikitext"}},"download_checksums":{"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip":{"num_bytes":190229076,"checksum":"242ba0f20b329cfdf1ccc61e9e9e5b59becf189db7f7a81cd2a0e2fc31539590"}},"download_size":190229076,"post_processing_size":null,"dataset_size":541721238,"size_in_bytes":731950314},"wikitext-2-v1":{"description":" The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.\n","citation":"@misc{merity2016pointer,\n title={Pointer Sentinel Mixture Models},\n author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n year={2016},\n eprint={1609.07843},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n","homepage":"https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/","license":"Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)","features":{"page":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"wikitext","config_name":"wikitext-2-v1","version":{"version_str":"1.0.0","description":null,"major":1,"minor":0,"patch":0},"splits":{"test":{"name":"test","num_bytes":1256634,"num_examples":62,"dataset_name":"wikitext"},"train":{"name":"train","num_bytes":10799034,"num_examples":629,"dataset_name":"wikitext"},"validation":{"name":"validation","num_bytes":1121860,"num_examples":60,"dataset_name":"wikitext"}},"download_checksums":{"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip":{"num_bytes":4475746,"checksum":"92675f1d63015c1c8b51f1656a52d5bdbc33aafa60cc47a218a66e7ee817488c"}},"download_size":4475746,"post_processing_size":null,"dataset_size":13177528,"size_in_bytes":17653274},"wikitext-103-raw-v1":{"description":" The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.\n","citation":"@misc{merity2016pointer,\n title={Pointer Sentinel Mixture Models},\n author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n year={2016},\n eprint={1609.07843},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n","homepage":"https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/","license":"Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)","features":{"page":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"wikitext","config_name":"wikitext-103-raw-v1","version":{"version_str":"1.0.0","description":null,"major":1,"minor":0,"patch":0},"splits":{"test":{"name":"test","num_bytes":1290775,"num_examples":62,"dataset_name":"wikitext"},"train":{"name":"train","num_bytes":540656522,"num_examples":29444,"dataset_name":"wikitext"},"validation":{"name":"validation","num_bytes":1147025,"num_examples":60,"dataset_name":"wikitext"}},"download_checksums":{"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip":{"num_bytes":191984949,"checksum":"91c00ae287f0d699e18605c84afc9e45c192bc6b7797ff8837e5474655a33794"}},"download_size":191984949,"post_processing_size":null,"dataset_size":543094322,"size_in_bytes":735079271},"wikitext-2-raw-v1":{"description":" The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.\n","citation":"@misc{merity2016pointer,\n title={Pointer Sentinel Mixture Models},\n author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n year={2016},\n eprint={1609.07843},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n","homepage":"https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/","license":"Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)","features":{"page":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"wikitext","config_name":"wikitext-2-raw-v1","version":{"version_str":"1.0.0","description":null,"major":1,"minor":0,"patch":0},"splits":{"test":{"name":"test","num_bytes":1290775,"num_examples":62,"dataset_name":"wikitext"},"train":{"name":"train","num_bytes":10942633,"num_examples":629,"dataset_name":"wikitext"},"validation":{"name":"validation","num_bytes":1147025,"num_examples":60,"dataset_name":"wikitext"}},"download_checksums":{"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip":{"num_bytes":4721645,"checksum":"ef7edb566e3e2b2d31b29c1fdb0c89a4cc683597484c3dc2517919c615435a11"}},"download_size":4721645,"post_processing_size":null,"dataset_size":13380433,"size_in_bytes":18102078}}