{"plain_text":{"description":"An open-source replication of the WebText dataset from OpenAI.\n\nThis is a small subset representing the first 10K records from the original dataset - created for testing.\n\nThe full 8M-record dataset is at https://huggingface.co/datasets/openwebtext\n","citation":"@misc{Gokaslan2019OpenWeb,\n title={OpenWebText Corpus},\n author={Aaron Gokaslan*, Vanya Cohen*, Ellie Pavlick, Stefanie Tellex},\n howpublished{\\url{http://Skylion007.github.io/OpenWebTextCorpus}},\n year={2019}\n}\n","homepage":"https://skylion007.github.io/OpenWebTextCorpus/","license":"","features":{"text":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"builder_name":"openwebtext10k","config_name":"plain_text","version":{"version_str":"1.0.0","description":null,"major":1,"minor":0,"patch":0},"splits":{"train":{"name":"train","num_bytes":49670861,"num_examples":10000,"dataset_name":"openwebtext10k"}},"download_checksums":{"https://cdn-datasets.huggingface.co/nlp/datasets/openwebtext/openwebtext-10k.tar.xz":{"num_bytes":14723792,"checksum":"1dd150ffa3361ab32fa9f129d1b5ce20ac48728be16be436558f844d1761c572"}},"download_size":14723792,"post_processing_size":null,"dataset_size":49670861,"size_in_bytes":64394653}}