author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
The Pile: An 800GB Dataset of Diverse Text for Language Modeling
https://arxiv.org/pdf/2101.00027.pdf
The Pile is a 825 GiB diverse, open source language modelling data set that consists
of 22 smaller, high-quality datasets combined together. To score well on Pile
BPB (bits per byte), a model must be able to understand many disparate domains
including books, github repositories, webpages, chat logs, and medical, physics,
math, computer science, and philosophy papers.
Homepage: https://pile.eleuther.ai/
"""
fromlm_evalimportutils
fromlm_eval.api.taskimportPerplexityTask
_CITATION="""
@article{pile,
title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},
author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},
template_aliases:"{%setanswer_choices=choices['text']%}{%setgold=choices.label.index(answerKey)%}"# set the list of possible answer choices, and set what this doc's gold answer is (set what ds column used, and what)
doc_to_text:"Question:{{question}}\nAnswer:"
doc_to_target:"{{gold}}"# this will be cast to an int.
template_aliases:"{%setanswer_choices=choices['text']%}{%setgold=choices.label.index(answerKey)%}"# set the list of possible answer choices, and set what this doc's gold answer is (set what ds column used, and what)
doc_to_text:"Question:{{question}}\nAnswer:"
doc_to_target:"{{gold}}"# this will be cast to an int.