pile.py 3.16 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
"""
The Pile: An 800GB Dataset of Diverse Text for Language Modeling
https://arxiv.org/pdf/2101.00027.pdf

The Pile is a 825 GiB diverse, open source language modelling data set that consists
of 22 smaller, high-quality datasets combined together. To score well on Pile
BPB (bits per byte), a model must be able to understand many disparate domains
including books, github repositories, webpages, chat logs, and medical, physics,
math, computer science, and philosophy papers.

Homepage: https://pile.eleuther.ai/
"""
Jonathan Tow's avatar
Jonathan Tow committed
13
14
15
import inspect
import lm_eval.datasets.pile.pile
from lm_eval.base import PerplexityTask
Jason Phang's avatar
Jason Phang committed
16
17


18
19
20
21
22
23
24
25
26
27
_CITATION = """
@article{pile,
  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},
  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},
  journal={arXiv preprint arXiv:2101.00027},
  year={2020}
}
"""


Jonathan Tow's avatar
Jonathan Tow committed
28
class PilePerplexityTask(PerplexityTask):
29
    VERSION = 1
Jonathan Tow's avatar
Jonathan Tow committed
30
31
    DATASET_PATH = inspect.getfile(lm_eval.datasets.pile.pile)
    DATASET_NAME = None
Jason Phang's avatar
Jason Phang committed
32
33
34
35
36
37
38

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

Jonathan Tow's avatar
Jonathan Tow committed
39
40
41
42
43
44
45
46
    def validation_docs(self):
        for doc in self.dataset["validation"]:
            yield doc["text"]

    def test_docs(self):
        for doc in self.dataset["test"]:
            yield doc["text"]

Jason Phang's avatar
Jason Phang committed
47

Leo Gao's avatar
Leo Gao committed
48
class PileArxiv(PilePerplexityTask):
Jonathan Tow's avatar
Jonathan Tow committed
49
    DATASET_NAME = "pile_arxiv"
Jason Phang's avatar
Jason Phang committed
50
51


Leo Gao's avatar
Leo Gao committed
52
class PileBooks3(PilePerplexityTask):
Jonathan Tow's avatar
Jonathan Tow committed
53
    DATASET_NAME = "pile_books3"
Jason Phang's avatar
Jason Phang committed
54
55


Leo Gao's avatar
Leo Gao committed
56
class PileBookCorpus2(PilePerplexityTask):
Jonathan Tow's avatar
Jonathan Tow committed
57
    DATASET_NAME = "pile_bookcorpus2"
Jason Phang's avatar
Jason Phang committed
58
59


Leo Gao's avatar
Leo Gao committed
60
class PileDmMathematics(PilePerplexityTask):
Jonathan Tow's avatar
Jonathan Tow committed
61
    DATASET_NAME = "pile_dm-mathematics"
Jason Phang's avatar
Jason Phang committed
62
63


Leo Gao's avatar
Leo Gao committed
64
class PileEnron(PilePerplexityTask):
Jonathan Tow's avatar
Jonathan Tow committed
65
    DATASET_NAME = "pile_enron"
Jason Phang's avatar
Jason Phang committed
66
67


Leo Gao's avatar
Leo Gao committed
68
class PileEuroparl(PilePerplexityTask):
Jonathan Tow's avatar
Jonathan Tow committed
69
    DATASET_NAME = "pile_europarl"
Jason Phang's avatar
Jason Phang committed
70
71


Leo Gao's avatar
Leo Gao committed
72
class PileFreeLaw(PilePerplexityTask):
Jonathan Tow's avatar
Jonathan Tow committed
73
    DATASET_NAME = "pile_freelaw"
Jason Phang's avatar
Jason Phang committed
74
75


Leo Gao's avatar
Leo Gao committed
76
class PileGithub(PilePerplexityTask):
Jonathan Tow's avatar
Jonathan Tow committed
77
    DATASET_NAME = "pile_github"
Jason Phang's avatar
Jason Phang committed
78
79


Leo Gao's avatar
Leo Gao committed
80
class PileGutenberg(PilePerplexityTask):
Jonathan Tow's avatar
Jonathan Tow committed
81
    DATASET_NAME = "pile_gutenberg"
Jason Phang's avatar
Jason Phang committed
82
83


Leo Gao's avatar
Leo Gao committed
84
class PileHackernews(PilePerplexityTask):
Jonathan Tow's avatar
Jonathan Tow committed
85
    DATASET_NAME = "pile_hackernews"
Jason Phang's avatar
Jason Phang committed
86
87


Leo Gao's avatar
Leo Gao committed
88
class PileNIHExporter(PilePerplexityTask):
Jonathan Tow's avatar
Jonathan Tow committed
89
    DATASET_NAME = "pile_nih-exporter"
Jason Phang's avatar
Jason Phang committed
90
91


Leo Gao's avatar
Leo Gao committed
92
class PileOpenSubtitles(PilePerplexityTask):
Jonathan Tow's avatar
Jonathan Tow committed
93
    DATASET_NAME = "pile_opensubtitles"
Jason Phang's avatar
Jason Phang committed
94
95


Leo Gao's avatar
Leo Gao committed
96
class PileOpenWebText2(PilePerplexityTask):
Jonathan Tow's avatar
Jonathan Tow committed
97
    DATASET_NAME = "pile_openwebtext2"
Jason Phang's avatar
Jason Phang committed
98
99


Leo Gao's avatar
Leo Gao committed
100
class PilePhilPapers(PilePerplexityTask):
Jonathan Tow's avatar
Jonathan Tow committed
101
    DATASET_NAME = "pile_philpapers"
Jason Phang's avatar
Jason Phang committed
102
103


Leo Gao's avatar
Leo Gao committed
104
class PilePileCc(PilePerplexityTask):
Jonathan Tow's avatar
Jonathan Tow committed
105
    DATASET_NAME = "pile_pile-cc"
Jason Phang's avatar
Jason Phang committed
106
107


Leo Gao's avatar
Leo Gao committed
108
class PilePubmedAbstracts(PilePerplexityTask):
Jonathan Tow's avatar
Jonathan Tow committed
109
    DATASET_NAME = "pile_pubmed-abstracts"
Jason Phang's avatar
Jason Phang committed
110
111


Leo Gao's avatar
Leo Gao committed
112
class PilePubmedCentral(PilePerplexityTask):
Jonathan Tow's avatar
Jonathan Tow committed
113
    DATASET_NAME = "pile_pubmed-central"
Jason Phang's avatar
Jason Phang committed
114
115


Leo Gao's avatar
Leo Gao committed
116
class PileStackExchange(PilePerplexityTask):
Jonathan Tow's avatar
Jonathan Tow committed
117
    DATASET_NAME = "pile_stackexchange"
Jason Phang's avatar
Jason Phang committed
118
119


Leo Gao's avatar
Leo Gao committed
120
class PileUspto(PilePerplexityTask):
Jonathan Tow's avatar
Jonathan Tow committed
121
    DATASET_NAME = "pile_upsto"
Jason Phang's avatar
Jason Phang committed
122
123


Leo Gao's avatar
Leo Gao committed
124
class PileUbuntuIrc(PilePerplexityTask):
Jonathan Tow's avatar
Jonathan Tow committed
125
    DATASET_NAME = "pile_ubuntu-irc"
Jason Phang's avatar
Jason Phang committed
126
127


Leo Gao's avatar
Leo Gao committed
128
class PileWikipedia(PilePerplexityTask):
Jonathan Tow's avatar
Jonathan Tow committed
129
    DATASET_NAME = "pile_wikipedia"
Jason Phang's avatar
Jason Phang committed
130
131


Leo Gao's avatar
Leo Gao committed
132
class PileYoutubeSubtitles(PilePerplexityTask):
Jonathan Tow's avatar
Jonathan Tow committed
133
    DATASET_NAME = "pile_youtubesubtitles"