pile.py 4.17 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
"""
The Pile: An 800GB Dataset of Diverse Text for Language Modeling
https://arxiv.org/pdf/2101.00027.pdf

The Pile is a 825 GiB diverse, open source language modelling data set that consists
of 22 smaller, high-quality datasets combined together. To score well on Pile
BPB (bits per byte), a model must be able to understand many disparate domains
including books, github repositories, webpages, chat logs, and medical, physics,
math, computer science, and philosophy papers.

Homepage: https://pile.eleuther.ai/

@article{pile,
  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},
  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},
  journal={arXiv preprint arXiv:2101.00027},
  year={2020}
}
"""
Jason Phang's avatar
Jason Phang committed
20
21
22
23
24
25
26
27
28
29
30
import os

import lm_dataformat
import abc
import numpy as np
from lm_eval.base import rf, PerplexityTask
from ..metrics import mean, matthews_corrcoef, f1_score
from ..utils import general_detokenize
from best_download import download_file


Leo Gao's avatar
Leo Gao committed
31
class PilePerplexityTask(PerplexityTask, abc.ABC):
32
    VERSION = 1
Jason Phang's avatar
Jason Phang committed
33
34
35
36
37
38

    PILE_SET_NAME = None
    VAL_PATH = 'data/pile/val.jsonl.zst'
    TEST_PATH = 'data/pile/test.jsonl.zst'

    def download(self):
Leo Gao's avatar
Leo Gao committed
39
        # TODO: separate pile val/test out by component so we don't have to scan the entire file once per set
Leo Gao's avatar
Leo Gao committed
40
        if not os.path.exists("data/pile/test.jsonl.zst"):
41
            # todo use new best_download fallback api
Leo Gao's avatar
Leo Gao committed
42
            os.makedirs("data/pile/", exist_ok=True)
43
44
            download_file("http://eaidata.bmk.sh/data/pile/val.jsonl.zst", local_file=self.VAL_PATH, expected_checksum="264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92")
            download_file("http://eaidata.bmk.sh/data/pile/test.jsonl.zst", local_file=self.TEST_PATH, expected_checksum="0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e")
Jason Phang's avatar
Jason Phang committed
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64

    def validation_docs(self):
        rdr = lm_dataformat.Reader(self.VAL_PATH)
        for doc, metadata in rdr.stream_data(get_meta=True):
            if metadata["pile_set_name"] == self.PILE_SET_NAME:
                yield doc

    def test_docs(self):
        rdr = lm_dataformat.Reader(self.TEST_PATH)
        for doc, metadata in rdr.stream_data(get_meta=True):
            if metadata["pile_set_name"] == self.PILE_SET_NAME:
                yield doc

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True


Leo Gao's avatar
Leo Gao committed
65
class PileArxiv(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
66
67
68
    PILE_SET_NAME = "ArXiv"


Leo Gao's avatar
Leo Gao committed
69
class PileBooks3(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
70
71
72
    PILE_SET_NAME = "Books3"


Leo Gao's avatar
Leo Gao committed
73
class PileBookCorpus2(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
74
75
76
    PILE_SET_NAME = "BookCorpus2"


Leo Gao's avatar
Leo Gao committed
77
class PileDmMathematics(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
78
79
80
    PILE_SET_NAME = "DM Mathematics"


Leo Gao's avatar
Leo Gao committed
81
class PileEnron(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
82
83
84
    PILE_SET_NAME = "Enron Emails"


Leo Gao's avatar
Leo Gao committed
85
class PileEuroparl(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
86
87
88
    PILE_SET_NAME = "EuroParl"


Leo Gao's avatar
Leo Gao committed
89
class PileFreeLaw(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
90
91
92
    PILE_SET_NAME = "FreeLaw"


Leo Gao's avatar
Leo Gao committed
93
class PileGithub(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
94
95
96
    PILE_SET_NAME = "Github"


Leo Gao's avatar
Leo Gao committed
97
class PileGutenberg(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
98
99
100
    PILE_SET_NAME = "Gutenberg (PG-19)"


Leo Gao's avatar
Leo Gao committed
101
class PileHackernews(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
102
103
104
    PILE_SET_NAME = "HackerNews"


Leo Gao's avatar
Leo Gao committed
105
class PileNIHExporter(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
106
107
108
    PILE_SET_NAME = "NIH ExPorter"


Leo Gao's avatar
Leo Gao committed
109
class PileOpenSubtitles(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
110
111
112
    PILE_SET_NAME = "OpenSubtitles"


Leo Gao's avatar
Leo Gao committed
113
class PileOpenWebText2(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
114
115
116
    PILE_SET_NAME = "OpenWebText2"


Leo Gao's avatar
Leo Gao committed
117
class PilePhilPapers(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
118
119
120
    PILE_SET_NAME = "PhilPapers"


Leo Gao's avatar
Leo Gao committed
121
class PilePileCc(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
122
123
124
    PILE_SET_NAME = "Pile-CC"


Leo Gao's avatar
Leo Gao committed
125
class PilePubmedAbstracts(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
126
127
128
    PILE_SET_NAME = "PubMed Abstracts"


Leo Gao's avatar
Leo Gao committed
129
class PilePubmedCentral(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
130
131
132
    PILE_SET_NAME = "PubMed Central"


Leo Gao's avatar
Leo Gao committed
133
class PileStackExchange(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
134
135
136
    PILE_SET_NAME = "StackExchange"


Leo Gao's avatar
Leo Gao committed
137
class PileUspto(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
138
139
140
    PILE_SET_NAME = "USPTO Backgrounds"


Leo Gao's avatar
Leo Gao committed
141
class PileUbuntuIrc(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
142
    PILE_SET_NAME = "Ubuntu IRC"
Jason Phang's avatar
Jason Phang committed
143
144


Leo Gao's avatar
Leo Gao committed
145
class PileWikipedia(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
146
147
148
    PILE_SET_NAME = "Wikipedia (en)"


Leo Gao's avatar
Leo Gao committed
149
class PileYoutubeSubtitles(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
150
    PILE_SET_NAME = "YoutubeSubtitles"