pile.py 3.12 KB
Newer Older
Jason Phang's avatar
Jason Phang committed
1
2
3
4
5
6
7
8
9
10
11
import os

import lm_dataformat
import abc
import numpy as np
from lm_eval.base import rf, PerplexityTask
from ..metrics import mean, matthews_corrcoef, f1_score
from ..utils import general_detokenize
from best_download import download_file


Leo Gao's avatar
Leo Gao committed
12
class PilePerplexityTask(PerplexityTask, abc.ABC):
Leo Gao's avatar
Leo Gao committed
13
    VERSION = 0
Jason Phang's avatar
Jason Phang committed
14
15
16
17
18
19

    PILE_SET_NAME = None
    VAL_PATH = 'data/pile/val.jsonl.zst'
    TEST_PATH = 'data/pile/test.jsonl.zst'

    def download(self):
Leo Gao's avatar
Leo Gao committed
20
        # TODO: separate pile val/test out by component so we don't have to scan the entire file once per set
Jason Phang's avatar
Jason Phang committed
21
        os.makedirs("data/pile/", exist_ok=True)
22
23
        download_file("http://eaidata.bmk.sh/data/pile/val.jsonl.zst", self.VAL_PATH, "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92")
        download_file("http://eaidata.bmk.sh/data/pile/test.jsonl.zst", self.TEST_PATH, "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e")
Jason Phang's avatar
Jason Phang committed
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43

    def validation_docs(self):
        rdr = lm_dataformat.Reader(self.VAL_PATH)
        for doc, metadata in rdr.stream_data(get_meta=True):
            if metadata["pile_set_name"] == self.PILE_SET_NAME:
                yield doc

    def test_docs(self):
        rdr = lm_dataformat.Reader(self.TEST_PATH)
        for doc, metadata in rdr.stream_data(get_meta=True):
            if metadata["pile_set_name"] == self.PILE_SET_NAME:
                yield doc

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True


Leo Gao's avatar
Leo Gao committed
44
class PileArxiv(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
45
46
47
    PILE_SET_NAME = "ArXiv"


Leo Gao's avatar
Leo Gao committed
48
class PileBooks3(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
49
50
51
    PILE_SET_NAME = "Books3"


Leo Gao's avatar
Leo Gao committed
52
class PileBookCorpus2(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
53
54
55
    PILE_SET_NAME = "BookCorpus2"


Leo Gao's avatar
Leo Gao committed
56
class PileDmMathematics(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
57
58
59
    PILE_SET_NAME = "DM Mathematics"


Leo Gao's avatar
Leo Gao committed
60
class PileEnron(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
61
62
63
    PILE_SET_NAME = "Enron Emails"


Leo Gao's avatar
Leo Gao committed
64
class PileEuroparl(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
65
66
67
    PILE_SET_NAME = "EuroParl"


Leo Gao's avatar
Leo Gao committed
68
class PileFreeLaw(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
69
70
71
    PILE_SET_NAME = "FreeLaw"


Leo Gao's avatar
Leo Gao committed
72
class PileGithub(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
73
74
75
    PILE_SET_NAME = "Github"


Leo Gao's avatar
Leo Gao committed
76
class PileGutenberg(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
77
78
79
    PILE_SET_NAME = "Gutenberg (PG-19)"


Leo Gao's avatar
Leo Gao committed
80
class PileHackernews(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
81
82
83
    PILE_SET_NAME = "HackerNews"


Leo Gao's avatar
Leo Gao committed
84
class PileNIHExporter(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
85
86
87
    PILE_SET_NAME = "NIH ExPorter"


Leo Gao's avatar
Leo Gao committed
88
class PileOpenSubtitles(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
89
90
91
    PILE_SET_NAME = "OpenSubtitles"


Leo Gao's avatar
Leo Gao committed
92
class PileOpenWebText2(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
93
94
95
    PILE_SET_NAME = "OpenWebText2"


Leo Gao's avatar
Leo Gao committed
96
class PilePhilPapers(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
97
98
99
    PILE_SET_NAME = "PhilPapers"


Leo Gao's avatar
Leo Gao committed
100
class PilePileCc(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
101
102
103
    PILE_SET_NAME = "Pile-CC"


Leo Gao's avatar
Leo Gao committed
104
class PilePubmedAbstracts(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
105
106
107
    PILE_SET_NAME = "PubMed Abstracts"


Leo Gao's avatar
Leo Gao committed
108
class PilePubmedCentral(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
109
110
111
    PILE_SET_NAME = "PubMed Central"


Leo Gao's avatar
Leo Gao committed
112
class PileStackExchange(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
113
114
115
    PILE_SET_NAME = "StackExchange"


Leo Gao's avatar
Leo Gao committed
116
class PileUspto(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
117
118
119
    PILE_SET_NAME = "USPTO Backgrounds"


Leo Gao's avatar
Leo Gao committed
120
class PileUbuntuIrc(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
121
    PILE_SET_NAME = "Ubuntu IRC"
Jason Phang's avatar
Jason Phang committed
122
123


Leo Gao's avatar
Leo Gao committed
124
class PileWikipedia(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
125
126
127
    PILE_SET_NAME = "Wikipedia (en)"


Leo Gao's avatar
Leo Gao committed
128
class PileYoutubeSubtitles(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
129
    PILE_SET_NAME = "YoutubeSubtitles"