pile.py 3.2 KB
Newer Older
Jason Phang's avatar
Jason Phang committed
1
2
3
4
5
6
7
8
9
10
11
import os

import lm_dataformat
import abc
import numpy as np
from lm_eval.base import rf, PerplexityTask
from ..metrics import mean, matthews_corrcoef, f1_score
from ..utils import general_detokenize
from best_download import download_file


Leo Gao's avatar
Leo Gao committed
12
class PilePerplexityTask(PerplexityTask, abc.ABC):
Leo Gao's avatar
Leo Gao committed
13
    VERSION = 0
Jason Phang's avatar
Jason Phang committed
14
15
16
17
18
19

    PILE_SET_NAME = None
    VAL_PATH = 'data/pile/val.jsonl.zst'
    TEST_PATH = 'data/pile/test.jsonl.zst'

    def download(self):
Leo Gao's avatar
Leo Gao committed
20
        # TODO: separate pile val/test out by component so we don't have to scan the entire file once per set
Leo Gao's avatar
Leo Gao committed
21
22
23
24
        if not os.path.exists("data/pile/test.jsonl.zst"):
            os.makedirs("data/pile/", exist_ok=True)
            download_file("https://the-eye.eu/public/AI/pile/val.jsonl.zst", self.VAL_PATH, "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92")
            download_file("https://the-eye.eu/public/AI/pile/test.jsonl.zst", self.TEST_PATH, "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e")
Jason Phang's avatar
Jason Phang committed
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44

    def validation_docs(self):
        rdr = lm_dataformat.Reader(self.VAL_PATH)
        for doc, metadata in rdr.stream_data(get_meta=True):
            if metadata["pile_set_name"] == self.PILE_SET_NAME:
                yield doc

    def test_docs(self):
        rdr = lm_dataformat.Reader(self.TEST_PATH)
        for doc, metadata in rdr.stream_data(get_meta=True):
            if metadata["pile_set_name"] == self.PILE_SET_NAME:
                yield doc

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True


Leo Gao's avatar
Leo Gao committed
45
class PileArxiv(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
46
47
48
    PILE_SET_NAME = "ArXiv"


Leo Gao's avatar
Leo Gao committed
49
class PileBooks3(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
50
51
52
    PILE_SET_NAME = "Books3"


Leo Gao's avatar
Leo Gao committed
53
class PileBookCorpus2(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
54
55
56
    PILE_SET_NAME = "BookCorpus2"


Leo Gao's avatar
Leo Gao committed
57
class PileDmMathematics(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
58
59
60
    PILE_SET_NAME = "DM Mathematics"


Leo Gao's avatar
Leo Gao committed
61
class PileEnron(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
62
63
64
    PILE_SET_NAME = "Enron Emails"


Leo Gao's avatar
Leo Gao committed
65
class PileEuroparl(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
66
67
68
    PILE_SET_NAME = "EuroParl"


Leo Gao's avatar
Leo Gao committed
69
class PileFreeLaw(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
70
71
72
    PILE_SET_NAME = "FreeLaw"


Leo Gao's avatar
Leo Gao committed
73
class PileGithub(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
74
75
76
    PILE_SET_NAME = "Github"


Leo Gao's avatar
Leo Gao committed
77
class PileGutenberg(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
78
79
80
    PILE_SET_NAME = "Gutenberg (PG-19)"


Leo Gao's avatar
Leo Gao committed
81
class PileHackernews(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
82
83
84
    PILE_SET_NAME = "HackerNews"


Leo Gao's avatar
Leo Gao committed
85
class PileNIHExporter(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
86
87
88
    PILE_SET_NAME = "NIH ExPorter"


Leo Gao's avatar
Leo Gao committed
89
class PileOpenSubtitles(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
90
91
92
    PILE_SET_NAME = "OpenSubtitles"


Leo Gao's avatar
Leo Gao committed
93
class PileOpenWebText2(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
94
95
96
    PILE_SET_NAME = "OpenWebText2"


Leo Gao's avatar
Leo Gao committed
97
class PilePhilPapers(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
98
99
100
    PILE_SET_NAME = "PhilPapers"


Leo Gao's avatar
Leo Gao committed
101
class PilePileCc(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
102
103
104
    PILE_SET_NAME = "Pile-CC"


Leo Gao's avatar
Leo Gao committed
105
class PilePubmedAbstracts(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
106
107
108
    PILE_SET_NAME = "PubMed Abstracts"


Leo Gao's avatar
Leo Gao committed
109
class PilePubmedCentral(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
110
111
112
    PILE_SET_NAME = "PubMed Central"


Leo Gao's avatar
Leo Gao committed
113
class PileStackExchange(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
114
115
116
    PILE_SET_NAME = "StackExchange"


Leo Gao's avatar
Leo Gao committed
117
class PileUspto(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
118
119
120
    PILE_SET_NAME = "USPTO Backgrounds"


Leo Gao's avatar
Leo Gao committed
121
class PileUbuntuIrc(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
122
    PILE_SET_NAME = "Ubuntu IRC"
Jason Phang's avatar
Jason Phang committed
123
124


Leo Gao's avatar
Leo Gao committed
125
class PileWikipedia(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
126
127
128
    PILE_SET_NAME = "Wikipedia (en)"


Leo Gao's avatar
Leo Gao committed
129
class PileYoutubeSubtitles(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
130
    PILE_SET_NAME = "YoutubeSubtitles"