pile.py 3.25 KB
Newer Older
Jason Phang's avatar
Jason Phang committed
1
2
3
4
5
6
7
8
9
10
11
import os

import lm_dataformat
import abc
import numpy as np
from lm_eval.base import rf, PerplexityTask
from ..metrics import mean, matthews_corrcoef, f1_score
from ..utils import general_detokenize
from best_download import download_file


Leo Gao's avatar
Leo Gao committed
12
class PilePerplexityTask(PerplexityTask, abc.ABC):
Leo Gao's avatar
Leo Gao committed
13
    VERSION = 0
Jason Phang's avatar
Jason Phang committed
14
15
16
17
18
19

    PILE_SET_NAME = None
    VAL_PATH = 'data/pile/val.jsonl.zst'
    TEST_PATH = 'data/pile/test.jsonl.zst'

    def download(self):
Leo Gao's avatar
Leo Gao committed
20
        # TODO: separate pile val/test out by component so we don't have to scan the entire file once per set
21

Leo Gao's avatar
Leo Gao committed
22
        if not os.path.exists("data/pile/test.jsonl.zst"):
23
            # todo use new best_download fallback api
Leo Gao's avatar
Leo Gao committed
24
            os.makedirs("data/pile/", exist_ok=True)
25
26
            download_file("http://eaidata.bmk.sh/data/pile/val.jsonl.zst", self.VAL_PATH, "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92")
            download_file("http://eaidata.bmk.sh/data/pile/test.jsonl.zst", self.TEST_PATH, "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e")
Jason Phang's avatar
Jason Phang committed
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46

    def validation_docs(self):
        rdr = lm_dataformat.Reader(self.VAL_PATH)
        for doc, metadata in rdr.stream_data(get_meta=True):
            if metadata["pile_set_name"] == self.PILE_SET_NAME:
                yield doc

    def test_docs(self):
        rdr = lm_dataformat.Reader(self.TEST_PATH)
        for doc, metadata in rdr.stream_data(get_meta=True):
            if metadata["pile_set_name"] == self.PILE_SET_NAME:
                yield doc

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True


Leo Gao's avatar
Leo Gao committed
47
class PileArxiv(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
48
49
50
    PILE_SET_NAME = "ArXiv"


Leo Gao's avatar
Leo Gao committed
51
class PileBooks3(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
52
53
54
    PILE_SET_NAME = "Books3"


Leo Gao's avatar
Leo Gao committed
55
class PileBookCorpus2(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
56
57
58
    PILE_SET_NAME = "BookCorpus2"


Leo Gao's avatar
Leo Gao committed
59
class PileDmMathematics(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
60
61
62
    PILE_SET_NAME = "DM Mathematics"


Leo Gao's avatar
Leo Gao committed
63
class PileEnron(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
64
65
66
    PILE_SET_NAME = "Enron Emails"


Leo Gao's avatar
Leo Gao committed
67
class PileEuroparl(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
68
69
70
    PILE_SET_NAME = "EuroParl"


Leo Gao's avatar
Leo Gao committed
71
class PileFreeLaw(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
72
73
74
    PILE_SET_NAME = "FreeLaw"


Leo Gao's avatar
Leo Gao committed
75
class PileGithub(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
76
77
78
    PILE_SET_NAME = "Github"


Leo Gao's avatar
Leo Gao committed
79
class PileGutenberg(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
80
81
82
    PILE_SET_NAME = "Gutenberg (PG-19)"


Leo Gao's avatar
Leo Gao committed
83
class PileHackernews(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
84
85
86
    PILE_SET_NAME = "HackerNews"


Leo Gao's avatar
Leo Gao committed
87
class PileNIHExporter(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
88
89
90
    PILE_SET_NAME = "NIH ExPorter"


Leo Gao's avatar
Leo Gao committed
91
class PileOpenSubtitles(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
92
93
94
    PILE_SET_NAME = "OpenSubtitles"


Leo Gao's avatar
Leo Gao committed
95
class PileOpenWebText2(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
96
97
98
    PILE_SET_NAME = "OpenWebText2"


Leo Gao's avatar
Leo Gao committed
99
class PilePhilPapers(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
100
101
102
    PILE_SET_NAME = "PhilPapers"


Leo Gao's avatar
Leo Gao committed
103
class PilePileCc(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
104
105
106
    PILE_SET_NAME = "Pile-CC"


Leo Gao's avatar
Leo Gao committed
107
class PilePubmedAbstracts(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
108
109
110
    PILE_SET_NAME = "PubMed Abstracts"


Leo Gao's avatar
Leo Gao committed
111
class PilePubmedCentral(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
112
113
114
    PILE_SET_NAME = "PubMed Central"


Leo Gao's avatar
Leo Gao committed
115
class PileStackExchange(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
116
117
118
    PILE_SET_NAME = "StackExchange"


Leo Gao's avatar
Leo Gao committed
119
class PileUspto(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
120
121
122
    PILE_SET_NAME = "USPTO Backgrounds"


Leo Gao's avatar
Leo Gao committed
123
class PileUbuntuIrc(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
124
    PILE_SET_NAME = "Ubuntu IRC"
Jason Phang's avatar
Jason Phang committed
125
126


Leo Gao's avatar
Leo Gao committed
127
class PileWikipedia(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
128
129
130
    PILE_SET_NAME = "Wikipedia (en)"


Leo Gao's avatar
Leo Gao committed
131
class PileYoutubeSubtitles(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
132
    PILE_SET_NAME = "YoutubeSubtitles"