pile.py 3.3 KB
Newer Older
Jason Phang's avatar
Jason Phang committed
1
2
3
4
5
6
7
8
9
10
11
import os

import lm_dataformat
import abc
import numpy as np
from lm_eval.base import rf, PerplexityTask
from ..metrics import mean, matthews_corrcoef, f1_score
from ..utils import general_detokenize
from best_download import download_file


Leo Gao's avatar
Leo Gao committed
12
class PilePerplexityTask(PerplexityTask, abc.ABC):
13
    VERSION = 1
Jason Phang's avatar
Jason Phang committed
14
15
16
17
18
19

    PILE_SET_NAME = None
    VAL_PATH = 'data/pile/val.jsonl.zst'
    TEST_PATH = 'data/pile/test.jsonl.zst'

    def download(self):
Leo Gao's avatar
Leo Gao committed
20
        # TODO: separate pile val/test out by component so we don't have to scan the entire file once per set
Leo Gao's avatar
Leo Gao committed
21
        if not os.path.exists("data/pile/test.jsonl.zst"):
22
            # todo use new best_download fallback api
Leo Gao's avatar
Leo Gao committed
23
            os.makedirs("data/pile/", exist_ok=True)
24
25
            download_file("http://eaidata.bmk.sh/data/pile/val.jsonl.zst", local_file=self.VAL_PATH, expected_checksum="264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92")
            download_file("http://eaidata.bmk.sh/data/pile/test.jsonl.zst", local_file=self.TEST_PATH, expected_checksum="0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e")
Jason Phang's avatar
Jason Phang committed
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45

    def validation_docs(self):
        rdr = lm_dataformat.Reader(self.VAL_PATH)
        for doc, metadata in rdr.stream_data(get_meta=True):
            if metadata["pile_set_name"] == self.PILE_SET_NAME:
                yield doc

    def test_docs(self):
        rdr = lm_dataformat.Reader(self.TEST_PATH)
        for doc, metadata in rdr.stream_data(get_meta=True):
            if metadata["pile_set_name"] == self.PILE_SET_NAME:
                yield doc

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True


Leo Gao's avatar
Leo Gao committed
46
class PileArxiv(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
47
48
49
    PILE_SET_NAME = "ArXiv"


Leo Gao's avatar
Leo Gao committed
50
class PileBooks3(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
51
52
53
    PILE_SET_NAME = "Books3"


Leo Gao's avatar
Leo Gao committed
54
class PileBookCorpus2(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
55
56
57
    PILE_SET_NAME = "BookCorpus2"


Leo Gao's avatar
Leo Gao committed
58
class PileDmMathematics(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
59
60
61
    PILE_SET_NAME = "DM Mathematics"


Leo Gao's avatar
Leo Gao committed
62
class PileEnron(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
63
64
65
    PILE_SET_NAME = "Enron Emails"


Leo Gao's avatar
Leo Gao committed
66
class PileEuroparl(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
67
68
69
    PILE_SET_NAME = "EuroParl"


Leo Gao's avatar
Leo Gao committed
70
class PileFreeLaw(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
71
72
73
    PILE_SET_NAME = "FreeLaw"


Leo Gao's avatar
Leo Gao committed
74
class PileGithub(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
75
76
77
    PILE_SET_NAME = "Github"


Leo Gao's avatar
Leo Gao committed
78
class PileGutenberg(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
79
80
81
    PILE_SET_NAME = "Gutenberg (PG-19)"


Leo Gao's avatar
Leo Gao committed
82
class PileHackernews(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
83
84
85
    PILE_SET_NAME = "HackerNews"


Leo Gao's avatar
Leo Gao committed
86
class PileNIHExporter(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
87
88
89
    PILE_SET_NAME = "NIH ExPorter"


Leo Gao's avatar
Leo Gao committed
90
class PileOpenSubtitles(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
91
92
93
    PILE_SET_NAME = "OpenSubtitles"


Leo Gao's avatar
Leo Gao committed
94
class PileOpenWebText2(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
95
96
97
    PILE_SET_NAME = "OpenWebText2"


Leo Gao's avatar
Leo Gao committed
98
class PilePhilPapers(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
99
100
101
    PILE_SET_NAME = "PhilPapers"


Leo Gao's avatar
Leo Gao committed
102
class PilePileCc(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
103
104
105
    PILE_SET_NAME = "Pile-CC"


Leo Gao's avatar
Leo Gao committed
106
class PilePubmedAbstracts(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
107
108
109
    PILE_SET_NAME = "PubMed Abstracts"


Leo Gao's avatar
Leo Gao committed
110
class PilePubmedCentral(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
111
112
113
    PILE_SET_NAME = "PubMed Central"


Leo Gao's avatar
Leo Gao committed
114
class PileStackExchange(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
115
116
117
    PILE_SET_NAME = "StackExchange"


Leo Gao's avatar
Leo Gao committed
118
class PileUspto(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
119
120
121
    PILE_SET_NAME = "USPTO Backgrounds"


Leo Gao's avatar
Leo Gao committed
122
class PileUbuntuIrc(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
123
    PILE_SET_NAME = "Ubuntu IRC"
Jason Phang's avatar
Jason Phang committed
124
125


Leo Gao's avatar
Leo Gao committed
126
class PileWikipedia(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
127
128
129
    PILE_SET_NAME = "Wikipedia (en)"


Leo Gao's avatar
Leo Gao committed
130
class PileYoutubeSubtitles(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
131
    PILE_SET_NAME = "YoutubeSubtitles"