pile.py 3.15 KB
Newer Older
Jason Phang's avatar
Jason Phang committed
1
2
3
4
5
6
7
8
9
10
11
import os

import lm_dataformat
import abc
import numpy as np
from lm_eval.base import rf, PerplexityTask
from ..metrics import mean, matthews_corrcoef, f1_score
from ..utils import general_detokenize
from best_download import download_file


Leo Gao's avatar
Leo Gao committed
12
class PilePerplexityTask(PerplexityTask, abc.ABC):
Jason Phang's avatar
Jason Phang committed
13
14
15
16
17
18

    PILE_SET_NAME = None
    VAL_PATH = 'data/pile/val.jsonl.zst'
    TEST_PATH = 'data/pile/test.jsonl.zst'

    def download(self):
Leo Gao's avatar
Leo Gao committed
19
        # TODO: separate pile val/test out by component so we don't have to scan the entire file once per set
Jason Phang's avatar
Jason Phang committed
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
        os.makedirs("data/pile/", exist_ok=True)
        if not os.path.exists(self.VAL_PATH):
            download_file("https://the-eye.eu/public/AI/pile/val.jsonl.zst", self.VAL_PATH)
        if not os.path.exists(self.TEST_PATH):
            download_file("https://the-eye.eu/public/AI/pile/test.jsonl.zst", self.TEST_PATH)

    def validation_docs(self):
        rdr = lm_dataformat.Reader(self.VAL_PATH)
        for doc, metadata in rdr.stream_data(get_meta=True):
            if metadata["pile_set_name"] == self.PILE_SET_NAME:
                yield doc

    def test_docs(self):
        rdr = lm_dataformat.Reader(self.TEST_PATH)
        for doc, metadata in rdr.stream_data(get_meta=True):
            if metadata["pile_set_name"] == self.PILE_SET_NAME:
                yield doc

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True


Leo Gao's avatar
Leo Gao committed
45
class PileArxiv(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
46
47
48
    PILE_SET_NAME = "ArXiv"


Leo Gao's avatar
Leo Gao committed
49
class PileBooks3(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
50
51
52
    PILE_SET_NAME = "Books3"


Leo Gao's avatar
Leo Gao committed
53
class PileBookCorpus2(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
54
55
56
    PILE_SET_NAME = "BookCorpus2"


Leo Gao's avatar
Leo Gao committed
57
class PileCommonCrawl(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
58
59
60
    PILE_SET_NAME = "CommonCrawl"


Leo Gao's avatar
Leo Gao committed
61
class PileDmMathematics(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
62
63
64
    PILE_SET_NAME = "DM Mathematics"


Leo Gao's avatar
Leo Gao committed
65
class PileEnron(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
66
67
68
    PILE_SET_NAME = "Enron Emails"


Leo Gao's avatar
Leo Gao committed
69
class PileEuroparl(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
70
71
72
    PILE_SET_NAME = "EuroParl"


Leo Gao's avatar
Leo Gao committed
73
class PileFreeLaw(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
74
75
76
    PILE_SET_NAME = "FreeLaw"


Leo Gao's avatar
Leo Gao committed
77
class PileGithub(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
78
79
80
    PILE_SET_NAME = "Github"


Leo Gao's avatar
Leo Gao committed
81
class PileGutenberg(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
82
83
84
    PILE_SET_NAME = "Gutenberg (PG-19)"


Leo Gao's avatar
Leo Gao committed
85
class PileHackernews(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
86
87
88
    PILE_SET_NAME = "HackerNews"


Leo Gao's avatar
Leo Gao committed
89
class PileNIHExporter(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
90
91
92
    PILE_SET_NAME = "NIH ExPorter"


Leo Gao's avatar
Leo Gao committed
93
class PileOpenSubtitles(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
94
95
96
    PILE_SET_NAME = "OpenSubtitles"


Leo Gao's avatar
Leo Gao committed
97
class PileOpenWebText2(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
98
99
100
    PILE_SET_NAME = "OpenWebText2"


Leo Gao's avatar
Leo Gao committed
101
class PilePhilPapers(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
102
103
104
    PILE_SET_NAME = "PhilPapers"


Leo Gao's avatar
Leo Gao committed
105
class PilePileCc(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
106
107
108
    PILE_SET_NAME = "Pile-CC"


Leo Gao's avatar
Leo Gao committed
109
class PilePubmedAbstracts(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
110
111
112
    PILE_SET_NAME = "PubMed Abstracts"


Leo Gao's avatar
Leo Gao committed
113
class PilePubmedCentral(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
114
115
116
    PILE_SET_NAME = "PubMed Central"


Leo Gao's avatar
Leo Gao committed
117
class PileStackExchange(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
118
119
120
    PILE_SET_NAME = "StackExchange"


Leo Gao's avatar
Leo Gao committed
121
class PileUspto(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
122
123
124
    PILE_SET_NAME = "USPTO Backgrounds"


Leo Gao's avatar
Leo Gao committed
125
class PileUbuntuIrc(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
126
    PILE_SET_NAME = "Ubuntu IRC"
Jason Phang's avatar
Jason Phang committed
127
128


Leo Gao's avatar
Leo Gao committed
129
class PileWikipedia(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
130
131
132
    PILE_SET_NAME = "Wikipedia (en)"


Leo Gao's avatar
Leo Gao committed
133
class PileYoutubeSubtitles(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
134
    PILE_SET_NAME = "YoutubeSubtitles"