pile.py 3.09 KB
Newer Older
Jason Phang's avatar
Jason Phang committed
1
2
3
4
5
6
7
8
9
10
11
import os

import lm_dataformat
import abc
import numpy as np
from lm_eval.base import rf, PerplexityTask
from ..metrics import mean, matthews_corrcoef, f1_score
from ..utils import general_detokenize
from best_download import download_file


Leo Gao's avatar
Leo Gao committed
12
class PilePerplexityTask(PerplexityTask, abc.ABC):
Leo Gao's avatar
Leo Gao committed
13
    VERSION = 0
Jason Phang's avatar
Jason Phang committed
14
15
16
17
18
19

    PILE_SET_NAME = None
    VAL_PATH = 'data/pile/val.jsonl.zst'
    TEST_PATH = 'data/pile/test.jsonl.zst'

    def download(self):
Leo Gao's avatar
Leo Gao committed
20
        # TODO: separate pile val/test out by component so we don't have to scan the entire file once per set
Jason Phang's avatar
Jason Phang committed
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
        os.makedirs("data/pile/", exist_ok=True)
        if not os.path.exists(self.VAL_PATH):
            download_file("https://the-eye.eu/public/AI/pile/val.jsonl.zst", self.VAL_PATH)
        if not os.path.exists(self.TEST_PATH):
            download_file("https://the-eye.eu/public/AI/pile/test.jsonl.zst", self.TEST_PATH)

    def validation_docs(self):
        rdr = lm_dataformat.Reader(self.VAL_PATH)
        for doc, metadata in rdr.stream_data(get_meta=True):
            if metadata["pile_set_name"] == self.PILE_SET_NAME:
                yield doc

    def test_docs(self):
        rdr = lm_dataformat.Reader(self.TEST_PATH)
        for doc, metadata in rdr.stream_data(get_meta=True):
            if metadata["pile_set_name"] == self.PILE_SET_NAME:
                yield doc

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True


Leo Gao's avatar
Leo Gao committed
46
class PileArxiv(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
47
48
49
    PILE_SET_NAME = "ArXiv"


Leo Gao's avatar
Leo Gao committed
50
class PileBooks3(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
51
52
53
    PILE_SET_NAME = "Books3"


Leo Gao's avatar
Leo Gao committed
54
class PileBookCorpus2(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
55
56
57
    PILE_SET_NAME = "BookCorpus2"


Leo Gao's avatar
Leo Gao committed
58
class PileDmMathematics(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
59
60
61
    PILE_SET_NAME = "DM Mathematics"


Leo Gao's avatar
Leo Gao committed
62
class PileEnron(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
63
64
65
    PILE_SET_NAME = "Enron Emails"


Leo Gao's avatar
Leo Gao committed
66
class PileEuroparl(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
67
68
69
    PILE_SET_NAME = "EuroParl"


Leo Gao's avatar
Leo Gao committed
70
class PileFreeLaw(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
71
72
73
    PILE_SET_NAME = "FreeLaw"


Leo Gao's avatar
Leo Gao committed
74
class PileGithub(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
75
76
77
    PILE_SET_NAME = "Github"


Leo Gao's avatar
Leo Gao committed
78
class PileGutenberg(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
79
80
81
    PILE_SET_NAME = "Gutenberg (PG-19)"


Leo Gao's avatar
Leo Gao committed
82
class PileHackernews(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
83
84
85
    PILE_SET_NAME = "HackerNews"


Leo Gao's avatar
Leo Gao committed
86
class PileNIHExporter(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
87
88
89
    PILE_SET_NAME = "NIH ExPorter"


Leo Gao's avatar
Leo Gao committed
90
class PileOpenSubtitles(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
91
92
93
    PILE_SET_NAME = "OpenSubtitles"


Leo Gao's avatar
Leo Gao committed
94
class PileOpenWebText2(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
95
96
97
    PILE_SET_NAME = "OpenWebText2"


Leo Gao's avatar
Leo Gao committed
98
class PilePhilPapers(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
99
100
101
    PILE_SET_NAME = "PhilPapers"


Leo Gao's avatar
Leo Gao committed
102
class PilePileCc(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
103
104
105
    PILE_SET_NAME = "Pile-CC"


Leo Gao's avatar
Leo Gao committed
106
class PilePubmedAbstracts(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
107
108
109
    PILE_SET_NAME = "PubMed Abstracts"


Leo Gao's avatar
Leo Gao committed
110
class PilePubmedCentral(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
111
112
113
    PILE_SET_NAME = "PubMed Central"


Leo Gao's avatar
Leo Gao committed
114
class PileStackExchange(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
115
116
117
    PILE_SET_NAME = "StackExchange"


Leo Gao's avatar
Leo Gao committed
118
class PileUspto(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
119
120
121
    PILE_SET_NAME = "USPTO Backgrounds"


Leo Gao's avatar
Leo Gao committed
122
class PileUbuntuIrc(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
123
    PILE_SET_NAME = "Ubuntu IRC"
Jason Phang's avatar
Jason Phang committed
124
125


Leo Gao's avatar
Leo Gao committed
126
class PileWikipedia(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
127
128
129
    PILE_SET_NAME = "Wikipedia (en)"


Leo Gao's avatar
Leo Gao committed
130
class PileYoutubeSubtitles(PilePerplexityTask):
Jason Phang's avatar
Jason Phang committed
131
    PILE_SET_NAME = "YoutubeSubtitles"