wikitext.py 2.94 KB
Newer Older
1
2
3
4
"""
Pointer Sentinel Mixture Models
https://arxiv.org/pdf/1609.07843.pdf

Fabrizio Milo's avatar
Fabrizio Milo committed
5
The WikiText language modeling dataset is a collection of over 100 million tokens
6
7
8
9
10
11
extracted from the set of verified Good and Featured articles on Wikipedia.

NOTE: This `Task` is based on WikiText-2.

Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/
"""
Leo Gao's avatar
Leo Gao committed
12
import re
13
14
15

from lm_eval import utils
from lm_eval.api.task import PerplexityTask
Leo Gao's avatar
Leo Gao committed
16
17


18
19
_CITATION = """
@misc{merity2016pointer,
Fabrizio Milo's avatar
Fabrizio Milo committed
20
    title={Pointer Sentinel Mixture Models},
21
22
23
24
25
26
27
28
29
    author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},
    year={2016},
    eprint={1609.07843},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}
"""


Leo Gao's avatar
Leo Gao committed
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def wikitext_detokenizer(string):
    # contractions
    string = string.replace("s '", "s'")
    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
    # number separators
    string = string.replace(" @-@ ", "-")
    string = string.replace(" @,@ ", ",")
    string = string.replace(" @.@ ", ".")
    # punctuation
    string = string.replace(" : ", ": ")
    string = string.replace(" ; ", "; ")
    string = string.replace(" . ", ". ")
    string = string.replace(" ! ", "! ")
    string = string.replace(" ? ", "? ")
    string = string.replace(" , ", ", ")
    # double brackets
    string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
    string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
    string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
    # miscellaneous
    string = string.replace("= = = =", "====")
    string = string.replace("= = =", "===")
    string = string.replace("= =", "==")
    string = string.replace(" " + chr(176) + " ", chr(176))
    string = string.replace(" \n", "\n")
    string = string.replace("\n ", "\n")
    string = string.replace(" N ", " 1 ")
    string = string.replace(" 's", "'s")

    return string

63
@utils.register_task
Leo Gao's avatar
Leo Gao committed
64
class WikiText(PerplexityTask):
65
    VERSION = "2.0"
66
    TASK_NAME = "wikitext"
ben's avatar
ben committed
67
    DATASET_PATH = "EleutherAI/wikitext_document_level"
Jonathan Tow's avatar
Jonathan Tow committed
68
    DATASET_NAME = "wikitext-2-raw-v1"
Leo Gao's avatar
Leo Gao committed
69

Jonathan Tow's avatar
Jonathan Tow committed
70
    def has_training_docs(self):
Leo Gao's avatar
Leo Gao committed
71
        return True
72

Jonathan Tow's avatar
Jonathan Tow committed
73
    def has_validation_docs(self):
Leo Gao's avatar
Leo Gao committed
74
        return True
Leo Gao's avatar
Leo Gao committed
75

Leo Gao's avatar
Leo Gao committed
76
77
78
    def has_test_docs(self):
        return True

Jonathan Tow's avatar
Jonathan Tow committed
79
    def training_docs(self):
80
        return map(self._process_doc, self.dataset["train"])
Leo Gao's avatar
Leo Gao committed
81

Jonathan Tow's avatar
Jonathan Tow committed
82
    def validation_docs(self):
83
        return map(self._process_doc, self.dataset["validation"])
Leo Gao's avatar
Leo Gao committed
84
85

    def test_docs(self):
86
        return map(self._process_doc, self.dataset["test"])
Jonathan Tow's avatar
Jonathan Tow committed
87

88
    def _process_doc(self, doc):
Jonathan Tow's avatar
Jonathan Tow committed
89
        return doc["page"]
Leo Gao's avatar
Leo Gao committed
90
91

    def doc_to_target(self, doc):
Leo Gao's avatar
Leo Gao committed
92
        return wikitext_detokenizer(doc)
Jonathan Tow's avatar
Jonathan Tow committed
93

94
95
96
    def should_decontaminate(self):
        return True

Leo Gao's avatar
Leo Gao committed
97
98
    def count_words(self, doc):
        # count number of words in *original doc before detokenization*
99
        return len(re.split(r"\s+", doc))