wikitext.py 2.86 KB
Newer Older
1
2
3
4
"""
Pointer Sentinel Mixture Models
https://arxiv.org/pdf/1609.07843.pdf

bzantium's avatar
bzantium committed
5
The WikiText language modeling dataset is a collection of over 100 million tokens
6
7
8
9
10
11
extracted from the set of verified Good and Featured articles on Wikipedia.

NOTE: This `Task` is based on WikiText-2.

Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/
"""
Leo Gao's avatar
Leo Gao committed
12
import re
Jonathan Tow's avatar
Jonathan Tow committed
13
from lm_eval.base import PerplexityTask
Leo Gao's avatar
Leo Gao committed
14
15


16
17
_CITATION = """
@misc{merity2016pointer,
bzantium's avatar
bzantium committed
18
    title={Pointer Sentinel Mixture Models},
19
20
21
22
23
24
25
26
27
    author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},
    year={2016},
    eprint={1609.07843},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}
"""


Leo Gao's avatar
Leo Gao committed
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def wikitext_detokenizer(string):
    # contractions
    string = string.replace("s '", "s'")
    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
    # number separators
    string = string.replace(" @-@ ", "-")
    string = string.replace(" @,@ ", ",")
    string = string.replace(" @.@ ", ".")
    # punctuation
    string = string.replace(" : ", ": ")
    string = string.replace(" ; ", "; ")
    string = string.replace(" . ", ". ")
    string = string.replace(" ! ", "! ")
    string = string.replace(" ? ", "? ")
    string = string.replace(" , ", ", ")
    # double brackets
    string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
    string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
    string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
    # miscellaneous
    string = string.replace("= = = =", "====")
    string = string.replace("= = =", "===")
    string = string.replace("= =", "==")
    string = string.replace(" " + chr(176) + " ", chr(176))
    string = string.replace(" \n", "\n")
    string = string.replace("\n ", "\n")
    string = string.replace(" N ", " 1 ")
    string = string.replace(" 's", "'s")

    return string


class WikiText(PerplexityTask):
63
    VERSION = 1
bzantium's avatar
bzantium committed
64
    DATASET_PATH = "EleutherAI/wikitext_document_level"
Jonathan Tow's avatar
Jonathan Tow committed
65
    DATASET_NAME = "wikitext-2-raw-v1"
Leo Gao's avatar
Leo Gao committed
66

Jonathan Tow's avatar
Jonathan Tow committed
67
    def has_training_docs(self):
Leo Gao's avatar
Leo Gao committed
68
        return True
69

Jonathan Tow's avatar
Jonathan Tow committed
70
    def has_validation_docs(self):
Leo Gao's avatar
Leo Gao committed
71
        return True
Leo Gao's avatar
Leo Gao committed
72

Leo Gao's avatar
Leo Gao committed
73
74
75
    def has_test_docs(self):
        return True

Jonathan Tow's avatar
Jonathan Tow committed
76
    def training_docs(self):
bzantium's avatar
bzantium committed
77
        return map(self._process_doc, self.dataset["train"])
Leo Gao's avatar
Leo Gao committed
78

Jonathan Tow's avatar
Jonathan Tow committed
79
    def validation_docs(self):
bzantium's avatar
bzantium committed
80
        return map(self._process_doc, self.dataset["validation"])
Leo Gao's avatar
Leo Gao committed
81
82

    def test_docs(self):
bzantium's avatar
bzantium committed
83
        return map(self._process_doc, self.dataset["test"])
Jonathan Tow's avatar
Jonathan Tow committed
84

bzantium's avatar
bzantium committed
85
    def _process_doc(self, doc):
Jonathan Tow's avatar
Jonathan Tow committed
86
        return doc["page"]
Leo Gao's avatar
Leo Gao committed
87
88

    def doc_to_target(self, doc):
Leo Gao's avatar
Leo Gao committed
89
        return wikitext_detokenizer(doc)
Jonathan Tow's avatar
Jonathan Tow committed
90

bzantium's avatar
bzantium committed
91
92
93
    def should_decontaminate(self):
        return True

Leo Gao's avatar
Leo Gao committed
94
95
    def count_words(self, doc):
        # count number of words in *original doc before detokenization*
96
        return len(re.split(r"\s+", doc))