wikitext.py 10.5 KB
Newer Older
Jonathan Tow's avatar
Jonathan Tow committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# NOTE: This is a modified version of https://github.com/huggingface/datasets/blob/master/datasets/wikitext/wikitext.py
# that returns Wiki pages instead of Wiki text line-by-line.
"""WikiText Dataset."""


import os

import datasets


_CITATION = """\
@misc{merity2016pointer,
      title={Pointer Sentinel Mixture Models},
      author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},
      year={2016},
      eprint={1609.07843},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
"""

_DESCRIPTION = """\
 The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified
 Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike
 License.
"""
_HOMEPAGE = "https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/"
_LICENSE = "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)"
_DATA_URL = "https://s3.amazonaws.com/research.metamind.io/wikitext"


class WikitextConfig(datasets.BuilderConfig):
    """BuilderConfig for GLUE."""

    def __init__(self, data_url, **kwargs):
        """BuilderConfig for Wikitext
        Args:
          data_url: `string`, url to the dataset (word or raw level)
          **kwargs: keyword arguments forwarded to super.
        """
        super(WikitextConfig, self).__init__(
            version=datasets.Version(
                "1.0.0",
            ),
            **kwargs,
        )
        self.data_url = data_url


class Wikitext(datasets.GeneratorBasedBuilder):
    """TODO(wikitext_103): Short description of my dataset."""

    # TODO(wikitext_103): Set up version.
    VERSION = datasets.Version("0.1.0")
    BUILDER_CONFIGS = [
        WikitextConfig(
            name="wikitext-103-v1",
            data_url=_DATA_URL + "/" + "wikitext-103-v1.zip",
            description="Word level dataset. No processing is needed other than replacing newlines with <eos> tokens.",
        ),
        WikitextConfig(
            name="wikitext-2-v1",
            data_url=_DATA_URL + "/" + "wikitext-2-v1.zip",
            description="Word level dataset. No processing is needed other than replacing newlines with <eos> tokens.",
        ),
        WikitextConfig(
            name="wikitext-103-raw-v1",
            data_url=_DATA_URL + "/" + "wikitext-103-raw-v1.zip",
            description="Raw level dataset: the raw tokens before the addition of <unk> tokens. "
            "They should only be used for character level work or for creating newly derived datasets.",
        ),
        WikitextConfig(
            name="wikitext-2-raw-v1",
            data_url=_DATA_URL + "/" + "wikitext-2-raw-v1.zip",
            description="Raw level dataset: the raw tokens before the addition of <unk> tokens. "
            "They should only be used for character level work or for creating newly derived datasets.",
        ),
    ]

    def _info(self):
        # TODO(wikitext): Specifies the datasets.DatasetInfo object
        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # datasets.features.FeatureConnectors
            features=datasets.Features(
                {
                    "page": datasets.Value("string")
                    # These are the features of your dataset like images, labels ...
                }
            ),
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        # TODO(wikitext): Downloads the data and defines the splits
        # dl_manager is a datasets.download.DownloadManager that can be used to
        # download and extract URLs
        if self.config.name == "wikitext-103-v1":
            data_file = dl_manager.download_and_extract(self.config.data_url)
            data_dir = os.path.join(data_file, "wikitext-103")
            return [
                datasets.SplitGenerator(
                    name=datasets.Split.TEST,
Fabrizio Milo's avatar
Fabrizio Milo committed
126
127
128
129
                    gen_kwargs={
                        "data_file": os.path.join(data_dir, "wiki.test.tokens"),
                        "split": "test",
                    },
Jonathan Tow's avatar
Jonathan Tow committed
130
131
132
                ),
                datasets.SplitGenerator(
                    name=datasets.Split.TRAIN,
Fabrizio Milo's avatar
Fabrizio Milo committed
133
134
135
136
                    gen_kwargs={
                        "data_file": os.path.join(data_dir, "wiki.train.tokens"),
                        "split": "train",
                    },
Jonathan Tow's avatar
Jonathan Tow committed
137
138
139
                ),
                datasets.SplitGenerator(
                    name=datasets.Split.VALIDATION,
Fabrizio Milo's avatar
Fabrizio Milo committed
140
141
142
143
                    gen_kwargs={
                        "data_file": os.path.join(data_dir, "wiki.valid.tokens"),
                        "split": "valid",
                    },
Jonathan Tow's avatar
Jonathan Tow committed
144
145
146
147
                ),
            ]
        else:
            if self.config.name == "wikitext-103-raw-v1":
Fabrizio Milo's avatar
Fabrizio Milo committed
148
                data_file = dl_manager.download_and_extract(self.config.data_url)
Jonathan Tow's avatar
Jonathan Tow committed
149
150
151
152
                data_dir = os.path.join(data_file, "wikitext-103-raw")
                return [
                    datasets.SplitGenerator(
                        name=datasets.Split.TEST,
Fabrizio Milo's avatar
Fabrizio Milo committed
153
154
155
156
                        gen_kwargs={
                            "data_file": os.path.join(data_dir, "wiki.test.raw"),
                            "split": "test",
                        },
Jonathan Tow's avatar
Jonathan Tow committed
157
158
159
                    ),
                    datasets.SplitGenerator(
                        name=datasets.Split.TRAIN,
Fabrizio Milo's avatar
Fabrizio Milo committed
160
161
162
163
                        gen_kwargs={
                            "data_file": os.path.join(data_dir, "wiki.train.raw"),
                            "split": "train",
                        },
Jonathan Tow's avatar
Jonathan Tow committed
164
165
166
                    ),
                    datasets.SplitGenerator(
                        name=datasets.Split.VALIDATION,
Fabrizio Milo's avatar
Fabrizio Milo committed
167
168
169
170
                        gen_kwargs={
                            "data_file": os.path.join(data_dir, "wiki.valid.raw"),
                            "split": "valid",
                        },
Jonathan Tow's avatar
Jonathan Tow committed
171
172
173
174
                    ),
                ]
            else:
                if self.config.name == "wikitext-2-raw-v1":
Fabrizio Milo's avatar
Fabrizio Milo committed
175
                    data_file = dl_manager.download_and_extract(self.config.data_url)
Jonathan Tow's avatar
Jonathan Tow committed
176
177
178
179
                    data_dir = os.path.join(data_file, "wikitext-2-raw")
                    return [
                        datasets.SplitGenerator(
                            name=datasets.Split.TEST,
Fabrizio Milo's avatar
Fabrizio Milo committed
180
181
182
183
                            gen_kwargs={
                                "data_file": os.path.join(data_dir, "wiki.test.raw"),
                                "split": "test",
                            },
Jonathan Tow's avatar
Jonathan Tow committed
184
185
186
                        ),
                        datasets.SplitGenerator(
                            name=datasets.Split.TRAIN,
Fabrizio Milo's avatar
Fabrizio Milo committed
187
188
189
190
                            gen_kwargs={
                                "data_file": os.path.join(data_dir, "wiki.train.raw"),
                                "split": "train",
                            },
Jonathan Tow's avatar
Jonathan Tow committed
191
192
193
                        ),
                        datasets.SplitGenerator(
                            name=datasets.Split.VALIDATION,
Fabrizio Milo's avatar
Fabrizio Milo committed
194
195
196
197
                            gen_kwargs={
                                "data_file": os.path.join(data_dir, "wiki.valid.raw"),
                                "split": "valid",
                            },
Jonathan Tow's avatar
Jonathan Tow committed
198
199
200
201
202
                        ),
                    ]
                else:
                    if self.config.name == "wikitext-2-v1":
                        data_file = dl_manager.download_and_extract(
Fabrizio Milo's avatar
Fabrizio Milo committed
203
204
                            self.config.data_url
                        )
Jonathan Tow's avatar
Jonathan Tow committed
205
206
207
208
                        data_dir = os.path.join(data_file, "wikitext-2")
                        return [
                            datasets.SplitGenerator(
                                name=datasets.Split.TEST,
Fabrizio Milo's avatar
Fabrizio Milo committed
209
210
211
212
213
214
                                gen_kwargs={
                                    "data_file": os.path.join(
                                        data_dir, "wiki.test.tokens"
                                    ),
                                    "split": "test",
                                },
Jonathan Tow's avatar
Jonathan Tow committed
215
216
217
218
                            ),
                            datasets.SplitGenerator(
                                name=datasets.Split.TRAIN,
                                gen_kwargs={
Fabrizio Milo's avatar
Fabrizio Milo committed
219
220
221
                                    "data_file": os.path.join(
                                        data_dir, "wiki.train.tokens"
                                    ),
Jonathan Tow's avatar
Jonathan Tow committed
222
223
224
225
226
227
                                    "split": "train",
                                },
                            ),
                            datasets.SplitGenerator(
                                name=datasets.Split.VALIDATION,
                                gen_kwargs={
Fabrizio Milo's avatar
Fabrizio Milo committed
228
229
230
                                    "data_file": os.path.join(
                                        data_dir, "wiki.valid.tokens"
                                    ),
Jonathan Tow's avatar
Jonathan Tow committed
231
232
233
234
235
236
237
238
239
240
241
242
243
                                    "split": "valid",
                                },
                            ),
                        ]

    def _generate_examples(self, data_file, split):
        """Yields examples."""
        with open(data_file, encoding="utf-8") as f:
            key = 0
            ret = []
            data = f.read().split("\n")
            for line in data:
                rline = line.replace("= = =", "===").replace("= =", "==").strip()
Fabrizio Milo's avatar
Fabrizio Milo committed
244
245
                if rline.startswith("= ") and rline.strip().endswith(" ="):
                    page = "\n".join(ret)
Jonathan Tow's avatar
Jonathan Tow committed
246
247
248
249
250
                    if page.strip():
                        yield key, {"page": page}
                        key += 1
                    ret = []
                ret.append(line)
Fabrizio Milo's avatar
Fabrizio Milo committed
251
            page = "\n".join(ret)
Jonathan Tow's avatar
Jonathan Tow committed
252
            yield key, {"page": page}