lambada_openai.py 4.72 KB
Newer Older
jon-tow's avatar
jon-tow committed
1
# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
Jonathan Tow's avatar
Jonathan Tow committed
2
3
4
5
6
7
8
9
10
11
12
13
14
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# TODO: Address all TODOs and remove all explanatory comments
jon-tow's avatar
jon-tow committed
15
"""LAMBADA (OpenAI) dataset."""
Jonathan Tow's avatar
Jonathan Tow committed
16
17
18
19
20
21
22
23
24


import json

import datasets


_CITATION = """\
@misc{
Fabrizio Milo's avatar
Fabrizio Milo committed
25
    author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
Jonathan Tow's avatar
Jonathan Tow committed
26
27
28
29
30
31
32
33
34
    title={The LAMBADA dataset},
    DOI={10.5281/zenodo.2630551},
    publisher={Zenodo},
    year={2016},
    month={Aug}
}
"""

_DESCRIPTION = """\
jon-tow's avatar
jon-tow committed
35
36
37
38
39
40
41
42
43
The LAMBADA dataset as processed by OpenAI. It is used to evaluate the capabilities
of computational models for text understanding by means of a word prediction task.
LAMBADA is a collection of narrative texts sharing the characteristic that human subjects
are able to guess their last word if they are exposed to the whole text, but not
if they only see the last sentence preceding the target word. To succeed on LAMBADA,
computational models cannot simply rely on local context, but must be able to keep track
of information in the broader discourse.

Reference: https://github.com/openai/gpt-2/issues/131#issuecomment-497136199
Jonathan Tow's avatar
Jonathan Tow committed
44
45
46
47
48
49
50
51
"""

_HOMEPAGE = "https://zenodo.org/record/2630551#.X4Xzn5NKjUI"

# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""

_URLS = {
jon-tow's avatar
jon-tow committed
52
    "default": "https://openaipublic.blob.core.windows.net/gpt-2/data/lambada_test.jsonl",
Jonathan Tow's avatar
Jonathan Tow committed
53
54
55
56
57
58
59
60
    "en": "http://eaidata.bmk.sh/data/lambada_test_en.jsonl",
    "fr": "http://eaidata.bmk.sh/data/lambada_test_fr.jsonl",
    "de": "http://eaidata.bmk.sh/data/lambada_test_de.jsonl",
    "it": "http://eaidata.bmk.sh/data/lambada_test_it.jsonl",
    "es": "http://eaidata.bmk.sh/data/lambada_test_es.jsonl",
}


jon-tow's avatar
jon-tow committed
61
class LambadaOpenAI(datasets.GeneratorBasedBuilder):
Jonathan Tow's avatar
Jonathan Tow committed
62
63
64
65
66
    """LAMBADA is a dataset to evaluate the capabilities of computational models for text understanding by means of a word prediction task."""

    VERSION = datasets.Version("0.0.1")

    BUILDER_CONFIGS = [
Fabrizio Milo's avatar
Fabrizio Milo committed
67
        datasets.BuilderConfig(
jon-tow's avatar
jon-tow committed
68
69
70
            name="default",
            version=VERSION,
            description="Pre-processed English LAMBADA dataset from OpenAI",
Fabrizio Milo's avatar
Fabrizio Milo committed
71
72
73
74
        ),
        datasets.BuilderConfig(
            name="en",
            version=VERSION,
jon-tow's avatar
jon-tow committed
75
            description="The English translated LAMBADA OpenAI dataset",
Fabrizio Milo's avatar
Fabrizio Milo committed
76
77
78
79
        ),
        datasets.BuilderConfig(
            name="fr",
            version=VERSION,
jon-tow's avatar
jon-tow committed
80
            description="The French translated LAMBADA OpenAI dataset",
Fabrizio Milo's avatar
Fabrizio Milo committed
81
82
83
84
        ),
        datasets.BuilderConfig(
            name="de",
            version=VERSION,
jon-tow's avatar
jon-tow committed
85
            description="The German translated LAMBADA OpenAI dataset",
Fabrizio Milo's avatar
Fabrizio Milo committed
86
87
88
89
        ),
        datasets.BuilderConfig(
            name="it",
            version=VERSION,
jon-tow's avatar
jon-tow committed
90
            description="The Italian translated LAMBADA OpenAI dataset",
Fabrizio Milo's avatar
Fabrizio Milo committed
91
92
93
94
        ),
        datasets.BuilderConfig(
            name="es",
            version=VERSION,
jon-tow's avatar
jon-tow committed
95
            description="The Spanish translated LAMBADA OpenAI dataset",
Fabrizio Milo's avatar
Fabrizio Milo committed
96
        ),
Jonathan Tow's avatar
Jonathan Tow committed
97
98
    ]

jon-tow's avatar
jon-tow committed
99
    DEFAULT_CONFIG_NAME = "default"
Jonathan Tow's avatar
Jonathan Tow committed
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

    def _info(self):
        features = datasets.Features(
            {
                "text": datasets.Value("string"),
            }
        )
        return datasets.DatasetInfo(
            description=f"{_DESCRIPTION}\n{self.config.description}",
            features=features,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        urls = _URLS[self.config.name]
        data_dir = dl_manager.download_and_extract(urls)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": data_dir,
                    "split": "validation",
                },
            ),
        ]

    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
    def _generate_examples(self, filepath, split):
        with open(filepath, encoding="utf-8") as f:
            for key, row in enumerate(f):
                data = json.loads(row)
Fabrizio Milo's avatar
Fabrizio Milo committed
134
                yield key, {"text": data["text"]}