lambada_multilingual.py 2.07 KB
Newer Older
1
"""
jon-tow's avatar
jon-tow committed
2
The LAMBADA (OpenAI) dataset: Word prediction requiring a broad discourse context∗
3
4
https://arxiv.org/pdf/1606.06031.pdf

jon-tow's avatar
jon-tow committed
5
The LAMBADA OpenAI dataset machine-translated to other languages.
6
7
8
9
10
11
12
13
14
LAMBADA is a dataset to evaluate the capabilities of computational models for text
understanding by means of a word prediction task. LAMBADA is a collection of narrative
passages sharing the characteristic that human subjects are able to guess their last
word if they are exposed to the whole passage, but not if they only see the last
sentence preceding the target word. To succeed on LAMBADA, computational models
cannot simply rely on local context, but must be able to keep track of information
in the broader discourse.

Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI
jon-tow's avatar
jon-tow committed
15
16

Reference (OpenAI): https://github.com/openai/gpt-2/issues/131#issuecomment-497136199
17
"""
jon-tow's avatar
jon-tow committed
18
from .lambada import LambadaOpenAI
sdtblck's avatar
sdtblck committed
19

20

21
22
_CITATION = """
@misc{
Fabrizio Milo's avatar
Fabrizio Milo committed
23
    author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
24
25
26
27
28
29
30
31
32
    title={The LAMBADA dataset},
    DOI={10.5281/zenodo.2630551},
    publisher={Zenodo},
    year={2016},
    month={Aug}
}
"""


jon-tow's avatar
jon-tow committed
33
class LambadaOpenAIMultilingualEnglish(LambadaOpenAI):
sdtblck's avatar
sdtblck committed
34
    VERSION = 0
Fabrizio Milo's avatar
Fabrizio Milo committed
35
    DATASET_NAME = "en"
Jonathan Tow's avatar
Jonathan Tow committed
36

sdtblck's avatar
sdtblck committed
37

jon-tow's avatar
jon-tow committed
38
39
class LambadaOpenAIMultilingualFrench(LambadaOpenAI):
    VERSION = 0
Fabrizio Milo's avatar
Fabrizio Milo committed
40
    DATASET_NAME = "fr"
Jonathan Tow's avatar
Jonathan Tow committed
41

sdtblck's avatar
sdtblck committed
42

jon-tow's avatar
jon-tow committed
43
44
class LambadaOpenAIMultilingualGerman(LambadaOpenAI):
    VERSION = 0
Fabrizio Milo's avatar
Fabrizio Milo committed
45
    DATASET_NAME = "de"
Jonathan Tow's avatar
Jonathan Tow committed
46

sdtblck's avatar
sdtblck committed
47

jon-tow's avatar
jon-tow committed
48
49
class LambadaOpenAIMultilingualItalian(LambadaOpenAI):
    VERSION = 0
Fabrizio Milo's avatar
Fabrizio Milo committed
50
    DATASET_NAME = "it"
Jonathan Tow's avatar
Jonathan Tow committed
51

sdtblck's avatar
sdtblck committed
52

jon-tow's avatar
jon-tow committed
53
54
class LambadaOpenAIMultilingualSpanish(LambadaOpenAI):
    VERSION = 0
Fabrizio Milo's avatar
Fabrizio Milo committed
55
    DATASET_NAME = "es"
Jonathan Tow's avatar
Jonathan Tow committed
56
57


Fabrizio Milo's avatar
Fabrizio Milo committed
58
LANG_CLASSES = [
jon-tow's avatar
jon-tow committed
59
60
61
62
63
    LambadaOpenAIMultilingualEnglish,
    LambadaOpenAIMultilingualFrench,
    LambadaOpenAIMultilingualGerman,
    LambadaOpenAIMultilingualItalian,
    LambadaOpenAIMultilingualSpanish,
Fabrizio Milo's avatar
Fabrizio Milo committed
64
]
sdtblck's avatar
sdtblck committed
65
66


sdtblck's avatar
sdtblck committed
67
68
def construct_tasks():
    tasks = {}
Jonathan Tow's avatar
Jonathan Tow committed
69
    for lang_class in LANG_CLASSES:
jon-tow's avatar
jon-tow committed
70
        tasks[f"lambada_openai_mt_{lang_class.DATASET_NAME}"] = lang_class
sdtblck's avatar
sdtblck committed
71
    return tasks