lambada_multilingual.py 1.86 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
"""
The LAMBADA dataset: Word prediction requiring a broad discourse context∗
https://arxiv.org/pdf/1606.06031.pdf

The LAMBADA dataset machine-translated to other languages.
LAMBADA is a dataset to evaluate the capabilities of computational models for text
understanding by means of a word prediction task. LAMBADA is a collection of narrative
passages sharing the characteristic that human subjects are able to guess their last
word if they are exposed to the whole passage, but not if they only see the last
sentence preceding the target word. To succeed on LAMBADA, computational models
cannot simply rely on local context, but must be able to keep track of information
in the broader discourse.

Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI
"""
sdtblck's avatar
sdtblck committed
16
17
from . import lambada

18

19
20
_CITATION = """
@misc{
Fabrizio Milo's avatar
Fabrizio Milo committed
21
    author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
22
23
24
25
26
27
28
29
30
    title={The LAMBADA dataset},
    DOI={10.5281/zenodo.2630551},
    publisher={Zenodo},
    year={2016},
    month={Aug}
}
"""


sdtblck's avatar
sdtblck committed
31
32
class MultilingualLAMBADA(lambada.LAMBADA):
    VERSION = 0
Jonathan Tow's avatar
Jonathan Tow committed
33

sdtblck's avatar
sdtblck committed
34

sdtblck's avatar
sdtblck committed
35
class MultilingualLAMBADAEN(MultilingualLAMBADA):
Fabrizio Milo's avatar
Fabrizio Milo committed
36
    DATASET_NAME = "en"
Jonathan Tow's avatar
Jonathan Tow committed
37

sdtblck's avatar
sdtblck committed
38
39

class MultilingualLAMBADAFR(MultilingualLAMBADA):
Fabrizio Milo's avatar
Fabrizio Milo committed
40
    DATASET_NAME = "fr"
Jonathan Tow's avatar
Jonathan Tow committed
41

sdtblck's avatar
sdtblck committed
42
43

class MultilingualLAMBADADE(MultilingualLAMBADA):
Fabrizio Milo's avatar
Fabrizio Milo committed
44
    DATASET_NAME = "de"
Jonathan Tow's avatar
Jonathan Tow committed
45

sdtblck's avatar
sdtblck committed
46
47

class MultilingualLAMBADAIT(MultilingualLAMBADA):
Fabrizio Milo's avatar
Fabrizio Milo committed
48
    DATASET_NAME = "it"
Jonathan Tow's avatar
Jonathan Tow committed
49

sdtblck's avatar
sdtblck committed
50
51

class MultilingualLAMBADAES(MultilingualLAMBADA):
Fabrizio Milo's avatar
Fabrizio Milo committed
52
    DATASET_NAME = "es"
Jonathan Tow's avatar
Jonathan Tow committed
53
54


Fabrizio Milo's avatar
Fabrizio Milo committed
55
56
57
58
59
60
61
LANG_CLASSES = [
    MultilingualLAMBADAEN,
    MultilingualLAMBADAFR,
    MultilingualLAMBADADE,
    MultilingualLAMBADAIT,
    MultilingualLAMBADAES,
]
sdtblck's avatar
sdtblck committed
62
63


sdtblck's avatar
sdtblck committed
64
65
def construct_tasks():
    tasks = {}
Jonathan Tow's avatar
Jonathan Tow committed
66
67
    for lang_class in LANG_CLASSES:
        tasks[f"lambada_mt_{lang_class.DATASET_NAME}"] = lang_class
sdtblck's avatar
sdtblck committed
68
    return tasks