lambada_multilingual.py 1.87 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
"""
The LAMBADA dataset: Word prediction requiring a broad discourse context∗
https://arxiv.org/pdf/1606.06031.pdf

The LAMBADA dataset machine-translated to other languages.
LAMBADA is a dataset to evaluate the capabilities of computational models for text
understanding by means of a word prediction task. LAMBADA is a collection of narrative
passages sharing the characteristic that human subjects are able to guess their last
word if they are exposed to the whole passage, but not if they only see the last
sentence preceding the target word. To succeed on LAMBADA, computational models
cannot simply rely on local context, but must be able to keep track of information
in the broader discourse.

Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI
"""
sdtblck's avatar
sdtblck committed
16
17
from . import lambada

18

19
20
21
22
23
24
25
26
27
28
29
30
_CITATION = """
@misc{
    author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel}, 
    title={The LAMBADA dataset},
    DOI={10.5281/zenodo.2630551},
    publisher={Zenodo},
    year={2016},
    month={Aug}
}
"""


sdtblck's avatar
sdtblck committed
31
32
class MultilingualLAMBADA(lambada.LAMBADA):
    VERSION = 0
Jonathan Tow's avatar
Jonathan Tow committed
33

sdtblck's avatar
sdtblck committed
34

sdtblck's avatar
sdtblck committed
35
class MultilingualLAMBADAEN(MultilingualLAMBADA):
Jonathan Tow's avatar
Jonathan Tow committed
36
37
    DATASET_NAME = 'en'

sdtblck's avatar
sdtblck committed
38
39

class MultilingualLAMBADAFR(MultilingualLAMBADA):
Jonathan Tow's avatar
Jonathan Tow committed
40
41
    DATASET_NAME = 'fr'

sdtblck's avatar
sdtblck committed
42
43

class MultilingualLAMBADADE(MultilingualLAMBADA):
Jonathan Tow's avatar
Jonathan Tow committed
44
45
    DATASET_NAME = 'de'

sdtblck's avatar
sdtblck committed
46
47

class MultilingualLAMBADAIT(MultilingualLAMBADA):
Jonathan Tow's avatar
Jonathan Tow committed
48
49
    DATASET_NAME = 'it'

sdtblck's avatar
sdtblck committed
50
51

class MultilingualLAMBADAES(MultilingualLAMBADA):
Jonathan Tow's avatar
Jonathan Tow committed
52
53
54
55
56
57
    DATASET_NAME = 'es'


LANG_CLASSES = [MultilingualLAMBADAEN, MultilingualLAMBADAFR,
                MultilingualLAMBADADE, MultilingualLAMBADAIT,
                MultilingualLAMBADAES]
sdtblck's avatar
sdtblck committed
58
59


sdtblck's avatar
sdtblck committed
60
61
def construct_tasks():
    tasks = {}
Jonathan Tow's avatar
Jonathan Tow committed
62
63
    for lang_class in LANG_CLASSES:
        tasks[f"lambada_mt_{lang_class.DATASET_NAME}"] = lang_class
sdtblck's avatar
sdtblck committed
64
    return tasks