gem_mlsum.py 3.41 KB
Newer Older
Shashi456's avatar
Shashi456 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
""" 
MLSUM: The Multilingual Summarization Corpus
https://aclanthology.org/2020.emnlp-main.647/

This is the MLSUM subset of the GEM benchmark. MLSUM is the first large-scale MultiLingual SUMmarization dataset. 
Obtained from online newspapers, it contains 1.5M+ article/summary pairs in five different languages -- namely, French, German, Spanish, Russian, Turkish. 
Together with English newspapers from the popular CNN/Daily mail dataset, the collected data form a large scale multilingual dataset which can enable new research directions for the text summarization community.
We report cross-lingual comparative analyses based on state-of-the-art systems. 
These highlight existing biases which motivate the use of a multi-lingual dataset.
Homepage: https://gitlab.lip6.fr/scialom/mlsum_data/-/raw/master/MLSUM/
"""
from numpy import True_
from lm_eval.base import PromptSourceTask

_CITATION = """
@article{scialom2020mlsum,
  title={MLSUM: The Multilingual Summarization Corpus},
  author={Scialom, Thomas and Dray, Paul-Alexis and Lamprier, Sylvain and Piwowarski, Benjamin and Staiano, Jacopo},
  journal={arXiv preprint arXiv:2004.14900},
  year={2020}
}
"""


Shashi456's avatar
Shashi456 committed
25
class GEMMLSUMEsBase(PromptSourceTask):
Shashi456's avatar
Shashi456 committed
26
27
28
29
30
    VERSION = 0
    DATASET_PATH = "GEM/mlsum"
    DATASET_NAME = "es"

    def has_training_docs(self):
Shashi456's avatar
Shashi456 committed
31
        return True
Shashi456's avatar
Shashi456 committed
32
33

    def has_validation_docs(self):
Shashi456's avatar
Shashi456 committed
34
        return True
Shashi456's avatar
Shashi456 committed
35
36

    def has_test_docs(self):
Shashi456's avatar
Shashi456 committed
37
        return True
Shashi456's avatar
Shashi456 committed
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52

    def training_docs(self):
        if self.has_training_docs():           
            if self._training_docs is None:
                self._training_docs = list(self.dataset["train"])
            return self._training_docs

    def validation_docs(self):
        if self.has_validation_docs():
            return self.dataset["validation"]

    def test_docs(self):
        if self.has_test_docs():
            return self.dataset["test"]

Shashi456's avatar
Shashi456 committed
53
54
55
class GEMMLSUMEs(GEMMLSUMEsBase):
    '''this is for train/validation/test'''
    SPLIT = ''
Shashi456's avatar
Shashi456 committed
56

Shashi456's avatar
Shashi456 committed
57
58
59
class GEMMLSUMEsChallgeTestCovid(GEMMLSUMEsBase):
     '''this is for challenge_test_covid'''
     SPLIT = 'challenge_test_covid'
Shashi456's avatar
Shashi456 committed
60

Shashi456's avatar
Shashi456 committed
61
62
     def has_training_docs(self):
         return False
Shashi456's avatar
Shashi456 committed
63

Shashi456's avatar
Shashi456 committed
64
65
66
67
68
69
70
     def has_validation_docs(self):
         return False
     def test_docs(self):
         if self.has_test_docs():
             return self.dataset[self.SPLIT] 

class GEMMLSUMDeBase(PromptSourceTask):
Shashi456's avatar
Shashi456 committed
71
72
73
74
75
    VERSION = 0
    DATASET_PATH = "GEM/mlsum"
    DATASET_NAME = "de"

    def has_training_docs(self):
Shashi456's avatar
Shashi456 committed
76
        return True
Shashi456's avatar
Shashi456 committed
77
78

    def has_validation_docs(self):
Shashi456's avatar
Shashi456 committed
79
        return True
Shashi456's avatar
Shashi456 committed
80
81

    def has_test_docs(self):
Shashi456's avatar
Shashi456 committed
82
        return True
Shashi456's avatar
Shashi456 committed
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97

    def training_docs(self):
        if self.has_training_docs():           
            if self._training_docs is None:
                self._training_docs = list(self.dataset["train"])
            return self._training_docs

    def validation_docs(self):
        if self.has_validation_docs():
            return self.dataset["validation"]

    def test_docs(self):
        if self.has_test_docs():
            return self.dataset["test"]

Shashi456's avatar
Shashi456 committed
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
class GEMMLSUMDe(GEMMLSUMDeBase):
    '''this is for train/validation/test'''
    SPLIT = ''

class GEMMLSUMDeChallgeTestCovid(GEMMLSUMDeBase):
     '''this is for challenge_test_covid'''
     SPLIT = 'challenge_test_covid'

     def has_training_docs(self):
         return False

     def has_validation_docs(self):
         return False
     def test_docs(self):
         if self.has_test_docs():
             return self.dataset[self.SPLIT]