"megatron/training/arguments.py" did not exist on "c464a10390d5b5c588e79ed3cc19b30b632b951c"
mlsum.py 2.74 KB
Newer Older
Shashi456's avatar
Shashi456 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
""" 
MLSUM: The Multilingual Summarization Corpus
https://aclanthology.org/2020.emnlp-main.647/

This is the MLSUM subset of the GEM benchmark. MLSUM is the first large-scale MultiLingual SUMmarization dataset. 
Obtained from online newspapers, it contains 1.5M+ article/summary pairs in five different languages -- namely, French, German, Spanish, Russian, Turkish. 
Together with English newspapers from the popular CNN/Daily mail dataset, the collected data form a large scale multilingual dataset which can enable new research directions for the text summarization community.
We report cross-lingual comparative analyses based on state-of-the-art systems. 
These highlight existing biases which motivate the use of a multi-lingual dataset.
Homepage: https://gitlab.lip6.fr/scialom/mlsum_data/-/raw/master/MLSUM/
"""
from numpy import True_
from lm_eval.base import PromptSourceTask

_CITATION = """
@article{scialom2020mlsum,
  title={MLSUM: The Multilingual Summarization Corpus},
  author={Scialom, Thomas and Dray, Paul-Alexis and Lamprier, Sylvain and Piwowarski, Benjamin and Staiano, Jacopo},
  journal={arXiv preprint arXiv:2004.14900},
  year={2020}
}
"""


class MLSUMEs(PromptSourceTask):
    VERSION = 0
    DATASET_PATH = "GEM/mlsum"
    DATASET_NAME = "es"

    def has_training_docs(self):
Shashi456's avatar
Shashi456 committed
31
        return True
Shashi456's avatar
Shashi456 committed
32
33

    def has_validation_docs(self):
Shashi456's avatar
Shashi456 committed
34
        return True
Shashi456's avatar
Shashi456 committed
35
36

    def has_test_docs(self):
Shashi456's avatar
Shashi456 committed
37
        return True
Shashi456's avatar
Shashi456 committed
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66

    def training_docs(self):
        if self.has_training_docs():           
            if self._training_docs is None:
                self._training_docs = list(self.dataset["train"])
            return self._training_docs

    def validation_docs(self):
        if self.has_validation_docs():
            return self.dataset["validation"]

    def test_docs(self):
        if self.has_test_docs():
            return self.dataset["test"]

    def stopping_criteria(self):
        return "."

    def max_generation_length(self):
        return 120



class MLSUMDe(PromptSourceTask):
    VERSION = 0
    DATASET_PATH = "GEM/mlsum"
    DATASET_NAME = "de"

    def has_training_docs(self):
Shashi456's avatar
Shashi456 committed
67
        return True
Shashi456's avatar
Shashi456 committed
68
69

    def has_validation_docs(self):
Shashi456's avatar
Shashi456 committed
70
        return True
Shashi456's avatar
Shashi456 committed
71
72

    def has_test_docs(self):
Shashi456's avatar
Shashi456 committed
73
        return True
Shashi456's avatar
Shashi456 committed
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92

    def training_docs(self):
        if self.has_training_docs():           
            if self._training_docs is None:
                self._training_docs = list(self.dataset["train"])
            return self._training_docs

    def validation_docs(self):
        if self.has_validation_docs():
            return self.dataset["validation"]

    def test_docs(self):
        if self.has_test_docs():
            return self.dataset["test"]

    def stopping_criteria(self):
        return "."

    def max_generation_length(self):
Shashi456's avatar
Shashi456 committed
93
        return 120