Merge pull request #5 from Shashi456/mlsum

Add MLsum Tasks

Merge pull request #5 from Shashi456/mlsum
Add MLsum Tasks
f65e196e · Jonathan Tow · GitHub · 6e56cd0d · 6d5031ac · f65e196e
Unverified Commit f65e196e authored Apr 28, 2022 by Jonathan Tow Committed by GitHub Apr 28, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 127 additions and 1 deletion

lm_eval/tasks/__init__.py lm_eval/tasks/__init__.py +8 -1

lm_eval/tasks/gem_mlsum.py lm_eval/tasks/gem_mlsum.py +119 -0

No files found.
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -55,7 +55,7 @@ from . import storycloze
 from . import hans
 from . import gem_webnlg
 from . import gem_xsum
-
+from . import gem_mlsum
 # from . import e2e_nlg_cleaned

 ########################################
@@ -288,6 +288,13 @@ TASK_REGISTRY = {
    "blimp_wh_vs_that_no_gap_long_distance": blimp.BlimpWhVsThatNoGapLongDistance,
    "blimp_wh_vs_that_with_gap": blimp.BlimpWhVsThatWithGap,
    "blimp_wh_vs_that_with_gap_long_distance": blimp.BlimpWhVsThatWithGapLongDistance,
+    
+    #GEM/mlsum
+    "mlsum_es":gem_mlsum.MLSUMEs,
+    "mlsum_de":gem_mlsum.MLSUMDe,
+    "mlsum_es_covid_challenge_set":gem_mlsum.GEMMLSUMEsChallgeTestCovid,
+    "mlsum_de_covid_challenge_set":gem_mlsum.GEMMLSUMDeChallgeTestCovid,
+
    # Requires manual download of data.
    # "storycloze_2016": storycloze.StoryCloze2016,
    # "storycloze_2018": storycloze.StoryCloze2018,

--- a/lm_eval/tasks/gem_mlsum.py
+++ b/lm_eval/tasks/gem_mlsum.py
+""" 
+MLSUM: The Multilingual Summarization Corpus
+https://aclanthology.org/2020.emnlp-main.647/
+
+This is the MLSUM subset of the GEM benchmark. MLSUM is the first large-scale MultiLingual SUMmarization dataset. 
+Obtained from online newspapers, it contains 1.5M+ article/summary pairs in five different languages -- namely, French, German, Spanish, Russian, Turkish. 
+Together with English newspapers from the popular CNN/Daily mail dataset, the collected data form a large scale multilingual dataset which can enable new research directions for the text summarization community.
+We report cross-lingual comparative analyses based on state-of-the-art systems. 
+These highlight existing biases which motivate the use of a multi-lingual dataset.
+Homepage: https://gitlab.lip6.fr/scialom/mlsum_data/-/raw/master/MLSUM/
+"""
+from numpy import True_
+from lm_eval.base import PromptSourceTask
+
+_CITATION = """
+@article{scialom2020mlsum,
+  title={MLSUM: The Multilingual Summarization Corpus},
+  author={Scialom, Thomas and Dray, Paul-Alexis and Lamprier, Sylvain and Piwowarski, Benjamin and Staiano, Jacopo},
+  journal={arXiv preprint arXiv:2004.14900},
+  year={2020}
+}
+"""
+
+
+class GEMMLSUMEsBase(PromptSourceTask):
+    VERSION = 0
+    DATASET_PATH = "GEM/mlsum"
+    DATASET_NAME = "es"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        if self.has_training_docs():           
+            if self._training_docs is None:
+                self._training_docs = list(self.dataset["train"])
+            return self._training_docs
+
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return self.dataset["validation"]
+
+    def test_docs(self):
+        if self.has_test_docs():
+            return self.dataset["test"]
+
+    def stopping_criteria(self):
+        return "."
+
+class GEMMLSUMEs(GEMMLSUMEsBase):
+    '''this is for train/validation/test'''
+    SPLIT = ''
+
+class GEMMLSUMEsChallgeTestCovid(GEMMLSUMEsBase):
+     '''this is for challenge_test_covid'''
+     SPLIT = 'challenge_test_covid'
+
+     def has_training_docs(self):
+         return False
+
+     def has_validation_docs(self):
+         return False
+     def test_docs(self):
+         if self.has_test_docs():
+             return self.dataset[self.SPLIT] 
+
+class GEMMLSUMDeBase(PromptSourceTask):
+    VERSION = 0
+    DATASET_PATH = "GEM/mlsum"
+    DATASET_NAME = "de"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        if self.has_training_docs():           
+            if self._training_docs is None:
+                self._training_docs = list(self.dataset["train"])
+            return self._training_docs
+
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return self.dataset["validation"]
+
+    def test_docs(self):
+        if self.has_test_docs():
+            return self.dataset["test"]
+
+    def stopping_criteria(self):
+        return "."
+
+class GEMMLSUMDe(GEMMLSUMDeBase):
+    '''this is for train/validation/test'''
+    SPLIT = ''
+
+class GEMMLSUMDeChallgeTestCovid(GEMMLSUMDeBase):
+     '''this is for challenge_test_covid'''
+     SPLIT = 'challenge_test_covid'
+
+     def has_training_docs(self):
+         return False
+
+     def has_validation_docs(self):
+         return False
+     def test_docs(self):
+         if self.has_test_docs():
+             return self.dataset[self.SPLIT]