Unverified Commit 6e56cd0d authored by Charles Lovering's avatar Charles Lovering Committed by GitHub
Browse files

Merge pull request #7 from bigscience-workshop/kkawamu1/gem_xsum

Add GEM/xsum
parents fce17ee1 a1b271dd
...@@ -54,6 +54,7 @@ from . import gsm8k ...@@ -54,6 +54,7 @@ from . import gsm8k
from . import storycloze from . import storycloze
from . import hans from . import hans
from . import gem_webnlg from . import gem_webnlg
from . import gem_xsum
# from . import e2e_nlg_cleaned # from . import e2e_nlg_cleaned
...@@ -291,6 +292,15 @@ TASK_REGISTRY = { ...@@ -291,6 +292,15 @@ TASK_REGISTRY = {
# "storycloze_2016": storycloze.StoryCloze2016, # "storycloze_2016": storycloze.StoryCloze2016,
# "storycloze_2018": storycloze.StoryCloze2018, # "storycloze_2018": storycloze.StoryCloze2018,
# "sat": sat.SATAnalogies, # "sat": sat.SATAnalogies,
#GEM/xum
"gem_xsum": gem_xsum.GEMXSUM,
"gem_xsum_challenge_sample": gem_xsum.GEMXSUMChallgeSample,
"gem_xsum_challenge_test_backtranslation": gem_xsum.GEMXSUMChallgeTestBacktranslation,
"gem_xsum_challenge_test_bfp_02": gem_xsum.GEMXSUMChallgeTestBFP02,
"gem_xsum_challenge_test_bfp_05": gem_xsum.GEMXSUMChallgeTestBFP05,
"gem_xsum_challenge_test_nopunc": gem_xsum.GEMXSUMChallgeTestNopunc,
"gem_xsum_challenge_test_covid": gem_xsum.GEMXSUMChallgeTestCovid,
} }
......
"""
Don’t Give Me the Details, Just the Summary! Topic-Aware Convolutional Neural Networks for Extreme Summarization
https://arxiv.org/pdf/1808.08745.pdf
The dataset is for the task of abstractive summarization in its extreme form, its about summarizing a document in a single sentence. It introduces extreme summarization, a new single-document summarization task which does not favor extractive strategies and calls for an abstractive modeling approach. The idea is to create a short, one-sentence news summary answering the question "What is the article about?".
This particularly uses the dataset that is part of the GEM benchmark
Homepage: https://github.com/EdinburghNLP/XSum
The GEM Benchmark: Natural Language Generation, its Evaluation and Metrics
https://arxiv.org/pdf/2102.01672v3.pdf
Write a Short Description of the task.
Homepage: https://gem-benchmark.com/data_cards/XSum
"""
from lm_eval.base import PromptSourceTask
from lm_eval.base import Task, rf
_CITATION = """
@InProceedings{xsum-emnlp,
author = "Shashi Narayan and Shay B. Cohen and Mirella Lapata",
title = "Don't Give Me the Details, Just the Summary! {T}opic-Aware Convolutional Neural Networks for Extreme Summarization",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing ",
year = "2018",
address = "Brussels, Belgium",
}
"""
class GEMXSUMBase(PromptSourceTask):
VERSION = 0
DATASET_PATH = "GEM/xsum"
DATASET_NAME = None
SPLIT = None
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def stopping_criteria(self):
return '.'
def training_docs(self):
if self.has_training_docs():
# We cache training documents in `self._training_docs` for faster
# few-shot processing. If the data is too large to fit in memory,
# return the training data as a generator instead of a list.
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
if self.has_validation_docs():
return self.dataset["validation"]
def test_docs(self):
if self.has_test_docs():
return self.dataset["test"]
class GEMXSUM(GEMXSUMBase):
'''this is for train/validation/test'''
SPLIT = ''
class GEMXSUMChallgeSample(GEMXSUMBase):
'''this is for challenge_train_sample/challenge_validation_sample'''
SPLIT = 'challenge_sample'
def has_test_docs(self):
return False
def training_docs(self):
if self.has_training_docs():
# We cache training documents in `self._training_docs` for faster
# few-shot processing. If the data is too large to fit in memory,
# return the training data as a generator instead of a list.
if self._training_docs is None:
self._training_docs = list(self.dataset["challenge_train_sample"])
return self._training_docs
def validation_docs(self):
if self.has_validation_docs():
return self.dataset["challenge_validation_sample"]
class GEMXSUMChallgeTestBacktranslation(GEMXSUMBase):
'''this is for challenge_test_backtranslation'''
SPLIT = 'challenge_test_backtranslation'
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def test_docs(self):
if self.has_test_docs():
return self.dataset[self.SPLIT]
class GEMXSUMChallgeTestBFP02(GEMXSUMBase):
'''this is for challenge_test_bfp_02'''
SPLIT = 'challenge_test_bfp_02'
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def test_docs(self):
if self.has_test_docs():
return self.dataset[self.SPLIT]
class GEMXSUMChallgeTestBFP05(GEMXSUMBase):
'''this is for challenge_test_bfp_05'''
SPLIT = 'challenge_test_bfp_05'
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def test_docs(self):
if self.has_test_docs():
return self.dataset[self.SPLIT]
class GEMXSUMChallgeTestNopunc(GEMXSUMBase):
'''this is for challenge_test_nopunc'''
SPLIT = 'challenge_test_nopunc'
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def test_docs(self):
if self.has_test_docs():
return self.dataset[self.SPLIT]
class GEMXSUMChallgeTestCovid(GEMXSUMBase):
'''this is for challenge_test_covid'''
SPLIT = 'challenge_test_covid'
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def test_docs(self):
if self.has_test_docs():
return self.dataset[self.SPLIT]
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment