gem_xsum.py

"""
Don’t Give Me the Details, Just the Summary! Topic-Aware Convolutional Neural Networks for Extreme Summarization
https://arxiv.org/pdf/1808.08745.pdf

The dataset is for the task of abstractive summarization in its extreme form, its about summarizing a document in a single sentence. It introduces extreme summarization, a new single-document summarization task which does not favor extractive strategies and calls for an abstractive modeling approach. The idea is to create a short, one-sentence news summary answering the question "What is the article about?". 

This particularly uses the dataset that is part of the GEM benchmark
Homepage: https://github.com/EdinburghNLP/XSum
The GEM Benchmark: Natural Language Generation, its Evaluation and Metrics
https://arxiv.org/pdf/2102.01672v3.pdf
Write a Short Description of the task.
Homepage: https://gem-benchmark.com/data_cards/XSum
"""
from lm_eval.base import PromptSourceTask
from lm_eval.base import Task, rf


_CITATION = """
@InProceedings{xsum-emnlp,
  author =      "Shashi Narayan and Shay B. Cohen and Mirella Lapata",
  title =       "Don't Give Me the Details, Just the Summary! {T}opic-Aware Convolutional Neural Networks for Extreme Summarization",
  booktitle =   "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing ",
  year =        "2018",
  address =     "Brussels, Belgium",
}
"""


class GEMXSUMBase(PromptSourceTask):
    VERSION = 0
    DATASET_PATH = "GEM/xsum"
    DATASET_NAME = None
    SPLIT = None

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def training_docs(self):
        if self.has_training_docs():
            # We cache training documents in `self._training_docs` for faster
            # few-shot processing. If the data is too large to fit in memory,
            # return the training data as a generator instead of a list.
            if self._training_docs is None:
                self._training_docs = list(self.dataset["train"])
            return self._training_docs

    def validation_docs(self):
        if self.has_validation_docs():
            return self.dataset["validation"]

    def test_docs(self):
        if self.has_test_docs():
            return self.dataset["test"]

class GEMXSUM(GEMXSUMBase):
    '''this is for train/validation/test'''
    SPLIT = ''

class GEMXSUMChallgeSample(GEMXSUMBase):
    '''this is for challenge_train_sample/challenge_validation_sample'''
    SPLIT = 'challenge_sample'

    def has_test_docs(self):
        return False
        
    def training_docs(self):
        if self.has_training_docs():
            # We cache training documents in `self._training_docs` for faster
            # few-shot processing. If the data is too large to fit in memory,
            # return the training data as a generator instead of a list.
            if self._training_docs is None:
                self._training_docs = list(self.dataset["challenge_train_sample"])
            return self._training_docs

    def validation_docs(self):
        if self.has_validation_docs():
            return self.dataset["challenge_validation_sample"]

class GEMXSUMChallgeTestBacktranslation(GEMXSUMBase):
    '''this is for challenge_test_backtranslation'''
    SPLIT = 'challenge_test_backtranslation'

    def has_training_docs(self):
        return False

    def has_validation_docs(self):
        return False

    def test_docs(self):
        if self.has_test_docs():
            return self.dataset[self.SPLIT]

class GEMXSUMChallgeTestBFP02(GEMXSUMBase):
    '''this is for challenge_test_bfp_02'''
    SPLIT = 'challenge_test_bfp_02'

    def has_training_docs(self):
        return False

    def has_validation_docs(self):
        return False

    def test_docs(self):
        if self.has_test_docs():
            return self.dataset[self.SPLIT]

class GEMXSUMChallgeTestBFP05(GEMXSUMBase):
    '''this is for challenge_test_bfp_05'''
    SPLIT = 'challenge_test_bfp_05'

    def has_training_docs(self):
        return False

    def has_validation_docs(self):
        return False
    def test_docs(self):
        if self.has_test_docs():
            return self.dataset[self.SPLIT]

class GEMXSUMChallgeTestNopunc(GEMXSUMBase):
    '''this is for challenge_test_nopunc'''
    SPLIT = 'challenge_test_nopunc'

    def has_training_docs(self):
        return False

    def has_validation_docs(self):
        return False
    def test_docs(self):
        if self.has_test_docs():
            return self.dataset[self.SPLIT]

class GEMXSUMChallgeTestCovid(GEMXSUMBase):
    '''this is for challenge_test_covid'''
    SPLIT = 'challenge_test_covid'

    def has_training_docs(self):
        return False

    def has_validation_docs(self):
        return False
    def test_docs(self):
        if self.has_test_docs():
            return self.dataset[self.SPLIT]