updates to scrolls

51753750 · lintangsutawika · 097c9253 · 51753750 · 51753750 · 51753750
Commit 51753750 authored Nov 09, 2023 by lintangsutawika
4 changed files
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -17,7 +17,14 @@ import logging

 # import python tasks
 from .squad import SQuAD2
-from .scrolls import QuALITY, NarrativeQA, ContractNLI, GovReport, SummScreenFD, QMSum
+from .scrolls.task import (
+    QuALITY,
+    NarrativeQA,
+    ContractNLI,
+    GovReport,
+    SummScreenFD,
+    QMSum,
+)

 eval_logger = logging.getLogger("lm-eval")


--- a/lm_eval/tasks/scrolls/README.md
+++ b/lm_eval/tasks/scrolls/README.md
+"""
+SCROLLS: Standardized CompaRison Over Long Language Sequences
+https://arxiv.org/abs/2201.03533
+
+SCROLLS is a suite of datasets that require synthesizing information over long texts.
+The benchmark includes seven natural language tasks across multiple domains,
+including summarization, question answering, and natural language inference.
+
+Homepage: https://www.scrolls-benchmark.com/
+
+Since SCROLLS tasks are generally longer than the maximum sequence length of many models,
+it is possible to create "subset" tasks that contain only those samples whose tokenized length
+is less than some pre-defined limit. For example, to create a subset of "Qasper" that would
+be suitable for a model using the GPTNeoX tokenizer and a 4K maximium sequence length:
+
+```
+class QasperGPTNeoX4K(Qasper):
+    PRUNE_TOKENIZERS = ["EleutherAI/pythia-410m-deduped"]
+    PRUNE_MAX_TOKENS = 4096
+    PRUNE_NUM_PROC = _num_cpu_cores() # optional, to speed up pruning of large datasets like NarrativeQA
+```
+
+`PRUNE_TOKENIZERS` can contain more than one tokenizer; this will include only samples that are
+less than `PRUNE_MAX_TOKENS` for ALL of the tokenizers. This can be useful to comparing models
+that use different tokenizers but the same maximum sequence length.
+
+Once the subset task class has been defined in this file, it can be used by adding the class
+to `lm_eval/tasks/__init__.py`.
+
+NOTE: GovReport may need `max_gen_toks` set larger for causal models.
+"""
\ No newline at end of file
--- a/lm_eval/tasks/scrolls/scrolls.yaml
+++ b/lm_eval/tasks/scrolls/scrolls.yaml
+group: scrolls
+task:
+  - scrolls_qasper
+  - scrolls_quality
+  - scrolls_narrativeqa
+  - scrolls_contractnli
+  - scrolls_govreport
+  - scrolls_summscreenfd
+  - scrolls_qmsum
--- a/lm_eval/tasks/scrolls.py
+++ b/lm_eval/tasks/scrolls.py
-"""
-SCROLLS: Standardized CompaRison Over Long Language Sequences
-https://arxiv.org/abs/2201.03533
-
-SCROLLS is a suite of datasets that require synthesizing information over long texts.
-The benchmark includes seven natural language tasks across multiple domains,
-including summarization, question answering, and natural language inference.
-
-Homepage: https://www.scrolls-benchmark.com/
-
-Since SCROLLS tasks are generally longer than the maximum sequence length of many models,
-it is possible to create "subset" tasks that contain only those samples whose tokenized length
-is less than some pre-defined limit. For example, to create a subset of "Qasper" that would
-be suitable for a model using the GPTNeoX tokenizer and a 4K maximium sequence length:
-
-```
-class QasperGPTNeoX4K(Qasper):
-    PRUNE_TOKENIZERS = ["EleutherAI/pythia-410m-deduped"]
-    PRUNE_MAX_TOKENS = 4096
-    PRUNE_NUM_PROC = _num_cpu_cores() # optional, to speed up pruning of large datasets like NarrativeQA
-```
-
-`PRUNE_TOKENIZERS` can contain more than one tokenizer; this will include only samples that are
-less than `PRUNE_MAX_TOKENS` for ALL of the tokenizers. This can be useful to comparing models
-that use different tokenizers but the same maximum sequence length.
-
-Once the subset task class has been defined in this file, it can be used by adding the class
-to `lm_eval/tasks/__init__.py`.
-
-NOTE: GovReport may need `max_gen_toks` set larger for causal models.
-"""
 import re
 import numpy as np
 import transformers.data.metrics.squad_metrics as squad_metrics
@@ -146,13 +115,8 @@ class _SCROLLSTask(Task):
    PRUNE_MAX_TOKENS = None
    PRUNE_NUM_PROC = None

-    def __init__(self, no_metric=False):
-        super().__init__()
-        self.metric = (
-            load_metric(_download_metric(), config_name=self.DATASET_NAME)
-            if not no_metric
-            else None
-        )
+    def __post_init__(self):
+        self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME)

    def has_training_docs(self):
        return True
@@ -245,8 +209,8 @@ class _SCROLLSTask(Task):


 class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
-    def __init__(self):
-        super().__init__(no_metric=True)
+    def __post_init__(self):
+        self.metric = None

    def _scrolls_metrics(self):
        return None
@@ -270,7 +234,7 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
            "em": acc_norm * 100.0,
        }

-    def construct_requests(self, doc, ctx):
+    def construct_requests(self, doc, ctx, **kwargs):

        request_list = [
            Instance(
@@ -278,6 +242,7 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
                doc=doc,
                arguments=(ctx, " {}".format(choice)),
                idx=i,
+                **kwargs,
            )
            for i, choice in doc["choices"]
        ]
@@ -302,12 +267,13 @@ class _SCROLLSSummaryTask(_SCROLLSTask):
            "rougeL": (results[0], doc["outputs"]),
        }

-    def construct_requests(self, doc, ctx):
+    def construct_requests(self, doc, ctx, **kwargs):
        return Instance(
            request_type="generate_until",
            doc=doc,
            arguments=(ctx, {"until": ["\n"]}),
            idx=0,
+            **kwargs,
        )

    def doc_to_text(self, doc):
@@ -344,7 +310,7 @@ class Qasper(_SCROLLSTask):
            prediction = results[0]
        return {"f1": (prediction, doc["outputs"])}

-    def construct_requests(self, doc, ctx):
+    def construct_requests(self, doc, ctx, **kwargs):
        if doc["is_yes_no"]:
            return [
                Instance(
@@ -352,12 +318,14 @@ class Qasper(_SCROLLSTask):
                    doc=doc,
                    arguments=(ctx, " yes"),
                    idx=0,
+                    **kwargs,
                ),
                Instance(
                    request_type="loglikelihood",
                    doc=doc,
                    arguments=(ctx, " no"),
                    idx=1,
+                    **kwargs,
                ),
            ]
        else:
@@ -366,6 +334,7 @@ class Qasper(_SCROLLSTask):
                doc=doc,
                arguments=(ctx, {"until": ["\n"]}),
                idx=0,
+                **kwargs,
            )


@@ -422,12 +391,13 @@ class NarrativeQA(_SCROLLSTask):
    def process_results(self, doc, results):
        return {"f1": (results[0], doc["outputs"])}

-    def construct_requests(self, doc, ctx):
+    def construct_requests(self, doc, ctx, **kwargs):
        return Instance(
            request_type="generate_until",
            doc=doc,
            arguments=(ctx, {"until": ["\n"]}),
            idx=0,
+            **kwargs,
        )