Merge pull request #769 from EleutherAI/superglue

[Refactor] Superglue T5 Parity

Merge pull request #769 from EleutherAI/superglue
[Refactor] Superglue T5 Parity
21e1ed17 · Lintang Sutawika · GitHub · 4cda3a1c · b7082722 · 21e1ed17
Unverified Commit 21e1ed17 authored Aug 29, 2023 by Lintang Sutawika Committed by GitHub Aug 29, 2023
20 changed files
--- a/lm_eval/api/filter.py
+++ b/lm_eval/api/filter.py
@@ -2,6 +2,7 @@ from dataclasses import dataclass
 from typing import List

 from lm_eval.api.instance import Instance
+from datasets import Dataset


 class Filter:
@@ -18,7 +19,7 @@ class Filter:
        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
        """

-    def apply(self, resps):
+    def apply(self, resps, docs):
        """
        Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects.
        Should return the list of (filtered) response lists *in the same order as they were input*, e.g.
@@ -40,14 +41,14 @@ class FilterEnsemble:
    name: str
    filters: List[Filter]

-    def apply(self, instances: List[Instance]):
+    def apply(self, instances: List[Instance], docs: List[Dataset]):

        resps = [
            inst.resps for inst in instances
        ]  # operate just on the model responses
        for f in self.filters:
            # apply filters in sequence
-            resps = f.apply(resps)
+            resps = f.apply(resps, docs)

        # add the end results after filtering to filtered_requests of their respective source instances.
        # has key `self.name`: each FilterEnsemble applied in a given run should use a different name.

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -627,19 +627,19 @@ class ConfigurableTask(Task):
            )

        if self.has_test_docs():
-            docs = self.test_docs()
+            self.task_docs = self.test_docs()
        elif self.has_validation_docs():
-            docs = self.validation_docs()
+            self.task_docs = self.validation_docs()
        else:
            assert (
                False
            ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"

        # Test One Doc
-        self.features = list(docs.features.keys())
+        self.features = list(self.task_docs.features.keys())
        self.multiple_input = 0
        self.multiple_target = 0
-        test_doc = docs[0]
+        test_doc = self.task_docs[0]
        test_text = self.doc_to_text(test_doc)
        test_target = self.doc_to_target(test_doc)

@@ -743,6 +743,15 @@ class ConfigurableTask(Task):
                )
            return super().fewshot_docs()

+    def apply_filters(self):
+
+        if hasattr(self, "_filters"):
+            for f in self._filters:
+                f.apply(self._instances, self.task_docs)
+        else:
+            eval_logger.warning("No filter defined, passing through instances")
+            return self._instances
+
    def should_decontaminate(self):
        return self._config.should_decontaminate


--- a/lm_eval/filters/__init__.py
+++ b/lm_eval/filters/__init__.py
@@ -17,14 +17,16 @@ FILTER_REGISTRY = {


 def get_filter(filter_name):
-    return FILTER_REGISTRY[filter_name]
+    if filter_name in FILTER_REGISTRY:
+        return FILTER_REGISTRY[filter_name]
+    else:
+        return filter_name


 def build_filter_ensemble(filter_name, components):
    """
    Create a filtering pipeline.
    """
-
    filters = []
    for (function, kwargs) in components:
        if kwargs is None:

--- a/lm_eval/filters/decontamination.py
+++ b/lm_eval/filters/decontamination.py
@@ -17,7 +17,7 @@ class DecontaminationFilter(Filter):
        """
        self._decontam_results = None

-    def apply(self, reps):
+    def apply(self, reps, docs):
        """
        Return {"no_contamination", "only_contamination"} keys for the 2 different subsets
        """

--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
@@ -15,7 +15,7 @@ class RegexFilter(Filter):
        self.regex = re.compile(regex_pattern)
        self.fallback = fallback

-    def apply(self, resps):
+    def apply(self, resps, docs):
        # here, we assume we have a list, in which each element is
        # a list of model responses for some particular input/target pair.
        # so we process each of these (same input/target response sets)
@@ -44,7 +44,7 @@ class WhitespaceFilter(Filter):
    def __init__(self):
        pass

-    def apply(self, resps):
+    def apply(self, resps, docs):
        def filter_set(inst):

            filtered_resp = []

--- a/lm_eval/filters/selection.py
+++ b/lm_eval/filters/selection.py
@@ -9,7 +9,7 @@ class TakeFirstFilter(Filter):
        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
        """

-    def apply(self, resps):
+    def apply(self, resps, docs):
        """
        Assuming each entry of `resps` is a list of model responses, we discard all but the first response.
        """
@@ -23,7 +23,7 @@ class TakeKFilter(Filter):

        super().__init__(*args, **kwargs)

-    def apply(self, resps):
+    def apply(self, resps, docs):
        # check we have at least k responses per doc, else we can't take the first k
        assert (
            len(resps[0]) >= self.k
@@ -37,7 +37,7 @@ class MajorityVoteFilter(Filter):
        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
        """

-    def apply(self, resps):
+    def apply(self, resps, docs):
        """
        Each entry of `resps` is a list of model responses.
        We select the response that occurs most frequently in each entry of `resps`.

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
+import os
+
 import torch
 import transformers
 from transformers.models.auto.modeling_auto import (
@@ -67,6 +69,7 @@ class HFLM(LM):
        revision: Optional[str] = "main",
        subfolder: Optional[str] = None,
        tokenizer: Optional[str] = None,
+        truncation: Optional[bool] = False,
        max_length: Optional[int] = None,
        device: Optional[str] = "cuda",
        dtype: Optional[Union[str, torch.dtype]] = "auto",
@@ -75,6 +78,7 @@ class HFLM(LM):
        low_cpu_mem_usage: Optional[bool] = True,
        trust_remote_code: Optional[bool] = False,
        use_fast_tokenizer: Optional[bool] = True,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
        # arguments used for splitting a model across GPUs naively.
        # only used if `parallelize=True`.
        parallelize: Optional[bool] = False,
@@ -240,6 +244,8 @@ class HFLM(LM):
            use_fast=use_fast_tokenizer,
        )

+        self.truncation = truncation
+
        self.vocab_size = self.tokenizer.vocab_size
        self.tokenizer.pad_token_id = self.tokenizer.eos_token_id

@@ -419,7 +425,11 @@ class HFLM(LM):
        return encoding

    def tok_batch_encode(
-        self, strings: List[str], padding_side="left", left_truncate_len=None
+        self,
+        strings: List[str],
+        padding_side="left",
+        left_truncate_len=None,
+        truncation=False,
    ):
        # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
        old_padding_side = self.tokenizer.padding_side
@@ -432,6 +442,7 @@ class HFLM(LM):

        encoding = self.tokenizer(
            strings,
+            truncation=truncation,
            padding="longest",
            return_tensors="pt",
            add_special_tokens=add_special_tokens,
@@ -856,7 +867,9 @@ class HFLM(LM):

                # encode, pad, and truncate contexts for this batch
                context_enc, attn_masks = self.tok_batch_encode(
-                    contexts, left_truncate_len=max_ctx_len
+                    contexts,
+                    left_truncate_len=max_ctx_len,
+                    truncation=self.truncation,
                )
                context_enc = context_enc.to(self.device)
                attn_masks = attn_masks.to(self.device)

--- a/lm_eval/tasks/super_glue/README.md
+++ b/lm_eval/tasks/super_glue/README.md
+# SuperGLUE
+
+### Paper
+
+Title: `SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems`
+Abstract: `https://w4ngatang.github.io/static/papers/superglue.pdf`
+
+SuperGLUE is a benchmark styled after GLUE with a new set of more difficult language
+understanding tasks.
+
+Homepage: https://super.gluebenchmark.com/
+
+### Citation
+
+```
+@inproceedings{NEURIPS2019_4496bf24,
+    author = {Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel},
+    booktitle = {Advances in Neural Information Processing Systems},
+    editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
+    pages = {},
+    publisher = {Curran Associates, Inc.},
+    title = {SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},
+    url = {https://proceedings.neurips.cc/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf},
+    volume = {32},
+    year = {2019}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* `super-glue-lm-eval-v1`: SuperGLUE eval adapted from LM Eval V1
+* `super-glue-t5-prompt`: SuperGLUE prompt and evaluation that matches the T5 paper (if using accelerate, will error if record is included.)
+
+#### Tasks
+
+Comparison between validation split score on T5x and LM-Eval (T5x models converted to HF)
+| T5V1.1 Base | SGLUE | BoolQ | CB        | Copa | MultiRC | ReCoRD | RTE | WiC | WSC |
+| ----------- | ------| ----- | --------- | ---- | ------- | ------ | --- | --- | --- |
+| T5x | 69.47 | 78.47(acc) | 83.93(f1) 87.5(acc) | 50(acc) | 73.81(f1) 33.26(em) | 70.09(em) 71.34(f1) | 78.7(acc) | 63.64(acc) | 75(acc) |
+| LM-Eval | 71.35 | 79.36(acc) | 83.63(f1) 87.5(acc) | 63(acc) | 73.45(f1) 33.26(em) | 69.85(em) 68.86(f1) | 78.34(acc) | 65.83(acc) | 75.96(acc) |
+
+
+
+* `super-glue-lm-eval-v1`
+    -  `boolq`
+    - `cb`
+    - `copa`
+    - `multirc`
+    - `record`
+    - `rte`
+    - `wic`
+    - `wsc`
+
+* `super-glue-t5-prompt`
+    - `super_glue-boolq-t5-prompt`
+    - `super_glue-cb-t5-prompt`
+    - `super_glue-copa-t5-prompt`
+    - `super_glue-multirc-t5-prompt`
+    - `super_glue-record-t5-prompt`
+    - `super_glue-rte-t5-prompt`
+    - `super_glue-wic-t5-prompt`
+    - `super_glue-wsc-t5-prompt`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
+group:
+  - super-glue-t5-prompt
+task: super_glue-boolq-t5-prompt
+dataset_path: super_glue
+dataset_name: boolq
+training_split: train
+validation_split: validation
+output_type: greedy_until
+doc_to_text: "boolq passage: {{passage}} question: {{question}}"
+doc_to_target: label
+doc_to_choice: ['False', 'True']
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
--- a/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
@@ -6,7 +6,7 @@ dataset_name: cb
 training_split: train
 validation_split: validation
 output_type: greedy_until
-doc_to_text: "cb hypothesis: {{hypothesis}} premise {{premise}}"
+doc_to_text: "cb hypothesis: {{hypothesis}} premise: {{premise}}"
 doc_to_target: label
 doc_to_choice: ['entailment', 'contradiction', 'neutral']
 metric_list:

--- a/lm_eval/tasks/super_glue/cb/t5_utils.py
+++ b/lm_eval/tasks/super_glue/cb/t5_utils.py
+import sklearn.metrics
+
+
+def mean_3class_f1(predictions, references):  # This is a passthrough function
+
+    string_label = ["entailment", "contradiction", "neutral"]
+    predictions = string_label.index(predictions[0])
+    references = string_label.index(references[0])
+
+    return (predictions, references)
+
+
+def agg_mean_3class_f1(items):
+
+    predictions, references = zip(*items)
+
+    """Computes the unweighted average of the F1 per class."""
+    metric_str = "fbeta_score"
+    metric_fn_kwargs = {
+        "beta": 1,
+        "labels": range(3),
+        "average": "macro",
+    }
+
+    def _fn(predictions, references):
+        metric_fn = getattr(sklearn.metrics, metric_str)
+        metric_val = metric_fn(references, predictions, **metric_fn_kwargs)
+        return metric_val
+
+    return _fn(predictions, references)
--- a/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
@@ -6,9 +6,9 @@ dataset_name: copa
 training_split: train
 validation_split: validation
 output_type: greedy_until
-doc_to_text: "copa choice1: {{choice1}} choice2: {{choice2}} question: {{question}}"
+doc_to_text: "copa choice1: {{choice1}} choice2: {{choice2}} premise: {{premise}} question: {{question}}"
 doc_to_target: label
-doc_to_choice: ['False', 'True']
+doc_to_choice: ['choice1', 'choice2']
 metric_list:
  - metric: exact_match
    aggregation: mean

--- a/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
+group:
+  - super-glue-t5-prompt
+task: super_glue-multirc-t5-prompt
+dataset_path: super_glue
+dataset_name: multirc
+training_split: train
+validation_split: validation
+output_type: greedy_until
+doc_to_text: "multirc question: {{question}} answer: {{answer}} paragraph: {{paragraph}}"
+doc_to_target: label
+doc_to_choice: "{% set group_id = idx.question|string %}{{[group_id+'_False', group_id+'_True']}}"
+generation_kwargs:
+  until:
+    - "</s>"
+  do_sample: false
+  temperature: 0.5
+metric_list:
+  - metric: !function t5_utils.f1
+    aggregation: !function t5_utils.agg_f1
+    higher_is_better: true
+  - metric: !function t5_utils.em
+    aggregation: !function t5_utils.agg_em
+    higher_is_better: true
--- a/lm_eval/tasks/super_glue/multirc/t5_utils.py
+++ b/lm_eval/tasks/super_glue/multirc/t5_utils.py
+import collections
+
+import numpy as np
+import sklearn.metrics
+
+
+def f1(predictions, references):  # This is a passthrough function
+
+    _prediction = predictions[0]
+    _reference = references[0].split("_")[-1]
+    string_label = ["False", "True"]
+    reference = string_label.index(_reference)
+    prediction = (
+        string_label.index(_prediction)
+        if _prediction in string_label
+        else not bool(reference)
+    )
+
+    return (prediction, reference)
+
+
+def agg_f1(items):
+
+    predictions, references = zip(*items)
+    references, predictions = np.asarray(references), np.asarray(predictions)
+
+    return sklearn.metrics.f1_score(references, predictions)
+
+
+def em(predictions, references):  # This is a passthrough function
+
+    _prediction = predictions[0]
+    _group, _reference = references[0].split("_")
+    string_label = ["False", "True"]
+    reference = string_label.index(_reference)
+    prediction = (
+        string_label.index(_prediction)
+        if _prediction in string_label
+        else not bool(reference)
+    )
+
+    return (_group, prediction, reference)
+
+
+def agg_em(items):
+    grouped_values = collections.defaultdict(lambda: ([], []))
+    for group, prediction, reference in items:
+        grouped_values[group][0].append(reference)
+        grouped_values[group][1].append(prediction)
+
+    group_scores = []
+    for group, (targets, predictions) in grouped_values.items():
+        score = float(np.array_equal(targets, predictions))
+        group_scores.append(score)
+
+    return np.mean(group_scores)
--- a/lm_eval/tasks/super_glue/record/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/record/t5-prompt.yaml
@@ -3,14 +3,15 @@ group:
 task: super_glue-record-t5-prompt
 dataset_path: super_glue
 dataset_name: record
-training_split: train
 validation_split: validation
 output_type: greedy_until
-doc_to_text: "record query: {{query}} entities: {{entities}} passage: {{passage}}"
-doc_to_target: "{{answers}}"
+process_docs: !function t5_utils.process_docs
+doc_to_text: !function t5_utils.doc_to_text
+doc_to_target: "{{idx.passage|string}}+{{idx.query}}_{{answers}}"
 metric_list:
-  - metric: exact_match
-    aggregation: mean
+  - metric: !function t5_utils.em
+    aggregation: !function t5_utils.squad_em_agg
+    higher_is_better: true
+  - metric: !function t5_utils.f1
+    aggregation: !function t5_utils.squad_f1_agg
    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
--- a/lm_eval/tasks/super_glue/record/t5_utils.py
+++ b/lm_eval/tasks/super_glue/record/t5_utils.py
+import re
+import string
+import collections
+import numpy as np
+
+from tqdm import tqdm
+from datasets import Dataset, concatenate_datasets
+
+from lm_eval.api.metrics import metric_max_over_ground_truths
+
+
+def doc_to_text(doc):
+
+    passage = doc["passage"]
+    passage = re.sub(r"(\.|\?|\!|\"|\')\n@highlight\n", r"\1 ", passage)
+    passage = re.sub(r"\n@highlight\n", ". ", passage)
+
+    return " ".join(
+        [
+            "record query:",
+            doc["query"],
+            "entities:",
+            ", ".join(doc["entities"]),
+            "passage:",
+            passage,
+        ]
+    )
+
+
+def process_docs(dataset):
+    def split_answers(doc):
+        split_doc = {
+            **{k: [] for k in doc.keys()},
+        }
+        answers = doc.pop("answers")
+        for idx, answer in enumerate(answers):
+
+            for key in split_doc.keys():
+                if key in doc:
+                    split_doc[key].append(doc[key])
+
+            split_doc["answers"].append(answer)
+        return split_doc
+
+    dataset = dataset.map(split_answers)
+    new_dataset = {}
+    for key in dataset.features.keys():
+        new_dataset[key] = [x for row in dataset[key] for x in row]
+
+    return Dataset.from_dict(new_dataset)
+
+
+def normalize_squad(answer):
+    """Normalization used in official SQuAD evaluation script."""
+
+    def _normalize_answer(text, punc_chars, punc_repl):
+        """Lower text and remove punctuation, articles and extra whitespace."""
+
+        def remove_articles(s):
+            return re.sub(r"\b(a|an|the)\b", " ", s)
+
+        def replace_punctuation(s):
+            to_replace = set(punc_chars)
+            return "".join(punc_repl if ch in to_replace else ch for ch in s)
+
+        def white_space_fix(s):
+            return " ".join(s.split())
+
+        text = text.lower()
+        text = replace_punctuation(text)
+        text = remove_articles(text)
+        text = white_space_fix(text)
+
+        return text
+
+    return _normalize_answer(answer, punc_chars=string.punctuation, punc_repl="")
+
+
+def em(predictions, references):  # This is a passthrough function
+    return (predictions[0], references[0])
+
+
+def f1(predictions, references):  # This is a passthrough function
+    return (predictions[0], references[0])
+
+
+def squad_em_agg(items):
+    def _exact_match_score(prediction, target):
+        return target == prediction
+
+    grouped_values = collections.defaultdict(lambda: ([], []))
+    for prediction, reference in items:
+        group, reference = reference.split("_")
+        # if group not in grouped_values:
+        grouped_values[group][0].append(normalize_squad(prediction))
+        grouped_values[group][1].append(normalize_squad(reference))
+
+    em = []
+    for group in grouped_values.keys():
+        predictions, targets = grouped_values[group]
+        for p in predictions:
+            em.append(metric_max_over_ground_truths(_exact_match_score, p, targets))
+
+    return np.mean(em)
+
+
+def squad_f1_agg(items):
+    def _f1_score(prediction, target):
+        """Computes token f1 score for a single target and prediction."""
+        prediction_tokens = prediction.split()
+        target_tokens = target.split()
+        common = collections.Counter(prediction_tokens) & collections.Counter(
+            target_tokens
+        )
+        num_same = sum(common.values())
+        if num_same == 0:
+            return 0
+        precision = 1.0 * num_same / len(prediction_tokens)
+        recall = 1.0 * num_same / len(target_tokens)
+        f1 = (2 * precision * recall) / (precision + recall)
+        return f1
+
+    grouped_values = collections.defaultdict(lambda: ([], []))
+    for prediction, reference in items:
+        group, reference = reference.split("_")
+        if group not in grouped_values:
+            grouped_values[group][0].append(normalize_squad(prediction))
+        grouped_values[group][1].append(normalize_squad(reference))
+
+    f1 = []
+    for group in grouped_values.keys():
+        p, t = grouped_values[group]
+        f1.append(metric_max_over_ground_truths(_f1_score, p[0], t))
+
+    return np.mean(f1)
--- a/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
+group:
+  - super-glue-t5-prompt
+task: super_glue-rte-t5-prompt
+dataset_path: super_glue
+dataset_name: rte
+training_split: train
+validation_split: validation
+output_type: greedy_until
+doc_to_text: "rte hypothesis: {{hypothesis}} premise: {{premise}}"
+doc_to_target: label
+doc_to_choice: ['entailment', 'not_entailment']
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
--- a/lm_eval/tasks/super_glue/wic/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/wic/t5-prompt.yaml
+group:
+  - super-glue-t5-prompt
+task: super_glue-wic-t5-prompt
+dataset_path: super_glue
+dataset_name: wic
+training_split: train
+validation_split: validation
+output_type: greedy_until
+doc_to_text: "wic sentence1: {{sentence1}} sentence2: {{sentence2}} word: {{word}}"
+doc_to_target: label
+doc_to_choice: ['False', 'True']
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
--- a/lm_eval/tasks/super_glue/wsc/default.yaml
+++ b/lm_eval/tasks/super_glue/wsc/default.yaml
@@ -2,7 +2,7 @@ group:
  - super-glue-lm-eval-v1
 task: wsc
 dataset_path: super_glue
-dataset_name: wsc
+dataset_name: wsc.fixed
 output_type: multiple_choice
 training_split: train
 validation_split: validation

--- a/lm_eval/tasks/super_glue/wsc/preprocess_wsc.py
+++ b/lm_eval/tasks/super_glue/wsc/preprocess_wsc.py
-import re
 from lm_eval.utils import general_detokenize


-def t5_prompt_doc_to_text(x):
-    def _mark_span(text, span_str, span_idx, mark):
-        pattern_tmpl = r"^((?:\S+\s){N})(W)"
-        pattern = re.sub("N", str(span_idx), pattern_tmpl)
-        pattern = re.sub("W", span_str, pattern)
-        return re.sub(pattern, r"\1{0} \2 {0}".format(mark), text)
-
-    text = x["text"]
-    text = _mark_span(text, x["span1_text"], x["span1_index"], "*")
-    # Compensate for 2 added "words" added in previous step.
-    span2_index = x["span2_index"] + 2 * (x["span1_index"] < x["span2_index"])
-    text = _mark_span(text, x["span2_text"], span2_index, "#")
-
-    return text
-
-
 def default_doc_to_text(x):
    raw_passage = x["text"]
    # NOTE: HuggingFace span indices are word-based not character-based.