Add BabiLong (#3287)

* create babilong tasks * lint * add clarification * fix typo * add babilong description

Add BabiLong (#3287)
* create babilong tasks * lint * add clarification * fix typo * add babilong description
ccfa4ad1 · Janna · GitHub · fec9dde7 · ccfa4ad1 · ccfa4ad1
Unverified Commit ccfa4ad1 authored Sep 20, 2025 by Janna Committed by GitHub Sep 21, 2025
6 changed files
--- a/lm_eval/tasks/babilong/babilong_qa5.yaml
+++ b/lm_eval/tasks/babilong/babilong_qa5.yaml
+include: _babilong_common_yaml
+task: babilong_qa5
+test_split: qa5
+custom_dataset: !function common_utils.load_dataset
+dataset_kwargs:
+  qa_split: qa5
+description: "I will give you context with the facts about locations and their relations hidden in some random text and a question. You need to answer the question based only on the information from the facts.\nYour answer should contain only one word. Do not write anything else after that. Do not explain your answer.\n\n"
+doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
+fewshot_config:
+  sampler: first_n
+  samples:
+    - input: "Mary picked up the apple there. Mary gave the apple to Fred. Mary moved to the bedroom. Bill took the milk there."
+      question: "Who did Mary give the apple to?"
+      target: "Fred"
+    - input: "Jeff took the football there. Jeff passed the football to Fred. Jeff got the milk there. Bill travelled to the bedroom."
+      question: "Who gave the football?"
+      target: "Jeff"
+    - input: "Fred picked up the apple there. Fred handed the apple to Bill. Bill journeyed to the bedroom. Jeff went back to the garden."
+      question: "What did Fred give to Bill?"
+      target: "apple"
--- a/lm_eval/tasks/babilong/babilong_qa6.yaml
+++ b/lm_eval/tasks/babilong/babilong_qa6.yaml
+include: _babilong_common_yaml
+task: babilong_qa6
+test_split: qa6
+custom_dataset: !function common_utils.load_dataset
+dataset_kwargs:
+  qa_split: qa6
+description: "I will give you context with the facts about people and their locations hidden in some random text and a question. You need to answer the question based only on the information from the facts. If a person was in different locations, use the latest location the person was in to answer the question.\nYour answer should contain only one word - $yes$ or $no$. Do not write anything else after that.\nDo not explain your answer.\n\n"
+doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
+fewshot_config:
+  sampler: first_n
+  samples:
+    - input: "John travelled to the hallway. John travelled to the garden."
+      question: "Is John in the garden?"
+      target: "yes"
+    - input: "Mary went to the office. Daniel journeyed to the hallway. Mary went to the bedroom. Sandra went to the garden."
+      question: "Is Mary in the office?"
+      target: "no"
--- a/lm_eval/tasks/babilong/babilong_qa7.yaml
+++ b/lm_eval/tasks/babilong/babilong_qa7.yaml
+include: _babilong_common_yaml
+task: babilong_qa7
+test_split: qa7
+custom_dataset: !function common_utils.load_dataset
+dataset_kwargs:
+  qa_split: qa7
+description: "I will give you context with the facts about people and objects they carry, hidden in some random text and a question. You need to answer the question based only on the information from the facts.\nYour answer should contain only one word - $none$ or $number_of_objects$.\nDo not write anything else after that. Do not explain your answer.\n\n"
+doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
+fewshot_config:
+  sampler: first_n
+  samples:
+    - input: "Daniel went to the bedroom. Daniel got the apple there."
+      question: "How many objects is Daniel carrying?"
+      target: "one"
+    - input: "Mary grabbed the apple there. Mary gave the apple to John."
+      question: "How many objects is Mary carrying?"
+      target: "none"
+    - input: "Sandra travelled to the hallway. Sandra picked up the milk there. Sandra took the apple there. Mary travelled to the garden."
+      question: "How many objects is Sandra carrying?"
+      target: "two"
--- a/lm_eval/tasks/babilong/babilong_qa8.yaml
+++ b/lm_eval/tasks/babilong/babilong_qa8.yaml
+include: _babilong_common_yaml
+task: babilong_qa8
+test_split: qa8
+custom_dataset: !function common_utils.load_dataset
+dataset_kwargs:
+  qa_split: qa8
+description: "I will give you context with the facts about people and objects they carry, hidden in some random text and a question. You need to answer the question based only on the information from the facts.\nYour answer should contain only one or two words: $nothing$ or $object$ or $object_1$, $object_2$. Do not write anything else. Do not explain your answer.\n\n"
+doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
+fewshot_config:
+  sampler: first_n
+  samples:
+    - input: "Sandra travelled to the garden. Mary grabbed the milk there."
+      question: "What is Mary carrying?"
+      target: "milk"
+    - input: "Mary travelled to the kitchen. Sandra travelled to the office. John travelled to the office. Sandra discarded the milk there."
+      question: "What is Sandra carrying?"
+      target: "nothing"
+    - input: "Daniel grabbed the apple there. Mary went to the office. Daniel moved to the garden. Daniel grabbed the milk there. Mary went to the kitchen."
+      question: "What is Daniel carrying?"
+      target: "apple,milk"
--- a/lm_eval/tasks/babilong/babilong_qa9.yaml
+++ b/lm_eval/tasks/babilong/babilong_qa9.yaml
+include: _babilong_common_yaml
+task: babilong_qa9
+test_split: qa9
+custom_dataset: !function common_utils.load_dataset
+dataset_kwargs:
+  qa_split: qa9
+description: "I will give you context with the facts about people and their locations hidden in some random text and a question. You need to answer the question based only on the information from the facts.\nIf a person was in different locations, use the latest location the person was in to answer the question.\nYour answer should contain only one word - $yes$ or $no$. Do not write anything else. Do not explain your answer.\n\n"
+doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
+fewshot_config:
+  sampler: first_n
+  samples:
+    - input: "John is not in the bathroom. Sandra is not in the bedroom."
+      question: "Is John in the bathroom?"
+      target: "no"
+    - input: "Mary journeyed to the kitchen. John is in the bedroom. Sandra is not in the garden."
+      question: "Is Mary in the kitchen?"
+      target: "yes"
--- a/lm_eval/tasks/babilong/common_utils.py
+++ b/lm_eval/tasks/babilong/common_utils.py
+import logging
+import re
+from functools import cache
+from typing import TYPE_CHECKING, Union
+import datasets
+from transformers import AutoTokenizer
+if TYPE_CHECKING:
+    import transformers
+eval_logger = logging.getLogger(__name__)
+@cache
+def get_tokenizer(
+    tokenizer=None, pretrained=None, **kwargs
+) -> Union["transformers.PreTrainedTokenizer", "transformers.PreTrainedTokenizerFast"]:
+    pretrained = tokenizer or pretrained
+    assert pretrained, "No tokenizer or pretrained provided."
+    eval_logger.info(f"Using tokenizer {pretrained} for babilong tasks.")
+    return AutoTokenizer.from_pretrained(pretrained, trust_remote_code=True)
+def postprocess_pred(prediction: list[str]) -> list[str]:
+    res = []
+    for predict_str in prediction:
+        predict_str = predict_str.strip()
+        # Remove all non-printable characters
+        np_pattern = re.compile(r"[\x00-\x1f]")
+        predict_str = np_pattern.sub("\n", predict_str).strip()
+        res.append(predict_str)
+    return res
+def load_dataset(**kwargs):
+    config_name = kwargs.get("max_seq_lengths", "0k")
+    # Get specific qa split
+    qa_split = kwargs.get("qa_split")
+    eval_logger.info(
+        f"Loading babilong dataset: max_seq_lengths={config_name}, split={qa_split}"
+    )
+    dataset = datasets.load_dataset(
+        "RMT-team/babilong-1k-samples", name=config_name, split=qa_split
+    )
+    return {qa_split: dataset}
+def process_results(doc: dict, results: list[str]) -> dict[str, float]:
+    pred = postprocess_pred(results)
+    target = doc.get("target", "").strip()
+    # String match
+    score = 1.0 if target.lower() in pred[0].lower() else 0.0
+    return {"acc": score}