Merge branch 'smolrefact' into tasklist

# Conflicts: # lm_eval/__main__.py # lm_eval/api/group.py # lm_eval/api/task.py # lm_eval/evaluator_utils.py # lm_eval/tasks/__init__.py # lm_eval/utils.py # pyproject.toml

Merge branch 'smolrefact' into tasklist
# Conflicts: # lm_eval/__main__.py # lm_eval/api/group.py # lm_eval/api/task.py # lm_eval/evaluator_utils.py # lm_eval/tasks/__init__.py # lm_eval/utils.py # pyproject.toml
abd17276 · Baber · 00afd536 · 70314843 · abd17276 · abd17276
Commit abd17276 authored Sep 26, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/babilong/babilong_qa17.yaml
+++ b/lm_eval/tasks/babilong/babilong_qa17.yaml
+include: _babilong_common_yaml
+task: babilong_qa17
+test_split: qa17
+dataset_name: 0k
+description: "I will give you context with the facts about different figures, their location and colors, hidden in some random text and a question. You need to answer the question based only on the information from the facts.\nYour answer should contain only one word - $yes$ or $no$. Do not write anything else.\nDo not explain your answer.\n\n"
+doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
+
+fewshot_config:
+  sampler: first_n
+  samples:
+    - input: "The triangle is above the pink rectangle. The blue square is to the left of the triangle."
+      question: "Is the pink rectangle to the right of the blue square?"
+      target: "yes"
+    - input: "The red sphere is to the left of the yellow square. The red sphere is below the pink rectangle."
+      question: "Is the pink rectangle to the left of the yellow square?"
+      target: "yes"
+    - input: "The red sphere is above the pink rectangle. The red sphere is to the right of the red square."
+      question: "Is the pink rectangle above the red square?"
+      target: "no"
--- a/lm_eval/tasks/babilong/babilong_qa18.yaml
+++ b/lm_eval/tasks/babilong/babilong_qa18.yaml
+include: _babilong_common_yaml
+task: babilong_qa18
+test_split: qa18
+dataset_name: 0k
+description: "I will give you context with the facts about different objects and their sizes, hidden in some random text and a question. You need to answer the question based only on the information from the facts.\nYour answer should contain only one word - $yes$ or $no$. Do not write anything else.\nDo not explain your answer.\n\n"
+doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
+
+fewshot_config:
+  sampler: first_n
+  samples:
+    - input: "The box of chocolates fits inside the chest. The box is bigger than the chest. The box is bigger than the suitcase. The suitcase fits inside the box. The container is bigger than the box of chocolates."
+      question: "Does the box fit in the box of chocolates?"
+      target: "no"
+    - input: "The suitcase is bigger than the container. The container fits inside the box. The chest is bigger than the chocolate. The suitcase fits inside the box. The chest fits inside the box."
+      question: "Does the chocolate fit in the box?"
+      target: "yes"
+    - input: "The chocolate fits inside the box of chocolates. The suitcase fits inside the box. The chocolate fits inside the box. The box is bigger than the box of chocolates. The suitcase is bigger than the box of chocolates."
+      question: "Is the chocolate bigger than the box?"
+      target: "no"
--- a/lm_eval/tasks/babilong/babilong_qa19.yaml
+++ b/lm_eval/tasks/babilong/babilong_qa19.yaml
+include: _babilong_common_yaml
+task: babilong_qa19
+test_split: qa19
+dataset_name: 0k
+description: "I will give you context with the facts about different places and their locations, hidden in some random text and a question. You need to answer the question based only on the information from the facts.\nYour answer should contain only two letters, separated by a comma - ordinal directions. You can choose the letters from $n$, $s$, $e$ and $w$. Do not write anything else after that.\n\n"
+doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
+
+fewshot_config:
+  sampler: first_n
+  samples:
+    - input: "The office is east of the hallway. The kitchen is north of the office. The garden is west of the bedroom. The office is west of the garden. The bathroom is north of the garden."
+      question: "How do you go from the kitchen to the garden?"
+      target: "s,e"
+    - input: "The bedroom is west of the hallway. The office is east of the garden. The garden is north of the kitchen. The kitchen is north of the bathroom. The hallway is west of the garden."
+      question: "How do you go from the kitchen to the hallway?"
+      target: "n,w"
+    - input: "The bedroom is south of the hallway. The bathroom is east of the office. The kitchen is west of the garden. The garden is south of the office. The office is south of the bedroom."
+      question: "How do you go from the garden to the bedroom?"
+      target: "n,n"
--- a/lm_eval/tasks/babilong/babilong_qa2.yaml
+++ b/lm_eval/tasks/babilong/babilong_qa2.yaml
+include: _babilong_common_yaml
+task: babilong_qa2
+test_split: qa2
+custom_dataset: !function common_utils.load_dataset
+dataset_kwargs:
+  qa_split: qa2
+description: "I will give you context with the facts about locations and actions of different persons hidden in some random text and a question. You need to answer the question based only on the information from the facts. If a person got an item in the first location and travelled to the second location the item is also in the second location. If a person dropped an item in the first location and moved to the second location the item remains in the first location.\nAlways return your answer in the following format:\nThe 'item' is in 'location'. Do not write anything else after that.\n\n"
+doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
+
+fewshot_config:
+  sampler: first_n
+  samples:
+    - input: "Charlie went to the kitchen. Charlie got a bottle. Charlie moved to the balcony."
+      question: "Where is the bottle?"
+      target: "The bottle is in the balcony."
+    - input: "Alan moved to the garage. Alan got a screw driver. Alan moved to the kitchen."
+      question: "Where is the screw driver?"
+      target: "The screw driver is in the kitchen."
--- a/lm_eval/tasks/babilong/babilong_qa20.yaml
+++ b/lm_eval/tasks/babilong/babilong_qa20.yaml
+include: _babilong_common_yaml
+task: babilong_qa20
+test_split: qa20
+dataset_name: 0k
+description: "I will give you context with the facts about people, their locations and condition hidden in some random text and a question. You need to answer the question based only on the information from the facts.\nIf a person was in different locations, use the latest location the person was in to answer the question.\nYour answer should contain only one word - a person condition or a place. Do not write anything else after that. Do not explain your answer.\n\n"
+doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
+
+fewshot_config:
+  sampler: first_n
+  samples:
+    - input: "Sumit is tired."
+      question: "Where will sumit go?"
+      target: "bedroom"
+    - input: "Yann is hungry. Yann journeyed to the kitchen."
+      question: "Why did yann go to the kitchen?"
+      target: "hungry"
+    - input: "Antoine is thirsty. Yann is tired. Yann went back to the bedroom. Yann picked up the pajamas there. Jason is thirsty. Antoine went back to the kitchen."
+      question: "Why did antoine go to the kitchen?"
+      target: "thirsty"
--- a/lm_eval/tasks/babilong/babilong_qa3.yaml
+++ b/lm_eval/tasks/babilong/babilong_qa3.yaml
+include: _babilong_common_yaml
+task: babilong_qa3
+test_split: qa3
+custom_dataset: !function common_utils.load_dataset
+dataset_kwargs:
+  qa_split: qa3
+description: "I give you context with the facts about locations and actions of different persons hidden in some random text and a question. You need to answer the question based only on the information from the facts. If a person got an item in the first location and travelled to the second location the item is also in the second location. If a person dropped an item in the first location and moved to the second location the item remains in the first location.\nAlways return your answer in the following format:\nBefore the $location_1$ the $item$ was in the $location_2$. Do not write anything else after that.\n\n"
+doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
+
+fewshot_config:
+  sampler: first_n
+  samples:
+    - input: "John journeyed to the bedroom. Mary grabbed the apple. Mary went back to the bathroom. Daniel journeyed to the bedroom. Daniel moved to the garden. Mary travelled to the kitchen."
+      question: "Where was the apple before the kitchen?"
+      target: "Before the kitchen the apple was in the bathroom."
+    - input: "John went back to the bedroom. John went back to the garden. John went back to the kitchen. Sandra took the football. Sandra travelled to the garden. Sandra journeyed to the bedroom."
+      question: "Where was the football before the bedroom?"
+      target: "Before the bedroom the football was in the garden."
--- a/lm_eval/tasks/babilong/babilong_qa4.yaml
+++ b/lm_eval/tasks/babilong/babilong_qa4.yaml
+include: _babilong_common_yaml
+task: babilong_qa4
+test_split: qa4
+custom_dataset: !function common_utils.load_dataset
+dataset_kwargs:
+  qa_split: qa4
+description: "I will give you context with the facts about different people, their location and actions, hidden in some random text and a question. You need to answer the question based only on the information from the facts.\nYour answer should contain only one word - location. Do not write anything else after that.\n\n"
+doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
+
+fewshot_config:
+  sampler: first_n
+  samples:
+    - input: "The hallway is south of the kitchen. The bedroom is north of the kitchen."
+      question: "What is the kitchen south of?"
+      target: "bedroom"
+    - input: "The garden is west of the bedroom. The bedroom is west of the kitchen."
+      question: "What is west of the bedroom?"
+      target: "garden"
--- a/lm_eval/tasks/babilong/babilong_qa5.yaml
+++ b/lm_eval/tasks/babilong/babilong_qa5.yaml
+include: _babilong_common_yaml
+task: babilong_qa5
+test_split: qa5
+custom_dataset: !function common_utils.load_dataset
+dataset_kwargs:
+  qa_split: qa5
+description: "I will give you context with the facts about locations and their relations hidden in some random text and a question. You need to answer the question based only on the information from the facts.\nYour answer should contain only one word. Do not write anything else after that. Do not explain your answer.\n\n"
+doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
+
+fewshot_config:
+  sampler: first_n
+  samples:
+    - input: "Mary picked up the apple there. Mary gave the apple to Fred. Mary moved to the bedroom. Bill took the milk there."
+      question: "Who did Mary give the apple to?"
+      target: "Fred"
+    - input: "Jeff took the football there. Jeff passed the football to Fred. Jeff got the milk there. Bill travelled to the bedroom."
+      question: "Who gave the football?"
+      target: "Jeff"
+    - input: "Fred picked up the apple there. Fred handed the apple to Bill. Bill journeyed to the bedroom. Jeff went back to the garden."
+      question: "What did Fred give to Bill?"
+      target: "apple"
--- a/lm_eval/tasks/babilong/babilong_qa6.yaml
+++ b/lm_eval/tasks/babilong/babilong_qa6.yaml
+include: _babilong_common_yaml
+task: babilong_qa6
+test_split: qa6
+custom_dataset: !function common_utils.load_dataset
+dataset_kwargs:
+  qa_split: qa6
+description: "I will give you context with the facts about people and their locations hidden in some random text and a question. You need to answer the question based only on the information from the facts. If a person was in different locations, use the latest location the person was in to answer the question.\nYour answer should contain only one word - $yes$ or $no$. Do not write anything else after that.\nDo not explain your answer.\n\n"
+doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
+
+fewshot_config:
+  sampler: first_n
+  samples:
+    - input: "John travelled to the hallway. John travelled to the garden."
+      question: "Is John in the garden?"
+      target: "yes"
+    - input: "Mary went to the office. Daniel journeyed to the hallway. Mary went to the bedroom. Sandra went to the garden."
+      question: "Is Mary in the office?"
+      target: "no"
--- a/lm_eval/tasks/babilong/babilong_qa7.yaml
+++ b/lm_eval/tasks/babilong/babilong_qa7.yaml
+include: _babilong_common_yaml
+task: babilong_qa7
+test_split: qa7
+custom_dataset: !function common_utils.load_dataset
+dataset_kwargs:
+  qa_split: qa7
+description: "I will give you context with the facts about people and objects they carry, hidden in some random text and a question. You need to answer the question based only on the information from the facts.\nYour answer should contain only one word - $none$ or $number_of_objects$.\nDo not write anything else after that. Do not explain your answer.\n\n"
+doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
+
+fewshot_config:
+  sampler: first_n
+  samples:
+    - input: "Daniel went to the bedroom. Daniel got the apple there."
+      question: "How many objects is Daniel carrying?"
+      target: "one"
+    - input: "Mary grabbed the apple there. Mary gave the apple to John."
+      question: "How many objects is Mary carrying?"
+      target: "none"
+    - input: "Sandra travelled to the hallway. Sandra picked up the milk there. Sandra took the apple there. Mary travelled to the garden."
+      question: "How many objects is Sandra carrying?"
+      target: "two"
--- a/lm_eval/tasks/babilong/babilong_qa8.yaml
+++ b/lm_eval/tasks/babilong/babilong_qa8.yaml
+include: _babilong_common_yaml
+task: babilong_qa8
+test_split: qa8
+custom_dataset: !function common_utils.load_dataset
+dataset_kwargs:
+  qa_split: qa8
+description: "I will give you context with the facts about people and objects they carry, hidden in some random text and a question. You need to answer the question based only on the information from the facts.\nYour answer should contain only one or two words: $nothing$ or $object$ or $object_1$, $object_2$. Do not write anything else. Do not explain your answer.\n\n"
+doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
+
+fewshot_config:
+  sampler: first_n
+  samples:
+    - input: "Sandra travelled to the garden. Mary grabbed the milk there."
+      question: "What is Mary carrying?"
+      target: "milk"
+    - input: "Mary travelled to the kitchen. Sandra travelled to the office. John travelled to the office. Sandra discarded the milk there."
+      question: "What is Sandra carrying?"
+      target: "nothing"
+    - input: "Daniel grabbed the apple there. Mary went to the office. Daniel moved to the garden. Daniel grabbed the milk there. Mary went to the kitchen."
+      question: "What is Daniel carrying?"
+      target: "apple,milk"
--- a/lm_eval/tasks/babilong/babilong_qa9.yaml
+++ b/lm_eval/tasks/babilong/babilong_qa9.yaml
+include: _babilong_common_yaml
+task: babilong_qa9
+test_split: qa9
+custom_dataset: !function common_utils.load_dataset
+dataset_kwargs:
+  qa_split: qa9
+description: "I will give you context with the facts about people and their locations hidden in some random text and a question. You need to answer the question based only on the information from the facts.\nIf a person was in different locations, use the latest location the person was in to answer the question.\nYour answer should contain only one word - $yes$ or $no$. Do not write anything else. Do not explain your answer.\n\n"
+doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
+
+fewshot_config:
+  sampler: first_n
+  samples:
+    - input: "John is not in the bathroom. Sandra is not in the bedroom."
+      question: "Is John in the bathroom?"
+      target: "no"
+    - input: "Mary journeyed to the kitchen. John is in the bedroom. Sandra is not in the garden."
+      question: "Is Mary in the kitchen?"
+      target: "yes"
--- a/lm_eval/tasks/babilong/common_utils.py
+++ b/lm_eval/tasks/babilong/common_utils.py
+import logging
+import re
+from functools import cache
+from typing import TYPE_CHECKING, Union
+
+import datasets
+from transformers import AutoTokenizer
+
+
+if TYPE_CHECKING:
+    import transformers
+
+
+eval_logger = logging.getLogger(__name__)
+
+
+@cache
+def get_tokenizer(
+    tokenizer=None, pretrained=None, **kwargs
+) -> Union["transformers.PreTrainedTokenizer", "transformers.PreTrainedTokenizerFast"]:
+    pretrained = tokenizer or pretrained
+    assert pretrained, "No tokenizer or pretrained provided."
+    eval_logger.info(f"Using tokenizer {pretrained} for babilong tasks.")
+    return AutoTokenizer.from_pretrained(pretrained, trust_remote_code=True)
+
+
+def postprocess_pred(prediction: list[str]) -> list[str]:
+    res = []
+    for predict_str in prediction:
+        predict_str = predict_str.strip()
+
+        # Remove all non-printable characters
+        np_pattern = re.compile(r"[\x00-\x1f]")
+        predict_str = np_pattern.sub("\n", predict_str).strip()
+        res.append(predict_str)
+
+    return res
+
+
+def load_dataset(**kwargs):
+    config_name = kwargs.get("max_seq_lengths", "0k")
+
+    # Get specific qa split
+    qa_split = kwargs.get("qa_split")
+
+    eval_logger.info(
+        f"Loading babilong dataset: max_seq_lengths={config_name}, split={qa_split}"
+    )
+    dataset = datasets.load_dataset(
+        "RMT-team/babilong-1k-samples", name=config_name, split=qa_split
+    )
+    return {qa_split: dataset}
+
+
+def process_results(doc: dict, results: list[str]) -> dict[str, float]:
+    pred = postprocess_pred(results)
+    target = doc.get("target", "").strip()
+
+    # String match
+    score = 1.0 if target.lower() in pred[0].lower() else 0.0
+
+    return {"acc": score}
--- a/lm_eval/tasks/bhs/README.md
+++ b/lm_eval/tasks/bhs/README.md
+#  BHS: Controlled Evaluation of Syntactic Knowledge in Basque, Hindi, and Swahili
+
+## Paper
+
+Title: Controlled Evaluation of Syntactic Knowledge in Multilingual Language Models
+
+Abstract:
+
+> Language models (LMs) are capable of acquiring elements of human-like syntactic knowledge. Targeted syntactic evaluation tests have been employed to measure how well they form generalizations about syntactic phenomena in high-resource languages such as English. However, we still lack a thorough understanding of LMs' capacity for syntactic generalizations in low-resource languages, which are responsible for much of the diversity of syntactic patterns worldwide. In this study, we develop targeted syntactic evaluation tests for three low-resource languages (Basque, Hindi, and Swahili) and use them to evaluate five families of open-access multilingual Transformer LMs. We find that some syntactic tasks prove relatively easy for LMs while others (agreement in sentences containing indirect objects in Basque, agreement across a prepositional phrase in Swahili) are challenging. We additionally uncover issues with publicly available Transformers, including a bias toward the habitual aspect in Hindi in multilingual BERT and underperformance compared to similar-sized models in XGLM-4.5B. ([Kryvosheieva & Levy, 2025](https://aclanthology.org/2025.loreslm-1.30/))
+
+
+Homepage: https://github.com/dariakryvosheieva/syntactic_generalization_multilingual
+
+### Citation
+
+```
+@inproceedings{kryvosheieva-levy-2025-controlled,
+    title = "Controlled Evaluation of Syntactic Knowledge in Multilingual Language Models",
+    author = "Kryvosheieva, Daria and Levy, Roger",
+    editor = "Hettiarachchi, Hansi and Ranasinghe, Tharindu and Rayson, Paul and Mitkov, Ruslan and Gaber, Mohamed and Premasiri, Damith and Tan, Fiona Anting and Uyangodage, Lasitha",
+    booktitle = "Proceedings of the First Workshop on Language Models for Low-Resource Languages",
+    month = jan,
+    year = "2025",
+    address = "Abu Dhabi, United Arab Emirates",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2025.loreslm-1.30/",
+    pages = "402--413"
+}
+```
+
+### Groups, Tags, and Tasks
+
+* `bhs_basque`: Run all Basque tasks (listed below) and calculate mean performance. In all tasks, the goal is for the model to predict the auxiliary verb (AUX) that correctly agrees with the subject (S), direct object (DO), and indirect object (IO). Each task manipulates a different one of these, e.g., for `bhs__basque__DO__S_IO_DO_V_AUX`, the two presented sentences (with `S_IO_DO_V_AUX` structure) have auxiliary verbs that agree with the subject and indirect object, and the task is to correctly assign the one that also agrees with the direct object (DO) a higher probability than the one that does not. For specific examples, see [Kryvosheieva & Levy (2025)](https://aclanthology.org/2025.loreslm-1.30/).
+    * `bhs__basque__DO__S_DO_V_AUX`
+    * `bhs__basque__DO__S_IO_DO_V_AUX`
+    * `bhs__basque__IO__IO_S_V_AUX`
+    * `bhs__basque__IO__S_IO_DO_V_AUX`
+    * `bhs__basque__S__IO_S_V_AUX`
+    * `bhs__basque__S__S_DO_V_AUX`
+    * `bhs__basque__S__S_IO_DO_V_AUX`
+    * `bhs__basque__S__S_V_AUX`
+
+* `bhs_hindi`: Run all Hindi tasks (listed below) and calculate mean performance. In all tasks, the goal is for the model to predict that in a sentence with the 'ne' clitic, the final verb should be in a perfective form, and in sentences without, it should be in a non-perfective form (in this case, habitual or progressive) by assigning a higher probability to the correct verb. For specific examples, see [Kryvosheieva & Levy (2025)](https://aclanthology.org/2025.loreslm-1.30/).
+    * `bhs__hindi__S_O_V`
+    * `bhs__hindi__S_PossPRN_O_V`
+    * `bhs__hindi__S_PossPRN_PossN_O_V`
+    * `bhs__hindi__S_ne_O_V`
+    * `bhs__hindi__S_ne_PossPRN_O_V`
+    * `bhs__hindi__S_ne_PossPRN_PossN_O_V`
+
+* `bhs_swahili`:  Run all Swahili tasks (listed below) and calculate mean performance. In all tasks, the goal is for the model to assign the final word - a verb (V) or adjective (A/AN) a higher probability if it correctly agrees with the initial noun (in terms of noun class) than if it does not. For specific examples, see [Kryvosheieva & Levy (2025)](https://aclanthology.org/2025.loreslm-1.30/).
+    * `bhs__swahili__N_of_Poss_D_AP_V_ni_AN`
+    * `bhs__swahili__N_of_Poss_D_AP_ni_AN`
+    * `bhs__swahili__N_of_Poss_D_A_V`
+    * `bhs__swahili__N_of_Poss_D_A_V1_V2`
+    * `bhs__swahili__N_of_Poss_D_V`
+    * `bhs__swahili__N_of_Poss_D_ni_A`
+    * `bhs__swahili__N_of_Poss_V`
+    * `bhs__swahili__N_of_Poss_ni_A`
+
+
+**Implementation Note:**  The [original implementation](https://github.com/dariakryvosheieva/syntactic_generalization_multilingual) normalizes the log-probability of the final word by its length in number of tokens, which is not supported by the Language Model Evaluation Harness (see [[1](https://blog.eleuther.ai/multiple-choice-normalization/)], [[2](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md)], [[3](https://github.com/EleutherAI/lm-evaluation-harness/issues/1396)]). For this reason, the implementation provided here includes both the `acc` (accuracy based on comparing the unnormalized log-probability of the correct and incorrect versions of each sentence) and `acc_norm` (the same as `acc` but with sentence log-probability normalized by number of bytes) metrics.
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+### Changelog
--- a/lm_eval/tasks/bhs/_template_yaml
+++ b/lm_eval/tasks/bhs/_template_yaml
+dataset_path: jmichaelov/bhs
+output_type: multiple_choice
+test_split: test
+doc_to_text: "{{context}}"
+doc_to_target: 0
+doc_to_choice: "{{[ending_good, ending_bad]}}"
+num_fewshot: 0
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0
--- a/lm_eval/tasks/bhs/basque-DO-S_DO_V_AUX.yaml
+++ b/lm_eval/tasks/bhs/basque-DO-S_DO_V_AUX.yaml
+dataset_name: basque-DO-S_DO_V_AUX
+include: _template_yaml
+task: bhs__basque__DO__S_DO_V_AUX
--- a/lm_eval/tasks/bhs/basque-DO-S_IO_DO_V_AUX.yaml
+++ b/lm_eval/tasks/bhs/basque-DO-S_IO_DO_V_AUX.yaml
+dataset_name: basque-DO-S_IO_DO_V_AUX
+include: _template_yaml
+task: bhs__basque__DO__S_IO_DO_V_AUX
--- a/lm_eval/tasks/bhs/basque-IO-IO_S_V_AUX.yaml
+++ b/lm_eval/tasks/bhs/basque-IO-IO_S_V_AUX.yaml
+dataset_name: basque-IO-IO_S_V_AUX
+include: _template_yaml
+task: bhs__basque__IO__IO_S_V_AUX
--- a/lm_eval/tasks/bhs/basque-IO-S_IO_DO_V_AUX.yaml
+++ b/lm_eval/tasks/bhs/basque-IO-S_IO_DO_V_AUX.yaml
+dataset_name: basque-IO-S_IO_DO_V_AUX
+include: _template_yaml
+task: bhs__basque__IO__S_IO_DO_V_AUX
--- a/lm_eval/tasks/bhs/basque-S-IO_S_V_AUX.yaml
+++ b/lm_eval/tasks/bhs/basque-S-IO_S_V_AUX.yaml
+dataset_name: basque-S-IO_S_V_AUX
+include: _template_yaml
+task: bhs__basque__S__IO_S_V_AUX