Commit c4b0c0cb authored by Baber's avatar Baber
Browse files

Merge branch 'main' into metrics

# Conflicts:
#	lm_eval/models/vllm_causallms.py
#	pyproject.toml
parents 6b20ae8c de496b80
include: _babilong_common_yaml
task: babilong_qa8
test_split: qa8
custom_dataset: !function common_utils.load_dataset
dataset_kwargs:
qa_split: qa8
description: "I will give you context with the facts about people and objects they carry, hidden in some random text and a question. You need to answer the question based only on the information from the facts.\nYour answer should contain only one or two words: $nothing$ or $object$ or $object_1$, $object_2$. Do not write anything else. Do not explain your answer.\n\n"
doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
fewshot_config:
sampler: first_n
samples:
- input: "Sandra travelled to the garden. Mary grabbed the milk there."
question: "What is Mary carrying?"
target: "milk"
- input: "Mary travelled to the kitchen. Sandra travelled to the office. John travelled to the office. Sandra discarded the milk there."
question: "What is Sandra carrying?"
target: "nothing"
- input: "Daniel grabbed the apple there. Mary went to the office. Daniel moved to the garden. Daniel grabbed the milk there. Mary went to the kitchen."
question: "What is Daniel carrying?"
target: "apple,milk"
include: _babilong_common_yaml
task: babilong_qa9
test_split: qa9
custom_dataset: !function common_utils.load_dataset
dataset_kwargs:
qa_split: qa9
description: "I will give you context with the facts about people and their locations hidden in some random text and a question. You need to answer the question based only on the information from the facts.\nIf a person was in different locations, use the latest location the person was in to answer the question.\nYour answer should contain only one word - $yes$ or $no$. Do not write anything else. Do not explain your answer.\n\n"
doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
fewshot_config:
sampler: first_n
samples:
- input: "John is not in the bathroom. Sandra is not in the bedroom."
question: "Is John in the bathroom?"
target: "no"
- input: "Mary journeyed to the kitchen. John is in the bedroom. Sandra is not in the garden."
question: "Is Mary in the kitchen?"
target: "yes"
import logging
import re
from functools import cache
from typing import TYPE_CHECKING, Union
import datasets
from transformers import AutoTokenizer
if TYPE_CHECKING:
import transformers
eval_logger = logging.getLogger(__name__)
@cache
def get_tokenizer(
tokenizer=None, pretrained=None, **kwargs
) -> Union["transformers.PreTrainedTokenizer", "transformers.PreTrainedTokenizerFast"]:
pretrained = tokenizer or pretrained
assert pretrained, "No tokenizer or pretrained provided."
eval_logger.info(f"Using tokenizer {pretrained} for babilong tasks.")
return AutoTokenizer.from_pretrained(pretrained, trust_remote_code=True)
def postprocess_pred(prediction: list[str]) -> list[str]:
res = []
for predict_str in prediction:
predict_str = predict_str.strip()
# Remove all non-printable characters
np_pattern = re.compile(r"[\x00-\x1f]")
predict_str = np_pattern.sub("\n", predict_str).strip()
res.append(predict_str)
return res
def load_dataset(**kwargs):
config_name = kwargs.get("max_seq_lengths", "0k")
# Get specific qa split
qa_split = kwargs.get("qa_split")
eval_logger.info(
f"Loading babilong dataset: max_seq_lengths={config_name}, split={qa_split}"
)
dataset = datasets.load_dataset(
"RMT-team/babilong-1k-samples", name=config_name, split=qa_split
)
return {qa_split: dataset}
def process_results(doc: dict, results: list[str]) -> dict[str, float]:
pred = postprocess_pred(results)
target = doc.get("target", "").strip()
# String match
score = 1.0 if target.lower() in pred[0].lower() else 0.0
return {"acc": score}
# BHS: Controlled Evaluation of Syntactic Knowledge in Basque, Hindi, and Swahili
## Paper
Title: Controlled Evaluation of Syntactic Knowledge in Multilingual Language Models
Abstract:
> Language models (LMs) are capable of acquiring elements of human-like syntactic knowledge. Targeted syntactic evaluation tests have been employed to measure how well they form generalizations about syntactic phenomena in high-resource languages such as English. However, we still lack a thorough understanding of LMs' capacity for syntactic generalizations in low-resource languages, which are responsible for much of the diversity of syntactic patterns worldwide. In this study, we develop targeted syntactic evaluation tests for three low-resource languages (Basque, Hindi, and Swahili) and use them to evaluate five families of open-access multilingual Transformer LMs. We find that some syntactic tasks prove relatively easy for LMs while others (agreement in sentences containing indirect objects in Basque, agreement across a prepositional phrase in Swahili) are challenging. We additionally uncover issues with publicly available Transformers, including a bias toward the habitual aspect in Hindi in multilingual BERT and underperformance compared to similar-sized models in XGLM-4.5B. ([Kryvosheieva & Levy, 2025](https://aclanthology.org/2025.loreslm-1.30/))
Homepage: https://github.com/dariakryvosheieva/syntactic_generalization_multilingual
### Citation
```
@inproceedings{kryvosheieva-levy-2025-controlled,
title = "Controlled Evaluation of Syntactic Knowledge in Multilingual Language Models",
author = "Kryvosheieva, Daria and Levy, Roger",
editor = "Hettiarachchi, Hansi and Ranasinghe, Tharindu and Rayson, Paul and Mitkov, Ruslan and Gaber, Mohamed and Premasiri, Damith and Tan, Fiona Anting and Uyangodage, Lasitha",
booktitle = "Proceedings of the First Workshop on Language Models for Low-Resource Languages",
month = jan,
year = "2025",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.loreslm-1.30/",
pages = "402--413"
}
```
### Groups, Tags, and Tasks
* `bhs_basque`: Run all Basque tasks (listed below) and calculate mean performance. In all tasks, the goal is for the model to predict the auxiliary verb (AUX) that correctly agrees with the subject (S), direct object (DO), and indirect object (IO). Each task manipulates a different one of these, e.g., for `bhs__basque__DO__S_IO_DO_V_AUX`, the two presented sentences (with `S_IO_DO_V_AUX` structure) have auxiliary verbs that agree with the subject and indirect object, and the task is to correctly assign the one that also agrees with the direct object (DO) a higher probability than the one that does not. For specific examples, see [Kryvosheieva & Levy (2025)](https://aclanthology.org/2025.loreslm-1.30/).
* `bhs__basque__DO__S_DO_V_AUX`
* `bhs__basque__DO__S_IO_DO_V_AUX`
* `bhs__basque__IO__IO_S_V_AUX`
* `bhs__basque__IO__S_IO_DO_V_AUX`
* `bhs__basque__S__IO_S_V_AUX`
* `bhs__basque__S__S_DO_V_AUX`
* `bhs__basque__S__S_IO_DO_V_AUX`
* `bhs__basque__S__S_V_AUX`
* `bhs_hindi`: Run all Hindi tasks (listed below) and calculate mean performance. In all tasks, the goal is for the model to predict that in a sentence with the 'ne' clitic, the final verb should be in a perfective form, and in sentences without, it should be in a non-perfective form (in this case, habitual or progressive) by assigning a higher probability to the correct verb. For specific examples, see [Kryvosheieva & Levy (2025)](https://aclanthology.org/2025.loreslm-1.30/).
* `bhs__hindi__S_O_V`
* `bhs__hindi__S_PossPRN_O_V`
* `bhs__hindi__S_PossPRN_PossN_O_V`
* `bhs__hindi__S_ne_O_V`
* `bhs__hindi__S_ne_PossPRN_O_V`
* `bhs__hindi__S_ne_PossPRN_PossN_O_V`
* `bhs_swahili`: Run all Swahili tasks (listed below) and calculate mean performance. In all tasks, the goal is for the model to assign the final word - a verb (V) or adjective (A/AN) a higher probability if it correctly agrees with the initial noun (in terms of noun class) than if it does not. For specific examples, see [Kryvosheieva & Levy (2025)](https://aclanthology.org/2025.loreslm-1.30/).
* `bhs__swahili__N_of_Poss_D_AP_V_ni_AN`
* `bhs__swahili__N_of_Poss_D_AP_ni_AN`
* `bhs__swahili__N_of_Poss_D_A_V`
* `bhs__swahili__N_of_Poss_D_A_V1_V2`
* `bhs__swahili__N_of_Poss_D_V`
* `bhs__swahili__N_of_Poss_D_ni_A`
* `bhs__swahili__N_of_Poss_V`
* `bhs__swahili__N_of_Poss_ni_A`
**Implementation Note:** The [original implementation](https://github.com/dariakryvosheieva/syntactic_generalization_multilingual) normalizes the log-probability of the final word by its length in number of tokens, which is not supported by the Language Model Evaluation Harness (see [[1](https://blog.eleuther.ai/multiple-choice-normalization/)], [[2](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md)], [[3](https://github.com/EleutherAI/lm-evaluation-harness/issues/1396)]). For this reason, the implementation provided here includes both the `acc` (accuracy based on comparing the unnormalized log-probability of the correct and incorrect versions of each sentence) and `acc_norm` (the same as `acc` but with sentence log-probability normalized by number of bytes) metrics.
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
### Changelog
dataset_path: jmichaelov/bhs
output_type: multiple_choice
test_split: test
doc_to_text: "{{context}}"
doc_to_target: 0
doc_to_choice: "{{[ending_good, ending_bad]}}"
num_fewshot: 0
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 0
dataset_name: basque-DO-S_DO_V_AUX
include: _template_yaml
task: bhs__basque__DO__S_DO_V_AUX
dataset_name: basque-DO-S_IO_DO_V_AUX
include: _template_yaml
task: bhs__basque__DO__S_IO_DO_V_AUX
dataset_name: basque-IO-IO_S_V_AUX
include: _template_yaml
task: bhs__basque__IO__IO_S_V_AUX
dataset_name: basque-IO-S_IO_DO_V_AUX
include: _template_yaml
task: bhs__basque__IO__S_IO_DO_V_AUX
dataset_name: basque-S-IO_S_V_AUX
include: _template_yaml
task: bhs__basque__S__IO_S_V_AUX
dataset_name: basque-S-S_DO_V_AUX
include: _template_yaml
task: bhs__basque__S__S_DO_V_AUX
dataset_name: basque-S-S_IO_DO_V_AUX
include: _template_yaml
task: bhs__basque__S__S_IO_DO_V_AUX
dataset_name: basque-S-S_V_AUX
include: _template_yaml
task: bhs__basque__S__S_V_AUX
group: bhs_basque
task:
- bhs__basque__DO__S_DO_V_AUX
- bhs__basque__DO__S_IO_DO_V_AUX
- bhs__basque__IO__IO_S_V_AUX
- bhs__basque__IO__S_IO_DO_V_AUX
- bhs__basque__S__IO_S_V_AUX
- bhs__basque__S__S_DO_V_AUX
- bhs__basque__S__S_IO_DO_V_AUX
- bhs__basque__S__S_V_AUX
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
group: bhs_hindi
task:
- bhs__hindi__S_O_V
- bhs__hindi__S_PossPRN_O_V
- bhs__hindi__S_PossPRN_PossN_O_V
- bhs__hindi__S_ne_O_V
- bhs__hindi__S_ne_PossPRN_O_V
- bhs__hindi__S_ne_PossPRN_PossN_O_V
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
group: bhs_swahili
task:
- bhs__swahili__N_of_Poss_D_AP_V_ni_AN
- bhs__swahili__N_of_Poss_D_AP_ni_AN
- bhs__swahili__N_of_Poss_D_A_V
- bhs__swahili__N_of_Poss_D_A_V1_V2
- bhs__swahili__N_of_Poss_D_V
- bhs__swahili__N_of_Poss_D_ni_A
- bhs__swahili__N_of_Poss_V
- bhs__swahili__N_of_Poss_ni_A
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
dataset_name: hindi-S_O_V
include: _template_yaml
task: bhs__hindi__S_O_V
dataset_name: hindi-S_PossPRN_O_V
include: _template_yaml
task: bhs__hindi__S_PossPRN_O_V
dataset_name: hindi-S_PossPRN_PossN_O_V
include: _template_yaml
task: bhs__hindi__S_PossPRN_PossN_O_V
dataset_name: hindi-S_ne_O_V
include: _template_yaml
task: bhs__hindi__S_ne_O_V
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment