Merge branch 'smolrefact' into tasklist

# Conflicts: # lm_eval/__main__.py # lm_eval/api/group.py # lm_eval/api/task.py # lm_eval/evaluator_utils.py # lm_eval/tasks/__init__.py # lm_eval/utils.py # pyproject.toml

Merge branch 'smolrefact' into tasklist
# Conflicts: # lm_eval/__main__.py # lm_eval/api/group.py # lm_eval/api/task.py # lm_eval/evaluator_utils.py # lm_eval/tasks/__init__.py # lm_eval/utils.py # pyproject.toml
abd17276 · Baber · 00afd536 · 70314843 · abd17276 · abd17276
Commit abd17276 authored Sep 26, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/blimp_nl/wh_movement__stranding_2.yaml
+++ b/lm_eval/tasks/blimp_nl/wh_movement__stranding_2.yaml
+dataset_name: wh_movement__stranding_2
+include: _template_yaml
+task: blimp_nl__wh_movement__stranding_2
--- a/lm_eval/tasks/blimp_nl/wh_movement_restrictions__bridge_verb_1.yaml
+++ b/lm_eval/tasks/blimp_nl/wh_movement_restrictions__bridge_verb_1.yaml
+dataset_name: wh_movement_restrictions__bridge_verb_1
+include: _template_yaml
+task: blimp_nl__wh_movement_restrictions__bridge_verb_1
--- a/lm_eval/tasks/blimp_nl/wh_movement_restrictions__bridge_verb_2.yaml
+++ b/lm_eval/tasks/blimp_nl/wh_movement_restrictions__bridge_verb_2.yaml
+dataset_name: wh_movement_restrictions__bridge_verb_2
+include: _template_yaml
+task: blimp_nl__wh_movement_restrictions__bridge_verb_2
--- a/lm_eval/tasks/blimp_nl/wh_movement_restrictions__island_1.yaml
+++ b/lm_eval/tasks/blimp_nl/wh_movement_restrictions__island_1.yaml
+dataset_name: wh_movement_restrictions__island_1
+include: _template_yaml
+task: blimp_nl__wh_movement_restrictions__island_1
--- a/lm_eval/tasks/blimp_nl/wh_movement_restrictions__island_2.yaml
+++ b/lm_eval/tasks/blimp_nl/wh_movement_restrictions__island_2.yaml
+dataset_name: wh_movement_restrictions__island_2
+include: _template_yaml
+task: blimp_nl__wh_movement_restrictions__island_2
--- a/lm_eval/tasks/blimp_nl/wh_movement_restrictions__resumptive_prolepsis.yaml
+++ b/lm_eval/tasks/blimp_nl/wh_movement_restrictions__resumptive_prolepsis.yaml
+dataset_name: wh_movement_restrictions__resumptive_prolepsis
+include: _template_yaml
+task: blimp_nl__wh_movement_restrictions__resumptive_prolepsis
--- a/lm_eval/tasks/blimp_nl/wh_movement_restrictions__superiority.yaml
+++ b/lm_eval/tasks/blimp_nl/wh_movement_restrictions__superiority.yaml
+dataset_name: wh_movement_restrictions__superiority
+include: _template_yaml
+task: blimp_nl__wh_movement_restrictions__superiority
--- a/lm_eval/tasks/cabbq/README.md
+++ b/lm_eval/tasks/cabbq/README.md
+# Catalan Bias Benchmark for Question Answering (CaBBQ)
+
+### Paper
+
+Title: `EsBBQ and CaBBQ: The Spanish and Catalan Bias Benchmarks for Question Answering`
+
+Abstract: [https://arxiv.org/abs/2507.11216](https://arxiv.org/abs/2507.11216)
+
+CaBBQ is a dataset designed to assess social bias across 10 categories in a multiple-choice QA setting, adapted from the original BBQ into the Catalan language and the social context of Spain.
+
+It is fully parallel with the `esbbq` task group, the version in Spanish.
+
+### Citation
+
+```
+@misc{esbbq-cabbq-2025,
+      title={EsBBQ and CaBBQ: The Spanish and Catalan Bias Benchmarks for Question Answering},
+      author={Valle Ruiz-Fernández and Mario Mina and Júlia Falcão and Luis Vasquez-Reina and Anna Sallés and Aitor Gonzalez-Agirre and Olatz Perez-de-Viñaspre},
+      year={2025},
+      eprint={2507.11216},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2507.11216},
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* `cabbq`: Contains the subtasks that covers all demographic categories.
+
+### Tasks
+
+`for category in ["age", "disability_status", "gender", "lgbtqia", "nationality", "physical_appearance", "race_ethnicity", "religion", "ses", "spanish_region"]:`
+  * `cabbq_{category}`: Subtask that evaluates on the given category's subset.
+
+### Metrics
+
+CaBBQ is evaluated with the following 4 metrics, at the level of each subtask and with aggregated values for the entire group:
+
+* `acc_ambig`: Accuracy over ambiguous instances.
+* `acc_disambig`: Accuracy over disambiguated instances.
+* `bias_score_ambig`: Bias score over ambiguous instances.
+* `bias_score_disambig`: Bias score over disambiguated instances.
+
+See the paper for a thorough explanation and the formulas of these metrics.
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/cabbq/_cabbq_common_yaml
+++ b/lm_eval/tasks/cabbq/_cabbq_common_yaml
+dataset_path: BSC-LT/CaBBQ
+dataset_name: null
+output_type: multiple_choice
+training_split: null
+validation_split: null
+test_split: test
+doc_to_text: "Context: {{context}}\nPregunta: {{question}}\nResposta:"
+doc_to_target: label
+doc_to_choice: "{{[ans0, ans1, 'No hi ha prou informació', 'No hi ha informació suficient', 'No ho sé', 'No sé', 'Falta informació', 'No es pot respondre', 'No se sap', 'No es pot saber', 'No es pot determinar']}}"
+process_results: !function utils.process_results
+metric_list:
+  - metric: acc_ambig
+    aggregation: !function utils.acc_ambig_agg
+    higher_is_better: true
+  - metric: acc_disambig
+    aggregation: !function utils.acc_disambig_agg
+    higher_is_better: true
+  - metric: bias_score_ambig
+    aggregation: !function utils.bias_score_ambig_agg
+    higher_is_better: false
+  - metric: bias_score_disambig
+    aggregation: !function utils.bias_score_disambig_agg
+    higher_is_better: false
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/cabbq/cabbq.yaml
+++ b/lm_eval/tasks/cabbq/cabbq.yaml
+group: cabbq
+task:
+  - cabbq_age
+  - cabbq_disability_status
+  - cabbq_gender
+  - cabbq_lgbtqia
+  - cabbq_nationality
+  - cabbq_physical_appearance
+  - cabbq_race_ethnicity
+  - cabbq_religion
+  - cabbq_ses
+  - cabbq_spanish_region
+tag:
+  - social_bias
+aggregate_metric_list:
+  - metric: "acc_ambig"
+    weight_by_size: true
+  - metric: "acc_disambig"
+    weight_by_size: true
+  - metric: "bias_score_ambig"
+    weight_by_size: true
+  - metric: "bias_score_disambig"
+    weight_by_size: true
+
+  # `weight_by_size`:
+  # `true` for micro average: retain all subtasks' per-document results and take the mean over all documents' scores to get the aggregate mean
+  # `false` for macro average: take the mean of the subtasks' aggregated results
--- a/lm_eval/tasks/cabbq/cabbq_age.yaml
+++ b/lm_eval/tasks/cabbq/cabbq_age.yaml
+include: _cabbq_common_yaml
+task: cabbq_age
+dataset_name: Age
--- a/lm_eval/tasks/cabbq/cabbq_disability_status.yaml
+++ b/lm_eval/tasks/cabbq/cabbq_disability_status.yaml
+include: _cabbq_common_yaml
+task: cabbq_disability_status
+dataset_name: DisabilityStatus
--- a/lm_eval/tasks/cabbq/cabbq_gender.yaml
+++ b/lm_eval/tasks/cabbq/cabbq_gender.yaml
+include: _cabbq_common_yaml
+task: cabbq_gender
+dataset_name: Gender
--- a/lm_eval/tasks/cabbq/cabbq_lgbtqia.yaml
+++ b/lm_eval/tasks/cabbq/cabbq_lgbtqia.yaml
+include: _cabbq_common_yaml
+task: cabbq_lgbtqia
+dataset_name: LGBTQIA
--- a/lm_eval/tasks/cabbq/cabbq_nationality.yaml
+++ b/lm_eval/tasks/cabbq/cabbq_nationality.yaml
+include: _cabbq_common_yaml
+task: cabbq_nationality
+dataset_name: Nationality
--- a/lm_eval/tasks/cabbq/cabbq_physical_appearance.yaml
+++ b/lm_eval/tasks/cabbq/cabbq_physical_appearance.yaml
+include: _cabbq_common_yaml
+task: cabbq_physical_appearance
+dataset_name: PhysicalAppearance
--- a/lm_eval/tasks/cabbq/cabbq_race_ethnicity.yaml
+++ b/lm_eval/tasks/cabbq/cabbq_race_ethnicity.yaml
+include: _cabbq_common_yaml
+task: cabbq_race_ethnicity
+dataset_name: RaceEthnicity
--- a/lm_eval/tasks/cabbq/cabbq_religion.yaml
+++ b/lm_eval/tasks/cabbq/cabbq_religion.yaml
+include: _cabbq_common_yaml
+task: cabbq_religion
+dataset_name: Religion
--- a/lm_eval/tasks/cabbq/cabbq_ses.yaml
+++ b/lm_eval/tasks/cabbq/cabbq_ses.yaml
+include: _cabbq_common_yaml
+task: cabbq_ses
+dataset_name: SES
--- a/lm_eval/tasks/cabbq/cabbq_spanish_region.yaml
+++ b/lm_eval/tasks/cabbq/cabbq_spanish_region.yaml
+include: _cabbq_common_yaml
+task: cabbq_spanish_region
+dataset_name: SpanishRegion