Merge branch 'main' into comma

3e8135ce · Baber · 8e560c96 · 0c134ee9 · 3e8135ce · 3e8135ce
Commit 3e8135ce authored Sep 16, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/blimp_nl/verb_second__order_embedded.yaml
+++ b/lm_eval/tasks/blimp_nl/verb_second__order_embedded.yaml
+dataset_name: verb_second__order_embedded
+include: _template_yaml
+task: blimp_nl__verb_second__order_embedded
--- a/lm_eval/tasks/blimp_nl/verb_second__order_main.yaml
+++ b/lm_eval/tasks/blimp_nl/verb_second__order_main.yaml
+dataset_name: verb_second__order_main
+include: _template_yaml
+task: blimp_nl__verb_second__order_main
--- a/lm_eval/tasks/blimp_nl/wh_movement__filler_effect_gap.yaml
+++ b/lm_eval/tasks/blimp_nl/wh_movement__filler_effect_gap.yaml
+dataset_name: wh_movement__filler_effect_gap
+include: _template_yaml
+task: blimp_nl__wh_movement__filler_effect_gap
--- a/lm_eval/tasks/blimp_nl/wh_movement__filler_effect_no_gap.yaml
+++ b/lm_eval/tasks/blimp_nl/wh_movement__filler_effect_no_gap.yaml
+dataset_name: wh_movement__filler_effect_no_gap
+include: _template_yaml
+task: blimp_nl__wh_movement__filler_effect_no_gap
--- a/lm_eval/tasks/blimp_nl/wh_movement__hierarchy.yaml
+++ b/lm_eval/tasks/blimp_nl/wh_movement__hierarchy.yaml
+dataset_name: wh_movement__hierarchy
+include: _template_yaml
+task: blimp_nl__wh_movement__hierarchy
--- a/lm_eval/tasks/blimp_nl/wh_movement__question_formation.yaml
+++ b/lm_eval/tasks/blimp_nl/wh_movement__question_formation.yaml
+dataset_name: wh_movement__question_formation
+include: _template_yaml
+task: blimp_nl__wh_movement__question_formation
--- a/lm_eval/tasks/blimp_nl/wh_movement__stranding_1.yaml
+++ b/lm_eval/tasks/blimp_nl/wh_movement__stranding_1.yaml
+dataset_name: wh_movement__stranding_1
+include: _template_yaml
+task: blimp_nl__wh_movement__stranding_1
--- a/lm_eval/tasks/blimp_nl/wh_movement__stranding_2.yaml
+++ b/lm_eval/tasks/blimp_nl/wh_movement__stranding_2.yaml
+dataset_name: wh_movement__stranding_2
+include: _template_yaml
+task: blimp_nl__wh_movement__stranding_2
--- a/lm_eval/tasks/blimp_nl/wh_movement_restrictions__bridge_verb_1.yaml
+++ b/lm_eval/tasks/blimp_nl/wh_movement_restrictions__bridge_verb_1.yaml
+dataset_name: wh_movement_restrictions__bridge_verb_1
+include: _template_yaml
+task: blimp_nl__wh_movement_restrictions__bridge_verb_1
--- a/lm_eval/tasks/blimp_nl/wh_movement_restrictions__bridge_verb_2.yaml
+++ b/lm_eval/tasks/blimp_nl/wh_movement_restrictions__bridge_verb_2.yaml
+dataset_name: wh_movement_restrictions__bridge_verb_2
+include: _template_yaml
+task: blimp_nl__wh_movement_restrictions__bridge_verb_2
--- a/lm_eval/tasks/blimp_nl/wh_movement_restrictions__island_1.yaml
+++ b/lm_eval/tasks/blimp_nl/wh_movement_restrictions__island_1.yaml
+dataset_name: wh_movement_restrictions__island_1
+include: _template_yaml
+task: blimp_nl__wh_movement_restrictions__island_1
--- a/lm_eval/tasks/blimp_nl/wh_movement_restrictions__island_2.yaml
+++ b/lm_eval/tasks/blimp_nl/wh_movement_restrictions__island_2.yaml
+dataset_name: wh_movement_restrictions__island_2
+include: _template_yaml
+task: blimp_nl__wh_movement_restrictions__island_2
--- a/lm_eval/tasks/blimp_nl/wh_movement_restrictions__resumptive_prolepsis.yaml
+++ b/lm_eval/tasks/blimp_nl/wh_movement_restrictions__resumptive_prolepsis.yaml
+dataset_name: wh_movement_restrictions__resumptive_prolepsis
+include: _template_yaml
+task: blimp_nl__wh_movement_restrictions__resumptive_prolepsis
--- a/lm_eval/tasks/blimp_nl/wh_movement_restrictions__superiority.yaml
+++ b/lm_eval/tasks/blimp_nl/wh_movement_restrictions__superiority.yaml
+dataset_name: wh_movement_restrictions__superiority
+include: _template_yaml
+task: blimp_nl__wh_movement_restrictions__superiority
--- a/lm_eval/tasks/cabbq/README.md
+++ b/lm_eval/tasks/cabbq/README.md
+# Catalan Bias Benchmark for Question Answering (CaBBQ)
+### Paper
+Title: `EsBBQ and CaBBQ: The Spanish and Catalan Bias Benchmarks for Question Answering`
+Abstract: [https://arxiv.org/abs/2507.11216](https://arxiv.org/abs/2507.11216)
+CaBBQ is a dataset designed to assess social bias across 10 categories in a multiple-choice QA setting, adapted from the original BBQ into the Catalan language and the social context of Spain.
+It is fully parallel with the `esbbq` task group, the version in Spanish.
+### Citation
+```
+@misc{esbbq-cabbq-2025,
+      title={EsBBQ and CaBBQ: The Spanish and Catalan Bias Benchmarks for Question Answering},
+      author={Valle Ruiz-Fernández and Mario Mina and Júlia Falcão and Luis Vasquez-Reina and Anna Sallés and Aitor Gonzalez-Agirre and Olatz Perez-de-Viñaspre},
+      year={2025},
+      eprint={2507.11216},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2507.11216},
+}
+```
+### Groups and Tasks
+#### Groups
+* `cabbq`: Contains the subtasks that covers all demographic categories.
+### Tasks
+`for category in ["age", "disability_status", "gender", "lgbtqia", "nationality", "physical_appearance", "race_ethnicity", "religion", "ses", "spanish_region"]:`
+  * `cabbq_{category}`: Subtask that evaluates on the given category's subset.
+### Metrics
+CaBBQ is evaluated with the following 4 metrics, at the level of each subtask and with aggregated values for the entire group:
+* `acc_ambig`: Accuracy over ambiguous instances.
+* `acc_disambig`: Accuracy over disambiguated instances.
+* `bias_score_ambig`: Bias score over ambiguous instances.
+* `bias_score_disambig`: Bias score over disambiguated instances.
+See the paper for a thorough explanation and the formulas of these metrics.
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/cabbq/_cabbq_common_yaml
+++ b/lm_eval/tasks/cabbq/_cabbq_common_yaml
+dataset_path: BSC-LT/CaBBQ
+dataset_name: null
+output_type: multiple_choice
+training_split: null
+validation_split: null
+test_split: test
+doc_to_text: "Context: {{context}}\nPregunta: {{question}}\nResposta:"
+doc_to_target: label
+doc_to_choice: "{{[ans0, ans1, 'No hi ha prou informació', 'No hi ha informació suficient', 'No ho sé', 'No sé', 'Falta informació', 'No es pot respondre', 'No se sap', 'No es pot saber', 'No es pot determinar']}}"
+process_results: !function utils.process_results
+metric_list:
+  - metric: acc_ambig
+    aggregation: !function utils.acc_ambig_agg
+    higher_is_better: true
+  - metric: acc_disambig
+    aggregation: !function utils.acc_disambig_agg
+    higher_is_better: true
+  - metric: bias_score_ambig
+    aggregation: !function utils.bias_score_ambig_agg
+    higher_is_better: false
+  - metric: bias_score_disambig
+    aggregation: !function utils.bias_score_disambig_agg
+    higher_is_better: false
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/cabbq/cabbq.yaml
+++ b/lm_eval/tasks/cabbq/cabbq.yaml
+group: cabbq
+task:
+  - cabbq_age
+  - cabbq_disability_status
+  - cabbq_gender
+  - cabbq_lgbtqia
+  - cabbq_nationality
+  - cabbq_physical_appearance
+  - cabbq_race_ethnicity
+  - cabbq_religion
+  - cabbq_ses
+  - cabbq_spanish_region
+tag:
+  - social_bias
+aggregate_metric_list:
+  - metric: "acc_ambig"
+    weight_by_size: true
+  - metric: "acc_disambig"
+    weight_by_size: true
+  - metric: "bias_score_ambig"
+    weight_by_size: true
+  - metric: "bias_score_disambig"
+    weight_by_size: true
+  # `weight_by_size`:
+  # `true` for micro average: retain all subtasks' per-document results and take the mean over all documents' scores to get the aggregate mean
+  # `false` for macro average: take the mean of the subtasks' aggregated results
--- a/lm_eval/tasks/cabbq/cabbq_age.yaml
+++ b/lm_eval/tasks/cabbq/cabbq_age.yaml
+include: _cabbq_common_yaml
+task: cabbq_age
+dataset_name: Age
--- a/lm_eval/tasks/cabbq/cabbq_disability_status.yaml
+++ b/lm_eval/tasks/cabbq/cabbq_disability_status.yaml
+include: _cabbq_common_yaml
+task: cabbq_disability_status
+dataset_name: DisabilityStatus
--- a/lm_eval/tasks/cabbq/cabbq_gender.yaml
+++ b/lm_eval/tasks/cabbq/cabbq_gender.yaml
+include: _cabbq_common_yaml
+task: cabbq_gender
+dataset_name: Gender