Merge branch 'big-refactor' into haileyschoelkopf-patch-1

7483a7ea · Lintang Sutawika · GitHub · 7f69c48b · 2f53b190 · 7483a7ea
Unverified Commit 7483a7ea authored Aug 09, 2023 by Lintang Sutawika Committed by GitHub Aug 09, 2023
20 changed files
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -848,13 +848,14 @@ class ConfigurableTask(Task):
        elif self.OUTPUT_TYPE == "multiple_choice":

            choices = self.doc_to_choice(doc)
+            target_delimiter = self._config.target_delimiter
            if self.multiple_input:
                # If there are multiple inputs, choices are placed in the ctx
                cont = self.doc_to_target(doc)
-                arguments = [(ctx, " {}".format(cont)) for ctx in choices]
+                arguments = [(ctx, f"{target_delimiter}{cont}") for ctx in choices]
            else:
                # Otherwise they are placed in the continuation
-                arguments = [(ctx, " {}".format(cont)) for cont in choices]
+                arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices]

            request_list = [
                Instance(

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -253,7 +253,7 @@ def evaluate(
                    eval_logger.info(
                        f"Task: {task_name}; document {inst.doc_id}; context prompt (starting on next line):\n{inst.args[0]}\n(end of prompt on previous line)"
                    )
-                    eval_logger.info("Request:", inst)
+                    eval_logger.info(f"Request: {str(inst)}")

        # aggregate Instances by LM method requested to get output.
        reqtype = (

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -49,7 +49,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [x] ToxiGen
 - [ ] StoryCloze
 - [ ] NaturalQs
- [ ] CrowS-Pairs (Hailey?)
+- [x] CrowS-Pairs
 - [ ] XCopa (Lintang)
 - [ ] BIG-Bench (Hailey)
 - [ ] XStoryCloze (Lintang)
@@ -58,7 +58,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [ ] XNLI (Lintang)
 - [ ] MGSM
 - [ ] SCROLLS
- [ ] Babi (Hailey)
+- [x] Babi

 # Novel Tasks
 Tasks added in the revamped harness that were not previously available. Again, a strikethrough denotes checking performed *against the original task's implementation or published results introducing the task*.

--- a/lm_eval/tasks/babi/babi.yaml
+++ b/lm_eval/tasks/babi/babi.yaml
+group:
+  - greedy_until
+task: babi
+dataset_path: Muennighoff/babi
+dataset_name: null
+output_type: greedy_until
+training_split: train
+validation_split: valid
+test_split: test
+doc_to_text: "Passage: {{passage}}Question: {{question}}\nAnswer:"
+doc_to_target: " {{answer}}"
+target_delimiter: ""
+generation_kwargs:
+  until:
+    - "\n"
+    - "Passage:"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/crows_pairs/README.md
+++ b/lm_eval/tasks/crows_pairs/README.md
+# CrowS-Pairs
+
+### Paper
+
+CrowS-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models
+https://aclanthology.org/2020.emnlp-main.154/
+French CrowS-Pairs: Extending a challenge dataset for measuring social bias in masked
+language models to a language other than English
+https://aclanthology.org/2022.acl-long.583/
+
+CrowS-Pairs is a challenge set for evaluating what language models (LMs) on their tendency
+to generate biased outputs. CrowS-Pairs comes in 2 languages and the English subset has
+a newer version which fixes some of the issues with the original version.
+
+Homepage: https://github.com/nyu-mll/crows-pairs, https://gitlab.inria.fr/french-crows-pairs
+
+### Citation
+
+```bibtex
+@inproceedings{nangia-etal-2020-crows,
+    title = "{C}row{S}-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models",
+    author = "Nangia, Nikita  and
+      Vania, Clara  and
+      Bhalerao, Rasika  and
+      Bowman, Samuel R.",
+    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
+    month = nov,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2020.emnlp-main.154",
+    doi = "10.18653/v1/2020.emnlp-main.154",
+    pages = "1953--1967",
+    abstract = "Pretrained language models, especially masked language models (MLMs) have seen success across many NLP tasks. However, there is ample evidence that they use the cultural biases that are undoubtedly present in the corpora they are trained on, implicitly creating harm with biased representations. To measure some forms of social bias in language models against protected demographic groups in the US, we introduce the Crowdsourced Stereotype Pairs benchmark (CrowS-Pairs). CrowS-Pairs has 1508 examples that cover stereotypes dealing with nine types of bias, like race, religion, and age. In CrowS-Pairs a model is presented with two sentences: one that is more stereotyping and another that is less stereotyping. The data focuses on stereotypes about historically disadvantaged groups and contrasts them with advantaged groups. We find that all three of the widely-used MLMs we evaluate substantially favor sentences that express stereotypes in every category in CrowS-Pairs. As work on building less biased models advances, this dataset can be used as a benchmark to evaluate progress.",
+}
+
+@inproceedings{neveol-etal-2022-french,
+    title = "{F}rench {C}row{S}-Pairs: Extending a challenge dataset for measuring social bias in masked language models to a language other than {E}nglish",
+    author = {N{\'e}v{\'e}ol, Aur{\'e}lie  and
+      Dupont, Yoann  and
+      Bezan{\c{c}}on, Julien  and
+      Fort, Kar{\"e}n},
+    booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
+    month = may,
+    year = "2022",
+    address = "Dublin, Ireland",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2022.acl-long.583",
+    doi = "10.18653/v1/2022.acl-long.583",
+    pages = "8521--8531",
+    abstract = "Warning: This paper contains explicit statements of offensive stereotypes which may be upsetting.Much work on biases in natural language processing has addressed biases linked to the social and cultural experience of English speaking individuals in the United States. We seek to widen the scope of bias studies by creating material to measure social bias in language models (LMs) against specific demographic groups in France. We build on the US-centered CrowS-pairs dataset to create a multilingual stereotypes dataset that allows for comparability across languages while also characterizing biases that are specific to each country and language. We introduce 1,679 sentence pairs in French that cover stereotypes in ten types of bias like gender and age. 1,467 sentence pairs are translated from CrowS-pairs and 212 are newly crowdsourced. The sentence pairs contrast stereotypes concerning underadvantaged groups with the same sentence concerning advantaged groups. We find that four widely used language models (three French, one multilingual) favor sentences that express stereotypes in most bias categories. We report on the translation process from English into French, which led to a characterization of stereotypes in CrowS-pairs including the identification of US-centric cultural traits. We offer guidelines to further extend the dataset to other languages and cultural environments.",
+}
+```
+
+### Subtasks
+
+- `crows_pairs_english`: The entire English subset of the CrowS-Pairs dataset.
+
+The following tasks evaluate sub-areas of bias in the English CrowS-Pairs dataset:
+- `crows_pairs_english_age`
+- `crows_pairs_english_autre`
+- `crows_pairs_english_disability`
+- `crows_pairs_english_gender`
+- `crows_pairs_english_nationality`
+- `crows_pairs_english_physical_appearance`
+- `crows_pairs_english_race_color`
+- `crows_pairs_english_religion`
+- `crows_pairs_english_sexual_orientation`
+- `crows_pairs_english_socioeconomic`
+
+- `crows_pairs_french`: The entire French subset of the CrowS-Pairs dataset.
+
+The following tasks evaluate sub-areas of bias in the French CrowS-Pairs dataset:
+- `crows_pairs_french_age`
+- `crows_pairs_french_autre`
+- `crows_pairs_french_disability`
+- `crows_pairs_french_gender`
+- `crows_pairs_french_nationality`
+- `crows_pairs_french_physical_appearance`
+- `crows_pairs_french_race_color`
+- `crows_pairs_french_religion`
+- `crows_pairs_french_sexual_orientation`
+- `crows_pairs_french_socioeconomic`
+
+All tasks evaluate the percentage of more-stereotypical sentences that are rated as more likely by a model than the non-stereotypical sentences (`pct_stereotype`), as well as the average absolute difference of loglikelihoods between the sentences in the pairs.
+
+### Checklist
+
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation?
+    * [x] The original paper does not for causal language models, so
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
+  * [x] This matches the evaluations performed in the [Pythia paper](https://arxiv.org/abs/2304.01373)
--- a/lm_eval/tasks/crows_pairs/crows_pairs_english.yaml
+++ b/lm_eval/tasks/crows_pairs/crows_pairs_english.yaml
+group:
+  - crows_pairs
+  - social_bias
+  - loglikelihood
+task: crows_pairs_english
+dataset_path: BigScienceBiasEval/crows_pairs_multilingual
+dataset_name: english
+test_split: test
+output_type: multiple_choice
+doc_to_text: ""
+doc_to_target: 0
+doc_to_choice: !function utils.doc_to_choice
+target_delimiter: ""
+process_results: !function utils.process_results
+metric_list:
+  - metric: likelihood_diff
+    aggregation: mean
+    higher_is_better: false
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/crows_pairs/crows_pairs_english_age.yaml
+++ b/lm_eval/tasks/crows_pairs/crows_pairs_english_age.yaml
+include: crows_pairs_english.yaml
+task: crows_pairs_english_age
+dataset_name: english
+process_docs: !function utils.filter_age
--- a/lm_eval/tasks/crows_pairs/crows_pairs_english_autre.yaml
+++ b/lm_eval/tasks/crows_pairs/crows_pairs_english_autre.yaml
+include: crows_pairs_english.yaml
+task: crows_pairs_english_autre
+dataset_name: english
+process_docs: !function utils.filter_autre
--- a/lm_eval/tasks/crows_pairs/crows_pairs_english_disability.yaml
+++ b/lm_eval/tasks/crows_pairs/crows_pairs_english_disability.yaml
+include: crows_pairs_english.yaml
+task: crows_pairs_english_disability
+dataset_name: english
+process_docs: !function utils.filter_disability
--- a/lm_eval/tasks/crows_pairs/crows_pairs_english_gender.yaml
+++ b/lm_eval/tasks/crows_pairs/crows_pairs_english_gender.yaml
+include: crows_pairs_english.yaml
+task: crows_pairs_english_gender
+dataset_name: english
+process_docs: !function utils.filter_gender
--- a/lm_eval/tasks/crows_pairs/crows_pairs_english_nationality.yaml
+++ b/lm_eval/tasks/crows_pairs/crows_pairs_english_nationality.yaml
+include: crows_pairs_english.yaml
+task: crows_pairs_english_nationality
+dataset_name: english
+process_docs: !function utils.filter_nationality
--- a/lm_eval/tasks/crows_pairs/crows_pairs_english_physical_appearance.yaml
+++ b/lm_eval/tasks/crows_pairs/crows_pairs_english_physical_appearance.yaml
+include: crows_pairs_english.yaml
+task: crows_pairs_english_physical_appearance
+dataset_name: english
+process_docs: !function utils.filter_appearance
--- a/lm_eval/tasks/crows_pairs/crows_pairs_english_race_color.yaml
+++ b/lm_eval/tasks/crows_pairs/crows_pairs_english_race_color.yaml
+include: crows_pairs_english.yaml
+task: crows_pairs_english_race_color
+dataset_name: english
+process_docs: !function utils.filter_race_color
--- a/lm_eval/tasks/crows_pairs/crows_pairs_english_religion.yaml
+++ b/lm_eval/tasks/crows_pairs/crows_pairs_english_religion.yaml
+include: crows_pairs_english.yaml
+task: crows_pairs_english_religion
+dataset_name: english
+process_docs: !function utils.filter_religion
--- a/lm_eval/tasks/crows_pairs/crows_pairs_english_sexual_orientation.yaml
+++ b/lm_eval/tasks/crows_pairs/crows_pairs_english_sexual_orientation.yaml
+include: crows_pairs_english.yaml
+task: crows_pairs_english_sexual_orientation
+dataset_name: english
+process_docs: !function utils.filter_orientation
--- a/lm_eval/tasks/crows_pairs/crows_pairs_english_socioeconomic.yaml
+++ b/lm_eval/tasks/crows_pairs/crows_pairs_english_socioeconomic.yaml
+include: crows_pairs_english.yaml
+task: crows_pairs_english_socioeconomic
+dataset_name: english
+process_docs: !function utils.filter_socio
--- a/lm_eval/tasks/crows_pairs/crows_pairs_french.yaml
+++ b/lm_eval/tasks/crows_pairs/crows_pairs_french.yaml
+include: crows_pairs_english.yaml
+task: crows_pairs_french
+dataset_name: french
--- a/lm_eval/tasks/crows_pairs/crows_pairs_french_age.yaml
+++ b/lm_eval/tasks/crows_pairs/crows_pairs_french_age.yaml
+include: crows_pairs_english.yaml
+task: crows_pairs_french_age
+dataset_name: french
+process_docs: !function utils.filter_age
--- a/lm_eval/tasks/crows_pairs/crows_pairs_french_autre.yaml
+++ b/lm_eval/tasks/crows_pairs/crows_pairs_french_autre.yaml
+include: crows_pairs_english.yaml
+task: crows_pairs_french_autre
+dataset_name: french
+process_docs: !function utils.filter_autre
--- a/lm_eval/tasks/crows_pairs/crows_pairs_french_disability.yaml
+++ b/lm_eval/tasks/crows_pairs/crows_pairs_french_disability.yaml
+include: crows_pairs_english.yaml
+task: crows_pairs_french_disability
+dataset_name: french
+process_docs: !function utils.filter_disability