WinoBias support added

7010e452 · tomlimi · 9cd70235 · 7010e452 · 7010e452
Commit 7010e452 authored Apr 28, 2022 by tomlimi
Hide whitespace changes
Inline Side-by-side

Showing with 115 additions and 0 deletions

lm_eval/tasks/__init__.py lm_eval/tasks/__init__.py +7 -0

lm_eval/tasks/wino_bias.py lm_eval/tasks/wino_bias.py +108 -0

No files found.
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -56,6 +56,7 @@ from . import hans
 from . import gem_webnlg
 from . import gem_xsum
 from . import gem_mlsum
+from . import wino_bias
 # from . import e2e_nlg_cleaned
 ########################################
@@ -308,6 +309,12 @@ TASK_REGISTRY = {
    "gem_xsum_challenge_test_bfp_05": gem_xsum.GEMXSUMChallgeTestBFP05,
    "gem_xsum_challenge_test_nopunc": gem_xsum.GEMXSUMChallgeTestNopunc,
    "gem_xsum_challenge_test_covid": gem_xsum.GEMXSUMChallgeTestCovid,
+    # WinoBias
+    "wino_bias_type1_pro": wino_bias.WinoBiasType1Pro,
+    "wino_bias_type1_anti": wino_bias.WinoBiasType1Anti,
+    "wino_bias_type2_pro": wino_bias.WinoBiasType2Pro,
+    "wino_bias_type2_anti": wino_bias.WinoBiasType2Anti,
 }

--- a/lm_eval/tasks/wino_bias.py
+++ b/lm_eval/tasks/wino_bias.py
+"""
+Gender Bias in Coreference Resolution: Evaluation and Debiasing Methods
+https://arxiv.org/abs/1804.06876
+Winograd-schema evaluation of gendered coreference resolution.
+The dataset contains pro-stereotypical and anti-stereotypical parts. The difference in accuracy for those two subsets
+quatnifies bias.
+Homepage: https://uclanlp.github.io/corefBias/overview
+"""
+from lm_eval.base import PromptSourceTask, mean
+import transformers.data.metrics.squad_metrics as squad_metrics
+_CITATION = """
+@inproceedings{zhao-etal-2018-gender,
+    title = "Gender Bias in Coreference Resolution: Evaluation and Debiasing Methods",
+    author = "Zhao, Jieyu  and
+      Wang, Tianlu  and
+      Yatskar, Mark  and
+      Ordonez, Vicente  and
+      Chang, Kai-Wei",
+    booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers)",
+    month = jun,
+    year = "2018",
+    address = "New Orleans, Louisiana",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/N18-2003",
+    doi = "10.18653/v1/N18-2003",
+    pages = "15--20",
+    abstract = "In this paper, we introduce a new benchmark for co-reference resolution focused on gender bias, WinoBias. Our corpus contains Winograd-schema style sentences with entities corresponding to people referred by their occupation (e.g. the nurse, the doctor, the carpenter). We demonstrate that a rule-based, a feature-rich, and a neural coreference system all link gendered pronouns to pro-stereotypical entities with higher accuracy than anti-stereotypical entities, by an average difference of 21.1 in F1 score. Finally, we demonstrate a data-augmentation approach that, in combination with existing word-embedding debiasing techniques, removes the bias demonstrated by these systems in WinoBias without significantly affecting their performance on existing datasets.",
+}
+"""
+class WinoBias(PromptSourceTask):
+    VERSION = 0
+    DATASET_PATH = "wino_bias"
+    def has_training_docs(self):
+        return False
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def training_docs(self):
+        pass
+    def validation_docs(self):
+        return self.dataset["validation"]
+    def test_docs(self):
+        return self.dataset["test"]
+    def stopping_criteria(self):
+        return "\n"
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        target = self.doc_to_target(doc).strip()
+        pred = " ".join(results[0].strip().split(" ")[:len(target.split(" "))])
+        # The original paper uses F1. In the case of exactly one predicted and one gold mention,
+        # F1 and exact match are equivalent.
+        em = squad_metrics.compute_exact(target, pred)
+        out = {"em": em}
+        if self.save_examples:
+            example = {"target": target, "pred": pred}
+            return out, example
+        return out
+    def aggregation(self):
+        """
+        :returns: {str: [metric_score] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metric scores
+        """
+        return {'em': mean}
+    def higher_is_better(self):
+        return {'em': True}
+class WinoBiasType1Pro(WinoBias):
+    DATASET_NAME = "type1_pro"
+class WinoBiasType1Anti(WinoBias):
+    DATASET_NAME = "type1_anti"
+class WinoBiasType2Pro(WinoBias):
+    DATASET_NAME = "type2_pro"
+class WinoBiasType2Anti(WinoBias):
+    DATASET_NAME = "type2_anti"