""" Gender Bias in Coreference Resolution: Evaluation and Debiasing Methods https://arxiv.org/abs/1804.06876 Winograd-schema evaluation of gendered coreference resolution. The dataset contains pro-stereotypical and anti-stereotypical parts. The difference in accuracy for those two subsets quatnifies bias. Homepage: https://uclanlp.github.io/corefBias/overview """ from lm_eval.base import PromptSourceTask, mean import transformers.data.metrics.squad_metrics as squad_metrics _CITATION = """ @inproceedings{zhao-etal-2018-gender, title = "Gender Bias in Coreference Resolution: Evaluation and Debiasing Methods", author = "Zhao, Jieyu and Wang, Tianlu and Yatskar, Mark and Ordonez, Vicente and Chang, Kai-Wei", booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers)", month = jun, year = "2018", address = "New Orleans, Louisiana", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/N18-2003", doi = "10.18653/v1/N18-2003", pages = "15--20", abstract = "In this paper, we introduce a new benchmark for co-reference resolution focused on gender bias, WinoBias. Our corpus contains Winograd-schema style sentences with entities corresponding to people referred by their occupation (e.g. the nurse, the doctor, the carpenter). We demonstrate that a rule-based, a feature-rich, and a neural coreference system all link gendered pronouns to pro-stereotypical entities with higher accuracy than anti-stereotypical entities, by an average difference of 21.1 in F1 score. Finally, we demonstrate a data-augmentation approach that, in combination with existing word-embedding debiasing techniques, removes the bias demonstrated by these systems in WinoBias without significantly affecting their performance on existing datasets.", } """ class WinoBias(PromptSourceTask): VERSION = 0 DATASET_PATH = "wino_bias" def has_training_docs(self): return False def has_validation_docs(self): return True def has_test_docs(self): return True def training_docs(self): pass def validation_docs(self): return self.dataset["validation"] def test_docs(self): return self.dataset["test"] def process_results(self, doc, results): """Take a single document and the LM results and evaluates, returning a dict where keys are the names of submetrics and values are the values of the metric for that one document :param doc: The document as returned from training_docs, validation_docs, or test_docs. :param results: The results of the requests created in construct_requests. """ target = self.doc_to_target(doc).strip() pred = " ".join(results[0].strip().split(" ")[:len(target.split(" "))]) # The original paper uses F1. In the case of exactly one predicted and one gold mention, # F1 and exact match are equivalent. em = squad_metrics.compute_exact(target, pred) out = {"em": em} if self.save_examples: example = {"target": target, "pred": pred} return out, example return out def aggregation(self): """ :returns: {str: [metric_score] -> float} A dictionary where keys are the names of submetrics and values are functions that aggregate a list of metric scores """ return {'em': mean} def higher_is_better(self): return {'em': True} class WinoBiasType1Pro(WinoBias): DATASET_NAME = "type1_pro" class WinoBiasType1Anti(WinoBias): DATASET_NAME = "type1_anti" class WinoBiasType2Pro(WinoBias): DATASET_NAME = "type2_pro" class WinoBiasType2Anti(WinoBias): DATASET_NAME = "type2_anti"