abstract = { We introduce The Benchmark of Linguistic Minimal Pairs (BLiMP),1 a challenge set for evaluating the linguistic knowledge of language models (LMs) on major grammatical phenomena in English. BLiMP consists of 67 individual datasets, each containing 1,000 minimal pairs—that is, pairs of minimally different sentences that contrast in grammatical acceptability and isolate specific phenomenon in syntax, morphology, or semantics. We generate the data according to linguist-crafted grammar templates, and human aggregate agreement with the labels is 96.4\%. We evaluate n-gram, LSTM, and Transformer (GPT-2 and Transformer-XL) LMs by observing whether they assign a higher probability to the acceptable sentence in each minimal pair. We find that state-of-the-art models identify morphological contrasts related to agreement reliably, but they struggle with some subtle semantic and syntactic phenomena, such as negative polarity items and extraction islands. }
abstract = { We introduce The Benchmark of Linguistic Minimal Pairs (BLiMP),1 a challenge set for evaluating the linguistic knowledge of language models (LMs) on major grammatical phenomena in English. BLiMP consists of 67 individual datasets, each containing 1,000 minimal pairs—that is, pairs of minimally different sentences that contrast in grammatical acceptability and isolate specific phenomenon in syntax, morphology, or semantics. We generate the data according to linguist-crafted grammar templates, and human aggregate agreement with the labels is 96.4\%. We evaluate n-gram, LSTM, and Transformer (GPT-2 and Transformer-XL) LMs by observing whether they assign a higher probability to the acceptable sentence in each minimal pair. We find that state-of-the-art models identify morphological contrasts related to agreement reliably, but they struggle with some subtle semantic and syntactic phenomena, such as negative polarity items and extraction islands. }
abstract = "Pretrained language models, especially masked language models (MLMs) have seen success across many NLP tasks. However, there is ample evidence that they use the cultural biases that are undoubtedly present in the corpora they are trained on, implicitly creating harm with biased representations. To measure some forms of social bias in language models against protected demographic groups in the US, we introduce the Crowdsourced Stereotype Pairs benchmark (CrowS-Pairs). CrowS-Pairs has 1508 examples that cover stereotypes dealing with nine types of bias, like race, religion, and age. In CrowS-Pairs a model is presented with two sentences: one that is more stereotyping and another that is less stereotyping. The data focuses on stereotypes about historically disadvantaged groups and contrasts them with advantaged groups. We find that all three of the widely-used MLMs we evaluate substantially favor sentences that express stereotypes in every category in CrowS-Pairs. As work on building less biased models advances, this dataset can be used as a benchmark to evaluate progress.",
}
@inproceedings{neveol-etal-2022-french,
title = "{F}rench {C}row{S}-Pairs: Extending a challenge dataset for measuring social bias in masked language models to a language other than {E}nglish",
author = {N{\'e}v{\'e}ol, Aur{\'e}lie and
Dupont, Yoann and
Bezan{\c{c}}on, Julien and
Fort, Kar{\"e}n},
booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
abstract = "Warning: This paper contains explicit statements of offensive stereotypes which may be upsetting.Much work on biases in natural language processing has addressed biases linked to the social and cultural experience of English speaking individuals in the United States. We seek to widen the scope of bias studies by creating material to measure social bias in language models (LMs) against specific demographic groups in France. We build on the US-centered CrowS-pairs dataset to create a multilingual stereotypes dataset that allows for comparability across languages while also characterizing biases that are specific to each country and language. We introduce 1,679 sentence pairs in French that cover stereotypes in ten types of bias like gender and age. 1,467 sentence pairs are translated from CrowS-pairs and 212 are newly crowdsourced. The sentence pairs contrast stereotypes concerning underadvantaged groups with the same sentence concerning advantaged groups. We find that four widely used language models (three French, one multilingual) favor sentences that express stereotypes in most bias categories. We report on the translation process from English into French, which led to a characterization of stereotypes in CrowS-pairs including the identification of US-centric cultural traits. We offer guidelines to further extend the dataset to other languages and cultural environments.",
Interpretable Multi-Step Reasoning with Knowledge Extraction on Complex Healthcare Question Answering
Interpretable Multi-Step Reasoning with Knowledge Extraction on Complex Healthcare Question Answering
https://aclanthology.org/P19-1092.pdf
https://aclanthology.org/P19-1092.pdf
HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to
HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to
access a specialized position in the Spanish healthcare system, and are challenging
access a specialized position in the Spanish healthcare system, and are challenging
even for highly specialized humans.
even for highly specialized humans.
...
@@ -15,7 +15,7 @@ from lm_eval.base import MultipleChoiceTask
...
@@ -15,7 +15,7 @@ from lm_eval.base import MultipleChoiceTask
_CITATION="""
_CITATION="""
@misc{liu2020interpretable,
@misc{liu2020interpretable,
title={Interpretable Multi-Step Reasoning with Knowledge Extraction on Complex Healthcare Question Answering},
title={Interpretable Multi-Step Reasoning with Knowledge Extraction on Complex Healthcare Question Answering},
author={Ye Liu and Shaika Chowdhury and Chenwei Zhang and Cornelia Caragea and Philip S. Yu},
author={Ye Liu and Shaika Chowdhury and Chenwei Zhang and Cornelia Caragea and Philip S. Yu},
year={2020},
year={2020},
eprint={2008.02434},
eprint={2008.02434},
...
@@ -61,6 +61,12 @@ class HeadQABase(MultipleChoiceTask):
...
@@ -61,6 +61,12 @@ class HeadQABase(MultipleChoiceTask):
defdoc_to_text(self,doc):
defdoc_to_text(self,doc):
returndoc["query"]
returndoc["query"]
defshould_decontaminate(self):
returnTrue
defdoc_to_decontamination_query(self,doc):
returndoc["query"]
classHeadQAEn(HeadQABase):
classHeadQAEn(HeadQABase):
DATASET_NAME="en"
DATASET_NAME="en"
...
@@ -76,4 +82,6 @@ class HeadQAEsDeprecated(HeadQABase):
...
@@ -76,4 +82,6 @@ class HeadQAEsDeprecated(HeadQABase):
def__init__(self):
def__init__(self):
super().__init__()
super().__init__()
print("WARNING: headqa is deprecated. Please use headqa_es or headqa_en instead. See https://github.com/EleutherAI/lm-evaluation-harness/pull/240 for more info.")
print(
\ No newline at end of file
"WARNING: headqa is deprecated. Please use headqa_es or headqa_en instead. See https://github.com/EleutherAI/lm-evaluation-harness/pull/240 for more info."
author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
title={Natural Questions: a Benchmark for Question Answering Research},
title={Natural Questions: a Benchmark for Question Answering Research},
author={Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le and Slav Petrov},
author={Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le and Slav Petrov},
year={2019},
year={2019},
journal={Transactions of the Association of Computational Linguistics}
journal={Transactions of the Association of Computational Linguistics}
}
}
"""
"""
classNaturalQs(Task):
classNaturalQs(Task):
VERSION=0
VERSION=0
DATASET_PATH="natural_questions"
DATASET_PATH="natural_questions"
DATASET_NAME=None
DATASET_NAME=None
defhas_training_docs(self):
defhas_training_docs(self):
returnTrue
returnTrue
defhas_validation_docs(self):
defhas_validation_docs(self):
returnTrue
returnTrue
defhas_test_docs(self):
defhas_test_docs(self):
returnFalse
returnFalse
deftraining_docs(self):
deftraining_docs(self):
# Cache training for faster few-shot.
# Cache training for faster few-shot.
# Data is too large to fit in memory.
# Data is too large to fit in memory.
ifself._training_docsisNone:
ifself._training_docsisNone:
self._training_docs=list(self.dataset["train"])
self._training_docs=list(self.dataset["train"])
returnself._training_docs
returnself._training_docs
defvalidation_docs(self):
defvalidation_docs(self):
returnself.dataset["validation"]
returnself.dataset["validation"]
deffewshot_examples(self,k,rnd):
deffewshot_examples(self,k,rnd):
# Data is too large to fit in memory. We just sample from the first bit.
# Data is too large to fit in memory. We just sample from the first bit.