author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
title={Natural Questions: a Benchmark for Question Answering Research},
author={Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le and Slav Petrov},
year={2019},
journal={Transactions of the Association of Computational Linguistics}
}
"""
classNaturalQs(Task):
VERSION=0
DATASET_PATH="natural_questions"
DATASET_NAME=None
defhas_training_docs(self):
returnTrue
defhas_validation_docs(self):
returnTrue
defhas_test_docs(self):
returnFalse
deftraining_docs(self):
# Cache training for faster few-shot.
# Data is too large to fit in memory.
ifself._training_docsisNone:
self._training_docs=list(self.dataset["train"])
returnself._training_docs
defvalidation_docs(self):
returnself.dataset["validation"]
deffewshot_examples(self,k,rnd):
# Data is too large to fit in memory. We just sample from the first bit.
The Pile: An 800GB Dataset of Diverse Text for Language Modeling
https://arxiv.org/pdf/2101.00027.pdf
The Pile is a 825 GiB diverse, open source language modelling data set that consists
of 22 smaller, high-quality datasets combined together. To score well on Pile
BPB (bits per byte), a model must be able to understand many disparate domains
including books, github repositories, webpages, chat logs, and medical, physics,
math, computer science, and philosophy papers.
Homepage: https://pile.eleuther.ai/
"""
importinspect
importlm_eval.datasets.pile.pile
fromlm_eval.baseimportPerplexityTask
_CITATION="""
@article{pile,
title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},
author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},
PubMedQA: A Dataset for Biomedical Research Question Answering
https://arxiv.org/pdf/1909.06146.pdf
PubMedQA is a novel biomedical question answering (QA) dataset collected from
PubMed abstracts. The task of PubMedQA is to answer research questions with
yes/no/maybe (e.g.: Do preoperative statins reduce atrial fibrillation after
coronary artery bypass grafting?) using the corresponding abstracts. PubMedQA
has 1k expert-annotated, 61.2k unlabeled and 211.3k artificially generated QA
instances. Each PubMedQA instance is composed of (1) a question which is either
an existing research article title or derived from one, (2) a context which is
the corresponding abstract without its conclusion, (3) a long answer, which is
the conclusion of the abstract and, presumably, answers the research question,
and (4) a yes/no/maybe answer which summarizes the conclusion.
Homepage: https://pubmedqa.github.io/
"""
importnumpyasnp
fromlm_eval.baseimportrf,Task
fromlm_eval.metricsimportmean
_CITATION="""
@inproceedings{jin2019pubmedqa,
title={PubMedQA: A Dataset for Biomedical Research Question Answering},
author={Jin, Qiao and Dhingra, Bhuwan and Liu, Zhengping and Cohen, William and Lu, Xinghua},
booktitle={Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)},
pages={2567--2577},
year={2019}
}
"""
classPubmed_QA(Task):
VERSION=0
DATASET_PATH="pubmed_qa"
DATASET_NAME="pqa_labeled"
defhas_training_docs(self):
returnFalse
defhas_validation_docs(self):
returnFalse
defhas_test_docs(self):
returnTrue
deftest_docs(self):
ifself.has_test_docs():
# HF is labelled as train but its really just for testing