title={Natural Questions: a Benchmark for Question Answering Research},
title={Natural Questions: a Benchmark for Question Answering Research},
author={Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le and Slav Petrov},
author={Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le and Slav Petrov},
year={2019},
year={2019},
journal={Transactions of the Association of Computational Linguistics}
journal={Transactions of the Association of Computational Linguistics}
}
}
"""
"""
classNaturalQs(Task):
classNaturalQs(Task):
VERSION=0
VERSION=0
DATASET_PATH="natural_questions"
DATASET_PATH="natural_questions"
DATASET_NAME=None
DATASET_NAME=None
defhas_training_docs(self):
defhas_training_docs(self):
returnTrue
returnTrue
defhas_validation_docs(self):
defhas_validation_docs(self):
returnTrue
returnTrue
defhas_test_docs(self):
defhas_test_docs(self):
returnFalse
returnFalse
deftraining_docs(self):
deftraining_docs(self):
# Cache training for faster few-shot.
# Cache training for faster few-shot.
# Data is too large to fit in memory.
# Data is too large to fit in memory.
ifself._training_docsisNone:
ifself._training_docsisNone:
self._training_docs=list(self.dataset["train"])
self._training_docs=list(self.dataset["train"])
returnself._training_docs
returnself._training_docs
defvalidation_docs(self):
defvalidation_docs(self):
returnself.dataset["validation"]
returnself.dataset["validation"]
deffewshot_examples(self,k,rnd):
deffewshot_examples(self,k,rnd):
# Data is too large to fit in memory. We just sample from the first bit.
# Data is too large to fit in memory. We just sample from the first bit.
title = "{PAWS}-{X}: A Cross-lingual Adversarial Dataset for Paraphrase Identification",
author = "Yang, Yinfei and
Zhang, Yuan and
Tar, Chris and
Baldridge, Jason",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)",
month = nov,
year = "2019",
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/D19-1382",
doi = "10.18653/v1/D19-1382",
pages = "3687--3692",
}"""
classPAWSXBase(Task):
VERSION=0
DATASET_PATH="paws-x"
DATASET_NAME=None# 'en'
YES=None# 'Yes'
NO=None# 'No'
QUESTION_WORD=None# 'right'
defhas_training_docs(self):
returnTrue
defhas_validation_docs(self):
returnTrue
defhas_test_docs(self):
returnTrue
deftraining_docs(self):
returnself.dataset["train"]
defvalidation_docs(self):
returnself.dataset["validation"]
deftest_docs(self):
returnself.dataset["test"]
defdoc_to_text(self,doc):
# same as in mGPT paper
return(
doc["sentence1"]
+", "
+self.QUESTION_WORD
+"? [MASK], "
+doc["sentence2"]
)
defdoc_to_target(self,doc):
return" "+[self.YES,self.NO][doc["label"]]
defconstruct_requests(self,doc,ctx):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or
test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question