Add citations and descriptions to all tasks

a1aceacd · Jonathan Tow · 663b781b · a1aceacd · a1aceacd · a1aceacd
Commit a1aceacd authored Feb 28, 2022 by Jonathan Tow
5 changed files
--- a/lm_eval/tasks/unscramble.py
+++ b/lm_eval/tasks/unscramble.py
+"""
+Language Models are Few-Shot Learners
+https://arxiv.org/pdf/2005.14165.pdf
+
+Unscramble is a small battery of 5 “character manipulation” tasks. Each task
+involves giving the model a word distorted by some combination of scrambling,
+addition, or deletion of characters, and asking it to recover the original word.
+
+Homepage: https://github.com/openai/gpt-3/tree/master/data
+
+@inproceedings{NEURIPS2020_1457c0d6,
+    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
+    booktitle = {Advances in Neural Information Processing Systems},
+    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
+    pages = {1877--1901},
+    publisher = {Curran Associates, Inc.},
+    title = {Language Models are Few-Shot Learners},
+    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},
+    volume = {33},
+    year = {2020}
+}
+"""
 import gzip
 import json
 import shutil

--- a/lm_eval/tasks/webqs.py
+++ b/lm_eval/tasks/webqs.py
+"""
+Semantic Parsing on Freebase from Question-Answer Pairs
+https://cs.stanford.edu/~pliang/papers/freebase-emnlp2013.pdf
+
+WebQuestions is a benchmark for question answering. The dataset consists of 6,642
+question/answer pairs. The questions are supposed to be answerable by Freebase, a
+large knowledge graph. The questions are mostly centered around a single named entity.
+The questions are popular ones asked on the web (at least in 2013).
+
+Homepage: https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a774dc8a
+
+@inproceedings{berant-etal-2013-semantic,
+    title = "Semantic Parsing on {F}reebase from Question-Answer Pairs",
+    author = "Berant, Jonathan  and
+      Chou, Andrew  and
+      Frostig, Roy  and
+      Liang, Percy",
+    booktitle = "Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing",
+    month = oct,
+    year = "2013",
+    address = "Seattle, Washington, USA",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/D13-1160",
+    pages = "1533--1544",
+}
+"""
 from . common import HFTask
 from lm_eval.base import rf
 from ..metrics import mean

--- a/lm_eval/tasks/wikitext.py
+++ b/lm_eval/tasks/wikitext.py
+"""
+Pointer Sentinel Mixture Models
+https://arxiv.org/pdf/1609.07843.pdf
+
+The WikiText language modeling dataset is a collection of over 100 million tokens 
+extracted from the set of verified Good and Featured articles on Wikipedia.
+
+NOTE: This `Task` is based on WikiText-2.
+
+Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/
+
+@misc{merity2016pointer,
+      title={Pointer Sentinel Mixture Models}, 
+      author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},
+      year={2016},
+      eprint={1609.07843},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+"""
 import os
 import re
 from lm_eval.base import rf, PerplexityTask

--- a/lm_eval/tasks/winogrande.py
+++ b/lm_eval/tasks/winogrande.py
+"""
+WinoGrande: An Adversarial Winograd Schema Challenge at Scale
+https://arxiv.org/pdf/1907.10641.pdf
+
+WinoGrande is a collection of 44k problems, inspired by Winograd Schema Challenge
+(Levesque, Davis, and Morgenstern 2011), but adjusted to improve the scale and
+robustness against the dataset-specific bias. Formulated as a fill-in-a-blank
+task with binary options, the goal is to choose the right option for a given
+sentence which requires commonsense reasoning.
+
+NOTE: This evaluation of Winogrande uses partial evaluation as described by
+Trinh & Le in Simple Method for Commonsense Reasoning (2018). 
+See: https://arxiv.org/abs/1806.02847
+
+Homepage: https://leaderboard.allenai.org/winogrande/submissions/public
+
+@article{sakaguchi2019winogrande,
+    title={WinoGrande: An Adversarial Winograd Schema Challenge at Scale},
+    author={Sakaguchi, Keisuke and Bras, Ronan Le and Bhagavatula, Chandra and Choi, Yejin},
+    journal={arXiv preprint arXiv:1907.10641},
+    year={2019}
+}
+"""
 import numpy as np
 from . common import HFTask
 from lm_eval.base import rf
 from ..metrics import mean

-"""
-This evaluation of Winogrande uses partial evaluation as described by
-Trinh & Le in Simple Method for Commonsense Reasoning (2018).
-Reference: https://arxiv.org/abs/1806.02847
-"""
-

 class Winogrande(HFTask):
    VERSION = 0

--- a/lm_eval/tasks/wsc273.py
+++ b/lm_eval/tasks/wsc273.py
+"""
+The Winograd Schema Challenge
+http://commonsensereasoning.org/2011/papers/Levesque.pdf
+
+A Winograd schema is a pair of sentences that differ in only one or two words
+and that contain an ambiguity that is resolved in opposite ways in the two
+sentences and requires the use of world knowledge and reasoning for its resolution.
+The Winograd Schema Challenge 273 is a collection of 273 such Winograd schemas.
+
+NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
+as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
+See: https://arxiv.org/abs/1806.0
+
+Homepage: https://cs.nyu.edu/~davise/papers/WinogradSchemas/WS.html
+
+@inproceedings{ea01b9c0db064caca6986b925d75f2bb,
+    title = "The winograd schema challenge",
+    abstract = "In this paper, we present an alternative to the Turing Test that has some conceptual and practical advantages. A Wino-grad schema is a pair of sentences that differ only in one or two words and that contain a referential ambiguity that is resolved in opposite directions in the two sentences. We have compiled a collection of Winograd schemas, designed so that the correct answer is obvious to the human reader, but cannot easily be found using selectional restrictions or statistical techniques over text corpora. A contestant in the Winograd Schema Challenge is presented with a collection of one sentence from each pair, and required to achieve human-level accuracy in choosing the correct disambiguation.",
+    author = "Levesque, {Hector J.} and Ernest Davis and Leora Morgenstern",
+    year = "2012",
+    language = "English (US)",
+    isbn = "9781577355601",
+    series = "Proceedings of the International Conference on Knowledge Representation and Reasoning",
+    publisher = "Institute of Electrical and Electronics Engineers Inc.",
+    pages = "552--561",
+    booktitle = "13th International Conference on the Principles of Knowledge Representation and Reasoning, KR 2012",
+    note = "13th International Conference on the Principles of Knowledge Representation and Reasoning, KR 2012 ; Conference date: 10-06-2012 Through 14-06-2012",
+}
+"""
 import numpy as np
 import random
 from lm_eval.base import rf
 from ..metrics import mean
 from . common import HFTask

-"""
-NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
-as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
-See: https://arxiv.org/abs/1806.02847
-"""
-

 class WinogradSchemaChallenge273(HFTask):
    VERSION = 0