Merge pull request #292 from jon-tow/task-doc-update

Add citations and descriptions to all tasks

Merge pull request #292 from jon-tow/task-doc-update
Add citations and descriptions to all tasks
263eb48d · Leo Gao · GitHub · 663b781b · b82e1a87 · 263eb48d
Unverified Commit 263eb48d authored Mar 01, 2022 by Leo Gao Committed by GitHub Mar 01, 2022
6 changed files
--- a/lm_eval/tasks/truthfulqa.py
+++ b/lm_eval/tasks/truthfulqa.py
@@ -2,6 +2,13 @@
 TruthfulQA: Measuring How Models Mimic Human Falsehoods
 https://arxiv.org/pdf/2109.07958.pdf

+TruthfulQA is a benchmark to measure whether a language model is truthful in
+generating answers to questions. The benchmark comprises 817 questions that
+span 38 categories, including health, law, finance and politics. Questions are
+crafted so that some humans would answer falsely due to a false belief or
+misconception. To perform well, models must avoid generating false answers
+learned from imitating human texts.
+
 TODO: Add support for the automatic metrics, 'GPT-judge' and 'GPT-info', which
 predict human evaluation of truth and informativeness (respectively) through
 a fine-tuned GPT-3 model. NOTE: This requires access keys to the corresponding
@@ -10,14 +17,7 @@ provide the data used to fine-tune GPT-3 into `GPT-judge` and `GPT-info`, see
 https://github.com/sylinrl/TruthfulQA#Fine-tuning-GPT-3-for-evaluation. Maybe
 we could try this?

-@misc{lin2021truthfulqa,
-      title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
-      author={Stephanie Lin and Jacob Hilton and Owain Evans},
-      year={2021},
-      eprint={2109.07958},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL}
-}
+Homepage: https://github.com/sylinrl/TruthfulQA
 """
 import csv
 import json
@@ -31,6 +31,18 @@ from ..metrics import mean
 from datasets import load_metric


+_CITATION = """
+@misc{lin2021truthfulqa,
+    title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
+    author={Stephanie Lin and Jacob Hilton and Owain Evans},
+    year={2021},
+    eprint={2109.07958},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+"""
+
+
 # The default QA preset prompt for all models.
 QA_PROMPT = (
    "Q: What is human life expectancy in the United States?\n"

--- a/lm_eval/tasks/unscramble.py
+++ b/lm_eval/tasks/unscramble.py
+"""
+Language Models are Few-Shot Learners
+https://arxiv.org/pdf/2005.14165.pdf
+
+Unscramble is a small battery of 5 “character manipulation” tasks. Each task
+involves giving the model a word distorted by some combination of scrambling,
+addition, or deletion of characters, and asking it to recover the original word.
+
+Homepage: https://github.com/openai/gpt-3/tree/master/data
+"""
 import gzip
 import json
 import shutil
@@ -7,6 +17,21 @@ from lm_eval.base import Task, rf
 from lm_eval.metrics import mean


+_CITATION = """
+@inproceedings{NEURIPS2020_1457c0d6,
+    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
+    booktitle = {Advances in Neural Information Processing Systems},
+    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
+    pages = {1877--1901},
+    publisher = {Curran Associates, Inc.},
+    title = {Language Models are Few-Shot Learners},
+    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},
+    volume = {33},
+    year = {2020}
+}
+"""
+
+
 def extract_gzip(gz, to):
    with gzip.open(gz, 'rb') as fin:
        with open(to, 'wb') as fout:

--- a/lm_eval/tasks/webqs.py
+++ b/lm_eval/tasks/webqs.py
+"""
+Semantic Parsing on Freebase from Question-Answer Pairs
+https://cs.stanford.edu/~pliang/papers/freebase-emnlp2013.pdf
+
+WebQuestions is a benchmark for question answering. The dataset consists of 6,642
+question/answer pairs. The questions are supposed to be answerable by Freebase, a
+large knowledge graph. The questions are mostly centered around a single named entity.
+The questions are popular ones asked on the web (at least in 2013).
+
+Homepage: https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a774dc8a
+"""
 from . common import HFTask
 from lm_eval.base import rf
 from ..metrics import mean


+_CITATION = """
+@inproceedings{berant-etal-2013-semantic,
+    title = "Semantic Parsing on {F}reebase from Question-Answer Pairs",
+    author = "Berant, Jonathan  and
+      Chou, Andrew  and
+      Frostig, Roy  and
+      Liang, Percy",
+    booktitle = "Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing",
+    month = oct,
+    year = "2013",
+    address = "Seattle, Washington, USA",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/D13-1160",
+    pages = "1533--1544",
+}
+"""
+
+
 class WebQs(HFTask):
    VERSION = 0
    DATASET_PATH = "web_questions"

--- a/lm_eval/tasks/wikitext.py
+++ b/lm_eval/tasks/wikitext.py
+"""
+Pointer Sentinel Mixture Models
+https://arxiv.org/pdf/1609.07843.pdf
+
+The WikiText language modeling dataset is a collection of over 100 million tokens 
+extracted from the set of verified Good and Featured articles on Wikipedia.
+
+NOTE: This `Task` is based on WikiText-2.
+
+Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/
+"""
 import os
 import re
 from lm_eval.base import rf, PerplexityTask
 from lm_eval.utils import sh
-
 from best_download import download_file


+_CITATION = """
+@misc{merity2016pointer,
+    title={Pointer Sentinel Mixture Models}, 
+    author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},
+    year={2016},
+    eprint={1609.07843},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+"""
+
+
 def wikitext_detokenizer(string):
    # contractions
    string = string.replace("s '", "s'")

--- a/lm_eval/tasks/winogrande.py
+++ b/lm_eval/tasks/winogrande.py
+"""
+WinoGrande: An Adversarial Winograd Schema Challenge at Scale
+https://arxiv.org/pdf/1907.10641.pdf
+
+WinoGrande is a collection of 44k problems, inspired by Winograd Schema Challenge
+(Levesque, Davis, and Morgenstern 2011), but adjusted to improve the scale and
+robustness against the dataset-specific bias. Formulated as a fill-in-a-blank
+task with binary options, the goal is to choose the right option for a given
+sentence which requires commonsense reasoning.
+
+NOTE: This evaluation of Winogrande uses partial evaluation as described by
+Trinh & Le in Simple Method for Commonsense Reasoning (2018). 
+See: https://arxiv.org/abs/1806.02847
+
+Homepage: https://leaderboard.allenai.org/winogrande/submissions/public
+"""
 import numpy as np
 from . common import HFTask
 from lm_eval.base import rf
 from ..metrics import mean

-"""
-This evaluation of Winogrande uses partial evaluation as described by
-Trinh & Le in Simple Method for Commonsense Reasoning (2018).
-Reference: https://arxiv.org/abs/1806.02847
+
+_CITATION = """
+@article{sakaguchi2019winogrande,
+    title={WinoGrande: An Adversarial Winograd Schema Challenge at Scale},
+    author={Sakaguchi, Keisuke and Bras, Ronan Le and Bhagavatula, Chandra and Choi, Yejin},
+    journal={arXiv preprint arXiv:1907.10641},
+    year={2019}
+}
 """



--- a/lm_eval/tasks/wsc273.py
+++ b/lm_eval/tasks/wsc273.py
+"""
+The Winograd Schema Challenge
+http://commonsensereasoning.org/2011/papers/Levesque.pdf
+
+A Winograd schema is a pair of sentences that differ in only one or two words
+and that contain an ambiguity that is resolved in opposite ways in the two
+sentences and requires the use of world knowledge and reasoning for its resolution.
+The Winograd Schema Challenge 273 is a collection of 273 such Winograd schemas.
+
+NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
+as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
+See: https://arxiv.org/abs/1806.0
+
+Homepage: https://cs.nyu.edu/~davise/papers/WinogradSchemas/WS.html
+"""
 import numpy as np
 import random
 from lm_eval.base import rf
 from ..metrics import mean
 from . common import HFTask

-"""
-NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
-as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
-See: https://arxiv.org/abs/1806.02847
+
+_CITATION = """
+@inproceedings{ea01b9c0db064caca6986b925d75f2bb,
+    title = "The winograd schema challenge",
+    abstract = "In this paper, we present an alternative to the Turing Test that has some conceptual and practical advantages. A Wino-grad schema is a pair of sentences that differ only in one or two words and that contain a referential ambiguity that is resolved in opposite directions in the two sentences. We have compiled a collection of Winograd schemas, designed so that the correct answer is obvious to the human reader, but cannot easily be found using selectional restrictions or statistical techniques over text corpora. A contestant in the Winograd Schema Challenge is presented with a collection of one sentence from each pair, and required to achieve human-level accuracy in choosing the correct disambiguation.",
+    author = "Levesque, {Hector J.} and Ernest Davis and Leora Morgenstern",
+    year = "2012",
+    language = "English (US)",
+    isbn = "9781577355601",
+    series = "Proceedings of the International Conference on Knowledge Representation and Reasoning",
+    publisher = "Institute of Electrical and Electronics Engineers Inc.",
+    pages = "552--561",
+    booktitle = "13th International Conference on the Principles of Knowledge Representation and Reasoning, KR 2012",
+    note = "13th International Conference on the Principles of Knowledge Representation and Reasoning, KR 2012 ; Conference date: 10-06-2012 Through 14-06-2012",
+}
 """