Commit 3f13d15f authored by Jonathan Tow's avatar Jonathan Tow
Browse files

Make citations module-level constants

parent a1aceacd
...@@ -8,7 +8,14 @@ increase in difficulty and complexity, and each question-answer includes annotat ...@@ -8,7 +8,14 @@ increase in difficulty and complexity, and each question-answer includes annotat
provided explanations. provided explanations.
Homepage: "https://github.com/facebookresearch/anli" Homepage: "https://github.com/facebookresearch/anli"
"""
import numpy as np
from lm_eval.base import rf
from ..metrics import mean
from . common import HFTask
_CITATION = """
@inproceedings{nie-etal-2020-adversarial, @inproceedings{nie-etal-2020-adversarial,
title = "Adversarial {NLI}: A New Benchmark for Natural Language Understanding", title = "Adversarial {NLI}: A New Benchmark for Natural Language Understanding",
author = "Nie, Yixin and author = "Nie, Yixin and
...@@ -22,10 +29,6 @@ Homepage: "https://github.com/facebookresearch/anli" ...@@ -22,10 +29,6 @@ Homepage: "https://github.com/facebookresearch/anli"
publisher = "Association for Computational Linguistics", publisher = "Association for Computational Linguistics",
} }
""" """
import numpy as np
from lm_eval.base import rf
from ..metrics import mean
from . common import HFTask
class ANLIBase(HFTask): class ANLIBase(HFTask):
......
...@@ -11,7 +11,12 @@ into a Challenge Set of 2,590 “hard” questions (those that both a retrieval ...@@ -11,7 +11,12 @@ into a Challenge Set of 2,590 “hard” questions (those that both a retrieval
a co-occurrence method fail to answer correctly) and an Easy Set of 5,197 questions. a co-occurrence method fail to answer correctly) and an Easy Set of 5,197 questions.
Homepage: https://allenai.org/data/arc Homepage: https://allenai.org/data/arc
"""
from lm_eval.base import MultipleChoiceTask
from . common import HFTask
_CITATION = """
@article{Clark2018ThinkYH, @article{Clark2018ThinkYH,
title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge}, title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord}, author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
...@@ -20,8 +25,6 @@ Homepage: https://allenai.org/data/arc ...@@ -20,8 +25,6 @@ Homepage: https://allenai.org/data/arc
volume={abs/1803.05457} volume={abs/1803.05457}
} }
""" """
from lm_eval.base import MultipleChoiceTask
from . common import HFTask
class ARCEasy(HFTask, MultipleChoiceTask): class ARCEasy(HFTask, MultipleChoiceTask):
......
...@@ -6,18 +6,6 @@ A small battery of 10 tests that involve asking language models a simple arithme ...@@ -6,18 +6,6 @@ A small battery of 10 tests that involve asking language models a simple arithme
problem in natural language. problem in natural language.
Homepage: https://github.com/openai/gpt-3/tree/master/data Homepage: https://github.com/openai/gpt-3/tree/master/data
@inproceedings{NEURIPS2020_1457c0d6,
author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
booktitle = {Advances in Neural Information Processing Systems},
editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
pages = {1877--1901},
publisher = {Curran Associates, Inc.},
title = {Language Models are Few-Shot Learners},
url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},
volume = {33},
year = {2020}
}
""" """
import abc import abc
import json import json
...@@ -27,6 +15,22 @@ from lm_eval.base import Task, rf ...@@ -27,6 +15,22 @@ from lm_eval.base import Task, rf
from lm_eval.metrics import mean from lm_eval.metrics import mean
from best_download import download_file from best_download import download_file
_CITATION = """
@inproceedings{NEURIPS2020_1457c0d6,
author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
booktitle = {Advances in Neural Information Processing Systems},
editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
pages = {1877--1901},
publisher = {Curran Associates, Inc.},
title = {Language Models are Few-Shot Learners},
url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},
volume = {33},
year = {2020}
}
"""
ArithmeticDoc = namedtuple('ArithmeticDoc', ['context', 'completion']) ArithmeticDoc = namedtuple('ArithmeticDoc', ['context', 'completion'])
......
...@@ -13,15 +13,6 @@ level (for indicating the level of difficulty). ...@@ -13,15 +13,6 @@ level (for indicating the level of difficulty).
NOTE: We currently ignore formulas for answer generation. NOTE: We currently ignore formulas for answer generation.
Homepage: https://github.com/chaochun/nlu-asdiv-dataset Homepage: https://github.com/chaochun/nlu-asdiv-dataset
@misc{miao2021diverse,
title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},
author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},
year={2021},
eprint={2106.15772},
archivePrefix={arXiv},
primaryClass={cs.AI}
}
""" """
from lm_eval.base import Task from lm_eval.base import Task
from pathlib import Path from pathlib import Path
...@@ -34,6 +25,18 @@ from zipfile import ZipFile ...@@ -34,6 +25,18 @@ from zipfile import ZipFile
import os import os
_CITATION = """
@misc{miao2021diverse,
title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},
author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},
year={2021},
eprint={2106.15772},
archivePrefix={arXiv},
primaryClass={cs.AI}
}
"""
class Asdiv(Task): class Asdiv(Task):
VERSION = 0 VERSION = 0
DATASET_PATH = Path("data/asdiv") DATASET_PATH = Path("data/asdiv")
......
...@@ -9,7 +9,13 @@ or semantics. The data is automatically generated according to expert-crafted ...@@ -9,7 +9,13 @@ or semantics. The data is automatically generated according to expert-crafted
grammars. grammars.
Homepage: https://github.com/alexwarstadt/blimp Homepage: https://github.com/alexwarstadt/blimp
"""
from lm_eval.base import rf
from lm_eval.metrics import mean
from .common import HFTask
_CITATION = """
@article{warstadt2019blimp, @article{warstadt2019blimp,
author = {Warstadt, Alex and Parrish, Alicia and Liu, Haokun and Mohananey, Anhad and Peng, Wei and Wang, Sheng-Fu and Bowman, Samuel R.}, author = {Warstadt, Alex and Parrish, Alicia and Liu, Haokun and Mohananey, Anhad and Peng, Wei and Wang, Sheng-Fu and Bowman, Samuel R.},
title = {BLiMP: The Benchmark of Linguistic Minimal Pairs for English}, title = {BLiMP: The Benchmark of Linguistic Minimal Pairs for English},
...@@ -25,10 +31,6 @@ Homepage: https://github.com/alexwarstadt/blimp ...@@ -25,10 +31,6 @@ Homepage: https://github.com/alexwarstadt/blimp
} }
""" """
from lm_eval.base import rf
from lm_eval.metrics import mean
from .common import HFTask
class BlimpTask(HFTask): class BlimpTask(HFTask):
VERSION = 0 VERSION = 0
......
...@@ -11,15 +11,6 @@ NOTE: This evaluation is based on the (context + query) question-answering varia ...@@ -11,15 +11,6 @@ NOTE: This evaluation is based on the (context + query) question-answering varia
used by the Recurrent Language Models described in the paper. See section 4.4. used by the Recurrent Language Models described in the paper. See section 4.4.
Homepage: https://github.com/facebookresearch/ParlAI/tree/main/parlai/tasks/cbt Homepage: https://github.com/facebookresearch/ParlAI/tree/main/parlai/tasks/cbt
@misc{hill2016goldilocks,
title={The Goldilocks Principle: Reading Children's Books with Explicit Memory Representations},
author={Felix Hill and Antoine Bordes and Sumit Chopra and Jason Weston},
year={2016},
eprint={1511.02301},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
""" """
import numpy as np import numpy as np
from lm_eval.base import rf from lm_eval.base import rf
...@@ -27,6 +18,18 @@ from lm_eval.metrics import mean ...@@ -27,6 +18,18 @@ from lm_eval.metrics import mean
from .common import HFTask from .common import HFTask
_CITATION = """
@misc{hill2016goldilocks,
title={The Goldilocks Principle: Reading Children's Books with Explicit Memory Representations},
author={Felix Hill and Antoine Bordes and Sumit Chopra and Jason Weston},
year={2016},
eprint={1511.02301},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""
class CBTBase(HFTask): class CBTBase(HFTask):
VERSION = 0 VERSION = 0
DATASET_PATH = "cbt" DATASET_PATH = "cbt"
......
...@@ -8,7 +8,17 @@ understand a text passage and answer a series of interconnected questions that ...@@ -8,7 +8,17 @@ understand a text passage and answer a series of interconnected questions that
appear in a conversation. appear in a conversation.
Homepage: https://stanfordnlp.github.io/coqa/ Homepage: https://stanfordnlp.github.io/coqa/
"""
import os
import json
import transformers.data.metrics.squad_metrics as squad_metrics
from lm_eval.base import Task, rf, mean
from ..utils import sh
from itertools import zip_longest
from best_download import download_file
_CITATION = """
@misc{reddy2018coqa, @misc{reddy2018coqa,
title={CoQA: A Conversational Question Answering Challenge}, title={CoQA: A Conversational Question Answering Challenge},
author={Siva Reddy and Danqi Chen and Christopher D. Manning}, author={Siva Reddy and Danqi Chen and Christopher D. Manning},
...@@ -18,13 +28,6 @@ Homepage: https://stanfordnlp.github.io/coqa/ ...@@ -18,13 +28,6 @@ Homepage: https://stanfordnlp.github.io/coqa/
primaryClass={cs.CL} primaryClass={cs.CL}
} }
""" """
import os
import json
import transformers.data.metrics.squad_metrics as squad_metrics
from lm_eval.base import Task, rf, mean
from ..utils import sh
from itertools import zip_longest
from best_download import download_file
class CoQA(Task): class CoQA(Task):
......
...@@ -11,15 +11,6 @@ Homepage: https://allenai.org/data/drop ...@@ -11,15 +11,6 @@ Homepage: https://allenai.org/data/drop
Acknowledgement: This implementation is based on the official evaluation for `DROP`: Acknowledgement: This implementation is based on the official evaluation for `DROP`:
https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py
@misc{dua2019drop,
title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs},
author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},
year={2019},
eprint={1903.00161},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
""" """
import json import json
import numpy as np import numpy as np
...@@ -32,6 +23,19 @@ from lm_eval.metrics import mean ...@@ -32,6 +23,19 @@ from lm_eval.metrics import mean
from pathlib import Path from pathlib import Path
from zipfile import ZipFile from zipfile import ZipFile
_CITATIONS = """
@misc{dua2019drop,
title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs},
author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},
year={2019},
eprint={1903.00161},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""
_ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE) _ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE)
......
...@@ -12,7 +12,16 @@ sizes, text genres, and degrees of difficulty, and ...@@ -12,7 +12,16 @@ sizes, text genres, and degrees of difficulty, and
respect to a wide range of linguistic phenomena found in natural language. respect to a wide range of linguistic phenomena found in natural language.
Homepage: https://gluebenchmark.com/ Homepage: https://gluebenchmark.com/
"""
import numpy as np
from lm_eval.base import rf
from ..metrics import mean, matthews_corrcoef, f1_score
from . common import HFTask, yesno
from ..utils import general_detokenize
# TODO(jon-tow): Add citations for the individual datasets/tasks that make up GLUE.
_CITATION = """
@inproceedings{wang-etal-2018-glue, @inproceedings{wang-etal-2018-glue,
title = "{GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding", title = "{GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding",
author = "Wang, Alex and author = "Wang, Alex and
...@@ -32,11 +41,7 @@ Homepage: https://gluebenchmark.com/ ...@@ -32,11 +41,7 @@ Homepage: https://gluebenchmark.com/
abstract = "Human ability to understand language is \textit{general, flexible, and robust}. In contrast, most NLU models above the word level are designed for a specific task and struggle with out-of-domain data. If we aspire to develop models with understanding beyond the detection of superficial correspondences between inputs and outputs, then it is critical to develop a unified model that can execute a range of linguistic tasks across different domains. To facilitate research in this direction, we present the General Language Understanding Evaluation (GLUE, gluebenchmark.com): a benchmark of nine diverse NLU tasks, an auxiliary dataset for probing models for understanding of specific linguistic phenomena, and an online platform for evaluating and comparing models. For some benchmark tasks, training data is plentiful, but for others it is limited or does not match the genre of the test set. GLUE thus favors models that can represent linguistic knowledge in a way that facilitates sample-efficient learning and effective knowledge-transfer across tasks. While none of the datasets in GLUE were created from scratch for the benchmark, four of them feature privately-held test data, which is used to ensure that the benchmark is used fairly. We evaluate baselines that use ELMo (Peters et al., 2018), a powerful transfer learning technique, as well as state-of-the-art sentence representation models. The best models still achieve fairly low absolute scores. Analysis with our diagnostic dataset yields similarly weak performance over all phenomena tested, with some exceptions.", abstract = "Human ability to understand language is \textit{general, flexible, and robust}. In contrast, most NLU models above the word level are designed for a specific task and struggle with out-of-domain data. If we aspire to develop models with understanding beyond the detection of superficial correspondences between inputs and outputs, then it is critical to develop a unified model that can execute a range of linguistic tasks across different domains. To facilitate research in this direction, we present the General Language Understanding Evaluation (GLUE, gluebenchmark.com): a benchmark of nine diverse NLU tasks, an auxiliary dataset for probing models for understanding of specific linguistic phenomena, and an online platform for evaluating and comparing models. For some benchmark tasks, training data is plentiful, but for others it is limited or does not match the genre of the test set. GLUE thus favors models that can represent linguistic knowledge in a way that facilitates sample-efficient learning and effective knowledge-transfer across tasks. While none of the datasets in GLUE were created from scratch for the benchmark, four of them feature privately-held test data, which is used to ensure that the benchmark is used fairly. We evaluate baselines that use ELMo (Peters et al., 2018), a powerful transfer learning technique, as well as state-of-the-art sentence representation models. The best models still achieve fairly low absolute scores. Analysis with our diagnostic dataset yields similarly weak performance over all phenomena tested, with some exceptions.",
} }
""" """
import numpy as np
from lm_eval.base import rf
from ..metrics import mean, matthews_corrcoef, f1_score
from . common import HFTask, yesno
from ..utils import general_detokenize
# Single-Sentence Tasks # Single-Sentence Tasks
......
...@@ -15,7 +15,17 @@ for how to make use of the dataset's calculator annotations in your language ...@@ -15,7 +15,17 @@ for how to make use of the dataset's calculator annotations in your language
model's sample/generation function. model's sample/generation function.
Homepage: https://github.com/openai/grade-school-math Homepage: https://github.com/openai/grade-school-math
"""
import json
import re
from best_download import download_file
from pathlib import Path
from lm_eval.base import Task, rf
from lm_eval.metrics import mean
_CITATION = """
@misc{cobbe2021training, @misc{cobbe2021training,
title={Training Verifiers to Solve Math Word Problems}, title={Training Verifiers to Solve Math Word Problems},
author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman}, author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
...@@ -26,12 +36,6 @@ Homepage: https://github.com/openai/grade-school-math ...@@ -26,12 +36,6 @@ Homepage: https://github.com/openai/grade-school-math
} }
""" """
import json
import re
from best_download import download_file
from pathlib import Path
from lm_eval.base import Task, rf
from lm_eval.metrics import mean
ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)") ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
INVALID_ANS = "[invalid]" INVALID_ANS = "[invalid]"
......
...@@ -7,18 +7,21 @@ access a specialized position in the Spanish healthcare system, and are challeng ...@@ -7,18 +7,21 @@ access a specialized position in the Spanish healthcare system, and are challeng
even for highly specialized humans. even for highly specialized humans.
Homepage: https://aghie.github.io/head-qa/ Homepage: https://aghie.github.io/head-qa/
"""
from . common import HFTask
from lm_eval.base import MultipleChoiceTask
_CITATION = """
@misc{liu2020interpretable, @misc{liu2020interpretable,
title={Interpretable Multi-Step Reasoning with Knowledge Extraction on Complex Healthcare Question Answering}, title={Interpretable Multi-Step Reasoning with Knowledge Extraction on Complex Healthcare Question Answering},
author={Ye Liu and Shaika Chowdhury and Chenwei Zhang and Cornelia Caragea and Philip S. Yu}, author={Ye Liu and Shaika Chowdhury and Chenwei Zhang and Cornelia Caragea and Philip S. Yu},
year={2020}, year={2020},
eprint={2008.02434}, eprint={2008.02434},
archivePrefix={arXiv}, archivePrefix={arXiv},
primaryClass={cs.AI} primaryClass={cs.AI}
} }
""" """
from . common import HFTask
from lm_eval.base import MultipleChoiceTask
class HeadQABase(HFTask, MultipleChoiceTask): class HeadQABase(HFTask, MultipleChoiceTask):
......
...@@ -12,7 +12,13 @@ zone wherein generated text is ridiculous to humans, yet often misclassified by ...@@ -12,7 +12,13 @@ zone wherein generated text is ridiculous to humans, yet often misclassified by
state-of-the-art models. state-of-the-art models.
Homepage: https://rowanzellers.com/hellaswag/ Homepage: https://rowanzellers.com/hellaswag/
"""
import re
from lm_eval.base import MultipleChoiceTask
from . common import HFTask
_CITATION = """
@inproceedings{zellers2019hellaswag, @inproceedings{zellers2019hellaswag,
title={HellaSwag: Can a Machine Really Finish Your Sentence?}, title={HellaSwag: Can a Machine Really Finish Your Sentence?},
author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin}, author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
...@@ -20,9 +26,6 @@ Homepage: https://rowanzellers.com/hellaswag/ ...@@ -20,9 +26,6 @@ Homepage: https://rowanzellers.com/hellaswag/
year={2019} year={2019}
} }
""" """
import re
from lm_eval.base import MultipleChoiceTask
from . common import HFTask
class HellaSwag(HFTask, MultipleChoiceTask): class HellaSwag(HFTask, MultipleChoiceTask):
......
...@@ -14,13 +14,6 @@ tasks are refered to in this work as the `em` sub-metric. See Section 3. Metrics ...@@ -14,13 +14,6 @@ tasks are refered to in this work as the `em` sub-metric. See Section 3. Metrics
of the paper. of the paper.
Homepage: https://github.com/hendrycks/ethics Homepage: https://github.com/hendrycks/ethics
@article{hendrycks2021ethics,
title={Aligning AI With Shared Human Values},
author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},
journal={Proceedings of the International Conference on Learning Representations (ICLR)},
year={2021}
}
""" """
import abc import abc
import csv import csv
...@@ -34,6 +27,16 @@ from .common import yesno ...@@ -34,6 +27,16 @@ from .common import yesno
from best_download import download_file from best_download import download_file
_CITATION = """
@article{hendrycks2021ethics,
title={Aligning AI With Shared Human Values},
author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},
journal={Proceedings of the International Conference on Learning Representations (ICLR)},
year={2021}
}
"""
class Ethics(Task): class Ethics(Task):
def download(self): def download(self):
if not os.path.exists('data/ethics/done'): if not os.path.exists('data/ethics/done'):
......
...@@ -7,13 +7,6 @@ problem in Math has a full step-by-step solution which can be used to teach ...@@ -7,13 +7,6 @@ problem in Math has a full step-by-step solution which can be used to teach
models to generate answer derivations and explanations. models to generate answer derivations and explanations.
Homepage: https://github.com/hendrycks/math Homepage: https://github.com/hendrycks/math
@article{hendrycksmath2021,
title={Measuring Mathematical Problem Solving With the Math Dataset},
author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
journal={NeurIPS},
year={2021}
}
""" """
import abc import abc
import json import json
...@@ -24,6 +17,16 @@ from pathlib import Path ...@@ -24,6 +17,16 @@ from pathlib import Path
from best_download import download_file from best_download import download_file
_CITATION = """
@article{hendrycksmath2021,
title={Measuring Mathematical Problem Solving With the Math Dataset},
author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
journal={NeurIPS},
year={2021}
}
"""
class Math(Task): class Math(Task):
DATASET_PATH = Path('data/MATH') DATASET_PATH = Path('data/MATH')
......
...@@ -11,13 +11,6 @@ Hendryck's Test can be used to analyze models across many tasks and to identify ...@@ -11,13 +11,6 @@ Hendryck's Test can be used to analyze models across many tasks and to identify
important shortcomings. important shortcomings.
Homepage: https://github.com/hendrycks/test Homepage: https://github.com/hendrycks/test
@article{hendryckstest2021,
title={Measuring Massive Multitask Language Understanding},
author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
journal={Proceedings of the International Conference on Learning Representations (ICLR)},
year={2021}
}
""" """
import csv import csv
import random import random
...@@ -26,6 +19,17 @@ from ..utils import sh ...@@ -26,6 +19,17 @@ from ..utils import sh
from pathlib import Path from pathlib import Path
from best_download import download_file from best_download import download_file
_CITATION = """
@article{hendryckstest2021,
title={Measuring Massive Multitask Language Understanding},
author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
journal={Proceedings of the International Conference on Learning Representations (ICLR)},
year={2021}
}
"""
SUBJECTS = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', SUBJECTS = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology',
'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics',
'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics',
......
...@@ -11,15 +11,6 @@ cannot simply rely on local context, but must be able to keep track of informati ...@@ -11,15 +11,6 @@ cannot simply rely on local context, but must be able to keep track of informati
in the broader discourse. in the broader discourse.
Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI
@misc{
author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
title={The LAMBADA dataset},
DOI={10.5281/zenodo.2630551},
publisher={Zenodo},
year={2016},
month={Aug}
}
""" """
import json import json
from lm_eval.base import Task, rf from lm_eval.base import Task, rf
...@@ -29,6 +20,18 @@ from best_download import download_file ...@@ -29,6 +20,18 @@ from best_download import download_file
import os import os
_CITATION = """
@misc{
author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
title={The LAMBADA dataset},
DOI={10.5281/zenodo.2630551},
publisher={Zenodo},
year={2016},
month={Aug}
}
"""
class LAMBADA(Task): class LAMBADA(Task):
VERSION = 0 VERSION = 0
def download(self): def download(self):
......
"""
The LAMBADA dataset: Word prediction requiring a broad discourse context∗
https://arxiv.org/pdf/1606.06031.pdf
Cloze-style LAMBADA dataset.
LAMBADA is a dataset to evaluate the capabilities of computational models for text
understanding by means of a word prediction task. LAMBADA is a collection of narrative
passages sharing the characteristic that human subjects are able to guess their last
word if they are exposed to the whole passage, but not if they only see the last
sentence preceding the target word. To succeed on LAMBADA, computational models
cannot simply rely on local context, but must be able to keep track of information
in the broader discourse.
Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI
"""
import json import json
from lm_eval.base import Task, rf from lm_eval.base import Task, rf
from lm_eval.metrics import mean, perplexity from lm_eval.metrics import mean, perplexity
...@@ -6,6 +21,18 @@ from lm_eval.tasks.lambada import LAMBADA ...@@ -6,6 +21,18 @@ from lm_eval.tasks.lambada import LAMBADA
from best_download import download_file from best_download import download_file
_CITATION = """
@misc{
author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
title={The LAMBADA dataset},
DOI={10.5281/zenodo.2630551},
publisher={Zenodo},
year={2016},
month={Aug}
}
"""
class LAMBADA_cloze(LAMBADA): class LAMBADA_cloze(LAMBADA):
VERSION = 0 VERSION = 0
def doc_to_text(self, doc): def doc_to_text(self, doc):
......
...@@ -12,15 +12,6 @@ cannot simply rely on local context, but must be able to keep track of informati ...@@ -12,15 +12,6 @@ cannot simply rely on local context, but must be able to keep track of informati
in the broader discourse. in the broader discourse.
Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI
@misc{
author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
title={The LAMBADA dataset},
DOI={10.5281/zenodo.2630551},
publisher={Zenodo},
year={2016},
month={Aug}
}
""" """
from . import lambada from . import lambada
from lm_eval.base import Task, rf from lm_eval.base import Task, rf
...@@ -32,6 +23,18 @@ from functools import partial ...@@ -32,6 +23,18 @@ from functools import partial
import os import os
_CITATION = """
@misc{
author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
title={The LAMBADA dataset},
DOI={10.5281/zenodo.2630551},
publisher={Zenodo},
year={2016},
month={Aug}
}
"""
LANGS = ["en", "fr", "de", "it", "es"] LANGS = ["en", "fr", "de", "it", "es"]
CHECKSUMS = {"en": "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226", CHECKSUMS = {"en": "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226",
"fr": "941ec6a73dba7dc91c860bf493eb66a527cd430148827a4753a4535a046bf362", "fr": "941ec6a73dba7dc91c860bf493eb66a527cd430148827a4753a4535a046bf362",
......
...@@ -9,20 +9,24 @@ also serve as a benchmark for reinvestigating logical AI under the deep learning ...@@ -9,20 +9,24 @@ also serve as a benchmark for reinvestigating logical AI under the deep learning
NLP setting. NLP setting.
Homepage: https://github.com/lgw863/LogiQA-dataset Homepage: https://github.com/lgw863/LogiQA-dataset
"""
@misc{liu2020logiqa,
title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning},
author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},
year={2020},
eprint={2007.08124},
archivePrefix={arXiv},
primaryClass={cs.CL}
}"""
from lm_eval.base import MultipleChoiceTask from lm_eval.base import MultipleChoiceTask
from best_download import download_file from best_download import download_file
from pathlib import Path from pathlib import Path
_CITATION = """
@misc{liu2020logiqa,
title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning},
author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},
year={2020},
eprint={2007.08124},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""
class LogiQA(MultipleChoiceTask): class LogiQA(MultipleChoiceTask):
VERSION = 0 VERSION = 0
DATASET_PATH = Path("data/logiqa") DATASET_PATH = Path("data/logiqa")
......
...@@ -7,21 +7,24 @@ covering multiple math domain categories by modeling operation programs correspo ...@@ -7,21 +7,24 @@ covering multiple math domain categories by modeling operation programs correspo
to word problems in the AQuA dataset (Ling et al., 2017). to word problems in the AQuA dataset (Ling et al., 2017).
Homepage: https://math-qa.github.io/math-QA/ Homepage: https://math-qa.github.io/math-QA/
@misc{amini2019mathqa,
title={MathQA: Towards Interpretable Math Word Problem Solving with Operation-Based Formalisms},
author={Aida Amini and Saadia Gabriel and Peter Lin and Rik Koncel-Kedziorski and Yejin Choi and Hannaneh Hajishirzi},
year={2019},
eprint={1905.13319},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
""" """
import re import re
from lm_eval.base import MultipleChoiceTask from lm_eval.base import MultipleChoiceTask
from . common import HFTask from . common import HFTask
_CITATION = """
@misc{amini2019mathqa,
title={MathQA: Towards Interpretable Math Word Problem Solving with Operation-Based Formalisms},
author={Aida Amini and Saadia Gabriel and Peter Lin and Rik Koncel-Kedziorski and Yejin Choi and Hannaneh Hajishirzi},
year={2019},
eprint={1905.13319},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""
class MathQA(HFTask, MultipleChoiceTask): class MathQA(HFTask, MultipleChoiceTask):
VERSION = 0 VERSION = 0
DATASET_PATH = "math_qa" DATASET_PATH = "math_qa"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment