{"es":{"description":"HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.\n","citation":"@inproceedings{vilares-gomez-rodriguez-2019-head,\n title = \"{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning\",\n author = \"Vilares, David and\n G{'o}mez-Rodr{'i}guez, Carlos\",\n booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n month = jul,\n year = \"2019\",\n address = \"Florence, Italy\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://www.aclweb.org/anthology/P19-1092\",\n doi = \"10.18653/v1/P19-1092\",\n pages = \"960--966\",\n abstract = \"We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work.\",\n}\n","homepage":"https://aghie.github.io/head-qa/","license":"MIT License","features":{"name":{"dtype":"string","id":null,"_type":"Value"},"year":{"dtype":"string","id":null,"_type":"Value"},"category":{"dtype":"string","id":null,"_type":"Value"},"qid":{"dtype":"int32","id":null,"_type":"Value"},"qtext":{"dtype":"string","id":null,"_type":"Value"},"ra":{"dtype":"int32","id":null,"_type":"Value"},"answers":[{"aid":{"dtype":"int32","id":null,"_type":"Value"},"atext":{"dtype":"string","id":null,"_type":"Value"}}]},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"head_qa","config_name":"es","version":{"version_str":"1.1.0","description":null,"major":1,"minor":1,"patch":0},"splits":{"train":{"name":"train","num_bytes":1196021,"num_examples":2657,"dataset_name":"head_qa"},"test":{"name":"test","num_bytes":1169819,"num_examples":2742,"dataset_name":"head_qa"},"validation":{"name":"validation","num_bytes":556924,"num_examples":1366,"dataset_name":"head_qa"}},"download_checksums":{"https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t":{"num_bytes":79365502,"checksum":"6ec29a3f55153d167f0bdf05395558919ba0b1df9c63e79ffceda2a09884ad8b"}},"download_size":79365502,"post_processing_size":null,"dataset_size":2922764,"size_in_bytes":82288266},"en":{"description":"HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.\n","citation":"@inproceedings{vilares-gomez-rodriguez-2019-head,\n title = \"{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning\",\n author = \"Vilares, David and\n G{'o}mez-Rodr{'i}guez, Carlos\",\n booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n month = jul,\n year = \"2019\",\n address = \"Florence, Italy\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://www.aclweb.org/anthology/P19-1092\",\n doi = \"10.18653/v1/P19-1092\",\n pages = \"960--966\",\n abstract = \"We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work.\",\n}\n","homepage":"https://aghie.github.io/head-qa/","license":"MIT License","features":{"name":{"dtype":"string","id":null,"_type":"Value"},"year":{"dtype":"string","id":null,"_type":"Value"},"category":{"dtype":"string","id":null,"_type":"Value"},"qid":{"dtype":"int32","id":null,"_type":"Value"},"qtext":{"dtype":"string","id":null,"_type":"Value"},"ra":{"dtype":"int32","id":null,"_type":"Value"},"answers":[{"aid":{"dtype":"int32","id":null,"_type":"Value"},"atext":{"dtype":"string","id":null,"_type":"Value"}}]},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"head_qa","config_name":"en","version":{"version_str":"1.1.0","description":null,"major":1,"minor":1,"patch":0},"splits":{"train":{"name":"train","num_bytes":1123151,"num_examples":2657,"dataset_name":"head_qa"},"test":{"name":"test","num_bytes":1097349,"num_examples":2742,"dataset_name":"head_qa"},"validation":{"name":"validation","num_bytes":523462,"num_examples":1366,"dataset_name":"head_qa"}},"download_checksums":{"https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t":{"num_bytes":79365502,"checksum":"6ec29a3f55153d167f0bdf05395558919ba0b1df9c63e79ffceda2a09884ad8b"}},"download_size":79365502,"post_processing_size":null,"dataset_size":2743962,"size_in_bytes":82109464}}
{"es":{"description":"HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.\n","citation":"@inproceedings{vilares-gomez-rodriguez-2019-head,\n title = \"{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning\",\n author = \"Vilares, David and\n G{'o}mez-Rodr{'i}guez, Carlos\",\n booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n month = jul,\n year = \"2019\",\n address = \"Florence, Italy\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://www.aclweb.org/anthology/P19-1092\",\n doi = \"10.18653/v1/P19-1092\",\n pages = \"960--966\",\n abstract = \"We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work.\",\n}\n","homepage":"https://aghie.github.io/head-qa/","license":"MIT License","features":{"name":{"dtype":"string","id":null,"_type":"Value"},"year":{"dtype":"string","id":null,"_type":"Value"},"category":{"dtype":"string","id":null,"_type":"Value"},"qid":{"dtype":"int32","id":null,"_type":"Value"},"qtext":{"dtype":"string","id":null,"_type":"Value"},"ra":{"dtype":"int32","id":null,"_type":"Value"},"answers":[{"aid":{"dtype":"int32","id":null,"_type":"Value"},"atext":{"dtype":"string","id":null,"_type":"Value"}}]},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"head_qa","config_name":"es","version":{"version_str":"1.1.0","description":null,"major":1,"minor":1,"patch":0},"splits":{"train":{"name":"train","num_bytes":1196021,"num_examples":2657,"dataset_name":"head_qa"},"test":{"name":"test","num_bytes":1169819,"num_examples":2742,"dataset_name":"head_qa"},"validation":{"name":"validation","num_bytes":556924,"num_examples":1366,"dataset_name":"head_qa"}},"download_checksums":{"https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t":{"num_bytes":79365502,"checksum":"6ec29a3f55153d167f0bdf05395558919ba0b1df9c63e79ffceda2a09884ad8b"}},"download_size":79365502,"post_processing_size":null,"dataset_size":2922764,"size_in_bytes":82288266},"en":{"description":"HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.\n","citation":"@inproceedings{vilares-gomez-rodriguez-2019-head,\n title = \"{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning\",\n author = \"Vilares, David and\n G{'o}mez-Rodr{'i}guez, Carlos\",\n booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n month = jul,\n year = \"2019\",\n address = \"Florence, Italy\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://www.aclweb.org/anthology/P19-1092\",\n doi = \"10.18653/v1/P19-1092\",\n pages = \"960--966\",\n abstract = \"We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work.\",\n}\n","homepage":"https://aghie.github.io/head-qa/","license":"MIT License","features":{"name":{"dtype":"string","id":null,"_type":"Value"},"year":{"dtype":"string","id":null,"_type":"Value"},"category":{"dtype":"string","id":null,"_type":"Value"},"qid":{"dtype":"int32","id":null,"_type":"Value"},"qtext":{"dtype":"string","id":null,"_type":"Value"},"ra":{"dtype":"int32","id":null,"_type":"Value"},"answers":[{"aid":{"dtype":"int32","id":null,"_type":"Value"},"atext":{"dtype":"string","id":null,"_type":"Value"}}]},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"head_qa","config_name":"en","version":{"version_str":"1.1.0","description":null,"major":1,"minor":1,"patch":0},"splits":{"train":{"name":"train","num_bytes":1123151,"num_examples":2657,"dataset_name":"head_qa"},"test":{"name":"test","num_bytes":1097349,"num_examples":2742,"dataset_name":"head_qa"},"validation":{"name":"validation","num_bytes":523462,"num_examples":1366,"dataset_name":"head_qa"}},"download_checksums":{"https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t":{"num_bytes":79365502,"checksum":"6ec29a3f55153d167f0bdf05395558919ba0b1df9c63e79ffceda2a09884ad8b"}},"download_size":79365502,"post_processing_size":null,"dataset_size":2743962,"size_in_bytes":82109464}}
{"commonsense":{"description":"The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept.","citation":"@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n","homepage":"https://github.com/hendrycks/ethics","license":"","features":{"label":{"dtype":"int32","id":null,"_type":"Value"},"input":{"dtype":"string","id":null,"_type":"Value"},"is_short":{"dtype":"bool","id":null,"_type":"Value"},"edited":{"dtype":"bool","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"hendrycks_ethics","config_name":"commonsense","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"train":{"name":"train","num_bytes":14435215,"num_examples":13910,"dataset_name":"hendrycks_ethics"},"test":{"name":"test","num_bytes":3150094,"num_examples":3885,"dataset_name":"hendrycks_ethics"}},"download_checksums":{"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar":{"num_bytes":35585024,"checksum":"40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}},"download_size":35585024,"post_processing_size":null,"dataset_size":17585309,"size_in_bytes":53170333},"deontology":{"description":"The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Deontology subset contains examples focusing on whether an act is required, permitted, or forbidden according to a set of rules or constraints","citation":"@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n","homepage":"https://github.com/hendrycks/ethics","license":"","features":{"group_id":{"dtype":"int32","id":null,"_type":"Value"},"label":{"dtype":"int32","id":null,"_type":"Value"},"scenario":{"dtype":"string","id":null,"_type":"Value"},"excuse":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"hendrycks_ethics","config_name":"deontology","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"train":{"name":"train","num_bytes":1931475,"num_examples":18164,"dataset_name":"hendrycks_ethics"},"test":{"name":"test","num_bytes":384602,"num_examples":3596,"dataset_name":"hendrycks_ethics"}},"download_checksums":{"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar":{"num_bytes":35585024,"checksum":"40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}},"download_size":35585024,"post_processing_size":null,"dataset_size":2316077,"size_in_bytes":37901101},"justice":{"description":"The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Justice subset contains examples focusing on how a character treats another person","citation":"@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n","homepage":"https://github.com/hendrycks/ethics","license":"","features":{"group_id":{"dtype":"int32","id":null,"_type":"Value"},"label":{"dtype":"int32","id":null,"_type":"Value"},"scenario":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"hendrycks_ethics","config_name":"justice","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"train":{"name":"train","num_bytes":2516501,"num_examples":21791,"dataset_name":"hendrycks_ethics"},"test":{"name":"test","num_bytes":309427,"num_examples":2704,"dataset_name":"hendrycks_ethics"}},"download_checksums":{"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar":{"num_bytes":35585024,"checksum":"40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}},"download_size":35585024,"post_processing_size":null,"dataset_size":2825928,"size_in_bytes":38410952},"utilitarianism":{"description":"The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Utilitarianism subset contains scenarios that should be ranked from most pleasant to least pleasant for the person in the scenario","citation":"@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n","homepage":"https://github.com/hendrycks/ethics","license":"","features":{"activity":{"dtype":"string","id":null,"_type":"Value"},"baseline":{"dtype":"string","id":null,"_type":"Value"},"rating":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"hendrycks_ethics","config_name":"utilitarianism","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"train":{"name":"train","num_bytes":2241770,"num_examples":13738,"dataset_name":"hendrycks_ethics"},"test":{"name":"test","num_bytes":749768,"num_examples":4808,"dataset_name":"hendrycks_ethics"}},"download_checksums":{"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar":{"num_bytes":35585024,"checksum":"40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}},"download_size":35585024,"post_processing_size":null,"dataset_size":2991538,"size_in_bytes":38576562},"virtue":{"description":"The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Virtue subset contains scenarios focusing on whether virtues or vices are being exemplified","citation":"@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n","homepage":"https://github.com/hendrycks/ethics","license":"","features":{"group_id":{"dtype":"int32","id":null,"_type":"Value"},"label":{"dtype":"int32","id":null,"_type":"Value"},"scenario":{"dtype":"string","id":null,"_type":"Value"},"trait":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"hendrycks_ethics","config_name":"virtue","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"train":{"name":"train","num_bytes":2640328,"num_examples":28245,"dataset_name":"hendrycks_ethics"},"test":{"name":"test","num_bytes":473473,"num_examples":4975,"dataset_name":"hendrycks_ethics"}},"download_checksums":{"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar":{"num_bytes":35585024,"checksum":"40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}},"download_size":35585024,"post_processing_size":null,"dataset_size":3113801,"size_in_bytes":38698825}}
{"commonsense":{"description":"The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept.","citation":"@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n","homepage":"https://github.com/hendrycks/ethics","license":"","features":{"label":{"dtype":"int32","id":null,"_type":"Value"},"input":{"dtype":"string","id":null,"_type":"Value"},"is_short":{"dtype":"bool","id":null,"_type":"Value"},"edited":{"dtype":"bool","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"hendrycks_ethics","config_name":"commonsense","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"train":{"name":"train","num_bytes":14435215,"num_examples":13910,"dataset_name":"hendrycks_ethics"},"test":{"name":"test","num_bytes":3150094,"num_examples":3885,"dataset_name":"hendrycks_ethics"}},"download_checksums":{"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar":{"num_bytes":35585024,"checksum":"40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}},"download_size":35585024,"post_processing_size":null,"dataset_size":17585309,"size_in_bytes":53170333},"deontology":{"description":"The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Deontology subset contains examples focusing on whether an act is required, permitted, or forbidden according to a set of rules or constraints","citation":"@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n","homepage":"https://github.com/hendrycks/ethics","license":"","features":{"group_id":{"dtype":"int32","id":null,"_type":"Value"},"label":{"dtype":"int32","id":null,"_type":"Value"},"scenario":{"dtype":"string","id":null,"_type":"Value"},"excuse":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"hendrycks_ethics","config_name":"deontology","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"train":{"name":"train","num_bytes":1931475,"num_examples":18164,"dataset_name":"hendrycks_ethics"},"test":{"name":"test","num_bytes":384602,"num_examples":3596,"dataset_name":"hendrycks_ethics"}},"download_checksums":{"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar":{"num_bytes":35585024,"checksum":"40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}},"download_size":35585024,"post_processing_size":null,"dataset_size":2316077,"size_in_bytes":37901101},"justice":{"description":"The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Justice subset contains examples focusing on how a character treats another person","citation":"@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n","homepage":"https://github.com/hendrycks/ethics","license":"","features":{"group_id":{"dtype":"int32","id":null,"_type":"Value"},"label":{"dtype":"int32","id":null,"_type":"Value"},"scenario":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"hendrycks_ethics","config_name":"justice","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"train":{"name":"train","num_bytes":2516501,"num_examples":21791,"dataset_name":"hendrycks_ethics"},"test":{"name":"test","num_bytes":309427,"num_examples":2704,"dataset_name":"hendrycks_ethics"}},"download_checksums":{"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar":{"num_bytes":35585024,"checksum":"40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}},"download_size":35585024,"post_processing_size":null,"dataset_size":2825928,"size_in_bytes":38410952},"utilitarianism":{"description":"The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Utilitarianism subset contains scenarios that should be ranked from most pleasant to least pleasant for the person in the scenario","citation":"@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n","homepage":"https://github.com/hendrycks/ethics","license":"","features":{"activity":{"dtype":"string","id":null,"_type":"Value"},"baseline":{"dtype":"string","id":null,"_type":"Value"},"rating":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"hendrycks_ethics","config_name":"utilitarianism","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"train":{"name":"train","num_bytes":2241770,"num_examples":13738,"dataset_name":"hendrycks_ethics"},"test":{"name":"test","num_bytes":749768,"num_examples":4808,"dataset_name":"hendrycks_ethics"}},"download_checksums":{"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar":{"num_bytes":35585024,"checksum":"40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}},"download_size":35585024,"post_processing_size":null,"dataset_size":2991538,"size_in_bytes":38576562},"virtue":{"description":"The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Virtue subset contains scenarios focusing on whether virtues or vices are being exemplified","citation":"@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n","homepage":"https://github.com/hendrycks/ethics","license":"","features":{"group_id":{"dtype":"int32","id":null,"_type":"Value"},"label":{"dtype":"int32","id":null,"_type":"Value"},"scenario":{"dtype":"string","id":null,"_type":"Value"},"trait":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"hendrycks_ethics","config_name":"virtue","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"train":{"name":"train","num_bytes":2640328,"num_examples":28245,"dataset_name":"hendrycks_ethics"},"test":{"name":"test","num_bytes":473473,"num_examples":4975,"dataset_name":"hendrycks_ethics"}},"download_checksums":{"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar":{"num_bytes":35585024,"checksum":"40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}},"download_size":35585024,"post_processing_size":null,"dataset_size":3113801,"size_in_bytes":38698825}}
@@ -71,54 +71,64 @@ class HendrycksEthics(datasets.GeneratorBasedBuilder):
...
@@ -71,54 +71,64 @@ class HendrycksEthics(datasets.GeneratorBasedBuilder):
EthicsConfig(
EthicsConfig(
name="commonsense",
name="commonsense",
prefix="cm",
prefix="cm",
features=datasets.Features({
features=datasets.Features(
"label":datasets.Value("int32"),
{
"input":datasets.Value("string"),
"label":datasets.Value("int32"),
"is_short":datasets.Value("bool"),
"input":datasets.Value("string"),
"edited":datasets.Value("bool"),
"is_short":datasets.Value("bool"),
}),
"edited":datasets.Value("bool"),
description="The Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept."
}
),
description="The Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept.",
),
),
EthicsConfig(
EthicsConfig(
name="deontology",
name="deontology",
prefix="deontology",
prefix="deontology",
features=datasets.Features({
features=datasets.Features(
"group_id":datasets.Value("int32"),
{
"label":datasets.Value("int32"),
"group_id":datasets.Value("int32"),
"scenario":datasets.Value("string"),
"label":datasets.Value("int32"),
"excuse":datasets.Value("string"),
"scenario":datasets.Value("string"),
}),
"excuse":datasets.Value("string"),
}
),
description="The Deontology subset contains examples focusing on whether an act is required, permitted, or forbidden according to a set of rules or constraints",
description="The Deontology subset contains examples focusing on whether an act is required, permitted, or forbidden according to a set of rules or constraints",
),
),
EthicsConfig(
EthicsConfig(
name="justice",
name="justice",
prefix="justice",
prefix="justice",
features=datasets.Features({
features=datasets.Features(
"group_id":datasets.Value("int32"),
{
"label":datasets.Value("int32"),
"group_id":datasets.Value("int32"),
"scenario":datasets.Value("string"),
"label":datasets.Value("int32"),
}),
"scenario":datasets.Value("string"),
}
),
description="The Justice subset contains examples focusing on how a character treats another person",
description="The Justice subset contains examples focusing on how a character treats another person",
),
),
EthicsConfig(
EthicsConfig(
name="utilitarianism",
name="utilitarianism",
prefix="util",
prefix="util",
features=datasets.Features({
features=datasets.Features(
"activity":datasets.Value("string"),
{
"baseline":datasets.Value("string"),
"activity":datasets.Value("string"),
"rating":datasets.Value("string"),# Empty rating.
"baseline":datasets.Value("string"),
}),
"rating":datasets.Value("string"),# Empty rating.
}
),
description="The Utilitarianism subset contains scenarios that should be ranked from most pleasant to least pleasant for the person in the scenario",
description="The Utilitarianism subset contains scenarios that should be ranked from most pleasant to least pleasant for the person in the scenario",
),
),
EthicsConfig(
EthicsConfig(
name="virtue",
name="virtue",
prefix="virtue",
prefix="virtue",
features=datasets.Features({
features=datasets.Features(
"group_id":datasets.Value("int32"),
{
"label":datasets.Value("int32"),
"group_id":datasets.Value("int32"),
"scenario":datasets.Value("string"),
"label":datasets.Value("int32"),
"trait":datasets.Value("string"),
"scenario":datasets.Value("string"),
}),
"trait":datasets.Value("string"),
}
),
description="The Virtue subset contains scenarios focusing on whether virtues or vices are being exemplified",
description="The Virtue subset contains scenarios focusing on whether virtues or vices are being exemplified",
),
),
]
]
...
@@ -140,7 +150,12 @@ class HendrycksEthics(datasets.GeneratorBasedBuilder):
...
@@ -140,7 +150,12 @@ class HendrycksEthics(datasets.GeneratorBasedBuilder):
name=datasets.Split.TRAIN,
name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples
# These kwargs will be passed to _generate_examples
author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
title={The LAMBADA dataset},
title={The LAMBADA dataset},
DOI={10.5281/zenodo.2630551},
DOI={10.5281/zenodo.2630551},
publisher={Zenodo},
publisher={Zenodo},
...
@@ -62,12 +62,34 @@ class Lambada(datasets.GeneratorBasedBuilder):
...
@@ -62,12 +62,34 @@ class Lambada(datasets.GeneratorBasedBuilder):
{"logiqa":{"description":"LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA\ninstances, covering multiple types of deductive reasoning. Results show that state-\nof-the-art neural models perform by far worse than human ceiling. The dataset can\nalso serve as a benchmark for reinvestigating logical AI under the deep learning\nNLP setting.\n","citation":"@misc{liu2020logiqa,\n title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, \n author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},\n year={2020},\n eprint={2007.08124},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n","homepage":"https://github.com/lgw863/LogiQA-dataset","license":"","features":{"label":{"dtype":"string","id":null,"_type":"Value"},"context":{"dtype":"string","id":null,"_type":"Value"},"question":{"dtype":"string","id":null,"_type":"Value"},"options":{"feature":{"dtype":"string","id":null,"_type":"Value"},"length":-1,"id":null,"_type":"Sequence"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"logiqa","config_name":"logiqa","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"train":{"name":"train","num_bytes":6419852,"num_examples":7376,"dataset_name":"logiqa"},"test":{"name":"test","num_bytes":571705,"num_examples":651,"dataset_name":"logiqa"},"validation":{"name":"validation","num_bytes":562437,"num_examples":651,"dataset_name":"logiqa"}},"download_checksums":{"https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Train.txt":{"num_bytes":6281272,"checksum":"7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"},"https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Test.txt":{"num_bytes":559060,"checksum":"359acb78c37802208f7fde9e2f6574b8526527c63d6a336f90a53f1932cb4701"},"https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Eval.txt":{"num_bytes":550021,"checksum":"4c49e6753b7262c001506b9151135abf722247035ab075dad93acdea5789c01f"}},"download_size":7390353,"post_processing_size":null,"dataset_size":7553994,"size_in_bytes":14944347}}
{"logiqa":{"description":"LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA\ninstances, covering multiple types of deductive reasoning. Results show that state-\nof-the-art neural models perform by far worse than human ceiling. The dataset can\nalso serve as a benchmark for reinvestigating logical AI under the deep learning\nNLP setting.\n","citation":"@misc{liu2020logiqa,\n title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, \n author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},\n year={2020},\n eprint={2007.08124},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n","homepage":"https://github.com/lgw863/LogiQA-dataset","license":"","features":{"label":{"dtype":"string","id":null,"_type":"Value"},"context":{"dtype":"string","id":null,"_type":"Value"},"question":{"dtype":"string","id":null,"_type":"Value"},"options":{"feature":{"dtype":"string","id":null,"_type":"Value"},"length":-1,"id":null,"_type":"Sequence"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"logiqa","config_name":"logiqa","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"train":{"name":"train","num_bytes":6419852,"num_examples":7376,"dataset_name":"logiqa"},"test":{"name":"test","num_bytes":571705,"num_examples":651,"dataset_name":"logiqa"},"validation":{"name":"validation","num_bytes":562437,"num_examples":651,"dataset_name":"logiqa"}},"download_checksums":{"https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Train.txt":{"num_bytes":6281272,"checksum":"7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"},"https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Test.txt":{"num_bytes":559060,"checksum":"359acb78c37802208f7fde9e2f6574b8526527c63d6a336f90a53f1932cb4701"},"https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Eval.txt":{"num_bytes":550021,"checksum":"4c49e6753b7262c001506b9151135abf722247035ab075dad93acdea5789c01f"}},"download_size":7390353,"post_processing_size":null,"dataset_size":7553994,"size_in_bytes":14944347}}
{"mutual":{"description":"MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nThe MuTual dataset.","citation":"@inproceedings{mutual,\n title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n author = \"Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n year = \"2020\",\n publisher = \"Association for Computational Linguistics\",\n}\n","homepage":"https://github.com/Nealcly/MuTual","license":"","features":{"answers":{"dtype":"string","id":null,"_type":"Value"},"options":{"feature":{"dtype":"string","id":null,"_type":"Value"},"length":-1,"id":null,"_type":"Sequence"},"article":{"dtype":"string","id":null,"_type":"Value"},"id":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"mutual","config_name":"mutual","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"train":{"name":"train","num_bytes":5141602,"num_examples":7088,"dataset_name":"mutual"},"test":{"name":"test","num_bytes":634396,"num_examples":886,"dataset_name":"mutual"},"validation":{"name":"validation","num_bytes":624271,"num_examples":886,"dataset_name":"mutual"}},"download_checksums":{"https://github.com/Nealcly/MuTual/archive/master.zip":{"num_bytes":10997878,"checksum":"bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}},"download_size":10997878,"post_processing_size":null,"dataset_size":6400269,"size_in_bytes":17398147},"mutual_plus":{"description":"MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nMuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.","citation":"@inproceedings{mutual,\n title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n author = \"Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n year = \"2020\",\n publisher = \"Association for Computational Linguistics\",\n}\n","homepage":"https://github.com/Nealcly/MuTual","license":"","features":{"answers":{"dtype":"string","id":null,"_type":"Value"},"options":{"feature":{"dtype":"string","id":null,"_type":"Value"},"length":-1,"id":null,"_type":"Sequence"},"article":{"dtype":"string","id":null,"_type":"Value"},"id":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"mutual","config_name":"mutual_plus","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"train":{"name":"train","num_bytes":4921179,"num_examples":7088,"dataset_name":"mutual"},"test":{"name":"test","num_bytes":606620,"num_examples":886,"dataset_name":"mutual"},"validation":{"name":"validation","num_bytes":597340,"num_examples":886,"dataset_name":"mutual"}},"download_checksums":{"https://github.com/Nealcly/MuTual/archive/master.zip":{"num_bytes":10997878,"checksum":"bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}},"download_size":10997878,"post_processing_size":null,"dataset_size":6125139,"size_in_bytes":17123017}}
{"mutual":{"description":"MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nThe MuTual dataset.","citation":"@inproceedings{mutual,\n title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n author = \"Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n year = \"2020\",\n publisher = \"Association for Computational Linguistics\",\n}\n","homepage":"https://github.com/Nealcly/MuTual","license":"","features":{"answers":{"dtype":"string","id":null,"_type":"Value"},"options":{"feature":{"dtype":"string","id":null,"_type":"Value"},"length":-1,"id":null,"_type":"Sequence"},"article":{"dtype":"string","id":null,"_type":"Value"},"id":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"mutual","config_name":"mutual","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"train":{"name":"train","num_bytes":5141602,"num_examples":7088,"dataset_name":"mutual"},"test":{"name":"test","num_bytes":634396,"num_examples":886,"dataset_name":"mutual"},"validation":{"name":"validation","num_bytes":624271,"num_examples":886,"dataset_name":"mutual"}},"download_checksums":{"https://github.com/Nealcly/MuTual/archive/master.zip":{"num_bytes":10997878,"checksum":"bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}},"download_size":10997878,"post_processing_size":null,"dataset_size":6400269,"size_in_bytes":17398147},"mutual_plus":{"description":"MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nMuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.","citation":"@inproceedings{mutual,\n title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n author = \"Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n year = \"2020\",\n publisher = \"Association for Computational Linguistics\",\n}\n","homepage":"https://github.com/Nealcly/MuTual","license":"","features":{"answers":{"dtype":"string","id":null,"_type":"Value"},"options":{"feature":{"dtype":"string","id":null,"_type":"Value"},"length":-1,"id":null,"_type":"Sequence"},"article":{"dtype":"string","id":null,"_type":"Value"},"id":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"mutual","config_name":"mutual_plus","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"train":{"name":"train","num_bytes":4921179,"num_examples":7088,"dataset_name":"mutual"},"test":{"name":"test","num_bytes":606620,"num_examples":886,"dataset_name":"mutual"},"validation":{"name":"validation","num_bytes":597340,"num_examples":886,"dataset_name":"mutual"}},"download_checksums":{"https://github.com/Nealcly/MuTual/archive/master.zip":{"num_bytes":10997878,"checksum":"bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}},"download_size":10997878,"post_processing_size":null,"dataset_size":6125139,"size_in_bytes":17123017}}
datasets.BuilderConfig(name="mutual_plus",version=VERSION,description="MuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses."),
{"quac":{"description":"Question Answering in Context (QuAC) is a dataset for modeling, understanding, and \nparticipating in information seeking dialog. Data instances consist of an interactive\ndialog between two crowd workers: (1) a student who poses a sequence of freeform\nquestions to learn as much as possible about a hidden Wikipedia text, and (2)\na teacher who answers the questions by providing short excerpts (spans) from the text.\n","citation":"@article{choi2018quac,\n title={Quac: Question answering in context},\n author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},\n journal={arXiv preprint arXiv:1808.07036},\n year={2018}\n}\n","homepage":"https://quac.ai/","license":"","features":{"title":{"dtype":"string","id":null,"_type":"Value"},"section_title":{"dtype":"string","id":null,"_type":"Value"},"paragraph":{"dtype":"string","id":null,"_type":"Value"},"question":{"dtype":"string","id":null,"_type":"Value"},"answer":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"quac","config_name":"quac","version":{"version_str":"1.1.0","description":null,"major":1,"minor":1,"patch":0},"splits":{"train":{"name":"train","num_bytes":212391958,"num_examples":83568,"dataset_name":"quac"},"validation":{"name":"validation","num_bytes":20678483,"num_examples":7354,"dataset_name":"quac"}},"download_checksums":{"https://s3.amazonaws.com/my89public/quac/train_v0.2.json":{"num_bytes":68114819,"checksum":"ff5cca5a2e4b4d1cb5b5ced68b9fce88394ef6d93117426d6d4baafbcc05c56a"},"https://s3.amazonaws.com/my89public/quac/val_v0.2.json":{"num_bytes":8929167,"checksum":"09e622916280ba04c9352acb1bc5bbe80f11a2598f6f34e934c51d9e6570f378"}},"download_size":77043986,"post_processing_size":null,"dataset_size":233070441,"size_in_bytes":310114427}}
{"quac":{"description":"Question Answering in Context (QuAC) is a dataset for modeling, understanding, and \nparticipating in information seeking dialog. Data instances consist of an interactive\ndialog between two crowd workers: (1) a student who poses a sequence of freeform\nquestions to learn as much as possible about a hidden Wikipedia text, and (2)\na teacher who answers the questions by providing short excerpts (spans) from the text.\n","citation":"@article{choi2018quac,\n title={Quac: Question answering in context},\n author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},\n journal={arXiv preprint arXiv:1808.07036},\n year={2018}\n}\n","homepage":"https://quac.ai/","license":"","features":{"title":{"dtype":"string","id":null,"_type":"Value"},"section_title":{"dtype":"string","id":null,"_type":"Value"},"paragraph":{"dtype":"string","id":null,"_type":"Value"},"question":{"dtype":"string","id":null,"_type":"Value"},"answer":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"quac","config_name":"quac","version":{"version_str":"1.1.0","description":null,"major":1,"minor":1,"patch":0},"splits":{"train":{"name":"train","num_bytes":212391958,"num_examples":83568,"dataset_name":"quac"},"validation":{"name":"validation","num_bytes":20678483,"num_examples":7354,"dataset_name":"quac"}},"download_checksums":{"https://s3.amazonaws.com/my89public/quac/train_v0.2.json":{"num_bytes":68114819,"checksum":"ff5cca5a2e4b4d1cb5b5ced68b9fce88394ef6d93117426d6d4baafbcc05c56a"},"https://s3.amazonaws.com/my89public/quac/val_v0.2.json":{"num_bytes":8929167,"checksum":"09e622916280ba04c9352acb1bc5bbe80f11a2598f6f34e934c51d9e6570f378"}},"download_size":77043986,"post_processing_size":null,"dataset_size":233070441,"size_in_bytes":310114427}}
{"triviaqa":{"description":"TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence\ntriples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts\nand independently gathered evidence documents, six per question on average, that provide\nhigh quality distant supervision for answering the questions.\n","citation":"@InProceedings{JoshiTriviaQA2017,\n author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},\n title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},\n booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},\n month = {July},\n year = {2017},\n address = {Vancouver, Canada},\n publisher = {Association for Computational Linguistics},\n}\n","homepage":"https://nlp.cs.washington.edu/triviaqa/","license":"Apache License 2.0","features":{"question_id":{"dtype":"string","id":null,"_type":"Value"},"question_source":{"dtype":"string","id":null,"_type":"Value"},"question":{"dtype":"string","id":null,"_type":"Value"},"answer":{"aliases":{"feature":{"dtype":"string","id":null,"_type":"Value"},"length":-1,"id":null,"_type":"Sequence"},"value":{"dtype":"string","id":null,"_type":"Value"}},"search_results":{"feature":{"description":{"dtype":"string","id":null,"_type":"Value"},"filename":{"dtype":"string","id":null,"_type":"Value"},"rank":{"dtype":"int32","id":null,"_type":"Value"},"title":{"dtype":"string","id":null,"_type":"Value"},"url":{"dtype":"string","id":null,"_type":"Value"},"search_context":{"dtype":"string","id":null,"_type":"Value"}},"length":-1,"id":null,"_type":"Sequence"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"triviaqa","config_name":"triviaqa","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"train":{"name":"train","num_bytes":1271393601,"num_examples":87622,"dataset_name":"triviaqa"},"validation":{"name":"validation","num_bytes":163819509,"num_examples":11313,"dataset_name":"triviaqa"}},"download_checksums":{"http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz":{"num_bytes":546481381,"checksum":"adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e"}},"download_size":546481381,"post_processing_size":null,"dataset_size":1435213110,"size_in_bytes":1981694491}}
{"triviaqa":{"description":"TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence\ntriples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts\nand independently gathered evidence documents, six per question on average, that provide\nhigh quality distant supervision for answering the questions.\n","citation":"@InProceedings{JoshiTriviaQA2017,\n author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},\n title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},\n booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},\n month = {July},\n year = {2017},\n address = {Vancouver, Canada},\n publisher = {Association for Computational Linguistics},\n}\n","homepage":"https://nlp.cs.washington.edu/triviaqa/","license":"Apache License 2.0","features":{"question_id":{"dtype":"string","id":null,"_type":"Value"},"question_source":{"dtype":"string","id":null,"_type":"Value"},"question":{"dtype":"string","id":null,"_type":"Value"},"answer":{"aliases":{"feature":{"dtype":"string","id":null,"_type":"Value"},"length":-1,"id":null,"_type":"Sequence"},"value":{"dtype":"string","id":null,"_type":"Value"}},"search_results":{"feature":{"description":{"dtype":"string","id":null,"_type":"Value"},"filename":{"dtype":"string","id":null,"_type":"Value"},"rank":{"dtype":"int32","id":null,"_type":"Value"},"title":{"dtype":"string","id":null,"_type":"Value"},"url":{"dtype":"string","id":null,"_type":"Value"},"search_context":{"dtype":"string","id":null,"_type":"Value"}},"length":-1,"id":null,"_type":"Sequence"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"triviaqa","config_name":"triviaqa","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"train":{"name":"train","num_bytes":1271393601,"num_examples":87622,"dataset_name":"triviaqa"},"validation":{"name":"validation","num_bytes":163819509,"num_examples":11313,"dataset_name":"triviaqa"}},"download_checksums":{"http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz":{"num_bytes":546481381,"checksum":"adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e"}},"download_size":546481381,"post_processing_size":null,"dataset_size":1435213110,"size_in_bytes":1981694491}}
{"multiple_choice":{"description":"TruthfulQA is a benchmark to measure whether a language model is truthful in\ngenerating answers to questions. The benchmark comprises 817 questions that\nspan 38 categories, including health, law, finance and politics. Questions are\ncrafted so that some humans would answer falsely due to a false belief or\nmisconception. To perform well, models must avoid generating false answers\nlearned from imitating human texts.\n\nThe multiple choice TruthfulQA task","citation":"@misc{lin2021truthfulqa,\n title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},\n author={Stephanie Lin and Jacob Hilton and Owain Evans},\n year={2021},\n eprint={2109.07958},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n","homepage":"https://github.com/sylinrl/TruthfulQA","license":"","features":{"question":{"dtype":"string","id":null,"_type":"Value"},"mc1_targets":{"choices":{"feature":{"dtype":"string","id":null,"_type":"Value"},"length":-1,"id":null,"_type":"Sequence"},"labels":{"feature":{"dtype":"int32","id":null,"_type":"Value"},"length":-1,"id":null,"_type":"Sequence"}},"mc2_targets":{"choices":{"feature":{"dtype":"string","id":null,"_type":"Value"},"length":-1,"id":null,"_type":"Sequence"},"labels":{"feature":{"dtype":"int32","id":null,"_type":"Value"},"length":-1,"id":null,"_type":"Sequence"}}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"truthfulqa","config_name":"multiple_choice","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"validation":{"name":"validation","num_bytes":610333,"num_examples":817,"dataset_name":"truthfulqa"}},"download_checksums":{"https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/data/mc_task.json":{"num_bytes":710607,"checksum":"6eb4125d25750c0145c4be2dce00440736684ab6f74ce6bff2139571cc758954"}},"download_size":710607,"post_processing_size":null,"dataset_size":610333,"size_in_bytes":1320940},"generation":{"description":"TruthfulQA is a benchmark to measure whether a language model is truthful in\ngenerating answers to questions. The benchmark comprises 817 questions that\nspan 38 categories, including health, law, finance and politics. Questions are\ncrafted so that some humans would answer falsely due to a false belief or\nmisconception. To perform well, models must avoid generating false answers\nlearned from imitating human texts.\n\nThe generative TruthfulQA task","citation":"@misc{lin2021truthfulqa,\n title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},\n author={Stephanie Lin and Jacob Hilton and Owain Evans},\n year={2021},\n eprint={2109.07958},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n","homepage":"https://github.com/sylinrl/TruthfulQA","license":"","features":{"category":{"dtype":"string","id":null,"_type":"Value"},"question":{"dtype":"string","id":null,"_type":"Value"},"best_answer":{"dtype":"string","id":null,"_type":"Value"},"correct_answers":{"feature":{"dtype":"string","id":null,"_type":"Value"},"length":-1,"id":null,"_type":"Sequence"},"incorrect_answers":{"feature":{"dtype":"string","id":null,"_type":"Value"},"length":-1,"id":null,"_type":"Sequence"},"source":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"truthfulqa","config_name":"generation","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"validation":{"name":"validation","num_bytes":463860,"num_examples":817,"dataset_name":"truthfulqa"}},"download_checksums":{"https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/TruthfulQA.csv":{"num_bytes":443723,"checksum":"8d7dd15f033196140f032d97d30f037da7a7b1192c3f36f9937c1850925335a2"}},"download_size":443723,"post_processing_size":null,"dataset_size":463860,"size_in_bytes":907583}}
{"multiple_choice":{"description":"TruthfulQA is a benchmark to measure whether a language model is truthful in\ngenerating answers to questions. The benchmark comprises 817 questions that\nspan 38 categories, including health, law, finance and politics. Questions are\ncrafted so that some humans would answer falsely due to a false belief or\nmisconception. To perform well, models must avoid generating false answers\nlearned from imitating human texts.\n\nThe multiple choice TruthfulQA task","citation":"@misc{lin2021truthfulqa,\n title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},\n author={Stephanie Lin and Jacob Hilton and Owain Evans},\n year={2021},\n eprint={2109.07958},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n","homepage":"https://github.com/sylinrl/TruthfulQA","license":"","features":{"question":{"dtype":"string","id":null,"_type":"Value"},"mc1_targets":{"choices":{"feature":{"dtype":"string","id":null,"_type":"Value"},"length":-1,"id":null,"_type":"Sequence"},"labels":{"feature":{"dtype":"int32","id":null,"_type":"Value"},"length":-1,"id":null,"_type":"Sequence"}},"mc2_targets":{"choices":{"feature":{"dtype":"string","id":null,"_type":"Value"},"length":-1,"id":null,"_type":"Sequence"},"labels":{"feature":{"dtype":"int32","id":null,"_type":"Value"},"length":-1,"id":null,"_type":"Sequence"}}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"truthfulqa","config_name":"multiple_choice","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"validation":{"name":"validation","num_bytes":610333,"num_examples":817,"dataset_name":"truthfulqa"}},"download_checksums":{"https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/data/mc_task.json":{"num_bytes":710607,"checksum":"6eb4125d25750c0145c4be2dce00440736684ab6f74ce6bff2139571cc758954"}},"download_size":710607,"post_processing_size":null,"dataset_size":610333,"size_in_bytes":1320940},"generation":{"description":"TruthfulQA is a benchmark to measure whether a language model is truthful in\ngenerating answers to questions. The benchmark comprises 817 questions that\nspan 38 categories, including health, law, finance and politics. Questions are\ncrafted so that some humans would answer falsely due to a false belief or\nmisconception. To perform well, models must avoid generating false answers\nlearned from imitating human texts.\n\nThe generative TruthfulQA task","citation":"@misc{lin2021truthfulqa,\n title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},\n author={Stephanie Lin and Jacob Hilton and Owain Evans},\n year={2021},\n eprint={2109.07958},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n","homepage":"https://github.com/sylinrl/TruthfulQA","license":"","features":{"category":{"dtype":"string","id":null,"_type":"Value"},"question":{"dtype":"string","id":null,"_type":"Value"},"best_answer":{"dtype":"string","id":null,"_type":"Value"},"correct_answers":{"feature":{"dtype":"string","id":null,"_type":"Value"},"length":-1,"id":null,"_type":"Sequence"},"incorrect_answers":{"feature":{"dtype":"string","id":null,"_type":"Value"},"length":-1,"id":null,"_type":"Sequence"},"source":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"task_templates":null,"builder_name":"truthfulqa","config_name":"generation","version":{"version_str":"0.0.1","description":null,"major":0,"minor":0,"patch":1},"splits":{"validation":{"name":"validation","num_bytes":463860,"num_examples":817,"dataset_name":"truthfulqa"}},"download_checksums":{"https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/TruthfulQA.csv":{"num_bytes":443723,"checksum":"8d7dd15f033196140f032d97d30f037da7a7b1192c3f36f9937c1850925335a2"}},"download_size":443723,"post_processing_size":null,"dataset_size":463860,"size_in_bytes":907583}}