Commit 1f8a8c1d authored by jon-tow's avatar jon-tow
Browse files

Merge branch 'master' of https://github.com/EleutherAI/lm-evaluation-harness into remove-dataset

parents b4c0275d b0acb337
# coding=utf-8
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
...@@ -14,7 +13,7 @@ ...@@ -14,7 +13,7 @@
# limitations under the License. # limitations under the License.
# #
# NOTE: This is an exact copy of # NOTE: This is an exact copy of
# https://github.com/huggingface/datasets/blob/3804442bb7cfcb9d52044d92688115cfdc69c2da/datasets/head_qa/head_qa.py # https://github.com/huggingface/datasets/blob/3804442bb7cfcb9d52044d92688115cfdc69c2da/datasets/head_qa/head_qa.py
# with the exception of the `image` feature. This is to avoid adding `Pillow` # with the exception of the `image` feature. This is to avoid adding `Pillow`
# as a dependency. # as a dependency.
"""HEAD-QA: A Healthcare Dataset for Complex Reasoning.""" """HEAD-QA: A Healthcare Dataset for Complex Reasoning."""
...@@ -65,8 +64,12 @@ class HeadQA(datasets.GeneratorBasedBuilder): ...@@ -65,8 +64,12 @@ class HeadQA(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("1.1.0") VERSION = datasets.Version("1.1.0")
BUILDER_CONFIGS = [ BUILDER_CONFIGS = [
datasets.BuilderConfig(name="es", version=VERSION, description="Spanish HEAD dataset"), datasets.BuilderConfig(
datasets.BuilderConfig(name="en", version=VERSION, description="English HEAD dataset"), name="es", version=VERSION, description="Spanish HEAD dataset"
),
datasets.BuilderConfig(
name="en", version=VERSION, description="English HEAD dataset"
),
] ]
DEFAULT_CONFIG_NAME = "es" DEFAULT_CONFIG_NAME = "es"
...@@ -106,15 +109,24 @@ class HeadQA(datasets.GeneratorBasedBuilder): ...@@ -106,15 +109,24 @@ class HeadQA(datasets.GeneratorBasedBuilder):
return [ return [
datasets.SplitGenerator( datasets.SplitGenerator(
name=datasets.Split.TRAIN, name=datasets.Split.TRAIN,
gen_kwargs={"data_dir": data_dir, "filepath": os.path.join(data_lang_dir, f"train_{dir}.json")}, gen_kwargs={
"data_dir": data_dir,
"filepath": os.path.join(data_lang_dir, f"train_{dir}.json"),
},
), ),
datasets.SplitGenerator( datasets.SplitGenerator(
name=datasets.Split.TEST, name=datasets.Split.TEST,
gen_kwargs={"data_dir": data_dir, "filepath": os.path.join(data_lang_dir, f"test_{dir}.json")}, gen_kwargs={
"data_dir": data_dir,
"filepath": os.path.join(data_lang_dir, f"test_{dir}.json"),
},
), ),
datasets.SplitGenerator( datasets.SplitGenerator(
name=datasets.Split.VALIDATION, name=datasets.Split.VALIDATION,
gen_kwargs={"data_dir": data_dir, "filepath": os.path.join(data_lang_dir, f"dev_{dir}.json")}, gen_kwargs={
"data_dir": data_dir,
"filepath": os.path.join(data_lang_dir, f"dev_{dir}.json"),
},
), ),
] ]
...@@ -134,7 +146,9 @@ class HeadQA(datasets.GeneratorBasedBuilder): ...@@ -134,7 +146,9 @@ class HeadQA(datasets.GeneratorBasedBuilder):
aids = [answer["aid"] for answer in question["answers"]] aids = [answer["aid"] for answer in question["answers"]]
atexts = [answer["atext"].strip() for answer in question["answers"]] atexts = [answer["atext"].strip() for answer in question["answers"]]
answers = [{"aid": aid, "atext": atext} for aid, atext in zip(aids, atexts)] answers = [
{"aid": aid, "atext": atext} for aid, atext in zip(aids, atexts)
]
id_ = f"{exam_id}_{qid}" id_ = f"{exam_id}_{qid}"
yield id_, { yield id_, {
......
{"commonsense": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept.", "citation": "@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"label": {"dtype": "int32", "id": null, "_type": "Value"}, "input": {"dtype": "string", "id": null, "_type": "Value"}, "is_short": {"dtype": "bool", "id": null, "_type": "Value"}, "edited": {"dtype": "bool", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "commonsense", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 14435215, "num_examples": 13910, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 3150094, "num_examples": 3885, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 17585309, "size_in_bytes": 53170333}, "deontology": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Deontology subset contains examples focusing on whether an act is required, permitted, or forbidden according to a set of rules or constraints", "citation": "@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}, "excuse": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "deontology", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1931475, "num_examples": 18164, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 384602, "num_examples": 3596, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2316077, "size_in_bytes": 37901101}, "justice": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Justice subset contains examples focusing on how a character treats another person", "citation": "@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "justice", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2516501, "num_examples": 21791, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 309427, "num_examples": 2704, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2825928, "size_in_bytes": 38410952}, "utilitarianism": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Utilitarianism subset contains scenarios that should be ranked from most pleasant to least pleasant for the person in the scenario", "citation": "@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"activity": {"dtype": "string", "id": null, "_type": "Value"}, "baseline": {"dtype": "string", "id": null, "_type": "Value"}, "rating": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "utilitarianism", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2241770, "num_examples": 13738, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 749768, "num_examples": 4808, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2991538, "size_in_bytes": 38576562}, "virtue": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Virtue subset contains scenarios focusing on whether virtues or vices are being exemplified", "citation": "@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}, "trait": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "virtue", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2640328, "num_examples": 28245, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 473473, "num_examples": 4975, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 3113801, "size_in_bytes": 38698825}} {"commonsense": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept.", "citation": "@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"label": {"dtype": "int32", "id": null, "_type": "Value"}, "input": {"dtype": "string", "id": null, "_type": "Value"}, "is_short": {"dtype": "bool", "id": null, "_type": "Value"}, "edited": {"dtype": "bool", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "commonsense", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 14435215, "num_examples": 13910, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 3150094, "num_examples": 3885, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 17585309, "size_in_bytes": 53170333}, "deontology": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Deontology subset contains examples focusing on whether an act is required, permitted, or forbidden according to a set of rules or constraints", "citation": "@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}, "excuse": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "deontology", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1931475, "num_examples": 18164, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 384602, "num_examples": 3596, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2316077, "size_in_bytes": 37901101}, "justice": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Justice subset contains examples focusing on how a character treats another person", "citation": "@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "justice", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2516501, "num_examples": 21791, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 309427, "num_examples": 2704, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2825928, "size_in_bytes": 38410952}, "utilitarianism": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Utilitarianism subset contains scenarios that should be ranked from most pleasant to least pleasant for the person in the scenario", "citation": "@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"activity": {"dtype": "string", "id": null, "_type": "Value"}, "baseline": {"dtype": "string", "id": null, "_type": "Value"}, "rating": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "utilitarianism", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2241770, "num_examples": 13738, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 749768, "num_examples": 4808, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2991538, "size_in_bytes": 38576562}, "virtue": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Virtue subset contains scenarios focusing on whether virtues or vices are being exemplified", "citation": "@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}, "trait": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "virtue", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2640328, "num_examples": 28245, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 473473, "num_examples": 4975, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 3113801, "size_in_bytes": 38698825}}
\ No newline at end of file
...@@ -71,54 +71,64 @@ class HendrycksEthics(datasets.GeneratorBasedBuilder): ...@@ -71,54 +71,64 @@ class HendrycksEthics(datasets.GeneratorBasedBuilder):
EthicsConfig( EthicsConfig(
name="commonsense", name="commonsense",
prefix="cm", prefix="cm",
features=datasets.Features({ features=datasets.Features(
"label": datasets.Value("int32"), {
"input": datasets.Value("string"), "label": datasets.Value("int32"),
"is_short": datasets.Value("bool"), "input": datasets.Value("string"),
"edited": datasets.Value("bool"), "is_short": datasets.Value("bool"),
}), "edited": datasets.Value("bool"),
description="The Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept." }
),
description="The Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept.",
), ),
EthicsConfig( EthicsConfig(
name="deontology", name="deontology",
prefix="deontology", prefix="deontology",
features=datasets.Features({ features=datasets.Features(
"group_id": datasets.Value("int32"), {
"label": datasets.Value("int32"), "group_id": datasets.Value("int32"),
"scenario": datasets.Value("string"), "label": datasets.Value("int32"),
"excuse": datasets.Value("string"), "scenario": datasets.Value("string"),
}), "excuse": datasets.Value("string"),
}
),
description="The Deontology subset contains examples focusing on whether an act is required, permitted, or forbidden according to a set of rules or constraints", description="The Deontology subset contains examples focusing on whether an act is required, permitted, or forbidden according to a set of rules or constraints",
), ),
EthicsConfig( EthicsConfig(
name="justice", name="justice",
prefix="justice", prefix="justice",
features=datasets.Features({ features=datasets.Features(
"group_id": datasets.Value("int32"), {
"label": datasets.Value("int32"), "group_id": datasets.Value("int32"),
"scenario": datasets.Value("string"), "label": datasets.Value("int32"),
}), "scenario": datasets.Value("string"),
}
),
description="The Justice subset contains examples focusing on how a character treats another person", description="The Justice subset contains examples focusing on how a character treats another person",
), ),
EthicsConfig( EthicsConfig(
name="utilitarianism", name="utilitarianism",
prefix="util", prefix="util",
features=datasets.Features({ features=datasets.Features(
"activity": datasets.Value("string"), {
"baseline": datasets.Value("string"), "activity": datasets.Value("string"),
"rating": datasets.Value("string"), # Empty rating. "baseline": datasets.Value("string"),
}), "rating": datasets.Value("string"), # Empty rating.
}
),
description="The Utilitarianism subset contains scenarios that should be ranked from most pleasant to least pleasant for the person in the scenario", description="The Utilitarianism subset contains scenarios that should be ranked from most pleasant to least pleasant for the person in the scenario",
), ),
EthicsConfig( EthicsConfig(
name="virtue", name="virtue",
prefix="virtue", prefix="virtue",
features=datasets.Features({ features=datasets.Features(
"group_id": datasets.Value("int32"), {
"label": datasets.Value("int32"), "group_id": datasets.Value("int32"),
"scenario": datasets.Value("string"), "label": datasets.Value("int32"),
"trait": datasets.Value("string"), "scenario": datasets.Value("string"),
}), "trait": datasets.Value("string"),
}
),
description="The Virtue subset contains scenarios focusing on whether virtues or vices are being exemplified", description="The Virtue subset contains scenarios focusing on whether virtues or vices are being exemplified",
), ),
] ]
...@@ -140,7 +150,12 @@ class HendrycksEthics(datasets.GeneratorBasedBuilder): ...@@ -140,7 +150,12 @@ class HendrycksEthics(datasets.GeneratorBasedBuilder):
name=datasets.Split.TRAIN, name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples # These kwargs will be passed to _generate_examples
gen_kwargs={ gen_kwargs={
"filepath": os.path.join(data_dir, "ethics", self.config.name, f"{self.config.prefix}_train.csv"), "filepath": os.path.join(
data_dir,
"ethics",
self.config.name,
f"{self.config.prefix}_train.csv",
),
"split": "train", "split": "train",
}, },
), ),
...@@ -148,18 +163,22 @@ class HendrycksEthics(datasets.GeneratorBasedBuilder): ...@@ -148,18 +163,22 @@ class HendrycksEthics(datasets.GeneratorBasedBuilder):
name=datasets.Split.TEST, name=datasets.Split.TEST,
# These kwargs will be passed to _generate_examples # These kwargs will be passed to _generate_examples
gen_kwargs={ gen_kwargs={
"filepath": os.path.join(data_dir, "ethics", self.config.name, f"{self.config.prefix}_test.csv"), "filepath": os.path.join(
"split": "test" data_dir,
"ethics",
self.config.name,
f"{self.config.prefix}_test.csv",
),
"split": "test",
}, },
) ),
] ]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators` # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
def _generate_examples(self, filepath, split): def _generate_examples(self, filepath, split):
with open(filepath, newline='') as f: with open(filepath, newline="") as f:
if self.config.name == "utilitarianism": if self.config.name == "utilitarianism":
contents = csv.DictReader( contents = csv.DictReader(f, fieldnames=["activity", "baseline"])
f, fieldnames=['activity', "baseline"])
else: else:
contents = csv.DictReader(f) contents = csv.DictReader(f)
# For subsets with grouped scenarios, tag them with an id. # For subsets with grouped scenarios, tag them with an id.
......
...@@ -44,13 +44,13 @@ _LICENSE = "" ...@@ -44,13 +44,13 @@ _LICENSE = ""
_URLS = "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar" _URLS = "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar"
_NAMES = [ _NAMES = [
'algebra', "algebra",
'counting_and_probability', "counting_and_probability",
'geometry', "geometry",
'intermediate_algebra', "intermediate_algebra",
'number_theory', "number_theory",
'prealgebra', "prealgebra",
'precalculus', "precalculus",
] ]
...@@ -89,7 +89,9 @@ class HendrycksMath(datasets.GeneratorBasedBuilder): ...@@ -89,7 +89,9 @@ class HendrycksMath(datasets.GeneratorBasedBuilder):
name=datasets.Split.TRAIN, name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples # These kwargs will be passed to _generate_examples
gen_kwargs={ gen_kwargs={
"basepath": os.path.join(data_dir, "MATH", "train", self.config.name), "basepath": os.path.join(
data_dir, "MATH", "train", self.config.name
),
"split": "train", "split": "train",
}, },
), ),
...@@ -97,8 +99,10 @@ class HendrycksMath(datasets.GeneratorBasedBuilder): ...@@ -97,8 +99,10 @@ class HendrycksMath(datasets.GeneratorBasedBuilder):
name=datasets.Split.TEST, name=datasets.Split.TEST,
# These kwargs will be passed to _generate_examples # These kwargs will be passed to _generate_examples
gen_kwargs={ gen_kwargs={
"basepath": os.path.join(data_dir, "MATH", "test", self.config.name), "basepath": os.path.join(
"split": "test" data_dir, "MATH", "test", self.config.name
),
"split": "test",
}, },
), ),
] ]
...@@ -107,7 +111,7 @@ class HendrycksMath(datasets.GeneratorBasedBuilder): ...@@ -107,7 +111,7 @@ class HendrycksMath(datasets.GeneratorBasedBuilder):
def _generate_examples(self, basepath, split): def _generate_examples(self, basepath, split):
key = 0 key = 0
for file in sorted(pathlib.Path(basepath).iterdir()): for file in sorted(pathlib.Path(basepath).iterdir()):
with open(file, "r", encoding='utf-8') as f: with open(file, "r", encoding="utf-8") as f:
data = json.load(f) data = json.load(f)
yield key, { yield key, {
"problem": data["problem"], "problem": data["problem"],
......
This diff is collapsed.
...@@ -22,7 +22,7 @@ import datasets ...@@ -22,7 +22,7 @@ import datasets
_CITATION = """\ _CITATION = """\
@misc{ @misc{
author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel}, author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
title={The LAMBADA dataset}, title={The LAMBADA dataset},
DOI={10.5281/zenodo.2630551}, DOI={10.5281/zenodo.2630551},
publisher={Zenodo}, publisher={Zenodo},
...@@ -62,12 +62,34 @@ class Lambada(datasets.GeneratorBasedBuilder): ...@@ -62,12 +62,34 @@ class Lambada(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("0.0.1") VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [ BUILDER_CONFIGS = [
datasets.BuilderConfig(name="original", version=VERSION, description="The LAMBADA dataset"), datasets.BuilderConfig(
datasets.BuilderConfig(name="en", version=VERSION, description="The English translated LAMBADA dataset"), name="original", version=VERSION, description="The LAMBADA dataset"
datasets.BuilderConfig(name="fr", version=VERSION, description="The French translated LAMBADA dataset"), ),
datasets.BuilderConfig(name="de", version=VERSION, description="The German translated LAMBADA dataset"), datasets.BuilderConfig(
datasets.BuilderConfig(name="it", version=VERSION, description="The Italian translated LAMBADA dataset"), name="en",
datasets.BuilderConfig(name="es", version=VERSION, description="The Spanish translated LAMBADA dataset"), version=VERSION,
description="The English translated LAMBADA dataset",
),
datasets.BuilderConfig(
name="fr",
version=VERSION,
description="The French translated LAMBADA dataset",
),
datasets.BuilderConfig(
name="de",
version=VERSION,
description="The German translated LAMBADA dataset",
),
datasets.BuilderConfig(
name="it",
version=VERSION,
description="The Italian translated LAMBADA dataset",
),
datasets.BuilderConfig(
name="es",
version=VERSION,
description="The Spanish translated LAMBADA dataset",
),
] ]
DEFAULT_CONFIG_NAME = "original" DEFAULT_CONFIG_NAME = "original"
...@@ -105,6 +127,4 @@ class Lambada(datasets.GeneratorBasedBuilder): ...@@ -105,6 +127,4 @@ class Lambada(datasets.GeneratorBasedBuilder):
with open(filepath, encoding="utf-8") as f: with open(filepath, encoding="utf-8") as f:
for key, row in enumerate(f): for key, row in enumerate(f):
data = json.loads(row) data = json.loads(row)
yield key, { yield key, {"text": data["text"]}
"text": data["text"]
}
{"logiqa": {"description": "LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA\ninstances, covering multiple types of deductive reasoning. Results show that state-\nof-the-art neural models perform by far worse than human ceiling. The dataset can\nalso serve as a benchmark for reinvestigating logical AI under the deep learning\nNLP setting.\n", "citation": "@misc{liu2020logiqa,\n title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, \n author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},\n year={2020},\n eprint={2007.08124},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/lgw863/LogiQA-dataset", "license": "", "features": {"label": {"dtype": "string", "id": null, "_type": "Value"}, "context": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "logiqa", "config_name": "logiqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 6419852, "num_examples": 7376, "dataset_name": "logiqa"}, "test": {"name": "test", "num_bytes": 571705, "num_examples": 651, "dataset_name": "logiqa"}, "validation": {"name": "validation", "num_bytes": 562437, "num_examples": 651, "dataset_name": "logiqa"}}, "download_checksums": {"https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Train.txt": {"num_bytes": 6281272, "checksum": "7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Test.txt": {"num_bytes": 559060, "checksum": "359acb78c37802208f7fde9e2f6574b8526527c63d6a336f90a53f1932cb4701"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Eval.txt": {"num_bytes": 550021, "checksum": "4c49e6753b7262c001506b9151135abf722247035ab075dad93acdea5789c01f"}}, "download_size": 7390353, "post_processing_size": null, "dataset_size": 7553994, "size_in_bytes": 14944347}} {"logiqa": {"description": "LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA\ninstances, covering multiple types of deductive reasoning. Results show that state-\nof-the-art neural models perform by far worse than human ceiling. The dataset can\nalso serve as a benchmark for reinvestigating logical AI under the deep learning\nNLP setting.\n", "citation": "@misc{liu2020logiqa,\n title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, \n author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},\n year={2020},\n eprint={2007.08124},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/lgw863/LogiQA-dataset", "license": "", "features": {"label": {"dtype": "string", "id": null, "_type": "Value"}, "context": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "logiqa", "config_name": "logiqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 6419852, "num_examples": 7376, "dataset_name": "logiqa"}, "test": {"name": "test", "num_bytes": 571705, "num_examples": 651, "dataset_name": "logiqa"}, "validation": {"name": "validation", "num_bytes": 562437, "num_examples": 651, "dataset_name": "logiqa"}}, "download_checksums": {"https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Train.txt": {"num_bytes": 6281272, "checksum": "7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Test.txt": {"num_bytes": 559060, "checksum": "359acb78c37802208f7fde9e2f6574b8526527c63d6a336f90a53f1932cb4701"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Eval.txt": {"num_bytes": 550021, "checksum": "4c49e6753b7262c001506b9151135abf722247035ab075dad93acdea5789c01f"}}, "download_size": 7390353, "post_processing_size": null, "dataset_size": 7553994, "size_in_bytes": 14944347}}
\ No newline at end of file
...@@ -19,7 +19,7 @@ import datasets ...@@ -19,7 +19,7 @@ import datasets
_CITATION = """\ _CITATION = """\
@misc{liu2020logiqa, @misc{liu2020logiqa,
title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning},
author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang}, author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},
year={2020}, year={2020},
eprint={2007.08124}, eprint={2007.08124},
...@@ -54,7 +54,9 @@ class Logiqa(datasets.GeneratorBasedBuilder): ...@@ -54,7 +54,9 @@ class Logiqa(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("0.0.1") VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [ BUILDER_CONFIGS = [
datasets.BuilderConfig(name="logiqa", version=VERSION, description="The LogiQA dataset."), datasets.BuilderConfig(
name="logiqa", version=VERSION, description="The LogiQA dataset."
),
] ]
def _info(self): def _info(self):
...@@ -63,9 +65,7 @@ class Logiqa(datasets.GeneratorBasedBuilder): ...@@ -63,9 +65,7 @@ class Logiqa(datasets.GeneratorBasedBuilder):
"label": datasets.Value("string"), "label": datasets.Value("string"),
"context": datasets.Value("string"), "context": datasets.Value("string"),
"question": datasets.Value("string"), "question": datasets.Value("string"),
"options": datasets.features.Sequence( "options": datasets.features.Sequence(datasets.Value("string")),
datasets.Value("string")
),
} }
) )
return datasets.DatasetInfo( return datasets.DatasetInfo(
...@@ -77,7 +77,11 @@ class Logiqa(datasets.GeneratorBasedBuilder): ...@@ -77,7 +77,11 @@ class Logiqa(datasets.GeneratorBasedBuilder):
) )
def _split_generators(self, dl_manager): def _split_generators(self, dl_manager):
urls = {"train": _URLS["train"], "test": _URLS["test"], "validation": _URLS["validation"]} urls = {
"train": _URLS["train"],
"test": _URLS["test"],
"validation": _URLS["validation"],
}
data_dir = dl_manager.download_and_extract(urls) data_dir = dl_manager.download_and_extract(urls)
return [ return [
datasets.SplitGenerator( datasets.SplitGenerator(
...@@ -91,10 +95,7 @@ class Logiqa(datasets.GeneratorBasedBuilder): ...@@ -91,10 +95,7 @@ class Logiqa(datasets.GeneratorBasedBuilder):
datasets.SplitGenerator( datasets.SplitGenerator(
name=datasets.Split.TEST, name=datasets.Split.TEST,
# These kwargs will be passed to _generate_examples # These kwargs will be passed to _generate_examples
gen_kwargs={ gen_kwargs={"filepath": data_dir["test"], "split": "test"},
"filepath": data_dir["test"],
"split": "test"
},
), ),
datasets.SplitGenerator( datasets.SplitGenerator(
name=datasets.Split.VALIDATION, name=datasets.Split.VALIDATION,
...@@ -110,6 +111,7 @@ class Logiqa(datasets.GeneratorBasedBuilder): ...@@ -110,6 +111,7 @@ class Logiqa(datasets.GeneratorBasedBuilder):
def _generate_examples(self, filepath, split): def _generate_examples(self, filepath, split):
def normalize(text): def normalize(text):
return text.replace(".", ". ").strip() return text.replace(".", ". ").strip()
with open(filepath, encoding="utf-8") as f: with open(filepath, encoding="utf-8") as f:
data = f.read().strip().split("\n\n") data = f.read().strip().split("\n\n")
for key, row in enumerate(data): for key, row in enumerate(data):
......
{"mutual": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nThe MuTual dataset.", "citation": "@inproceedings{mutual,\n title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n author = \"Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n year = \"2020\",\n publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 5141602, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 634396, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 624271, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6400269, "size_in_bytes": 17398147}, "mutual_plus": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nMuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.", "citation": "@inproceedings{mutual,\n title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n author = \"Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n year = \"2020\",\n publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual_plus", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 4921179, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 606620, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 597340, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6125139, "size_in_bytes": 17123017}} {"mutual": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nThe MuTual dataset.", "citation": "@inproceedings{mutual,\n title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n author = \"Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n year = \"2020\",\n publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 5141602, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 634396, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 624271, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6400269, "size_in_bytes": 17398147}, "mutual_plus": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nMuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.", "citation": "@inproceedings{mutual,\n title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n author = \"Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n year = \"2020\",\n publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual_plus", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 4921179, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 606620, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 597340, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6125139, "size_in_bytes": 17123017}}
\ No newline at end of file
...@@ -50,8 +50,14 @@ class Mutual(datasets.GeneratorBasedBuilder): ...@@ -50,8 +50,14 @@ class Mutual(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("0.0.1") VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [ BUILDER_CONFIGS = [
datasets.BuilderConfig(name="mutual", version=VERSION, description="The MuTual dataset."), datasets.BuilderConfig(
datasets.BuilderConfig(name="mutual_plus", version=VERSION, description="MuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses."), name="mutual", version=VERSION, description="The MuTual dataset."
),
datasets.BuilderConfig(
name="mutual_plus",
version=VERSION,
description="MuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.",
),
] ]
def _info(self): def _info(self):
...@@ -79,7 +85,9 @@ class Mutual(datasets.GeneratorBasedBuilder): ...@@ -79,7 +85,9 @@ class Mutual(datasets.GeneratorBasedBuilder):
name=datasets.Split.TRAIN, name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples # These kwargs will be passed to _generate_examples
gen_kwargs={ gen_kwargs={
"basepath": os.path.join(data_dir, "MuTual-master", "data", self.config.name, "train"), "basepath": os.path.join(
data_dir, "MuTual-master", "data", self.config.name, "train"
),
"split": "train", "split": "train",
}, },
), ),
...@@ -87,7 +95,9 @@ class Mutual(datasets.GeneratorBasedBuilder): ...@@ -87,7 +95,9 @@ class Mutual(datasets.GeneratorBasedBuilder):
name=datasets.Split.TEST, name=datasets.Split.TEST,
# These kwargs will be passed to _generate_examples # These kwargs will be passed to _generate_examples
gen_kwargs={ gen_kwargs={
"basepath": os.path.join(data_dir, "MuTual-master", "data", self.config.name, "test"), "basepath": os.path.join(
data_dir, "MuTual-master", "data", self.config.name, "test"
),
"split": "test", "split": "test",
}, },
), ),
...@@ -95,7 +105,9 @@ class Mutual(datasets.GeneratorBasedBuilder): ...@@ -95,7 +105,9 @@ class Mutual(datasets.GeneratorBasedBuilder):
name=datasets.Split.VALIDATION, name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples # These kwargs will be passed to _generate_examples
gen_kwargs={ gen_kwargs={
"basepath": os.path.join(data_dir, "MuTual-master", "data", self.config.name, "dev"), "basepath": os.path.join(
data_dir, "MuTual-master", "data", self.config.name, "dev"
),
"split": "dev", "split": "dev",
}, },
), ),
...@@ -109,7 +121,7 @@ class Mutual(datasets.GeneratorBasedBuilder): ...@@ -109,7 +121,7 @@ class Mutual(datasets.GeneratorBasedBuilder):
for file in sorted(Path(basepath).iterdir()): for file in sorted(Path(basepath).iterdir()):
if file.suffix != ".txt": if file.suffix != ".txt":
continue continue
with open(file, "r", encoding='utf-8') as f: with open(file, "r", encoding="utf-8") as f:
data_str = f.read() data_str = f.read()
# Ignore the occasional empty file. # Ignore the occasional empty file.
if not data_str: if not data_str:
......
This diff is collapsed.
...@@ -103,10 +103,7 @@ class Pile(datasets.GeneratorBasedBuilder): ...@@ -103,10 +103,7 @@ class Pile(datasets.GeneratorBasedBuilder):
datasets.SplitGenerator( datasets.SplitGenerator(
name=datasets.Split.TEST, name=datasets.Split.TEST,
# These kwargs will be passed to _generate_examples # These kwargs will be passed to _generate_examples
gen_kwargs={ gen_kwargs={"filepath": data_dir["test"], "split": "test"},
"filepath": data_dir["test"],
"split": "test"
},
), ),
datasets.SplitGenerator( datasets.SplitGenerator(
name=datasets.Split.VALIDATION, name=datasets.Split.VALIDATION,
......
{"quac": {"description": "Question Answering in Context (QuAC) is a dataset for modeling, understanding, and \nparticipating in information seeking dialog. Data instances consist of an interactive\ndialog between two crowd workers: (1) a student who poses a sequence of freeform\nquestions to learn as much as possible about a hidden Wikipedia text, and (2)\na teacher who answers the questions by providing short excerpts (spans) from the text.\n", "citation": "@article{choi2018quac,\n title={Quac: Question answering in context},\n author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},\n journal={arXiv preprint arXiv:1808.07036},\n year={2018}\n}\n", "homepage": "https://quac.ai/", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "section_title": {"dtype": "string", "id": null, "_type": "Value"}, "paragraph": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "quac", "config_name": "quac", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 212391958, "num_examples": 83568, "dataset_name": "quac"}, "validation": {"name": "validation", "num_bytes": 20678483, "num_examples": 7354, "dataset_name": "quac"}}, "download_checksums": {"https://s3.amazonaws.com/my89public/quac/train_v0.2.json": {"num_bytes": 68114819, "checksum": "ff5cca5a2e4b4d1cb5b5ced68b9fce88394ef6d93117426d6d4baafbcc05c56a"}, "https://s3.amazonaws.com/my89public/quac/val_v0.2.json": {"num_bytes": 8929167, "checksum": "09e622916280ba04c9352acb1bc5bbe80f11a2598f6f34e934c51d9e6570f378"}}, "download_size": 77043986, "post_processing_size": null, "dataset_size": 233070441, "size_in_bytes": 310114427}} {"quac": {"description": "Question Answering in Context (QuAC) is a dataset for modeling, understanding, and \nparticipating in information seeking dialog. Data instances consist of an interactive\ndialog between two crowd workers: (1) a student who poses a sequence of freeform\nquestions to learn as much as possible about a hidden Wikipedia text, and (2)\na teacher who answers the questions by providing short excerpts (spans) from the text.\n", "citation": "@article{choi2018quac,\n title={Quac: Question answering in context},\n author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},\n journal={arXiv preprint arXiv:1808.07036},\n year={2018}\n}\n", "homepage": "https://quac.ai/", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "section_title": {"dtype": "string", "id": null, "_type": "Value"}, "paragraph": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "quac", "config_name": "quac", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 212391958, "num_examples": 83568, "dataset_name": "quac"}, "validation": {"name": "validation", "num_bytes": 20678483, "num_examples": 7354, "dataset_name": "quac"}}, "download_checksums": {"https://s3.amazonaws.com/my89public/quac/train_v0.2.json": {"num_bytes": 68114819, "checksum": "ff5cca5a2e4b4d1cb5b5ced68b9fce88394ef6d93117426d6d4baafbcc05c56a"}, "https://s3.amazonaws.com/my89public/quac/val_v0.2.json": {"num_bytes": 8929167, "checksum": "09e622916280ba04c9352acb1bc5bbe80f11a2598f6f34e934c51d9e6570f378"}}, "download_size": 77043986, "post_processing_size": null, "dataset_size": 233070441, "size_in_bytes": 310114427}}
\ No newline at end of file
...@@ -30,7 +30,7 @@ _CITATION = """\ ...@@ -30,7 +30,7 @@ _CITATION = """\
""" """
_DESCRIPTION = """\ _DESCRIPTION = """\
Question Answering in Context (QuAC) is a dataset for modeling, understanding, and Question Answering in Context (QuAC) is a dataset for modeling, understanding, and
participating in information seeking dialog. Data instances consist of an interactive participating in information seeking dialog. Data instances consist of an interactive
dialog between two crowd workers: (1) a student who poses a sequence of freeform dialog between two crowd workers: (1) a student who poses a sequence of freeform
questions to learn as much as possible about a hidden Wikipedia text, and (2) questions to learn as much as possible about a hidden Wikipedia text, and (2)
...@@ -54,7 +54,9 @@ class Quac(datasets.GeneratorBasedBuilder): ...@@ -54,7 +54,9 @@ class Quac(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("1.1.0") VERSION = datasets.Version("1.1.0")
BUILDER_CONFIGS = [ BUILDER_CONFIGS = [
datasets.BuilderConfig(name="quac", version=VERSION, description="The QuAC dataset"), datasets.BuilderConfig(
name="quac", version=VERSION, description="The QuAC dataset"
),
] ]
def _info(self): def _info(self):
...@@ -90,10 +92,7 @@ class Quac(datasets.GeneratorBasedBuilder): ...@@ -90,10 +92,7 @@ class Quac(datasets.GeneratorBasedBuilder):
datasets.SplitGenerator( datasets.SplitGenerator(
name=datasets.Split.VALIDATION, name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples # These kwargs will be passed to _generate_examples
gen_kwargs={ gen_kwargs={"filepath": data_dir["validation"], "split": "validation"},
"filepath": data_dir["validation"],
"split": "validation"
},
), ),
] ]
...@@ -105,7 +104,7 @@ class Quac(datasets.GeneratorBasedBuilder): ...@@ -105,7 +104,7 @@ class Quac(datasets.GeneratorBasedBuilder):
for row in data: for row in data:
paragraph = row["paragraphs"][0]["context"].replace("CANNOTANSWER", "") paragraph = row["paragraphs"][0]["context"].replace("CANNOTANSWER", "")
qas = row["paragraphs"][0]["qas"] qas = row["paragraphs"][0]["qas"]
qa_pairs = [(qa['question'], qa['answers'][0]['text']) for qa in qas] qa_pairs = [(qa["question"], qa["answers"][0]["text"]) for qa in qas]
for (question, answer) in qa_pairs: for (question, answer) in qa_pairs:
# Yields examples as (key, example) tuples # Yields examples as (key, example) tuples
yield key, { yield key, {
......
...@@ -44,13 +44,16 @@ _LICENSE = "" ...@@ -44,13 +44,16 @@ _LICENSE = ""
class SatAnalogies(datasets.GeneratorBasedBuilder): class SatAnalogies(datasets.GeneratorBasedBuilder):
""" SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374 multiple-choice analogy questions. """ """SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374 multiple-choice analogy questions."""
VERSION = datasets.Version("0.0.1") VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [ BUILDER_CONFIGS = [
datasets.BuilderConfig(name="sat_analogies", version=VERSION, datasets.BuilderConfig(
description="The SAT Analogy Questions dataset"), name="sat_analogies",
version=VERSION,
description="The SAT Analogy Questions dataset",
),
] ]
@property @property
...@@ -58,7 +61,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder): ...@@ -58,7 +61,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder):
return ( return (
"To use SAT Analogy Questions you have to download it manually. Please " "To use SAT Analogy Questions you have to download it manually. Please "
"email Peter Turney to request the data (https://www.apperceptual.com). " "email Peter Turney to request the data (https://www.apperceptual.com). "
"Once you recieve a download link for the dataset, supply the local path " "Once you receive a download link for the dataset, supply the local path "
"as the `data_dir` arg: " "as the `data_dir` arg: "
"`datasets.load_dataset('sat_analogies', data_dir='path/to/folder/folder_name')`" "`datasets.load_dataset('sat_analogies', data_dir='path/to/folder/folder_name')`"
) )
...@@ -68,9 +71,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder): ...@@ -68,9 +71,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder):
{ {
"source": datasets.Value("string"), "source": datasets.Value("string"),
"stem": datasets.Value("string"), "stem": datasets.Value("string"),
"choices": datasets.features.Sequence( "choices": datasets.features.Sequence(datasets.Value("string")),
datasets.Value("string")
),
"solution": datasets.Value("string"), "solution": datasets.Value("string"),
} }
) )
...@@ -108,7 +109,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder): ...@@ -108,7 +109,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder):
if len(line) == 0 and record: if len(line) == 0 and record:
data.append(record) data.append(record)
record = [] record = []
elif len(line) > 0 and line[0] == '#': elif len(line) > 0 and line[0] == "#":
# Skip comments. # Skip comments.
continue continue
else: else:
...@@ -120,8 +121,8 @@ class SatAnalogies(datasets.GeneratorBasedBuilder): ...@@ -120,8 +121,8 @@ class SatAnalogies(datasets.GeneratorBasedBuilder):
choices = record[-6:-1] choices = record[-6:-1]
solution = record[-1] solution = record[-1]
yield key, { yield key, {
'source': source, "source": source,
'stem': stem, "stem": stem,
'choices': choices, "choices": choices,
'solution': solution, "solution": solution,
} }
{"triviaqa": {"description": "TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence\ntriples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts\nand independently gathered evidence documents, six per question on average, that provide\nhigh quality distant supervision for answering the questions.\n", "citation": "@InProceedings{JoshiTriviaQA2017,\n author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},\n title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},\n booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},\n month = {July},\n year = {2017},\n address = {Vancouver, Canada},\n publisher = {Association for Computational Linguistics},\n}\n", "homepage": "https://nlp.cs.washington.edu/triviaqa/", "license": "Apache License 2.0", "features": {"question_id": {"dtype": "string", "id": null, "_type": "Value"}, "question_source": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"aliases": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "value": {"dtype": "string", "id": null, "_type": "Value"}}, "search_results": {"feature": {"description": {"dtype": "string", "id": null, "_type": "Value"}, "filename": {"dtype": "string", "id": null, "_type": "Value"}, "rank": {"dtype": "int32", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}, "search_context": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "triviaqa", "config_name": "triviaqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1271393601, "num_examples": 87622, "dataset_name": "triviaqa"}, "validation": {"name": "validation", "num_bytes": 163819509, "num_examples": 11313, "dataset_name": "triviaqa"}}, "download_checksums": {"http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz": {"num_bytes": 546481381, "checksum": "adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e"}}, "download_size": 546481381, "post_processing_size": null, "dataset_size": 1435213110, "size_in_bytes": 1981694491}} {"triviaqa": {"description": "TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence\ntriples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts\nand independently gathered evidence documents, six per question on average, that provide\nhigh quality distant supervision for answering the questions.\n", "citation": "@InProceedings{JoshiTriviaQA2017,\n author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},\n title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},\n booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},\n month = {July},\n year = {2017},\n address = {Vancouver, Canada},\n publisher = {Association for Computational Linguistics},\n}\n", "homepage": "https://nlp.cs.washington.edu/triviaqa/", "license": "Apache License 2.0", "features": {"question_id": {"dtype": "string", "id": null, "_type": "Value"}, "question_source": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"aliases": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "value": {"dtype": "string", "id": null, "_type": "Value"}}, "search_results": {"feature": {"description": {"dtype": "string", "id": null, "_type": "Value"}, "filename": {"dtype": "string", "id": null, "_type": "Value"}, "rank": {"dtype": "int32", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}, "search_context": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "triviaqa", "config_name": "triviaqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1271393601, "num_examples": 87622, "dataset_name": "triviaqa"}, "validation": {"name": "validation", "num_bytes": 163819509, "num_examples": 11313, "dataset_name": "triviaqa"}}, "download_checksums": {"http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz": {"num_bytes": 546481381, "checksum": "adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e"}}, "download_size": 546481381, "post_processing_size": null, "dataset_size": 1435213110, "size_in_bytes": 1981694491}}
\ No newline at end of file
...@@ -50,13 +50,14 @@ _URLS = "http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz" ...@@ -50,13 +50,14 @@ _URLS = "http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz"
class Triviaqa(datasets.GeneratorBasedBuilder): class Triviaqa(datasets.GeneratorBasedBuilder):
""" TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence triples """ """TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence triples"""
VERSION = datasets.Version("0.0.1") VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [ BUILDER_CONFIGS = [
datasets.BuilderConfig( datasets.BuilderConfig(
name="triviaqa", version=VERSION, description="The TriviaQA dataset"), name="triviaqa", version=VERSION, description="The TriviaQA dataset"
),
] ]
def _info(self): def _info(self):
...@@ -66,10 +67,10 @@ class Triviaqa(datasets.GeneratorBasedBuilder): ...@@ -66,10 +67,10 @@ class Triviaqa(datasets.GeneratorBasedBuilder):
"question_source": datasets.Value("string"), "question_source": datasets.Value("string"),
"question": datasets.Value("string"), "question": datasets.Value("string"),
"answer": { "answer": {
"aliases": datasets.features.Sequence( "aliases": datasets.features.Sequence(
datasets.Value("string"), datasets.Value("string"),
), ),
"value": datasets.Value("string") "value": datasets.Value("string"),
}, },
"search_results": datasets.features.Sequence( "search_results": datasets.features.Sequence(
{ {
...@@ -120,12 +121,24 @@ class Triviaqa(datasets.GeneratorBasedBuilder): ...@@ -120,12 +121,24 @@ class Triviaqa(datasets.GeneratorBasedBuilder):
for search_result in data["SearchResults"]: for search_result in data["SearchResults"]:
search_results.append( search_results.append(
{ {
"description": search_result["Description"] if "Description" in search_result else "", "description": search_result["Description"]
"filename": search_result["Filename"] if "Filename" in search_result else "", if "Description" in search_result
"rank": search_result["Rank"] if "Rank" in search_result else -1, else "",
"title": search_result["Title"] if "Title" in search_result else "", "filename": search_result["Filename"]
"url": search_result["Url"] if "Url" in search_result else "", if "Filename" in search_result
"search_context": search_result["SearchContext"] if "SearchContext" in search_result else "", else "",
"rank": search_result["Rank"]
if "Rank" in search_result
else -1,
"title": search_result["Title"]
if "Title" in search_result
else "",
"url": search_result["Url"]
if "Url" in search_result
else "",
"search_context": search_result["SearchContext"]
if "SearchContext" in search_result
else "",
} }
) )
yield key, { yield key, {
......
...@@ -64,8 +64,9 @@ class Unscramble(datasets.GeneratorBasedBuilder): ...@@ -64,8 +64,9 @@ class Unscramble(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("0.0.1") VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [ BUILDER_CONFIGS = [
datasets.BuilderConfig(name=name, version=version, datasets.BuilderConfig(
description=_DESCRIPTIONS[name]) name=name, version=version, description=_DESCRIPTIONS[name]
)
for name, version in zip(_NAMES, [VERSION] * len(_NAMES)) for name, version in zip(_NAMES, [VERSION] * len(_NAMES))
] ]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment