"vscode:/vscode.git/clone" did not exist on "b04cd3d48794950c2b3f5d48a0ef75fedef8931d"
Commit a6b358ca authored by Rayyyyy's avatar Rayyyyy
Browse files

update version

parent ed53d51c
{"default": {"description": "\nHellaSwag: Can a Machine Really Finish Your Sentence? is a new dataset for commonsense NLI. A paper was published at ACL2019.\n", "citation": "@inproceedings{zellers2019hellaswag,\n title={HellaSwag: Can a Machine Really Finish Your Sentence?},\n author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},\n booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},\n year={2019}\n}\n", "homepage": "https://rowanzellers.com/hellaswag/", "license": "", "features": {"ind": {"dtype": "int32", "id": null, "_type": "Value"}, "activity_label": {"dtype": "string", "id": null, "_type": "Value"}, "ctx_a": {"dtype": "string", "id": null, "_type": "Value"}, "ctx_b": {"dtype": "string", "id": null, "_type": "Value"}, "ctx": {"dtype": "string", "id": null, "_type": "Value"}, "endings": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source_id": {"dtype": "string", "id": null, "_type": "Value"}, "split": {"dtype": "string", "id": null, "_type": "Value"}, "split_type": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hellaswag", "config_name": "default", "version": {"version_str": "0.1.0", "description": null, "major": 0, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 43232624, "num_examples": 39905, "dataset_name": "hellaswag"}, "test": {"name": "test", "num_bytes": 10791853, "num_examples": 10003, "dataset_name": "hellaswag"}, "validation": {"name": "validation", "num_bytes": 11175717, "num_examples": 10042, "dataset_name": "hellaswag"}}, "download_checksums": {"https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_train.jsonl": {"num_bytes": 47496131, "checksum": "dae5e69249868cb9fe4e23ff925c60b66169564cfb7072d793cd7356a2b69f8d"}, "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_test.jsonl": {"num_bytes": 11752147, "checksum": "da082b00543e422b8d25394614d102944586986def4de5cd1bd36d86bcb76261"}, "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl": {"num_bytes": 12246618, "checksum": "0aa3b88843990f3f10a97b9575c94d7b71fb2205240ba04ae4884d9e9c992588"}}, "download_size": 71494896, "post_processing_size": null, "dataset_size": 65200194, "size_in_bytes": 136695090}}
\ No newline at end of file
"""TODO(hellaswag): Add a description here."""
import json
import datasets
# TODO(hellaswag): BibTeX citation
_CITATION = """\
@inproceedings{zellers2019hellaswag,
title={HellaSwag: Can a Machine Really Finish Your Sentence?},
author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
year={2019}
}
"""
_DESCRIPTION = """
HellaSwag: Can a Machine Really Finish Your Sentence? is a new dataset for commonsense NLI. A paper was published at ACL2019.
"""
# _URL = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/"
_URL = "https://github.com/rowanz/hellaswag/raw/master/data/"
_URLS = {
"train": _URL + "hellaswag_train.jsonl",
"test": _URL + "hellaswag_test.jsonl",
"dev": _URL + "hellaswag_val.jsonl",
}
class Hellaswag(datasets.GeneratorBasedBuilder):
"""TODO(hellaswag): Short description of my dataset."""
# TODO(hellaswag): Set up version.
VERSION = datasets.Version("0.1.0")
def _info(self):
# TODO(hellaswag): Specifies the datasets.DatasetInfo object
return datasets.DatasetInfo(
# This is the description that will appear on the datasets page.
description=_DESCRIPTION,
# datasets.features.FeatureConnectors
features=datasets.Features(
{
# These are the features of your dataset like images, labels ...
"ind": datasets.Value("int32"),
"activity_label": datasets.Value("string"),
"ctx_a": datasets.Value("string"),
"ctx_b": datasets.Value("string"),
"ctx": datasets.Value("string"),
"endings": datasets.features.Sequence(datasets.Value("string")),
"source_id": datasets.Value("string"),
"split": datasets.Value("string"),
"split_type": datasets.Value("string"),
"label": datasets.Value("string"),
}
),
# If there's a common (input, target) tuple from the features,
# specify them here. They'll be used if as_supervised=True in
# builder.as_dataset.
supervised_keys=None,
# Homepage of the dataset for documentation
homepage="https://rowanzellers.com/hellaswag/",
citation=_CITATION,
)
def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
# TODO(hellaswag): Downloads the data and defines the splits
# dl_manager is a datasets.download.DownloadManager that can be used to
# download and extract URLs
urls_to_download = _URLS
dl_dir = dl_manager.download_and_extract(urls_to_download)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples
gen_kwargs={"filepath": dl_dir["train"]},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
# These kwargs will be passed to _generate_examples
gen_kwargs={"filepath": dl_dir["test"]},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
gen_kwargs={"filepath": dl_dir["dev"]},
),
]
def _generate_examples(self, filepath):
"""Yields examples."""
# TODO(hellaswag): Yields (key, example) tuples from the dataset
try:
with open(filepath, encoding="utf-8") as f:
for id_, row in enumerate(f):
data = json.loads(row)
yield id_, {
"ind": int(data["ind"]),
"activity_label": data["activity_label"],
"ctx_a": data.get("ctx_a", ""),
"ctx_b": data.get("ctx_b", ""),
"ctx": data["ctx"],
"endings": data.get("endings", []),
"source_id": data["source_id"],
"split": data["split"],
"split_type": data["split_type"],
"label": str(data.get("label", "")),
}
except Exception as e:
print("### error in _generate_examples:", e)
{"commonsense": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept.", "citation": "@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"label": {"dtype": "int32", "id": null, "_type": "Value"}, "input": {"dtype": "string", "id": null, "_type": "Value"}, "is_short": {"dtype": "bool", "id": null, "_type": "Value"}, "edited": {"dtype": "bool", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "commonsense", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 14435215, "num_examples": 13910, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 3150094, "num_examples": 3885, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 17585309, "size_in_bytes": 53170333}, "deontology": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Deontology subset contains examples focusing on whether an act is required, permitted, or forbidden according to a set of rules or constraints", "citation": "@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}, "excuse": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "deontology", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1931475, "num_examples": 18164, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 384602, "num_examples": 3596, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2316077, "size_in_bytes": 37901101}, "justice": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Justice subset contains examples focusing on how a character treats another person", "citation": "@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "justice", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2516501, "num_examples": 21791, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 309427, "num_examples": 2704, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2825928, "size_in_bytes": 38410952}, "utilitarianism": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Utilitarianism subset contains scenarios that should be ranked from most pleasant to least pleasant for the person in the scenario", "citation": "@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"activity": {"dtype": "string", "id": null, "_type": "Value"}, "baseline": {"dtype": "string", "id": null, "_type": "Value"}, "rating": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "utilitarianism", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2241770, "num_examples": 13738, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 749768, "num_examples": 4808, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2991538, "size_in_bytes": 38576562}, "virtue": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Virtue subset contains scenarios focusing on whether virtues or vices are being exemplified", "citation": "@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}, "trait": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "virtue", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2640328, "num_examples": 28245, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 473473, "num_examples": 4975, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 3113801, "size_in_bytes": 38698825}}
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""ETHICS dataset."""
# TODO: Add the `hard` dataset splits.
import csv
import os
import datasets
_CITATION = """\
@article{hendrycks2021ethics
title={Aligning AI With Shared Human Values},
author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},
journal={Proceedings of the International Conference on Learning Representations (ICLR)},
year={2021}
}
"""
_DESCRIPTION = """\
The ETHICS dataset is a benchmark that spans concepts in justice, well-being,
duties, virtues, and commonsense morality. Models predict widespread moral
judgments about diverse text scenarios. This requires connecting physical and
social world knowledge to value judgements, a capability that may enable us
to steer chatbot outputs or eventually regularize open-ended reinforcement
learning agents.
"""
_HOMEPAGE = "https://github.com/hendrycks/ethics"
# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""
_URLS = "https://people.eecs.berkeley.edu/~hendrycks/ethics.tar"
class EthicsConfig(datasets.BuilderConfig):
"""BuilderConfig for Hendrycks ETHICS."""
def __init__(self, prefix, features, **kwargs):
"""BuilderConfig for Hendrycks ETHICS.
Args:
prefix: *string*, prefix to add to the dataset name for path location.
features: *list[string]*, list of the features that will appear in the
feature dict.
"""
# Version history:
super().__init__(version=datasets.Version("0.0.1"), **kwargs)
self.prefix = prefix
self.features = features
class HendrycksEthics(datasets.GeneratorBasedBuilder):
"""The ETHICS dataset is a benchmark that spans concepts in justice, well-being, duties, virtues, and commonsense morality."""
BUILDER_CONFIGS = [
EthicsConfig(
name="commonsense",
prefix="cm",
features=datasets.Features(
{
"label": datasets.Value("int32"),
"input": datasets.Value("string"),
"is_short": datasets.Value("bool"),
"edited": datasets.Value("bool"),
}
),
description="The Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept.",
),
EthicsConfig(
name="deontology",
prefix="deontology",
features=datasets.Features(
{
"group_id": datasets.Value("int32"),
"label": datasets.Value("int32"),
"scenario": datasets.Value("string"),
"excuse": datasets.Value("string"),
}
),
description="The Deontology subset contains examples focusing on whether an act is required, permitted, or forbidden according to a set of rules or constraints",
),
EthicsConfig(
name="justice",
prefix="justice",
features=datasets.Features(
{
"group_id": datasets.Value("int32"),
"label": datasets.Value("int32"),
"scenario": datasets.Value("string"),
}
),
description="The Justice subset contains examples focusing on how a character treats another person",
),
EthicsConfig(
name="utilitarianism",
prefix="util",
features=datasets.Features(
{
"activity": datasets.Value("string"),
"baseline": datasets.Value("string"),
"rating": datasets.Value("string"), # Empty rating.
}
),
description="The Utilitarianism subset contains scenarios that should be ranked from most pleasant to least pleasant for the person in the scenario",
),
EthicsConfig(
name="virtue",
prefix="virtue",
features=datasets.Features(
{
"group_id": datasets.Value("int32"),
"label": datasets.Value("int32"),
"scenario": datasets.Value("string"),
"trait": datasets.Value("string"),
}
),
description="The Virtue subset contains scenarios focusing on whether virtues or vices are being exemplified",
),
]
def _info(self):
return datasets.DatasetInfo(
description=f"{_DESCRIPTION}\n{self.config.description}",
features=self.config.features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
urls = _URLS
data_dir = dl_manager.download_and_extract(urls)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": os.path.join(
data_dir,
"ethics",
self.config.name,
f"{self.config.prefix}_train.csv",
),
"split": "train",
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": os.path.join(
data_dir,
"ethics",
self.config.name,
f"{self.config.prefix}_test.csv",
),
"split": "test",
},
),
]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
def _generate_examples(self, filepath, split):
with open(filepath, newline="") as f:
if self.config.name == "utilitarianism":
contents = csv.DictReader(f, fieldnames=["activity", "baseline"])
else:
contents = csv.DictReader(f)
# For subsets with grouped scenarios, tag them with an id.
group_id = 0
for key, row in enumerate(contents):
if self.config.name == "deontology":
# Scenarios come in groups of 4.
if key % 4 == 0 and key != 0:
group_id += 1
yield key, {
"group_id": group_id,
"label": row["label"],
"scenario": row["scenario"],
"excuse": row["excuse"],
}
elif self.config.name == "justice":
# Scenarios come in groups of 4.
if key % 4 == 0 and key != 0:
group_id += 1
yield key, {
"group_id": group_id,
"label": row["label"],
"scenario": row["scenario"],
}
elif self.config.name == "commonsense":
yield key, {
"label": row["label"],
"input": row["input"],
"is_short": row["is_short"],
"edited": row["edited"],
}
elif self.config.name == "virtue":
# Scenarios come in groups of 5.
if key % 5 == 0 and key != 0:
group_id += 1
scenario, trait = row["scenario"].split(" [SEP] ")
yield key, {
"group_id": group_id,
"label": row["label"],
"scenario": scenario,
"trait": trait,
}
elif self.config.name == "utilitarianism":
yield key, {
"activity": row["activity"],
"baseline": row["baseline"],
"rating": "",
}
{"algebra": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n title={Measuring Mathematical Problem Solving With the Math Dataset},\n author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n journal={NeurIPS},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "algebra", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 955021, "num_examples": 1744, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 648291, "num_examples": 1187, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1603312, "size_in_bytes": 21931248}, "counting_and_probability": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n title={Measuring Mathematical Problem Solving With the Math Dataset},\n author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n journal={NeurIPS},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "counting_and_probability", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 667385, "num_examples": 771, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 353803, "num_examples": 474, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1021188, "size_in_bytes": 21349124}, "geometry": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n title={Measuring Mathematical Problem Solving With the Math Dataset},\n author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n journal={NeurIPS},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "geometry", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1077241, "num_examples": 870, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 523126, "num_examples": 479, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1600367, "size_in_bytes": 21928303}, "intermediate_algebra": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n title={Measuring Mathematical Problem Solving With the Math Dataset},\n author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n journal={NeurIPS},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "intermediate_algebra", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1157476, "num_examples": 1295, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 795070, "num_examples": 903, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1952546, "size_in_bytes": 22280482}, "number_theory": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n title={Measuring Mathematical Problem Solving With the Math Dataset},\n author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n journal={NeurIPS},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "number_theory", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 595793, "num_examples": 869, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 349455, "num_examples": 540, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 945248, "size_in_bytes": 21273184}, "prealgebra": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n title={Measuring Mathematical Problem Solving With the Math Dataset},\n author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n journal={NeurIPS},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "prealgebra", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 715611, "num_examples": 1205, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 510195, "num_examples": 871, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1225806, "size_in_bytes": 21553742}, "precalculus": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n title={Measuring Mathematical Problem Solving With the Math Dataset},\n author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n journal={NeurIPS},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "precalculus", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 816245, "num_examples": 746, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 552893, "num_examples": 546, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1369138, "size_in_bytes": 21697074}}
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""MATH dataset."""
import json
import os
import pathlib
import datasets
_CITATION = """\
@article{hendrycksmath2021,
title={Measuring Mathematical Problem Solving With the Math Dataset},
author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
journal={NeurIPS},
year={2021}
}
"""
_DESCRIPTION = """\
MATH is a dataset of 12,500 challenging competition mathematics problems. Each
problem in Math has a full step-by-step solution which can be used to teach
models to generate answer derivations and explanations.
"""
_HOMEPAGE = "https://github.com/hendrycks/math"
# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""
_URLS = "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar"
_NAMES = [
"algebra",
"counting_and_probability",
"geometry",
"intermediate_algebra",
"number_theory",
"prealgebra",
"precalculus",
]
class HendrycksMath(datasets.GeneratorBasedBuilder):
"""MATH is a dataset of 12,500 challenging competition mathematics problems."""
VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [
datasets.BuilderConfig(name=name, version=version, description=name)
for name, version in zip(_NAMES, [VERSION] * len(_NAMES))
]
def _info(self):
features = datasets.Features(
{
"problem": datasets.Value("string"),
"level": datasets.Value("string"),
"type": datasets.Value("string"),
"solution": datasets.Value("string"),
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
urls = _URLS
data_dir = dl_manager.download_and_extract(urls)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"basepath": os.path.join(
data_dir, "MATH", "train", self.config.name
),
"split": "train",
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"basepath": os.path.join(
data_dir, "MATH", "test", self.config.name
),
"split": "test",
},
),
]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
def _generate_examples(self, basepath, split):
key = 0
for file in sorted(pathlib.Path(basepath).iterdir()):
with open(file, "r", encoding="utf-8") as f:
data = json.load(f)
yield key, {
"problem": data["problem"],
"level": data["level"],
"type": data["type"],
"solution": data["solution"],
}
key += 1
{"logiqa": {"description": "LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA\ninstances, covering multiple types of deductive reasoning. Results show that state-\nof-the-art neural models perform by far worse than human ceiling. The dataset can\nalso serve as a benchmark for reinvestigating logical AI under the deep learning\nNLP setting.\n", "citation": "@misc{liu2020logiqa,\n title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, \n author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},\n year={2020},\n eprint={2007.08124},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/lgw863/LogiQA-dataset", "license": "", "features": {"label": {"dtype": "string", "id": null, "_type": "Value"}, "context": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "logiqa", "config_name": "logiqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 6419852, "num_examples": 7376, "dataset_name": "logiqa"}, "test": {"name": "test", "num_bytes": 571705, "num_examples": 651, "dataset_name": "logiqa"}, "validation": {"name": "validation", "num_bytes": 562437, "num_examples": 651, "dataset_name": "logiqa"}}, "download_checksums": {"https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Train.txt": {"num_bytes": 6281272, "checksum": "7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Test.txt": {"num_bytes": 559060, "checksum": "359acb78c37802208f7fde9e2f6574b8526527c63d6a336f90a53f1932cb4701"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Eval.txt": {"num_bytes": 550021, "checksum": "4c49e6753b7262c001506b9151135abf722247035ab075dad93acdea5789c01f"}}, "download_size": 7390353, "post_processing_size": null, "dataset_size": 7553994, "size_in_bytes": 14944347}}
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""LogiQA dataset."""
import datasets
_CITATION = """\
@misc{liu2020logiqa,
title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning},
author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},
year={2020},
eprint={2007.08124},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""
_DESCRIPTION = """\
LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA
instances, covering multiple types of deductive reasoning. Results show that state-
of-the-art neural models perform by far worse than human ceiling. The dataset can
also serve as a benchmark for reinvestigating logical AI under the deep learning
NLP setting.
"""
_HOMEPAGE = "https://github.com/lgw863/LogiQA-dataset"
# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""
_URLS = {
"train": "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Train.txt",
"validation": "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Eval.txt",
"test": "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Test.txt",
}
class Logiqa(datasets.GeneratorBasedBuilder):
"""LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning"""
VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [
datasets.BuilderConfig(
name="logiqa", version=VERSION, description="The LogiQA dataset."
),
]
def _info(self):
features = datasets.Features(
{
"label": datasets.Value("string"),
"context": datasets.Value("string"),
"question": datasets.Value("string"),
"options": datasets.features.Sequence(datasets.Value("string")),
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
urls = {
"train": _URLS["train"],
"test": _URLS["test"],
"validation": _URLS["validation"],
}
data_dir = dl_manager.download_and_extract(urls)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": data_dir["train"],
"split": "train",
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
# These kwargs will be passed to _generate_examples
gen_kwargs={"filepath": data_dir["test"], "split": "test"},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": data_dir["validation"],
"split": "validation",
},
),
]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
def _generate_examples(self, filepath, split):
def normalize(text):
return text.replace(".", ". ").strip()
with open(filepath, encoding="utf-8") as f:
data = f.read().strip().split("\n\n")
for key, row in enumerate(data):
example = row.split("\n")
yield key, {
"label": example[0].strip(),
"context": normalize(example[1]),
"question": normalize(example[2]),
"options": [normalize(option[2:]) for option in example[3:]],
}
{"mutual": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nThe MuTual dataset.", "citation": "@inproceedings{mutual,\n title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n author = \"Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n year = \"2020\",\n publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 5141602, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 634396, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 624271, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6400269, "size_in_bytes": 17398147}, "mutual_plus": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nMuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.", "citation": "@inproceedings{mutual,\n title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n author = \"Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n year = \"2020\",\n publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual_plus", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 4921179, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 606620, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 597340, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6125139, "size_in_bytes": 17123017}}
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""MuTual dataset."""
import json
import os
from pathlib import Path
import datasets
_CITATION = """\
@inproceedings{mutual,
title = "MuTual: A Dataset for Multi-Turn Dialogue Reasoning",
author = "Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming" ,
booktitle = "Proceedings of the 58th Conference of the Association for Computational Linguistics",
year = "2020",
publisher = "Association for Computational Linguistics",
}
"""
_DESCRIPTION = """\
MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is
modified from Chinese high school English listening comprehension test data.
"""
_HOMEPAGE = "https://github.com/Nealcly/MuTual"
# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""
_URLS = "https://github.com/Nealcly/MuTual/archive/master.zip"
class Mutual(datasets.GeneratorBasedBuilder):
"""MuTual: A Dataset for Multi-Turn Dialogue Reasoning"""
VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [
datasets.BuilderConfig(
name="mutual", version=VERSION, description="The MuTual dataset."
),
datasets.BuilderConfig(
name="mutual_plus",
version=VERSION,
description="MuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.",
),
]
def _info(self):
features = datasets.Features(
{
"answers": datasets.Value("string"),
"options": datasets.features.Sequence(datasets.Value("string")),
"article": datasets.Value("string"),
"id": datasets.Value("string"),
}
)
return datasets.DatasetInfo(
description=f"{_DESCRIPTION}\n{self.config.description}",
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
urls = _URLS
data_dir = dl_manager.download_and_extract(urls)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"basepath": os.path.join(
data_dir, "MuTual-master", "data", self.config.name, "train"
),
"split": "train",
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"basepath": os.path.join(
data_dir, "MuTual-master", "data", self.config.name, "test"
),
"split": "test",
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"basepath": os.path.join(
data_dir, "MuTual-master", "data", self.config.name, "dev"
),
"split": "dev",
},
),
]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
def _generate_examples(self, basepath, split):
# TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
# The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example.
key = 0
for file in sorted(Path(basepath).iterdir()):
if file.suffix != ".txt":
continue
with open(file, "r", encoding="utf-8") as f:
data_str = f.read()
# Ignore the occasional empty file.
if not data_str:
continue
data = json.loads(data_str)
yield key, {
"answers": data["answers"],
"options": data["options"],
"article": data["article"],
"id": data["id"],
}
key += 1
{"pile_arxiv": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nArXiv", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_arxiv", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 113218251, "num_examples": 2407, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 115653720, "num_examples": 2434, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 228871971, "size_in_bytes": 1160030307}, "pile_books3": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nBooks3", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_books3", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 150095743, "num_examples": 269, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 177359876, "num_examples": 301, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 327455619, "size_in_bytes": 1258613955}, "pile_bookcorpus2": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nBookCorpus2", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_bookcorpus2", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 9680652, "num_examples": 28, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 9776271, "num_examples": 26, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 19456923, "size_in_bytes": 950615259}, "pile_dm-mathematics": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nDM Mathematics", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_dm-mathematics", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 15756556, "num_examples": 1922, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 16453386, "num_examples": 2007, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 32209942, "size_in_bytes": 963368278}, "pile_enron": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nEnron Emails", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_enron", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 1638859, "num_examples": 1010, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 1556487, "num_examples": 947, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 3195346, "size_in_bytes": 934353682}, "pile_europarl": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nEuroParl", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_europarl", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 8789652, "num_examples": 157, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 9111791, "num_examples": 133, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 17901443, "size_in_bytes": 949059779}, "pile_freelaw": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nFreeLaw", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_freelaw", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 80808693, "num_examples": 5101, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 80363814, "num_examples": 5094, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 161172507, "size_in_bytes": 1092330843}, "pile_github": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nGithub", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_github", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 95654706, "num_examples": 18195, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 97179576, "num_examples": 18337, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 192834282, "size_in_bytes": 1123992618}, "pile_gutenberg": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nGutenberg (PG-19)", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_gutenberg", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 30243176, "num_examples": 80, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 24685980, "num_examples": 60, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 54929156, "size_in_bytes": 986087492}, "pile_hackernews": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nHackerNews", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_hackernews", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 8124255, "num_examples": 1632, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 9803822, "num_examples": 1619, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 17928077, "size_in_bytes": 949086413}, "pile_nih-exporter": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nNIH ExPorter", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_nih-exporter", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 3928804, "num_examples": 1884, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 3927967, "num_examples": 1825, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 7856771, "size_in_bytes": 939015107}, "pile_opensubtitles": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nOpenSubtitles", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_opensubtitles", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 21008996, "num_examples": 642, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 19622904, "num_examples": 621, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 40631900, "size_in_bytes": 971790236}, "pile_openwebtext2": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nOpenWebText2", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_openwebtext2", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 128624303, "num_examples": 32925, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 131554302, "num_examples": 33400, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 260178605, "size_in_bytes": 1191336941}, "pile_philpapers": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPhilPapers", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_philpapers", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 5090158, "num_examples": 68, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 6499078, "num_examples": 64, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 11589236, "size_in_bytes": 942747572}, "pile_pile-cc": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPile-CC", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_pile-cc", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 235004043, "num_examples": 52790, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 233535650, "num_examples": 52792, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 468539693, "size_in_bytes": 1399698029}, "pile_pubmed-abstracts": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPubMed Abstracts", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_pubmed-abstracts", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 39908950, "num_examples": 29895, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 40008336, "num_examples": 29871, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 79917286, "size_in_bytes": 1011075622}, "pile_pubmed-central": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPubMed Central", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_pubmed-central", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 187251519, "num_examples": 5911, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 184791818, "num_examples": 5977, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 372043337, "size_in_bytes": 1303201673}, "pile_stackexchange": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nStackExchange", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_stackexchange", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 66441557, "num_examples": 30378, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 66011397, "num_examples": 29950, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 132452954, "size_in_bytes": 1063611290}, "pile_upsto": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nUSPTO Backgrounds", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_upsto", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 47345405, "num_examples": 11415, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 48122320, "num_examples": 11387, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 95467725, "size_in_bytes": 1026626061}, "pile_ubuntu-irc": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nUbuntu IRC", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_ubuntu-irc", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 5694218, "num_examples": 22, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 7410104, "num_examples": 21, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 13104322, "size_in_bytes": 944262658}, "pile_wikipedia": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nWikipedia (en)", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_wikipedia", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 52166968, "num_examples": 17511, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 53186137, "num_examples": 17478, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 105353105, "size_in_bytes": 1036511441}, "pile_youtubesubtitles": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nYoutubeSubtitles", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_youtubesubtitles", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 7377448, "num_examples": 342, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 8937546, "num_examples": 326, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 16314994, "size_in_bytes": 947473330}}
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Pile dataset."""
import json
import datasets
_CITATION = """\
@article{pile,
title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},
author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},
journal={arXiv preprint arXiv:2101.00027},
year={2020}
}
"""
_DESCRIPTION = """\
The Pile is a 825 GiB diverse, open source language modeling data set that consists
of 22 smaller, high-quality datasets combined together. To score well on Pile
BPB (bits per byte), a model must be able to understand many disparate domains
including books, github repositories, webpages, chat logs, and medical, physics,
math, computer science, and philosophy papers.
"""
_HOMEPAGE = "https://pile.eleuther.ai/"
# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""
_URLS = {
"validation": "https://the-eye.eu/public/AI/pile/val.jsonl.zst",
"test": "https://the-eye.eu/public/AI/pile/test.jsonl.zst",
}
_NAMES = {
"pile_arxiv": "ArXiv",
"pile_books3": "Books3",
"pile_bookcorpus2": "BookCorpus2",
"pile_dm-mathematics": "DM Mathematics",
"pile_enron": "Enron Emails",
"pile_europarl": "EuroParl",
"pile_freelaw": "FreeLaw",
"pile_github": "Github",
"pile_gutenberg": "Gutenberg (PG-19)",
"pile_hackernews": "HackerNews",
"pile_nih-exporter": "NIH ExPorter",
"pile_opensubtitles": "OpenSubtitles",
"pile_openwebtext2": "OpenWebText2",
"pile_philpapers": "PhilPapers",
"pile_pile-cc": "Pile-CC",
"pile_pubmed-abstracts": "PubMed Abstracts",
"pile_pubmed-central": "PubMed Central",
"pile_stackexchange": "StackExchange",
"pile_upsto": "USPTO Backgrounds",
"pile_ubuntu-irc": "Ubuntu IRC",
"pile_wikipedia": "Wikipedia (en)",
"pile_youtubesubtitles": "YoutubeSubtitles",
}
class Pile(datasets.GeneratorBasedBuilder):
"""The Pile is a 825 GiB diverse, open source language modeling dataset."""
VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [
datasets.BuilderConfig(name=name, version=version, description=_NAMES[name])
for name, version in zip(_NAMES.keys(), [VERSION] * len(_NAMES))
]
def _info(self):
features = datasets.Features(
{
"text": datasets.Value("string"),
}
)
return datasets.DatasetInfo(
description=f"{_DESCRIPTION}\n{self.config.description}",
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
urls = {"validation": _URLS["validation"], "test": _URLS["test"]}
data_dir = dl_manager.download_and_extract(urls)
return [
datasets.SplitGenerator(
name=datasets.Split.TEST,
# These kwargs will be passed to _generate_examples
gen_kwargs={"filepath": data_dir["test"], "split": "test"},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": data_dir["validation"],
"split": "validation",
},
),
]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
def _generate_examples(self, filepath, split):
with open(filepath, encoding="utf-8") as f:
for key, row in enumerate(f):
data = json.loads(row)
if data["meta"]["pile_set_name"] == _NAMES[self.config.name]:
yield key, {
"text": data["text"],
}
{"quac": {"description": "Question Answering in Context (QuAC) is a dataset for modeling, understanding, and \nparticipating in information seeking dialog. Data instances consist of an interactive\ndialog between two crowd workers: (1) a student who poses a sequence of freeform\nquestions to learn as much as possible about a hidden Wikipedia text, and (2)\na teacher who answers the questions by providing short excerpts (spans) from the text.\n", "citation": "@article{choi2018quac,\n title={Quac: Question answering in context},\n author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},\n journal={arXiv preprint arXiv:1808.07036},\n year={2018}\n}\n", "homepage": "https://quac.ai/", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "section_title": {"dtype": "string", "id": null, "_type": "Value"}, "paragraph": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "quac", "config_name": "quac", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 212391958, "num_examples": 83568, "dataset_name": "quac"}, "validation": {"name": "validation", "num_bytes": 20678483, "num_examples": 7354, "dataset_name": "quac"}}, "download_checksums": {"https://s3.amazonaws.com/my89public/quac/train_v0.2.json": {"num_bytes": 68114819, "checksum": "ff5cca5a2e4b4d1cb5b5ced68b9fce88394ef6d93117426d6d4baafbcc05c56a"}, "https://s3.amazonaws.com/my89public/quac/val_v0.2.json": {"num_bytes": 8929167, "checksum": "09e622916280ba04c9352acb1bc5bbe80f11a2598f6f34e934c51d9e6570f378"}}, "download_size": 77043986, "post_processing_size": null, "dataset_size": 233070441, "size_in_bytes": 310114427}}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment