Commit 6e092c81 authored by Rayyyyy's avatar Rayyyyy
Browse files

Add hellaswag datasets

parent a6b358ca
...@@ -21,6 +21,7 @@ from typing import ( ...@@ -21,6 +21,7 @@ from typing import (
) )
import datasets import datasets
import inspect
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
...@@ -707,7 +708,7 @@ class ConfigurableTask(Task): ...@@ -707,7 +708,7 @@ class ConfigurableTask(Task):
self.OUTPUT_TYPE = self.config.output_type self.OUTPUT_TYPE = self.config.output_type
if self.config.dataset_path is not None: if self.config.dataset_path is not None:
self.DATASET_PATH = self.config.dataset_path self.DATASET_PATH = inspect.getfile(self.config.dataset_path)
if self.config.dataset_name is not None: if self.config.dataset_name is not None:
self.DATASET_NAME = self.config.dataset_name self.DATASET_NAME = self.config.dataset_name
......
{"default": {"description": "\nHellaSwag: Can a Machine Really Finish Your Sentence? is a new dataset for commonsense NLI. A paper was published at ACL2019.\n", "citation": "@inproceedings{zellers2019hellaswag,\n title={HellaSwag: Can a Machine Really Finish Your Sentence?},\n author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},\n booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},\n year={2019}\n}\n", "homepage": "https://rowanzellers.com/hellaswag/", "license": "", "features": {"ind": {"dtype": "int32", "id": null, "_type": "Value"}, "activity_label": {"dtype": "string", "id": null, "_type": "Value"}, "ctx_a": {"dtype": "string", "id": null, "_type": "Value"}, "ctx_b": {"dtype": "string", "id": null, "_type": "Value"}, "ctx": {"dtype": "string", "id": null, "_type": "Value"}, "endings": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source_id": {"dtype": "string", "id": null, "_type": "Value"}, "split": {"dtype": "string", "id": null, "_type": "Value"}, "split_type": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hellaswag", "config_name": "default", "version": {"version_str": "0.1.0", "description": null, "major": 0, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 43232624, "num_examples": 39905, "dataset_name": "hellaswag"}, "test": {"name": "test", "num_bytes": 10791853, "num_examples": 10003, "dataset_name": "hellaswag"}, "validation": {"name": "validation", "num_bytes": 11175717, "num_examples": 10042, "dataset_name": "hellaswag"}}, "download_checksums": {"https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_train.jsonl": {"num_bytes": 47496131, "checksum": "dae5e69249868cb9fe4e23ff925c60b66169564cfb7072d793cd7356a2b69f8d"}, "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_test.jsonl": {"num_bytes": 11752147, "checksum": "da082b00543e422b8d25394614d102944586986def4de5cd1bd36d86bcb76261"}, "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl": {"num_bytes": 12246618, "checksum": "0aa3b88843990f3f10a97b9575c94d7b71fb2205240ba04ae4884d9e9c992588"}}, "download_size": 71494896, "post_processing_size": null, "dataset_size": 65200194, "size_in_bytes": 136695090}}
\ No newline at end of file
"""TODO(hellaswag): Add a description here."""
import json
import datasets
# TODO(hellaswag): BibTeX citation
_CITATION = """\
@inproceedings{zellers2019hellaswag,
title={HellaSwag: Can a Machine Really Finish Your Sentence?},
author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
year={2019}
}
"""
_DESCRIPTION = """
HellaSwag: Can a Machine Really Finish Your Sentence? is a new dataset for commonsense NLI. A paper was published at ACL2019.
"""
_URL = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/"
_URLS = {
"train": _URL + "hellaswag_train.jsonl",
"test": _URL + "hellaswag_test.jsonl",
"dev": _URL + "hellaswag_val.jsonl",
}
class Hellaswag(datasets.GeneratorBasedBuilder):
"""TODO(hellaswag): Short description of my dataset."""
# TODO(hellaswag): Set up version.
VERSION = datasets.Version("0.1.0")
def _info(self):
# TODO(hellaswag): Specifies the datasets.DatasetInfo object
return datasets.DatasetInfo(
# This is the description that will appear on the datasets page.
description=_DESCRIPTION,
# datasets.features.FeatureConnectors
features=datasets.Features(
{
# These are the features of your dataset like images, labels ...
"ind": datasets.Value("int32"),
"activity_label": datasets.Value("string"),
"ctx_a": datasets.Value("string"),
"ctx_b": datasets.Value("string"),
"ctx": datasets.Value("string"),
"endings": datasets.features.Sequence(datasets.Value("string")),
"source_id": datasets.Value("string"),
"split": datasets.Value("string"),
"split_type": datasets.Value("string"),
"label": datasets.Value("string"),
}
),
# If there's a common (input, target) tuple from the features,
# specify them here. They'll be used if as_supervised=True in
# builder.as_dataset.
supervised_keys=None,
# Homepage of the dataset for documentation
homepage="https://rowanzellers.com/hellaswag/",
citation=_CITATION,
)
def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
# TODO(hellaswag): Downloads the data and defines the splits
# dl_manager is a datasets.download.DownloadManager that can be used to
# download and extract URLs
urls_to_download = _URLS
dl_dir = dl_manager.download_and_extract(urls_to_download)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples
gen_kwargs={"filepath": dl_dir["train"]},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
# These kwargs will be passed to _generate_examples
gen_kwargs={"filepath": dl_dir["test"]},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
gen_kwargs={"filepath": dl_dir["dev"]},
),
]
def _generate_examples(self, filepath):
"""Yields examples."""
# TODO(hellaswag): Yields (key, example) tuples from the dataset
with open(filepath, encoding="utf-8") as f:
for id_, row in enumerate(f):
data = json.loads(row)
yield id_, {
"ind": int(data["ind"]),
"activity_label": data["activity_label"],
"ctx_a": data.get("ctx_a", ""),
"ctx_b": data.get("ctx_b", ""),
"ctx": data["ctx"],
"endings": data.get("endings", []),
"source_id": data["source_id"],
"split": data["split"],
"split_type": data["split_type"],
"label": str(data.get("label", "")),
}
group: group:
- multiple_choice - multiple_choice
task: hellaswag task: hellaswag
dataset_path: hellaswag dataset_path: lm_eval.datasets.hellaswag.hellaswag
dataset_name: null dataset_name: hellaswag
output_type: multiple_choice output_type: multiple_choice
training_split: train training_split: train
validation_split: validation validation_split: validation
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment