Commit d2a9b759 authored by haileyschoelkopf's avatar haileyschoelkopf
Browse files

in-place replace main with lm-eval2, keeping old git history

parent 814940e8
dataset_path: super_glue
dataset_name: cb
training_split: train
validation_split: validation
doc_to_text: "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe?"
doc_to_target: "{% set answer_choices = ['Yes', 'No', 'Maybe'] %}{{answer_choices[label]}}"
metric_list: [
[exact_match, mean, true]
]
# filters: [
# ["none", ["take_first"]]
# ]
from . import metrics
METRIC_REGISTRY = {
"matthews_corrcoef": metrics.matthews_corrcoef,
"f1_score": metrics.f1_score,
"perplexity": metrics.perplexity,
"bleu": metrics.bleu,
"chrf": metrics.chrf,
"ter": metrics.ter,
}
AGGREGATION_REGISTRY = {
"mean": metrics.mean,
"median": metrics.median
}
\ No newline at end of file
from dataclasses import dataclass
from typing import List
from lm_eval.api.instance import Instance
class Filter:
"""
Filter classes operate on a per-task level.
They take all model outputs (`instance.resps` for all `task.instances`)
across all instances of a task, and perform operations.
In a single run, one can configure any number of separate filters or lists of filters.
"""
def __init__(self):
"""
Can define custom behavior here, if an individual instantiation of a Filter class should have state.
"""
def apply(self, resps):
"""
Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects.
Should return the list of (filtered) response lists *in the same order as they were input*, e.g.
if pass in [<inst.resps for instance 0>, <inst.resps for instance 1>] should return
[<filtered resps for instance 0>, <filtered resps for instance 1>]
"""
return resps
@dataclass
class FilterEnsemble:
"""
FilterEnsemble creates a pipeline applying multiple filters.
Its intended usage is to stack multiple post-processing steps in order.
`task.apply_filters` should use a list of FilterEnsemble classes that it stores, to apply each
pipeline separately.
"""
name: str
filters: List[Filter]
def apply(self, instances: List[Instance]):
resps = [inst.resps for inst in instances] # operate just on the model responses
for f in self.filters:
# apply filters in sequence
out = f.apply(resps)
resps = out # TODO: handle the case where a filter returns multiple "buckets"
# add the end results after filtering to filtered_requests of their respective source instances.
# has key `self.name`: each FilterEnsemble applied in a given run should use a different name.
for inst, resp in zip(instances, resps):
inst.filtered_resps[self.name] = resp
from dataclasses import dataclass, field
@dataclass
class Instance:
request_type: str = None # TODO: make this an enum?
doc: dict = None
arguments: tuple = None
id_: int = None
metadata: tuple = None # TODO: better typehints here
resps: list = field(default_factory=list)
filtered_resps: dict = field(default_factory=dict)
task_name: str = None
doc_id: str = None
repeats: str = None
def __post_init__(self):
self.task_name, self.doc_id, self.repeats = self.metadata
@property
def args(self):
"""
Returns (string,) where `string` is the string to calculate loglikelihood over
"""
return self.arguments if isinstance(self.arguments, tuple) else (self.arguments,)
# import abc
# class Instance(abc.ABC):
# """
# A class used to bind together all necessary information and metadata for
# running forward pass of a model on a specific datapoint.
# """
# # all Instance subclasses have an attribute which is the name of the LM() class function they call to get outputs.
# request_type = None
# def __init__(self, doc, arguments=None, id_=None, metadata=("", None, None)):
# self.doc = doc # store the document which we're using. this is a dict
# self.arguments = arguments
# # need: task name, doc idx, num. repeats
# self.task_name, self.doc_id, self.repeats = metadata
# # id_ = idx within a doc's requests
# self.id_ = id_
# # handle repeats internally. should be able to run K times on exact same input/output pair
# # self.repeats = repeats
# # list containing the returns from each call of the model on this particular set of arguments.
# self.resps = []
# # filtered_resps should end up a dict, with a different key for each set of filters to apply. calculate results against each key in filtered_resps
# self.filtered_resps = {}
# #TODO: add more info as needed for detailed logging
# def __repr__(self):
# return f"Req_{self.request_type}{self.args}{self.id_}"
@dataclass
class LoglikelihoodInstance(Instance):
request_type: str = "loglikelihood"
@dataclass
class RollingLoglikelihoodInstance(Instance):
request_type: str = "loglikelihood_rolling"
@dataclass
class GenerationInstance(Instance):
request_type: str = "greedy_until"
import abc
from lm_eval import utils
class LM(abc.ABC):
def __init__(self):
"""Defines the interface that should be implemented by all LM subclasses.
LMs are assumed to take text (strings) as input and yield strings as output
(inputs/outputs should be tokenization-agnostic.)
"""
@abc.abstractmethod
def loglikelihood(self, requests):
"""Compute log-likelihood of generating a continuation from a context.
Downstream tasks should attempt to use loglikelihood instead of other
LM calls whenever possible.
:param requests: list
A list of pairs (context, continuation)
context: str
Context string. Implementations of LM must be able to handle an
empty context string.
continuation: str
The continuation over which log likelihood will be calculated. If
there is a word boundary, the space should be in the continuation.
For example, context="hello" continuation=" world" is correct.
:return: list
A list of pairs (logprob, isgreedy)
logprob: float
The log probability of `continuation`
isgreedy:
Whether `continuation` would be generated by greedy sampling from `context`
"""
pass
@abc.abstractmethod
def loglikelihood_rolling(self, requests):
"""Compute full log-likelihood of a string, with no truncation, for perplexity computation
- We will use the full max context length of the model.
- For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
the max context length.
- IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementations
which may simply concatenate multiple documents together.
- IMPORTANT: We maximize the amount of context for each prediction. Specifically, for inputs that we break into
multiple chunks, the last input will still a full-sized context.
Example:
Input tokens: [ 0 1 2 3 4 5 6 7 8 9 ]
Prefix: EOT
Max context length: 4
Resulting input/prediction pairs:
INPUT: EOT 0 1 2
PRED: 0 1 2 3
INPUT: 3 4 5 6
PRED: 4 5 6 7
INPUT: 5 6 7 8
PRED: 8 9
Observe that:
1. Each token is predicted exactly once
2. For the last pair, we provide the full context, but only score the last two tokens
:param requests: list
A list of strings
string: str
String for which we are computing per-token loglikelihood
:return: list
A list of pairs (logprob, isgreedy)
logprob: float
The log probability of `continuation`
isgreedy:
Whether `continuation` would be generated by greedy sampling from `context`
"""
pass
# TODO: Add an optional max length
@abc.abstractmethod
def greedy_until(self, requests):
"""Generate greedily until a stopping sequence
:param requests: list
A list of pairs (context, until)
context: str
Context string
until: [str]
The string sequences to generate until. These string sequences
may each span across multiple tokens, or may be part of one token.
:return: list
A list of strings continuation
continuation: str
The generated continuation.
"""
pass
@classmethod
def create_from_arg_string(cls, arg_string, additional_config=None):
additional_config = {} if additional_config is None else additional_config
args = utils.simple_parse_args_string(arg_string)
args2 = {k: v for k, v in additional_config.items() if v is not None}
return cls(**args, **args2)
class Sampler: # TODO: make this abstract class?
def __init__(self, docs, task, fewshot_indices=None, rnd=None):
self.rnd = rnd
assert self.rnd, "must pass rnd to FewShotSampler!"
self.task = task
self.config = task._config
self.delimiter = self.config.delimiter
self.docs = docs # HF dataset split, provided by task._fewshot_docs()
if fewshot_indices: # subset few-shot docs from
self.docs = self.docs.select(fewshot_indices)
def get_context(self, doc, num_fewshot):
# draw an extra fewshot sample if
n_samples = num_fewshot + 1 if self.config.fewshot_split == self.config.test_split else num_fewshot
fewshotex = self.sample(n_samples)
# get rid of the doc that's the one we're evaluating, if it's in the fewshot
selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
labeled_examples = (
self.delimiter.join(
[
self.task.doc_to_text(doc) + self.task.doc_to_target(doc)
for doc in selected_docs
]
)
+ self.delimiter
)
# only returns the fewshot context! Does not append the document, do this outside the object
return labeled_examples
def sample(self, n):
"""
Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.
"""
return self.rnd.sample(self.docs, n)
class BalancedSampler(Sampler):
def sample(self, n):
"""
TODO: this should return approximately class-balanced samples from our fewshot examples.
TODO: what order should they be in?
"""
pass
class ManualSampler(Sampler):
def sample(self, n):
"""
"""
pass
# TODO: how should we do design here? might be better to have a single sampler and pass more kwargs at init.
# Depends what's easier for new user to add own functionality on top of
# types of sampler:
# - class-balanced, randomly shuffled
# - class-balanced, one particular set of fewshot examples for all evaled instances
# - hand-specify number of fewshot examples per class?
# - random, varies per example (check that this is curr. default in old repo)
# - random, unified per example
# - enforce a specific fixed fewshot string! (or should we not use this, in favor of including it in prompt template directly)
# - user-specified doc indices to restrict fewshot doc options to
# - user specifies split to use for drawing fewshot instances (TODO: manually prevent this from being same split you eval!)
# - user specifies a prepended "description"/string to add in front of the (prompted) input
# - user specifies a location to draw fewshot samples from? DO THIS IN TASK CLASS
This diff is collapsed.
# datasets
This directory contains custom HuggingFace [dataset loading scripts](https://huggingface.co/docs/datasets/dataset_script). They are provided to maintain backward compatibility with the ad-hoc data downloaders in earlier versions of the `lm-evaluation-harness` before HuggingFace [`datasets`](https://huggingface.co/docs/datasets/index) was adopted as the default downloading manager. For example, some instances in the HuggingFace `datasets` repository process features (e.g. whitespace stripping, lower-casing, etc.) in ways that the `lm-evaluation-harness` did not.
__NOTE__: We are __not__ accepting any additional loading scripts into the main branch! If you'd like to use a custom dataset, fork the repo and follow HuggingFace's loading script guide found [here](https://huggingface.co/docs/datasets/dataset_script). You can then override your `Task`'s `DATASET_PATH` attribute to point to this script's local path.
__WARNING__: A handful of loading scripts are included in this collection because they have not yet been pushed to the Huggingface Hub or a HuggingFace organization repo. We will remove such scripts once pushed.
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""ASDIV dataset."""
import os
import xml.etree.ElementTree as ET
import datasets
_CITATION = """\
@misc{miao2021diverse,
title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},
author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},
year={2021},
eprint={2106.15772},
archivePrefix={arXiv},
primaryClass={cs.AI}
}
"""
_DESCRIPTION = """\
ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language
patterns and problem types) English math word problem (MWP) corpus for evaluating
the capability of various MWP solvers. Existing MWP corpora for studying AI progress
remain limited either in language usage patterns or in problem types. We thus present
a new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem
types taught in elementary school. Each MWP is annotated with its problem type and grade
level (for indicating the level of difficulty).
"""
_HOMEPAGE = "https://github.com/chaochun/nlu-asdiv-dataset"
# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""
_URLS = "https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip"
class ASDiv(datasets.GeneratorBasedBuilder):
"""ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers"""
VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [
datasets.BuilderConfig(
name="asdiv",
version=VERSION,
description="A diverse corpus for evaluating and developing english math word problem solvers",
)
]
def _info(self):
features = datasets.Features(
{
"body": datasets.Value("string"),
"question": datasets.Value("string"),
"solution_type": datasets.Value("string"),
"answer": datasets.Value("string"),
"formula": datasets.Value("string"),
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
urls = _URLS
data_dir = dl_manager.download_and_extract(urls)
base_filepath = "nlu-asdiv-dataset-55790e5270bb91ccfa5053194b25732534696b50"
return [
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": os.path.join(
data_dir, base_filepath, "dataset", "ASDiv.xml"
),
"split": datasets.Split.VALIDATION,
},
),
]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
def _generate_examples(self, filepath, split):
tree = ET.parse(filepath)
root = tree.getroot()
for key, problem in enumerate(root.iter("Problem")):
yield key, {
"body": problem.find("Body").text,
"question": problem.find("Question").text,
"solution_type": problem.find("Solution-Type").text,
"answer": problem.find("Answer").text,
"formula": problem.find("Formula").text,
}
{"asdiv": {"description": "ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language\npatterns and problem types) English math word problem (MWP) corpus for evaluating\nthe capability of various MWP solvers. Existing MWP corpora for studying AI progress\nremain limited either in language usage patterns or in problem types. We thus present\na new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem\ntypes taught in elementary school. Each MWP is annotated with its problem type and grade\nlevel (for indicating the level of difficulty).\n", "citation": "@misc{miao2021diverse,\n title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},\n author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},\n year={2021},\n eprint={2106.15772},\n archivePrefix={arXiv},\n primaryClass={cs.AI}\n}\n", "homepage": "https://github.com/chaochun/nlu-asdiv-dataset", "license": "", "features": {"body": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "solution_type": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "formula": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "as_div", "config_name": "asdiv", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 501489, "num_examples": 2305, "dataset_name": "as_div"}}, "download_checksums": {"https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip": {"num_bytes": 440966, "checksum": "8f1fe4f6d5f170ec1e24ab78c244153c14c568b1bb2b1dad0324e71f37939a2d"}}, "download_size": 440966, "post_processing_size": null, "dataset_size": 501489, "size_in_bytes": 942455}}
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""CoQA dataset.
This `CoQA` adds the "additional_answers" feature that's missing in the original
datasets version:
https://github.com/huggingface/datasets/blob/master/datasets/coqa/coqa.py
"""
import json
import datasets
_CITATION = """\
@misc{reddy2018coqa,
title={CoQA: A Conversational Question Answering Challenge},
author={Siva Reddy and Danqi Chen and Christopher D. Manning},
year={2018},
eprint={1808.07042},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""
_DESCRIPTION = """\
CoQA is a large-scale dataset for building Conversational Question Answering
systems. The goal of the CoQA challenge is to measure the ability of machines to
understand a text passage and answer a series of interconnected questions that
appear in a conversation.
"""
_HOMEPAGE = "https://stanfordnlp.github.io/coqa/"
# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""
_URLS = {
"train": "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json",
"validation": "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json",
}
# `additional_answers` are not available in the train set so we fill them with
# empty dicts of the same form.
_EMPTY_ADDITIONAL_ANSWER = {
"0": [
{
"span_start": -1,
"span_end": -1,
"span_text": "",
"input_text": "",
"turn_id": -1,
}
],
"1": [
{
"span_start": -1,
"span_end": -1,
"span_text": "",
"input_text": "",
"turn_id": -1,
}
],
"2": [
{
"span_start": -1,
"span_end": -1,
"span_text": "",
"input_text": "",
"turn_id": -1,
}
],
}
class Coqa(datasets.GeneratorBasedBuilder):
"""CoQA is a large-scale dataset for building Conversational Question Answering systems."""
VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [
datasets.BuilderConfig(
name="coqa", version=VERSION, description="The CoQA dataset."
),
]
def _info(self):
features = datasets.Features(
{
"id": datasets.Value("string"),
"source": datasets.Value("string"),
"story": datasets.Value("string"),
"questions": datasets.features.Sequence(
{
"input_text": datasets.Value("string"),
"turn_id": datasets.Value("int32"),
}
),
"answers": datasets.features.Sequence(
{
"span_start": datasets.Value("int32"),
"span_end": datasets.Value("int32"),
"span_text": datasets.Value("string"),
"input_text": datasets.Value("string"),
"turn_id": datasets.Value("int32"),
}
),
"additional_answers": {
"0": datasets.features.Sequence(
{
"span_start": datasets.Value("int32"),
"span_end": datasets.Value("int32"),
"span_text": datasets.Value("string"),
"input_text": datasets.Value("string"),
"turn_id": datasets.Value("int32"),
}
),
"1": datasets.features.Sequence(
{
"span_start": datasets.Value("int32"),
"span_end": datasets.Value("int32"),
"span_text": datasets.Value("string"),
"input_text": datasets.Value("string"),
"turn_id": datasets.Value("int32"),
}
),
"2": datasets.features.Sequence(
{
"span_start": datasets.Value("int32"),
"span_end": datasets.Value("int32"),
"span_text": datasets.Value("string"),
"input_text": datasets.Value("string"),
"turn_id": datasets.Value("int32"),
}
),
},
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
urls = {"train": _URLS["train"], "validation": _URLS["validation"]}
data_dirs = dl_manager.download_and_extract(urls)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": data_dirs["train"],
"split": datasets.Split.TRAIN,
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": data_dirs["validation"],
"split": datasets.Split.VALIDATION,
},
),
]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
def _generate_examples(self, filepath, split):
with open(filepath, encoding="utf-8") as f:
data = json.load(f)
for row in data["data"]:
id = row["id"]
source = row["source"]
story = row["story"]
questions = [
{"input_text": q["input_text"], "turn_id": q["turn_id"]}
for q in row["questions"]
]
answers = [
{
"span_start": a["span_start"],
"span_end": a["span_end"],
"span_text": a["span_text"],
"input_text": a["input_text"],
"turn_id": a["turn_id"],
}
for a in row["answers"]
]
if split == datasets.Split.TRAIN:
additional_answers = _EMPTY_ADDITIONAL_ANSWER
else:
additional_answers = {
"0": [
{
"span_start": a0["span_start"],
"span_end": a0["span_end"],
"span_text": a0["span_text"],
"input_text": a0["input_text"],
"turn_id": a0["turn_id"],
}
for a0 in row["additional_answers"]["0"]
],
"1": [
{
"span_start": a1["span_start"],
"span_end": a1["span_end"],
"span_text": a1["span_text"],
"input_text": a1["input_text"],
"turn_id": a1["turn_id"],
}
for a1 in row["additional_answers"]["1"]
],
"2": [
{
"span_start": a2["span_start"],
"span_end": a2["span_end"],
"span_text": a2["span_text"],
"input_text": a2["input_text"],
"turn_id": a2["turn_id"],
}
for a2 in row["additional_answers"]["2"]
],
}
yield row["id"], {
"id": id,
"story": story,
"source": source,
"questions": questions,
"answers": answers,
"additional_answers": additional_answers,
}
{"coqa": {"description": "CoQA is a large-scale dataset for building Conversational Question Answering\nsystems. The goal of the CoQA challenge is to measure the ability of machines to\nunderstand a text passage and answer a series of interconnected questions that\nappear in a conversation.\n", "citation": "@misc{reddy2018coqa,\n title={CoQA: A Conversational Question Answering Challenge},\n author={Siva Reddy and Danqi Chen and Christopher D. Manning},\n year={2018},\n eprint={1808.07042},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://stanfordnlp.github.io/coqa/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "story": {"dtype": "string", "id": null, "_type": "Value"}, "questions": {"feature": {"input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "answers": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "additional_answers": {"0": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "1": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "2": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "coqa", "config_name": "coqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 26250528, "num_examples": 7199, "dataset_name": "coqa"}, "validation": {"name": "validation", "num_bytes": 3765933, "num_examples": 500, "dataset_name": "coqa"}}, "download_checksums": {"https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json": {"num_bytes": 49001836, "checksum": "b0fdb2bc1bd38dd3ca2ce5fa2ac3e02c6288ac914f241ac409a655ffb6619fa6"}, "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json": {"num_bytes": 9090845, "checksum": "dfa367a9733ce53222918d0231d9b3bedc2b8ee831a2845f62dfc70701f2540a"}}, "download_size": 58092681, "post_processing_size": null, "dataset_size": 30016461, "size_in_bytes": 88109142}}
{"drop": {"description": "DROP is a QA dataset which tests comprehensive understanding of paragraphs. In \nthis crowdsourced, adversarially-created, 96k question-answering benchmark, a \nsystem must resolve multiple references in a question, map them onto a paragraph,\nand perform discrete operations over them (such as addition, counting, or sorting).\n", "citation": "@misc{dua2019drop,\n title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs}, \n author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},\n year={2019},\n eprint={1903.00161},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://allenai.org/data/drop", "license": "", "features": {"section_id": {"dtype": "string", "id": null, "_type": "Value"}, "passage": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "query_id": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "validated_answers": {"feature": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "drop", "config_name": "drop", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 108858121, "num_examples": 77409, "dataset_name": "drop"}, "validation": {"name": "validation", "num_bytes": 12560739, "num_examples": 9536, "dataset_name": "drop"}}, "download_checksums": {"https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip": {"num_bytes": 8308692, "checksum": "39d2278a29fd729de301b111a45f434c24834f40df8f4ff116d864589e3249d6"}}, "download_size": 8308692, "post_processing_size": null, "dataset_size": 121418860, "size_in_bytes": 129727552}}
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Custom DROP dataset that, unlike HF, keeps all question-answer pairs
# even if there are multiple types of answers for the same question.
"""DROP dataset."""
import json
import os
import datasets
_CITATION = """\
@misc{dua2019drop,
title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs},
author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},
year={2019},
eprint={1903.00161},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""
_DESCRIPTION = """\
DROP is a QA dataset which tests comprehensive understanding of paragraphs. In
this crowdsourced, adversarially-created, 96k question-answering benchmark, a
system must resolve multiple references in a question, map them onto a paragraph,
and perform discrete operations over them (such as addition, counting, or sorting).
"""
_HOMEPAGE = "https://allenai.org/data/drop"
# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""
_URLS = {
"drop": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip",
}
_EMPTY_VALIDATED_ANSWER = [
{
"number": "",
"date": {
"day": "",
"month": "",
"year": "",
},
"spans": [],
"worker_id": "",
"hit_id": "",
}
]
class Drop(datasets.GeneratorBasedBuilder):
"""DROP is a QA dataset which tests comprehensive understanding of paragraphs."""
VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [
datasets.BuilderConfig(
name="drop", version=VERSION, description="The DROP dataset."
),
]
def _info(self):
features = datasets.Features(
{
"section_id": datasets.Value("string"),
"passage": datasets.Value("string"),
"question": datasets.Value("string"),
"query_id": datasets.Value("string"),
"answer": {
"number": datasets.Value("string"),
"date": {
"day": datasets.Value("string"),
"month": datasets.Value("string"),
"year": datasets.Value("string"),
},
"spans": datasets.features.Sequence(datasets.Value("string")),
"worker_id": datasets.Value("string"),
"hit_id": datasets.Value("string"),
},
"validated_answers": datasets.features.Sequence(
{
"number": datasets.Value("string"),
"date": {
"day": datasets.Value("string"),
"month": datasets.Value("string"),
"year": datasets.Value("string"),
},
"spans": datasets.features.Sequence(datasets.Value("string")),
"worker_id": datasets.Value("string"),
"hit_id": datasets.Value("string"),
}
),
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
urls = _URLS[self.config.name]
data_dir = dl_manager.download_and_extract(urls)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": os.path.join(
data_dir, "drop_dataset", "drop_dataset_train.json"
),
"split": "train",
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": os.path.join(
data_dir, "drop_dataset", "drop_dataset_dev.json"
),
"split": "validation",
},
),
]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
def _generate_examples(self, filepath, split):
with open(filepath, encoding="utf-8") as f:
data = json.load(f)
key = 0
for section_id, example in data.items():
# Each example (passage) has multiple sub-question-answer pairs.
for qa in example["qa_pairs"]:
# Build answer.
answer = qa["answer"]
answer = {
"number": answer["number"],
"date": {
"day": answer["date"].get("day", ""),
"month": answer["date"].get("month", ""),
"year": answer["date"].get("year", ""),
},
"spans": answer["spans"],
"worker_id": answer.get("worker_id", ""),
"hit_id": answer.get("hit_id", ""),
}
validated_answers = []
if "validated_answers" in qa:
for validated_answer in qa["validated_answers"]:
va = {
"number": validated_answer.get("number", ""),
"date": {
"day": validated_answer["date"].get("day", ""),
"month": validated_answer["date"].get("month", ""),
"year": validated_answer["date"].get("year", ""),
},
"spans": validated_answer.get("spans", ""),
"worker_id": validated_answer.get("worker_id", ""),
"hit_id": validated_answer.get("hit_id", ""),
}
validated_answers.append(va)
else:
validated_answers = _EMPTY_VALIDATED_ANSWER
yield key, {
"section_id": section_id,
"passage": example["passage"],
"question": qa["question"],
"query_id": qa["query_id"],
"answer": answer,
"validated_answers": validated_answers,
}
key += 1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment