Commit d2a9b759 authored by haileyschoelkopf's avatar haileyschoelkopf
Browse files

in-place replace main with lm-eval2, keeping old git history

parent 814940e8
dataset_path: super_glue
dataset_name: cb
training_split: train
validation_split: validation
doc_to_text: "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe?"
doc_to_target: "{% set answer_choices = ['Yes', 'No', 'Maybe'] %}{{answer_choices[label]}}"
metric_list: [
[exact_match, mean, true]
]
# filters: [
# ["none", ["take_first"]]
# ]
from . import metrics
METRIC_REGISTRY = {
"matthews_corrcoef": metrics.matthews_corrcoef,
"f1_score": metrics.f1_score,
"perplexity": metrics.perplexity,
"bleu": metrics.bleu,
"chrf": metrics.chrf,
"ter": metrics.ter,
}
AGGREGATION_REGISTRY = {
"mean": metrics.mean,
"median": metrics.median
}
\ No newline at end of file
from dataclasses import dataclass
from typing import List
from lm_eval.api.instance import Instance
class Filter:
"""
Filter classes operate on a per-task level.
They take all model outputs (`instance.resps` for all `task.instances`)
across all instances of a task, and perform operations.
In a single run, one can configure any number of separate filters or lists of filters.
"""
def __init__(self):
"""
Can define custom behavior here, if an individual instantiation of a Filter class should have state.
"""
def apply(self, resps):
"""
Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects.
Should return the list of (filtered) response lists *in the same order as they were input*, e.g.
if pass in [<inst.resps for instance 0>, <inst.resps for instance 1>] should return
[<filtered resps for instance 0>, <filtered resps for instance 1>]
"""
return resps
@dataclass
class FilterEnsemble:
"""
FilterEnsemble creates a pipeline applying multiple filters.
Its intended usage is to stack multiple post-processing steps in order.
`task.apply_filters` should use a list of FilterEnsemble classes that it stores, to apply each
pipeline separately.
"""
name: str
filters: List[Filter]
def apply(self, instances: List[Instance]):
resps = [inst.resps for inst in instances] # operate just on the model responses
for f in self.filters:
# apply filters in sequence
out = f.apply(resps)
resps = out # TODO: handle the case where a filter returns multiple "buckets"
# add the end results after filtering to filtered_requests of their respective source instances.
# has key `self.name`: each FilterEnsemble applied in a given run should use a different name.
for inst, resp in zip(instances, resps):
inst.filtered_resps[self.name] = resp
from dataclasses import dataclass, field
@dataclass
class Instance:
request_type: str = None # TODO: make this an enum?
doc: dict = None
arguments: tuple = None
id_: int = None
metadata: tuple = None # TODO: better typehints here
resps: list = field(default_factory=list)
filtered_resps: dict = field(default_factory=dict)
task_name: str = None
doc_id: str = None
repeats: str = None
def __post_init__(self):
self.task_name, self.doc_id, self.repeats = self.metadata
@property
def args(self):
"""
Returns (string,) where `string` is the string to calculate loglikelihood over
"""
return self.arguments if isinstance(self.arguments, tuple) else (self.arguments,)
# import abc
# class Instance(abc.ABC):
# """
# A class used to bind together all necessary information and metadata for
# running forward pass of a model on a specific datapoint.
# """
# # all Instance subclasses have an attribute which is the name of the LM() class function they call to get outputs.
# request_type = None
# def __init__(self, doc, arguments=None, id_=None, metadata=("", None, None)):
# self.doc = doc # store the document which we're using. this is a dict
# self.arguments = arguments
# # need: task name, doc idx, num. repeats
# self.task_name, self.doc_id, self.repeats = metadata
# # id_ = idx within a doc's requests
# self.id_ = id_
# # handle repeats internally. should be able to run K times on exact same input/output pair
# # self.repeats = repeats
# # list containing the returns from each call of the model on this particular set of arguments.
# self.resps = []
# # filtered_resps should end up a dict, with a different key for each set of filters to apply. calculate results against each key in filtered_resps
# self.filtered_resps = {}
# #TODO: add more info as needed for detailed logging
# def __repr__(self):
# return f"Req_{self.request_type}{self.args}{self.id_}"
@dataclass
class LoglikelihoodInstance(Instance):
request_type: str = "loglikelihood"
@dataclass
class RollingLoglikelihoodInstance(Instance):
request_type: str = "loglikelihood_rolling"
@dataclass
class GenerationInstance(Instance):
request_type: str = "greedy_until"
import abc
from lm_eval import utils
class LM(abc.ABC):
def __init__(self):
"""Defines the interface that should be implemented by all LM subclasses.
LMs are assumed to take text (strings) as input and yield strings as output
(inputs/outputs should be tokenization-agnostic.)
"""
@abc.abstractmethod
def loglikelihood(self, requests):
"""Compute log-likelihood of generating a continuation from a context.
Downstream tasks should attempt to use loglikelihood instead of other
LM calls whenever possible.
:param requests: list
A list of pairs (context, continuation)
context: str
Context string. Implementations of LM must be able to handle an
empty context string.
continuation: str
The continuation over which log likelihood will be calculated. If
there is a word boundary, the space should be in the continuation.
For example, context="hello" continuation=" world" is correct.
:return: list
A list of pairs (logprob, isgreedy)
logprob: float
The log probability of `continuation`
isgreedy:
Whether `continuation` would be generated by greedy sampling from `context`
"""
pass
@abc.abstractmethod
def loglikelihood_rolling(self, requests):
"""Compute full log-likelihood of a string, with no truncation, for perplexity computation
- We will use the full max context length of the model.
- For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
the max context length.
- IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementations
which may simply concatenate multiple documents together.
- IMPORTANT: We maximize the amount of context for each prediction. Specifically, for inputs that we break into
multiple chunks, the last input will still a full-sized context.
Example:
Input tokens: [ 0 1 2 3 4 5 6 7 8 9 ]
Prefix: EOT
Max context length: 4
Resulting input/prediction pairs:
INPUT: EOT 0 1 2
PRED: 0 1 2 3
INPUT: 3 4 5 6
PRED: 4 5 6 7
INPUT: 5 6 7 8
PRED: 8 9
Observe that:
1. Each token is predicted exactly once
2. For the last pair, we provide the full context, but only score the last two tokens
:param requests: list
A list of strings
string: str
String for which we are computing per-token loglikelihood
:return: list
A list of pairs (logprob, isgreedy)
logprob: float
The log probability of `continuation`
isgreedy:
Whether `continuation` would be generated by greedy sampling from `context`
"""
pass
# TODO: Add an optional max length
@abc.abstractmethod
def greedy_until(self, requests):
"""Generate greedily until a stopping sequence
:param requests: list
A list of pairs (context, until)
context: str
Context string
until: [str]
The string sequences to generate until. These string sequences
may each span across multiple tokens, or may be part of one token.
:return: list
A list of strings continuation
continuation: str
The generated continuation.
"""
pass
@classmethod
def create_from_arg_string(cls, arg_string, additional_config=None):
additional_config = {} if additional_config is None else additional_config
args = utils.simple_parse_args_string(arg_string)
args2 = {k: v for k, v in additional_config.items() if v is not None}
return cls(**args, **args2)
class Sampler: # TODO: make this abstract class?
def __init__(self, docs, task, fewshot_indices=None, rnd=None):
self.rnd = rnd
assert self.rnd, "must pass rnd to FewShotSampler!"
self.task = task
self.config = task._config
self.delimiter = self.config.delimiter
self.docs = docs # HF dataset split, provided by task._fewshot_docs()
if fewshot_indices: # subset few-shot docs from
self.docs = self.docs.select(fewshot_indices)
def get_context(self, doc, num_fewshot):
# draw an extra fewshot sample if
n_samples = num_fewshot + 1 if self.config.fewshot_split == self.config.test_split else num_fewshot
fewshotex = self.sample(n_samples)
# get rid of the doc that's the one we're evaluating, if it's in the fewshot
selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
labeled_examples = (
self.delimiter.join(
[
self.task.doc_to_text(doc) + self.task.doc_to_target(doc)
for doc in selected_docs
]
)
+ self.delimiter
)
# only returns the fewshot context! Does not append the document, do this outside the object
return labeled_examples
def sample(self, n):
"""
Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.
"""
return self.rnd.sample(self.docs, n)
class BalancedSampler(Sampler):
def sample(self, n):
"""
TODO: this should return approximately class-balanced samples from our fewshot examples.
TODO: what order should they be in?
"""
pass
class ManualSampler(Sampler):
def sample(self, n):
"""
"""
pass
# TODO: how should we do design here? might be better to have a single sampler and pass more kwargs at init.
# Depends what's easier for new user to add own functionality on top of
# types of sampler:
# - class-balanced, randomly shuffled
# - class-balanced, one particular set of fewshot examples for all evaled instances
# - hand-specify number of fewshot examples per class?
# - random, varies per example (check that this is curr. default in old repo)
# - random, unified per example
# - enforce a specific fixed fewshot string! (or should we not use this, in favor of including it in prompt template directly)
# - user-specified doc indices to restrict fewshot doc options to
# - user specifies split to use for drawing fewshot instances (TODO: manually prevent this from being same split you eval!)
# - user specifies a prepended "description"/string to add in front of the (prompted) input
# - user specifies a location to draw fewshot samples from? DO THIS IN TASK CLASS
import abc
from typing import Iterable
import numpy as np
import random
from dataclasses import dataclass
import re
import os
import json
import hashlib
import evaluate
import random
import itertools
import datasets
from sqlitedict import SqliteDict
from tqdm import tqdm
import torch
import torch.nn.functional as F
import numpy as np
from lm_eval.metrics import mean, weighted_perplexity, weighted_mean, bits_per_byte
from lm_eval.api import METRIC_REGISTRY, AGGREGATION_REGISTRY
from lm_eval.api.instance import LoglikelihoodInstance, RollingLoglikelihoodInstance, GenerationInstance
from lm_eval.api.metrics import mean, weighted_perplexity, weighted_mean, bits_per_byte
from lm_eval import utils
from abc import abstractmethod
class LM(abc.ABC):
def __init__(self):
self.cache_hook = CacheHook(None)
@abstractmethod
def loglikelihood(self, requests):
"""Compute log-likelihood of generating a continuation from a context.
Downstream tasks should attempt to use loglikelihood instead of other
LM calls whenever possible.
:param requests: list
A list of pairs (context, continuation)
context: str
Context string. Implementations of LM must be able to handle an
empty context string.
continuation: str
The continuation over which log likelihood will be calculated. If
there is a word boundary, the space should be in the continuation.
For example, context="hello" continuation=" world" is correct.
:return: list
A list of pairs (logprob, isgreedy)
logprob: float
The log probability of `continuation`
isgreedy:
Whether `continuation` would be generated by greedy sampling from `context`
"""
pass
@abstractmethod
def loglikelihood_rolling(self, requests):
"""Compute full log-likelihood of a string, with no truncation, for perplexity computation
- We will use the full max context length of the model.
- For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
the max context length.
- IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementations
which may simply concatenate multiple documents together.
- IMPORTANT: We maximize the amount of context for each prediction. Specifically, for inputs that we break into
multiple chunks, the last input will still a full-sized context.
Example:
Input tokens: [ 0 1 2 3 4 5 6 7 8 9 ]
Prefix: EOT
Max context length: 4
Resulting input/prediction pairs:
INPUT: EOT 0 1 2
PRED: 0 1 2 3
INPUT: 3 4 5 6
PRED: 4 5 6 7
INPUT: 5 6 7 8
PRED: 8 9
Observe that:
1. Each token is predicted exactly once
2. For the last pair, we provide the full context, but only score the last two tokens
:param requests: list
A list of strings
string: str
String for which we are computing per-toke loglikelihood
:return: list
A list of pairs (logprob, isgreedy)
logprob: float
The log probability of `continuation`
isgreedy:
Whether `continuation` would be generated by greedy sampling from `context`
"""
pass
# TODO: Add an optional max length
@abstractmethod
def greedy_until(self, requests):
"""Generate greedily until a stopping sequence
:param requests: list
A list of pairs (context, until)
context: str
Context string
until: [str]
The string sequences to generate until. These string sequences
may each span across multiple tokens, or may be part of one token.
:return: list
A list of strings continuation
continuation: str
The generated continuation.
"""
pass
@classmethod
def create_from_arg_string(cls, arg_string, additional_config=None):
additional_config = {} if additional_config is None else additional_config
args = utils.simple_parse_args_string(arg_string)
args2 = {k: v for k, v in additional_config.items() if v is not None}
return cls(**args, **args2)
def set_cache_hook(self, cache_hook):
self.cache_hook = cache_hook
class BaseLM(LM):
@property
@abstractmethod
def eot_token_id(self):
pass
@property
@abstractmethod
def max_length(self):
pass
@property
@abstractmethod
def max_gen_toks(self):
pass
@property
@abstractmethod
def batch_size(self):
pass
@property
@abstractmethod
def device(self):
pass
@abstractmethod
def tok_encode(self, string: str):
pass
@abstractmethod
def tok_decode(self, tokens: Iterable[int]):
pass
@abstractmethod
def _model_generate(self, context, max_length, eos_token_id):
pass
@abstractmethod
def _model_call(self, inps):
"""
inps: a torch tensor of shape [batch, sequence]
the size of sequence may vary from call to call
returns: a torch tensor of shape [batch, sequence, vocab] with the
logits returned from the model
"""
pass
# subclass must implement properties vocab_size, eot_token_id, max_gen_toks, batch_size, device, max_length.
# TODO: enforce this somehow
def loglikelihood(self, requests):
new_reqs = []
for context, continuation in requests:
if context == "":
# end of text as context
context_enc = [self.eot_token_id]
else:
context_enc = self.tok_encode(context)
continuation_enc = self.tok_encode(continuation)
new_reqs.append(((context, continuation), context_enc, continuation_enc))
return self._loglikelihood_tokens(new_reqs)
def loglikelihood_rolling(self, requests):
# TODO: Implement caching once we've confirmed the perplexity implementation
# TODO: automatic batch size detection for vectorization
loglikelihoods = []
for (string,) in tqdm(requests):
rolling_token_windows = list(
map(
utils.make_disjoint_window,
utils.get_rolling_token_windows(
token_list=self.tok_encode(string),
prefix_token=self.eot_token_id,
max_seq_len=self.max_length,
context_len=1,
),
)
)
rolling_token_windows = [(None,) + x for x in rolling_token_windows]
# TODO: extract out this call so it only gets called once and also somehow figure out partial caching for
# that
string_nll = self._loglikelihood_tokens(
rolling_token_windows, disable_tqdm=True
)
# discard is_greedy
string_nll = [x[0] for x in string_nll]
string_nll = sum(string_nll)
loglikelihoods.append(string_nll)
return loglikelihoods
def _loglikelihood_tokens(self, requests, disable_tqdm=False):
# TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
res = []
def _collate(x):
# the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch
# padded context length. this is useful to simplify the batching logic and more importantly to make
# automatic adaptive batches much much easier to implement
# - any OOMs will happen right away rather than near the end
toks = x[1] + x[2]
return -len(toks), tuple(toks)
# TODO: automatic (variable) batch size detection for vectorization
re_ord = utils.Reorderer(requests, _collate)
for chunk in utils.chunks(
tqdm(re_ord.get_reordered(), disable=disable_tqdm), self.batch_size
):
inps = []
cont_toks_list = []
inplens = []
padding_length = None
# because vectorizing is annoying, we first convert each (context, continuation) pair to padded
# tensors, then we pack them together into a batch, call the model, and then pick it all apart
# again because vectorizing is annoying
for _, context_enc, continuation_enc in chunk:
# sanity check
assert len(context_enc) > 0
assert len(continuation_enc) > 0
assert len(continuation_enc) <= self.max_length
# how this all works:
# CTX CONT
# inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1]
# gpt2 \ \
# logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the
# cont_toks 4 5 6 7 8 9 [:, -len(continuation_enc):, :self.vocab_size] slice
# when too long to fit in context, truncate from the left
inp = torch.tensor(
(context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
dtype=torch.long,
).to(self.device)
(inplen,) = inp.shape
cont = continuation_enc
# since in _collate we make sure length is descending, the longest is always the first one.
padding_length = (
padding_length if padding_length is not None else inplen
)
# pad length from seq to padding_length
inp = torch.cat(
[
inp, # [seq]
torch.zeros(padding_length - inplen, dtype=torch.long).to(
inp.device
), # [padding_length - seq]
],
dim=0,
)
inps.append(inp.unsqueeze(0)) # [1, padding_length]
cont_toks_list.append(cont)
inplens.append(inplen)
batched_inps = torch.cat(inps, dim=0) # [batch, padding_length
multi_logits = F.log_softmax(
self._model_call(batched_inps), dim=-1
).cpu() # [batch, padding_length, vocab]
for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(
chunk, multi_logits, inps, inplens, cont_toks_list
):
# Slice to original seq length
contlen = len(cont_toks)
logits = logits[inplen - contlen : inplen].unsqueeze(
0
) # [1, seq, vocab]
# Check if per-token argmax is exactly equal to continuation
greedy_tokens = logits.argmax(dim=-1)
cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(
0
) # [1, seq]
max_equal = (greedy_tokens == cont_toks).all()
# Obtain log-probs at the corresponding continuation token indices
# last_token_slice = logits[:, -1, :].squeeze(0).tolist()
logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(
-1
) # [1, seq]
# Answer: (log prob, is-exact-match)
answer = (float(logits.sum()), bool(max_equal))
# partial caching
if cache_key is not None:
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
res.append(answer)
return re_ord.get_original(res)
def greedy_until(self, requests):
# TODO: implement fully general `until` that handles until that are
# multiple tokens or that span multiple tokens correctly
# TODO: extract to TokenizedLM?
res = []
def _collate(x):
toks = self.tok_encode(x[0])
return len(toks), x[0]
re_ord = utils.Reorderer(requests, _collate)
for context, until in tqdm(re_ord.get_reordered()):
if isinstance(until, str):
until = [until]
(primary_until,) = self.tok_encode(until[0])
context_enc = torch.tensor(
[self.tok_encode(context)[self.max_gen_toks - self.max_length :]]
).to(self.device)
cont = self._model_generate(
context_enc, context_enc.shape[1] + self.max_gen_toks, primary_until
)
s = self.tok_decode(cont[0].tolist()[context_enc.shape[1] :])
for term in until:
s = s.split(term)[0]
# partial caching
self.cache_hook.add_partial("greedy_until", (context, until), s)
res.append(s)
return re_ord.get_original(res)
from lm_eval.filters import build_filter_ensemble
from lm_eval.api import samplers
@dataclass
class TaskConfig(dict):
task_name: str = None
dataset_path: str = None
dataset_name: str = None
training_split: str = None
validation_split: str = None
test_split: str = None
fewshot_split: str = None # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
# TODO: add this as more jinja2 appended to start of jinja2 templates. Should allow users to set vars
# s.t. they can define e.g. {% set question = query %} to map dataset columns to "canonical" names in prompts.
template_vars: str = None
doc_to_text: str = None
doc_to_target: str = None
# aggregation: dict = None # TODO: remove, I think these 2 are obsolete w/ current metric_list impl.
# higher_is_better: dict = None
num_fewshot: int = 0
batch_size: int = 1
metric_list: str = None
gold_alias: str = None
output_type: str = "greedy_until"
delimiter: str = "\n\n"
filters: str = None #TODO: need to make this typehint `list`?
normalization: str = None # TODO: add length-normalization of various types, mutual info
stop_sequences: list = None # TODO: allow passing of stop sequences to greedy gen.
def __getitem__(self, item):
return getattr(self, item)
class Task(abc.ABC):
......@@ -379,6 +61,7 @@ class Task(abc.ABC):
{"question": ..., question, answer)
"""
VERSION = None
# The name of the `Task` benchmark as denoted in the HuggingFace datasets Hub
# or a path to a custom `datasets` loading script.
DATASET_PATH: str = None
......@@ -386,7 +69,14 @@ class Task(abc.ABC):
# The name of a subset within `DATASET_PATH`.
DATASET_NAME: str = None
def __init__(self, data_dir=None, cache_dir=None, download_mode=None):
OUTPUT_TYPE: str = None
def __init__(
self,
data_dir=None,
cache_dir=None,
download_mode=None,
config=None,
):
"""
:param data_dir: str
Stores the path to a local folder containing the `Task`'s data files.
......@@ -412,6 +102,17 @@ class Task(abc.ABC):
self.download(data_dir, cache_dir, download_mode)
self._training_docs = None
self._fewshot_docs = None
self._instances = None
self._config = TaskConfig(**config) if config else {}
if not hasattr(self, "_filters"):
self._filters = []
for name, components in self._config.get("filters", [["none", ["take_first"]]]):
filter_pipeline = build_filter_ensemble(name, components)
self._filters.append(filter_pipeline)
self.sampler = samplers.Sampler(self.training_docs(), self, rnd=random.Random()) # TODO: pass the correct docs in here
def download(self, data_dir=None, cache_dir=None, download_mode=None):
"""Downloads and returns the task dataset.
......@@ -446,21 +147,17 @@ class Task(abc.ABC):
download_mode=download_mode,
)
def should_decontaminate(self):
"""Whether this task supports decontamination against model training set."""
return False
@abstractmethod
@abc.abstractmethod
def has_training_docs(self):
"""Whether the task has a training set"""
pass
@abstractmethod
@abc.abstractmethod
def has_validation_docs(self):
"""Whether the task has a validation set"""
pass
@abstractmethod
@abc.abstractmethod
def has_test_docs(self):
"""Whether the task has a test set"""
pass
......@@ -497,6 +194,13 @@ class Task(abc.ABC):
"""
return doc
@property
def instances(self):
"""After calling `task.build_all_requests()`, tasks
maintain a list of the dataset instances which will be evaluated.
"""
return self._instances
def fewshot_examples(self, k, rnd):
if self._training_docs is None:
self._training_docs = list(self.training_docs())
......@@ -509,16 +213,46 @@ class Task(abc.ABC):
)
assert False
@abstractmethod
@abc.abstractmethod
def doc_to_text(self, doc):
pass
@abstractmethod
@abc.abstractmethod
def doc_to_target(self, doc):
pass
@abstractmethod
def construct_requests(self, doc, ctx):
def build_all_requests(self, limit=None):
"""Build a set of Instances for a task, and store them in task.instances"""
if self.has_test_docs():
docs = self.test_docs()
elif self.has_validation_docs():
docs = self.validation_docs()
else:
assert (
False
), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
instances = []
for doc_id, doc in enumerate(itertools.islice(docs, 0, limit) if limit else docs):
# sample fewshot context
fewshot_ctx = self.fewshot_context(
doc, self._config.num_fewshot, rnd=random.Random()
)
# TODO: hardcoded for now: # of runs on each input to be 2. # TODO: we should override this if doing greedy gen so users don't waste time+compute
inst = self.construct_requests(doc=doc, ctx=fewshot_ctx, metadata=(self._config["task_name"], doc_id, 2))
if not isinstance(inst, list):
inst = [inst]
instances.extend(inst)
self._instances = instances
assert len(self._instances) != 0, "task.build_requests() did not find any docs!"
@abc.abstractmethod
def construct_requests(self, doc, ctx, **kwargs):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
......@@ -528,10 +262,17 @@ class Task(abc.ABC):
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
:param doc_idx: int
The index of a document within `self.test_docs()` or `self.validation_docs()`,
whichever is the main split used.
:param repeats: int
TODO: update this docstring
The number of times each instance in a dataset is inferred on. Defaults to 1,
can be increased for techniques like majority voting.
"""
pass
@abstractmethod
@abc.abstractmethod
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
......@@ -544,7 +285,7 @@ class Task(abc.ABC):
"""
pass
@abstractmethod
@abc.abstractmethod
def aggregation(self):
"""
:returns: {str: [metric_score] -> float}
......@@ -553,7 +294,7 @@ class Task(abc.ABC):
"""
pass
@abstractmethod
@abc.abstractmethod
def higher_is_better(self):
"""
:returns: {str: bool}
......@@ -562,20 +303,8 @@ class Task(abc.ABC):
"""
pass
def fewshot_description(self):
import warnings
warnings.warn(
"`fewshot_description` will be removed in futures versions. Pass "
"any custom descriptions to the `evaluate` function instead.",
DeprecationWarning,
)
return ""
@utils.positional_deprecated
def fewshot_context(
self, doc, num_fewshot, provide_description=None, rnd=None, description=None
):
def fewshot_context(self, doc, num_fewshot, rnd=None):
"""Returns a fewshot context string that is made up of a prepended description
(if provided), the `num_fewshot` number of examples, and an appended prompt example.
......@@ -583,35 +312,22 @@ class Task(abc.ABC):
The document as returned from training_docs, validation_docs, or test_docs.
:param num_fewshot: int
The number of fewshot examples to provide in the returned context string.
:param provide_description: bool
Not implemented, and this option is deprecated and will be removed in a future version in favor of a different description providing method
:param rnd: random.Random
The pseudo-random number generator used to randomly sample examples.
WARNING: This is currently a required arg although it's optionalized with a default `None`.
:param description: str
The task's description that will be prepended to the fewshot examples.
:returns: str
The fewshot context.
"""
assert (
rnd is not None
), "A `random.Random` generator argument must be provided to `rnd`"
assert not provide_description, (
"The `provide_description` arg will be removed in future versions. To prepend "
"a custom description to the context, supply the corresponding string via the "
"`description` arg."
)
if provide_description is not None:
# nudge people to not specify it at all
print(
"WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict"
)
description = description + "\n\n" if description else ""
if num_fewshot == 0:
labeled_examples = ""
else:
# labeled_examples = self.sampler.get_context(doc, self._config.num_fewshot)
# for sets with no training docs, draw from other set *but ensure no overlap with current doc*
if self.has_training_docs():
fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd)
......@@ -639,21 +355,164 @@ class Task(abc.ABC):
)
example = self.doc_to_text(doc)
return description + labeled_examples + example
return labeled_examples + example
def apply_filters(self):
for f in self._filters:
f.apply(self._instances)
class ConfigurableTask(Task):
VERSION = "2.0"
OUTPUT_TYPE = "greedy_until"
def __init__(
self, data_dir=None, cache_dir=None, download_mode=None, config: dict = None
):
self._config = TaskConfig(**config)
if self._config.dataset_path is not None:
self.DATASET_PATH = self._config.dataset_path
if self._config.dataset_name is not None:
self.DATASET_NAME = self._config.dataset_name
if self._config.metric_list is not None:
self._metric_list = {}
self._aggregation_list = {}
self._higher_is_better = {}
for (metric_name, aggregation, higher_is_better) in self._config.metric_list:
self._aggregation_list[metric_name] = AGGREGATION_REGISTRY[aggregation]
self._higher_is_better[metric_name] = higher_is_better
if metric_name in METRIC_REGISTRY.keys():
self._metric_list[metric_name] = METRIC_REGISTRY[metric_name]
else:
try:
metric_object = evaluate.load(metric_name)
self._metric_list[metric_name] = metric_object
except Exception as ex:
raise Warning(
"{} not found in the evaluate library!".format(metric_name),
"Please check https://huggingface.co/evaluate-metric",
)
self.download(data_dir, cache_dir, download_mode)
self._training_docs = None
self._fewshot_docs = None
self._filters = []
for name, components in self._config.get("filters", [["none", ["take_first"]]]):
filter_pipeline = build_filter_ensemble(name, components)
self._filters.append(filter_pipeline)
def has_training_docs(self):
if self._config.training_split is not None:
return True
else:
return False
def has_validation_docs(self):
if self._config.validation_split is not None:
return True
else:
return False
def has_test_docs(self):
if self._config.test_split is not None:
return True
else:
return False
def training_docs(self):
if self._config.training_split is not None:
return self.dataset[self._config.training_split]
def validation_docs(self):
if self._config.validation_split is not None:
return self.dataset[self._config.validation_split]
def test_docs(self):
if self._config.test_split is not None:
return self.dataset[self._config.test_split]
def _process_doc(self, doc):
"""
Override this to process (detokenize, strip, replace, etc.) individual
documents. This can be used in a map over documents of a data split.
E.g. `map(self._process_doc, self.dataset["validation"])`
:return: dict
The processed version of the specified `doc`.
"""
return doc
def doc_to_text(self, doc):
return utils.apply_template(self._config.doc_to_text, doc)
def doc_to_target(self, doc):
return utils.apply_template(self._config.doc_to_target, doc)
def construct_requests(self, doc, ctx, **kwargs):
if self.OUTPUT_TYPE == "greedy_until":
return GenerationInstance(doc=doc, arguments=(ctx, "\n\n"), id_=0, **kwargs)
def process_results(self, doc, results):
if self._config.gold_alias is not None:
gold = doc[self._config.gold_alias]
else:
gold = self.doc_to_target(doc)
result_dict = {}
for key, result in zip(self._metric_list.keys(), results):
_dict = self._metric_list[key].compute(
references=[gold],
predictions=[result],
)
result_dict[key] = _dict[key]
return result_dict
def aggregation(self):
return self._aggregation_list
def higher_is_better(self):
return self._higher_is_better_list
class MultipleChoiceTask(Task):
OUTPUT_TYPE: str = "loglikelihood"
def doc_to_target(self, doc):
return " " + doc["choices"][doc["gold"]]
def construct_requests(self, doc, ctx):
lls = [
rf.loglikelihood(ctx, " {}".format(choice))[0] for choice in doc["choices"]
]
def construct_requests(self, doc, ctx, **kwargs):
return [LoglikelihoodInstance(
doc=doc,
arguments=(ctx, " {}".format(choice)),
id_=i,
**kwargs,
)
for i, choice in enumerate(doc["choices"])]
#lls = [
# rf.loglikelihood(ctx, " {}".format(choice))[0] for choice in doc["choices"]
# ]
return lls
# return lls
def process_results(self, doc, results):
results = [res[0] for res in results] # only retain loglikelihoods, discard is_greedy TODO: do we need is_greedy anywhere?
gold = doc["gold"]
acc = 1.0 if np.argmax(results) == gold else 0.0
......@@ -679,9 +538,8 @@ class MultipleChoiceTask(Task):
class PerplexityTask(Task, abc.ABC):
def should_decontaminate(self):
"""Whether this task supports decontamination against model training set."""
return True
OUTPUT_TYPE = "loglikelihood_rolling"
def has_training_docs(self):
return False
......@@ -691,7 +549,7 @@ class PerplexityTask(Task, abc.ABC):
return []
def fewshot_context(
self, doc, num_fewshot, provide_description=None, rnd=None, description=None
self, doc, num_fewshot, rnd=None
):
assert (
num_fewshot == 0
......@@ -699,16 +557,6 @@ class PerplexityTask(Task, abc.ABC):
assert (
rnd is not None
), "A `random.Random` generator argument must be provided to `rnd`."
assert not provide_description, (
"The `provide_description` arg will be removed in future versions. To prepend "
"a custom description to the context, supply the corresponding string via the "
"`description` arg."
)
if provide_description is not None:
# nudge people to not specify it at all
print(
"WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict"
)
return ""
......@@ -728,10 +576,12 @@ class PerplexityTask(Task, abc.ABC):
def doc_to_target(self, doc):
return doc
def construct_requests(self, doc, ctx):
def construct_requests(self, doc, ctx, **kwargs):
assert not ctx
req = rf.loglikelihood_rolling(self.doc_to_target(doc))
return req
return RollingLoglikelihoodInstance(doc=doc, arguments=(self.doc_to_target(doc),), id_=0, **kwargs)
# req = rf.loglikelihood_rolling(self.doc_to_target(doc))
# return req
def process_results(self, doc, results):
(loglikelihood,) = results
......@@ -758,134 +608,3 @@ class PerplexityTask(Task, abc.ABC):
def count_words(cls, doc):
"""Downstream tasks with custom word boundaries should override this!"""
return len(re.split(r"\s+", doc))
def hash_args(attr, args):
dat = json.dumps([attr] + list(args))
return hashlib.sha256(dat.encode("utf-8")).hexdigest()
class CacheHook:
def __init__(self, cachinglm):
if cachinglm is None:
self.dbdict = None
return
self.dbdict = cachinglm.dbdict
def add_partial(self, attr, req, res):
if self.dbdict is None:
return
hsh = hash_args(attr, req)
self.dbdict[hsh] = res
class CachingLM:
def __init__(self, lm, cache_db):
"""LM wrapper that returns cached results if they exist, and uses the underlying LM if not.
:param lm: LM
Underlying LM
:param cache_db: str
Path to cache db
"""
self.lm = lm
self.cache_db = cache_db
if os.path.dirname(cache_db):
os.makedirs(os.path.dirname(cache_db), exist_ok=True)
self.dbdict = SqliteDict(cache_db, autocommit=True)
# add hook to lm
lm.set_cache_hook(self.get_cache_hook())
def __getattr__(self, attr):
def fn(requests):
res = []
remaining_reqs = []
# figure out which ones are cached and which ones are new
for req in requests:
hsh = hash_args(attr, req)
if hsh in self.dbdict:
ob = self.dbdict[hsh]
assert ob is not None
res.append(ob)
else:
res.append(None)
remaining_reqs.append(req)
# actually run the LM on the requests that do not have cached results
rem_res = getattr(self.lm, attr)(remaining_reqs)
# stick the new ones back into the list and also cache any of the new ones
resptr = 0
for req, r in zip(remaining_reqs, rem_res):
while res[resptr] is not None:
resptr += 1
res[resptr] = r
# caching
hsh = hash_args(attr, req)
self.dbdict[hsh] = r
self.dbdict.commit()
return res
return fn
def get_cache_hook(self):
return CacheHook(self)
REQUEST_RETURN_LENGTHS = {
"loglikelihood": 2,
"greedy_until": None,
"loglikelihood_rolling": None,
}
class Request:
def __init__(self, request_type, args, index=None):
if request_type not in REQUEST_RETURN_LENGTHS.keys():
raise NotImplementedError(
"The request type {} is not implemented!".format(request_type)
)
self.request_type = request_type
self.args = args
self.index = index
def __iter__(self):
if REQUEST_RETURN_LENGTHS[self.request_type] is None:
raise IndexError("This request type does not return multiple arguments!")
for i in range(REQUEST_RETURN_LENGTHS[self.request_type]):
yield Request(self.request_type, self.args, i)
def __getitem__(self, i):
if REQUEST_RETURN_LENGTHS[self.request_type] is None:
raise IndexError("This request type does not return multiple arguments!")
return Request(self.request_type, self.args, i)
def __eq__(self, other):
return (
self.request_type == other.request_type
and self.args == other.args
and self.index == other.index
)
def __repr__(self):
return f"Req_{self.request_type}{self.args}[{self.index}]\n"
class RequestFactory:
def __getattr__(self, attr):
def fn(*args):
return Request(attr, args)
return fn
rf = RequestFactory()
# datasets
This directory contains custom HuggingFace [dataset loading scripts](https://huggingface.co/docs/datasets/dataset_script). They are provided to maintain backward compatibility with the ad-hoc data downloaders in earlier versions of the `lm-evaluation-harness` before HuggingFace [`datasets`](https://huggingface.co/docs/datasets/index) was adopted as the default downloading manager. For example, some instances in the HuggingFace `datasets` repository process features (e.g. whitespace stripping, lower-casing, etc.) in ways that the `lm-evaluation-harness` did not.
__NOTE__: We are __not__ accepting any additional loading scripts into the main branch! If you'd like to use a custom dataset, fork the repo and follow HuggingFace's loading script guide found [here](https://huggingface.co/docs/datasets/dataset_script). You can then override your `Task`'s `DATASET_PATH` attribute to point to this script's local path.
__WARNING__: A handful of loading scripts are included in this collection because they have not yet been pushed to the Huggingface Hub or a HuggingFace organization repo. We will remove such scripts once pushed.
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""ASDIV dataset."""
import os
import xml.etree.ElementTree as ET
import datasets
_CITATION = """\
@misc{miao2021diverse,
title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},
author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},
year={2021},
eprint={2106.15772},
archivePrefix={arXiv},
primaryClass={cs.AI}
}
"""
_DESCRIPTION = """\
ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language
patterns and problem types) English math word problem (MWP) corpus for evaluating
the capability of various MWP solvers. Existing MWP corpora for studying AI progress
remain limited either in language usage patterns or in problem types. We thus present
a new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem
types taught in elementary school. Each MWP is annotated with its problem type and grade
level (for indicating the level of difficulty).
"""
_HOMEPAGE = "https://github.com/chaochun/nlu-asdiv-dataset"
# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""
_URLS = "https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip"
class ASDiv(datasets.GeneratorBasedBuilder):
"""ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers"""
VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [
datasets.BuilderConfig(
name="asdiv",
version=VERSION,
description="A diverse corpus for evaluating and developing english math word problem solvers",
)
]
def _info(self):
features = datasets.Features(
{
"body": datasets.Value("string"),
"question": datasets.Value("string"),
"solution_type": datasets.Value("string"),
"answer": datasets.Value("string"),
"formula": datasets.Value("string"),
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
urls = _URLS
data_dir = dl_manager.download_and_extract(urls)
base_filepath = "nlu-asdiv-dataset-55790e5270bb91ccfa5053194b25732534696b50"
return [
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": os.path.join(
data_dir, base_filepath, "dataset", "ASDiv.xml"
),
"split": datasets.Split.VALIDATION,
},
),
]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
def _generate_examples(self, filepath, split):
tree = ET.parse(filepath)
root = tree.getroot()
for key, problem in enumerate(root.iter("Problem")):
yield key, {
"body": problem.find("Body").text,
"question": problem.find("Question").text,
"solution_type": problem.find("Solution-Type").text,
"answer": problem.find("Answer").text,
"formula": problem.find("Formula").text,
}
{"asdiv": {"description": "ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language\npatterns and problem types) English math word problem (MWP) corpus for evaluating\nthe capability of various MWP solvers. Existing MWP corpora for studying AI progress\nremain limited either in language usage patterns or in problem types. We thus present\na new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem\ntypes taught in elementary school. Each MWP is annotated with its problem type and grade\nlevel (for indicating the level of difficulty).\n", "citation": "@misc{miao2021diverse,\n title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},\n author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},\n year={2021},\n eprint={2106.15772},\n archivePrefix={arXiv},\n primaryClass={cs.AI}\n}\n", "homepage": "https://github.com/chaochun/nlu-asdiv-dataset", "license": "", "features": {"body": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "solution_type": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "formula": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "as_div", "config_name": "asdiv", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 501489, "num_examples": 2305, "dataset_name": "as_div"}}, "download_checksums": {"https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip": {"num_bytes": 440966, "checksum": "8f1fe4f6d5f170ec1e24ab78c244153c14c568b1bb2b1dad0324e71f37939a2d"}}, "download_size": 440966, "post_processing_size": null, "dataset_size": 501489, "size_in_bytes": 942455}}
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""CoQA dataset.
This `CoQA` adds the "additional_answers" feature that's missing in the original
datasets version:
https://github.com/huggingface/datasets/blob/master/datasets/coqa/coqa.py
"""
import json
import datasets
_CITATION = """\
@misc{reddy2018coqa,
title={CoQA: A Conversational Question Answering Challenge},
author={Siva Reddy and Danqi Chen and Christopher D. Manning},
year={2018},
eprint={1808.07042},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""
_DESCRIPTION = """\
CoQA is a large-scale dataset for building Conversational Question Answering
systems. The goal of the CoQA challenge is to measure the ability of machines to
understand a text passage and answer a series of interconnected questions that
appear in a conversation.
"""
_HOMEPAGE = "https://stanfordnlp.github.io/coqa/"
# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""
_URLS = {
"train": "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json",
"validation": "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json",
}
# `additional_answers` are not available in the train set so we fill them with
# empty dicts of the same form.
_EMPTY_ADDITIONAL_ANSWER = {
"0": [
{
"span_start": -1,
"span_end": -1,
"span_text": "",
"input_text": "",
"turn_id": -1,
}
],
"1": [
{
"span_start": -1,
"span_end": -1,
"span_text": "",
"input_text": "",
"turn_id": -1,
}
],
"2": [
{
"span_start": -1,
"span_end": -1,
"span_text": "",
"input_text": "",
"turn_id": -1,
}
],
}
class Coqa(datasets.GeneratorBasedBuilder):
"""CoQA is a large-scale dataset for building Conversational Question Answering systems."""
VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [
datasets.BuilderConfig(
name="coqa", version=VERSION, description="The CoQA dataset."
),
]
def _info(self):
features = datasets.Features(
{
"id": datasets.Value("string"),
"source": datasets.Value("string"),
"story": datasets.Value("string"),
"questions": datasets.features.Sequence(
{
"input_text": datasets.Value("string"),
"turn_id": datasets.Value("int32"),
}
),
"answers": datasets.features.Sequence(
{
"span_start": datasets.Value("int32"),
"span_end": datasets.Value("int32"),
"span_text": datasets.Value("string"),
"input_text": datasets.Value("string"),
"turn_id": datasets.Value("int32"),
}
),
"additional_answers": {
"0": datasets.features.Sequence(
{
"span_start": datasets.Value("int32"),
"span_end": datasets.Value("int32"),
"span_text": datasets.Value("string"),
"input_text": datasets.Value("string"),
"turn_id": datasets.Value("int32"),
}
),
"1": datasets.features.Sequence(
{
"span_start": datasets.Value("int32"),
"span_end": datasets.Value("int32"),
"span_text": datasets.Value("string"),
"input_text": datasets.Value("string"),
"turn_id": datasets.Value("int32"),
}
),
"2": datasets.features.Sequence(
{
"span_start": datasets.Value("int32"),
"span_end": datasets.Value("int32"),
"span_text": datasets.Value("string"),
"input_text": datasets.Value("string"),
"turn_id": datasets.Value("int32"),
}
),
},
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
urls = {"train": _URLS["train"], "validation": _URLS["validation"]}
data_dirs = dl_manager.download_and_extract(urls)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": data_dirs["train"],
"split": datasets.Split.TRAIN,
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": data_dirs["validation"],
"split": datasets.Split.VALIDATION,
},
),
]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
def _generate_examples(self, filepath, split):
with open(filepath, encoding="utf-8") as f:
data = json.load(f)
for row in data["data"]:
id = row["id"]
source = row["source"]
story = row["story"]
questions = [
{"input_text": q["input_text"], "turn_id": q["turn_id"]}
for q in row["questions"]
]
answers = [
{
"span_start": a["span_start"],
"span_end": a["span_end"],
"span_text": a["span_text"],
"input_text": a["input_text"],
"turn_id": a["turn_id"],
}
for a in row["answers"]
]
if split == datasets.Split.TRAIN:
additional_answers = _EMPTY_ADDITIONAL_ANSWER
else:
additional_answers = {
"0": [
{
"span_start": a0["span_start"],
"span_end": a0["span_end"],
"span_text": a0["span_text"],
"input_text": a0["input_text"],
"turn_id": a0["turn_id"],
}
for a0 in row["additional_answers"]["0"]
],
"1": [
{
"span_start": a1["span_start"],
"span_end": a1["span_end"],
"span_text": a1["span_text"],
"input_text": a1["input_text"],
"turn_id": a1["turn_id"],
}
for a1 in row["additional_answers"]["1"]
],
"2": [
{
"span_start": a2["span_start"],
"span_end": a2["span_end"],
"span_text": a2["span_text"],
"input_text": a2["input_text"],
"turn_id": a2["turn_id"],
}
for a2 in row["additional_answers"]["2"]
],
}
yield row["id"], {
"id": id,
"story": story,
"source": source,
"questions": questions,
"answers": answers,
"additional_answers": additional_answers,
}
{"coqa": {"description": "CoQA is a large-scale dataset for building Conversational Question Answering\nsystems. The goal of the CoQA challenge is to measure the ability of machines to\nunderstand a text passage and answer a series of interconnected questions that\nappear in a conversation.\n", "citation": "@misc{reddy2018coqa,\n title={CoQA: A Conversational Question Answering Challenge},\n author={Siva Reddy and Danqi Chen and Christopher D. Manning},\n year={2018},\n eprint={1808.07042},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://stanfordnlp.github.io/coqa/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "story": {"dtype": "string", "id": null, "_type": "Value"}, "questions": {"feature": {"input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "answers": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "additional_answers": {"0": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "1": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "2": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "coqa", "config_name": "coqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 26250528, "num_examples": 7199, "dataset_name": "coqa"}, "validation": {"name": "validation", "num_bytes": 3765933, "num_examples": 500, "dataset_name": "coqa"}}, "download_checksums": {"https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json": {"num_bytes": 49001836, "checksum": "b0fdb2bc1bd38dd3ca2ce5fa2ac3e02c6288ac914f241ac409a655ffb6619fa6"}, "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json": {"num_bytes": 9090845, "checksum": "dfa367a9733ce53222918d0231d9b3bedc2b8ee831a2845f62dfc70701f2540a"}}, "download_size": 58092681, "post_processing_size": null, "dataset_size": 30016461, "size_in_bytes": 88109142}}
{"drop": {"description": "DROP is a QA dataset which tests comprehensive understanding of paragraphs. In \nthis crowdsourced, adversarially-created, 96k question-answering benchmark, a \nsystem must resolve multiple references in a question, map them onto a paragraph,\nand perform discrete operations over them (such as addition, counting, or sorting).\n", "citation": "@misc{dua2019drop,\n title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs}, \n author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},\n year={2019},\n eprint={1903.00161},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://allenai.org/data/drop", "license": "", "features": {"section_id": {"dtype": "string", "id": null, "_type": "Value"}, "passage": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "query_id": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "validated_answers": {"feature": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "drop", "config_name": "drop", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 108858121, "num_examples": 77409, "dataset_name": "drop"}, "validation": {"name": "validation", "num_bytes": 12560739, "num_examples": 9536, "dataset_name": "drop"}}, "download_checksums": {"https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip": {"num_bytes": 8308692, "checksum": "39d2278a29fd729de301b111a45f434c24834f40df8f4ff116d864589e3249d6"}}, "download_size": 8308692, "post_processing_size": null, "dataset_size": 121418860, "size_in_bytes": 129727552}}
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Custom DROP dataset that, unlike HF, keeps all question-answer pairs
# even if there are multiple types of answers for the same question.
"""DROP dataset."""
import json
import os
import datasets
_CITATION = """\
@misc{dua2019drop,
title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs},
author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},
year={2019},
eprint={1903.00161},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""
_DESCRIPTION = """\
DROP is a QA dataset which tests comprehensive understanding of paragraphs. In
this crowdsourced, adversarially-created, 96k question-answering benchmark, a
system must resolve multiple references in a question, map them onto a paragraph,
and perform discrete operations over them (such as addition, counting, or sorting).
"""
_HOMEPAGE = "https://allenai.org/data/drop"
# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""
_URLS = {
"drop": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip",
}
_EMPTY_VALIDATED_ANSWER = [
{
"number": "",
"date": {
"day": "",
"month": "",
"year": "",
},
"spans": [],
"worker_id": "",
"hit_id": "",
}
]
class Drop(datasets.GeneratorBasedBuilder):
"""DROP is a QA dataset which tests comprehensive understanding of paragraphs."""
VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [
datasets.BuilderConfig(
name="drop", version=VERSION, description="The DROP dataset."
),
]
def _info(self):
features = datasets.Features(
{
"section_id": datasets.Value("string"),
"passage": datasets.Value("string"),
"question": datasets.Value("string"),
"query_id": datasets.Value("string"),
"answer": {
"number": datasets.Value("string"),
"date": {
"day": datasets.Value("string"),
"month": datasets.Value("string"),
"year": datasets.Value("string"),
},
"spans": datasets.features.Sequence(datasets.Value("string")),
"worker_id": datasets.Value("string"),
"hit_id": datasets.Value("string"),
},
"validated_answers": datasets.features.Sequence(
{
"number": datasets.Value("string"),
"date": {
"day": datasets.Value("string"),
"month": datasets.Value("string"),
"year": datasets.Value("string"),
},
"spans": datasets.features.Sequence(datasets.Value("string")),
"worker_id": datasets.Value("string"),
"hit_id": datasets.Value("string"),
}
),
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
urls = _URLS[self.config.name]
data_dir = dl_manager.download_and_extract(urls)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": os.path.join(
data_dir, "drop_dataset", "drop_dataset_train.json"
),
"split": "train",
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": os.path.join(
data_dir, "drop_dataset", "drop_dataset_dev.json"
),
"split": "validation",
},
),
]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
def _generate_examples(self, filepath, split):
with open(filepath, encoding="utf-8") as f:
data = json.load(f)
key = 0
for section_id, example in data.items():
# Each example (passage) has multiple sub-question-answer pairs.
for qa in example["qa_pairs"]:
# Build answer.
answer = qa["answer"]
answer = {
"number": answer["number"],
"date": {
"day": answer["date"].get("day", ""),
"month": answer["date"].get("month", ""),
"year": answer["date"].get("year", ""),
},
"spans": answer["spans"],
"worker_id": answer.get("worker_id", ""),
"hit_id": answer.get("hit_id", ""),
}
validated_answers = []
if "validated_answers" in qa:
for validated_answer in qa["validated_answers"]:
va = {
"number": validated_answer.get("number", ""),
"date": {
"day": validated_answer["date"].get("day", ""),
"month": validated_answer["date"].get("month", ""),
"year": validated_answer["date"].get("year", ""),
},
"spans": validated_answer.get("spans", ""),
"worker_id": validated_answer.get("worker_id", ""),
"hit_id": validated_answer.get("hit_id", ""),
}
validated_answers.append(va)
else:
validated_answers = _EMPTY_VALIDATED_ANSWER
yield key, {
"section_id": section_id,
"passage": example["passage"],
"question": qa["question"],
"query_id": qa["query_id"],
"answer": answer,
"validated_answers": validated_answers,
}
key += 1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment