Commit 6ac42518 authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'big-refactor' of...

Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into openai_completions
parents 9c3ba7d4 e3644fcc
import os
import yaml
from lm_eval import utils
from lm_eval.tasks import register_configurable_task, check_prompt_config
from lm_eval.logger import eval_logger
from lm_eval.api.registry import (
TASK_REGISTRY,
GROUP_REGISTRY,
ALL_TASKS,
)
def include_benchmarks(task_dir: str) -> None:
for root, subdirs, file_list in os.walk(task_dir):
if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
for f in file_list:
if f.endswith(".yaml"):
try:
benchmark_path = os.path.join(root, f)
with open(benchmark_path, "rb") as file:
yaml_config = yaml.full_load(file)
assert "group" in yaml_config
group = yaml_config["group"]
all_task_list = yaml_config["task"]
config_list = [
task for task in all_task_list if type(task) != str
]
task_list = [
task for task in all_task_list if type(task) == str
]
for task_config in config_list:
var_configs = check_prompt_config(
{
**task_config,
**{"group": group},
}
)
for config in var_configs:
register_configurable_task(config)
task_names = utils.pattern_match(task_list, ALL_TASKS)
for task in task_names:
if task in TASK_REGISTRY:
if group in GROUP_REGISTRY:
GROUP_REGISTRY[group].append(task)
else:
GROUP_REGISTRY[group] = [task]
ALL_TASKS.add(group)
except Exception as error:
eval_logger.warning(
"Failed to load benchmark in\n"
f" {benchmark_path}\n"
" Benchmark will not be added to registry\n"
f" Error: {error}"
)
task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
include_benchmarks(task_dir)
......@@ -3,7 +3,7 @@ import string
import pickle
import traceback
from pprint import pprint
from typing import Iterator, Sequence, TypeVar
from typing import Iterator, Sequence, TypeVar, List, Tuple
# This is a cpp module. Compile janitor_util.cpp with:
# c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup
......@@ -21,7 +21,7 @@ T = TypeVar("T")
# Implementation from nltk source
# https://www.nltk.org/_modules/nltk/util.html
def form_ngrams(sequence: Iterator[T], n: int) -> Iterator[tuple[T, ...]]:
def form_ngrams(sequence: Iterator[T], n: int) -> Iterator[Tuple[T, ...]]:
history = []
while n > 1:
# PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
......@@ -70,14 +70,14 @@ def word_ngrams(s: str, n: int) -> Iterator[str]:
# https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python
def split_indices(s: str) -> Iterator[tuple[str, tuple[int, int]]]:
def split_indices(s: str) -> Iterator[Tuple[str, Tuple[int, int]]]:
"""Splits a string on whitespaces and records the indices of each in the original string.
@:return generator((word, (start_idx, end_idx)), ...)
"""
return ((m.group(0), (m.start(), m.end() - 1)) for m in re.finditer(r"\S+", s))
def word_ngrams_indices(s: str, n: int) -> Iterator[tuple[str, tuple[int, int]]]:
def word_ngrams_indices(s: str, n: int) -> Iterator[Tuple[str, Tuple[int, int]]]:
"""Splits a string into pairs of (ngram words, their start/end indices)"""
tokens_with_indices = split_indices(s)
......@@ -157,7 +157,7 @@ class Janitor:
print("WARNING: Janitor running in python mode")
return self.register_contaminant_python(dirt_string)
def clean(self, dirty_string: str) -> list[str]:
def clean(self, dirty_string: str) -> List[str]:
"""Clean a string (e.g. a training set) by removing all ngrams previously
registered as contaminants. Returns a list of clean chunks, or empty if
the string was too dirty"""
......@@ -168,8 +168,8 @@ class Janitor:
return self.clean_python(dirty_string)
def _split_chunks(
self, dirty_string: str, dirty_parts: Sequence[tuple]
) -> list[str]:
self, dirty_string: str, dirty_parts: Sequence[Tuple]
) -> List[str]:
clean_chunks = []
splice_idx = 0
end = -1
......@@ -197,7 +197,7 @@ class Janitor:
janitor_util.clean_ngram(dirt_string, self.delete_chars, self.ngram_n)
)
def clean_cpp(self, dirty_string: str) -> list[str]:
def clean_cpp(self, dirty_string: str) -> List[str]:
contamination_indices = janitor_util.clean_ngram_with_indices(
dirty_string, self.delete_chars, self.ngram_n
)
......@@ -215,7 +215,7 @@ class Janitor:
word_ngrams(self.normalize_string(dirt_string), self.ngram_n)
)
def clean_python(self, dirty_string: str) -> list[str]:
def clean_python(self, dirty_string: str) -> List[str]:
contamination_indices = (
(None, *idx_pair)
for dirty_ngram, idx_pair in word_ngrams_indices(dirty_string, self.ngram_n)
......
......@@ -2,7 +2,6 @@ import random
import itertools
import json
import collections
import logging
import sys
import torch
......@@ -11,7 +10,6 @@ import numpy as np
import lm_eval.api
import lm_eval.tasks
import lm_eval.benchmarks
import lm_eval.models
import lm_eval.api.metrics
import lm_eval.api.registry
......@@ -22,14 +20,9 @@ from lm_eval.utils import (
make_table,
create_iterator,
get_git_commit_hash,
eval_logger,
)
from lm_eval.logger import eval_logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(sys.stdout))
@positional_deprecated
def simple_evaluate(
......@@ -120,6 +113,8 @@ def simple_evaluate(
task_obj = task_dict[task_name]
if type(task_obj) == tuple:
group, task_obj = task_obj
if task_obj is None:
continue
config = task_obj._config
if num_fewshot is not None:
......@@ -184,7 +179,7 @@ def evaluate(
:param lm: obj
Language Model
:param task_dict: dict[str, Task]
Dictionary of tasks. Tasks will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
Dictionary of tasks. Tasks will be taken to have name type(task).config.task .
:param limit: int, optional
Limit the number of examples per task (only use this for testing)
:param bootstrap_iters:
......@@ -209,27 +204,46 @@ def evaluate(
samples = collections.defaultdict(list)
# tracks all Instances/requests a model must generate output on.
requests = collections.defaultdict(list)
# Stores task scores based on task grouping.
aggregate = collections.defaultdict(dict)
# tracks if a task was chosen via user selecting a group containing it
task_groups = collections.defaultdict(dict)
# Aggregated task scores presented with groups
results_agg = collections.defaultdict(dict)
# Aggregated groups scores only
groups_agg = collections.defaultdict(dict)
# stores the amount to pad out reqs per req. type so that
# number of fwd passes per distributed rank is equal
padding_requests = collections.defaultdict(int)
# Stores group related keys and values for group-aggregation
task_groups = collections.defaultdict(dict)
# store the hierarchy to do proper ordering
task_hierarchy = collections.defaultdict(list)
# store the ordering of tasks and groups
task_order = collections.defaultdict(int)
task_group_alias = collections.defaultdict(dict)
# get lists of each type of request
for task_name, task in task_dict.items():
if type(task) == tuple:
group, task = task
task_groups[task_name] = group
aggregate[task_name] = {}
group_name, task = task
task_hierarchy[group_name].append(task_name)
versions[group_name] = "N/A"
else:
group_name = None
task_hierarchy[task_name] = []
if task is None:
continue
versions[task_name] = task.VERSION
configs[task_name] = dict(task.dump_config())
if "task_alias" in configs[task_name]:
task_group_alias[task_name] = configs[task_name]["task_alias"]
if (
("group_alias" in configs[task_name])
and (group_name not in task_group_alias)
and (group_name is not None)
):
task_group_alias[group_name] = configs[task_name]["group_alias"]
if limit is not None:
if task.has_test_docs():
task_docs = task.test_docs()
......@@ -241,7 +255,7 @@ def evaluate(
task.build_all_requests(limit=limit, rank=lm.rank, world_size=lm.world_size)
eval_logger.info(
eval_logger.debug(
f"Task: {task_name}; number of requests on this rank: {len(task.instances)}"
)
......@@ -256,12 +270,9 @@ def evaluate(
eval_logger.info(f"Request: {str(inst)}")
# aggregate Instances by LM method requested to get output.
reqtype = (
"loglikelihood"
if task.OUTPUT_TYPE == "multiple_choice"
else task.OUTPUT_TYPE
) # TODO: this is hacky, fix in task.py
requests[reqtype].extend(task.instances)
for instance in task.instances:
reqtype = instance.request_type
requests[reqtype].append(instance)
if lm.world_size > 1:
instances_rnk = torch.tensor(len(task._instances), device=lm.device)
......@@ -301,6 +312,8 @@ def evaluate(
for task_name, task in task_dict.items():
if type(task) == tuple:
group, task = task
if task is None:
continue
task.apply_filters()
### Collect values of metrics on all datapoints ###
......@@ -310,6 +323,8 @@ def evaluate(
for task_name, task in task_dict.items():
if type(task) == tuple:
group, task = task
if task is None:
continue
# TODO: make it possible to use a different metric per filter
# iterate over different filters used
for key in task.instances[0].filtered_resps.keys():
......@@ -396,34 +411,56 @@ def evaluate(
vals = vals_torch
if lm.rank == 0:
### Get task ordering for correct sample-wide aggregation
group_to_task = {}
for group in task_hierarchy.keys():
if group not in task_order:
task_order[group] = 0
if len(task_hierarchy[group]) > 0:
group_to_task[group] = task_hierarchy[group].copy()
for task in task_hierarchy[group]:
if task in task_order:
task_order[task] += 1
else:
task_order[task] = 1 + task_order[group]
if task in task_hierarchy:
group_to_task[group].remove(task)
group_to_task[group].extend(task_hierarchy[task])
task_to_group = {}
for group in group_to_task:
for task in group_to_task[group]:
if task in task_to_group:
task_to_group[task].append(group)
else:
task_to_group[task] = [group]
### Aggregate results over all datapoints ###
# aggregate results ; run bootstrap CIs
for (task_name, key, metric), items in vals.items():
task = task_dict[task_name]
metric_key = metric + "," + key
if type(task) == tuple:
group, task = task
task_score = task.aggregation()[metric](items)
results[task_name][metric + "," + key] = task_score
# Need to put back in results
# pythia | acc
# | perplexity
# | word_perplexity
# | byte_perplexity
# | bits_per_byte
if task_name in task_groups:
group_name = task_groups[task_name]
if metric in list(aggregate[group_name].keys()):
aggregate[group_name][metric].append(task_score)
else:
aggregate[group_name][metric] = [task_score]
group_name, task = task
else:
group_name = None
agg_fn = task.aggregation()[metric]
results[task_name][metric_key] = agg_fn(items)
results[task_name]["samples"] = len(items)
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
if False: # bootstrap_iters > 0:
if bootstrap_iters > 0:
stderr = lm_eval.api.metrics.stderr_for_metric(
metric=task.aggregation()[metric],
bootstrap_iters=min(bootstrap_iters, 1000)
bootstrap_iters=min(bootstrap_iters, 100)
if metric in ["bleu", "chrf", "ter"]
else bootstrap_iters,
)
......@@ -431,19 +468,136 @@ def evaluate(
if stderr is not None:
results[task_name][metric + "_stderr" + "," + key] = stderr(items)
if bool(aggregate):
for group in aggregate.keys():
for metric in aggregate[group].keys():
aggregate[group][metric] = np.average(aggregate[group][metric])
versions[group] = "N/A"
if bool(results):
for group, task_list in reversed(task_hierarchy.items()):
if task_list == []:
total_size = results[group]["samples"]
else:
total_size = 0
for task in task_list:
metrics = results[task]
current_size = metrics.pop("samples")
# TODO: There should be a way for users
# to toggle between weighted and
# unweighted averaging
# For unweighted averaging, use:
# current_size = 1
all_stderr = []
for metric in [
key for key in metrics.keys() if "_stderr" not in key
]:
stderr = "_stderr,".join(metric.split(","))
stderr_score = results[task][stderr]
var_score = stderr_score**2
metric_score = results[task][metric]
all_stderr.append(stderr)
if metric in results[group]:
results[group][metric] = (
results[group][metric] * total_size
+ metric_score * current_size
) / (total_size + current_size)
# $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$
results[group][stderr] = (
(total_size - 1) * results[group][stderr]
+ (current_size - 1) * var_score
) / (
total_size + current_size - 1
) + total_size * current_size / (
(total_size + current_size)
* (total_size + current_size - 1)
) * (
results[group][metric] - metric_score
) ** 2
else:
results[group][metric] = metric_score
results[group][stderr] = var_score
total_size += current_size
for stderr in all_stderr:
results[group][stderr] = np.sqrt(results[group][stderr])
results[group]["samples"] = total_size
def print_tasks(task_hierarchy, task_order, task_version, task_group_alias):
results_agg = collections.defaultdict(dict)
groups_agg = collections.defaultdict(dict)
for group_name, task_list in task_hierarchy.items():
order = task_order[group_name]
results_agg[group_name] = results[group_name].copy()
results_agg[group_name]["tab"] = order
if (order < max(task_order.values())) and (len(task_list) > 0):
groups_agg[group_name] = results[group_name].copy()
groups_agg[group_name]["tab"] = order
if task_list != []:
for task in sorted(task_list):
if task in task_hierarchy:
_task_hierarchy = {task: task_hierarchy[task]}
else:
_task_hierarchy = {task: []}
_results_agg, _groups_agg, task_version = print_tasks(
_task_hierarchy, task_order, task_version, task_group_alias
)
results_agg = {**results_agg, **_results_agg}
groups_agg = {**groups_agg, **_groups_agg}
return results_agg, groups_agg, task_version
results_agg, groups_agg, versions = print_tasks(
task_hierarchy, task_order, versions, task_group_alias
)
for task in results_agg:
task_results = results_agg[task]
if "samples" in task_results:
task_results.pop("samples")
tab_string = ""
if "tab" in task_results:
tab = task_results.pop("tab")
tab_string = " " * tab + "- " if tab > 0 else ""
if task in task_group_alias:
task_alias = task_group_alias[task]
results_agg[task]["alias"] = tab_string + task_alias
else:
results_agg[task]["alias"] = tab_string + task
for group in groups_agg:
group_results = groups_agg[group]
if "samples" in group_results:
group_results.pop("samples")
tab_string = ""
if "tab" in group_results:
tab = group_results.pop("tab")
tab_string = " " * tab + "- " if tab > 0 else ""
if group in task_group_alias:
group_alias = task_group_alias[group]
groups_agg[group]["alias"] = tab_string + group_alias
else:
groups_agg[group]["alias"] = tab_string + group
results_dict = {
"results": dict(sorted(results.items())),
**(
{"aggregate": dict(sorted(aggregate.items()))}
if bool(aggregate)
else {}
),
"results": dict(results_agg.items()),
**({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}),
"configs": dict(sorted(configs.items())),
"versions": dict(sorted(versions.items())),
}
......
from lm_eval.api.filter import FilterEnsemble
from . import selection
from . import extraction
from . import transformation
FILTER_REGISTRY = {
......@@ -9,6 +10,9 @@ FILTER_REGISTRY = {
"majority_vote": selection.MajorityVoteFilter,
"take_first_k": selection.TakeKFilter,
"remove_whitespace": extraction.WhitespaceFilter,
"lowercase": transformation.LowercaseFilter,
"uppercase": transformation.UppercaseFilter,
"map": transformation.MapFilter,
# TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function
# that takes an input and returns a scalar and then should select the max reward,
# or should implement different filters for different ways of handling a reward model's inference.
......
from lm_eval.api.filter import Filter
class LowercaseFilter(Filter):
def __init__(self) -> None:
pass
def apply(self, resps, docs):
def filter_set(inst):
return [resp.lower() for resp in inst]
return [filter_set(resp) for resp in resps]
class UppercaseFilter(Filter):
def __init__(self) -> None:
pass
def apply(self, resps, docs):
def filter_set(inst):
return [resp.upper() for resp in inst]
return [filter_set(resp) for resp in resps]
class MapFilter(Filter):
def __init__(self, mapping_dict: dict = {}, default_value=None) -> None:
"""
Initializes the MapFilter with a given mapping dictionary and default value.
Args:
- mapping_dict (dict): A dictionary containing the key-value mappings.
Default is an empty dictionary.
- default_value (Any): The value to be returned when a key is not found in the mapping_dict.
Default is None.
Example:
mapper = MapFilter({'A': 1, 'B': 2}, default_value=0)
"""
assert isinstance(
mapping_dict, dict
), "Provided mapping_dict is not a dictionary"
self.mapping_dict = mapping_dict
self.default_value = default_value
def apply(self, resps, docs):
def filter_set(inst):
return [self.mapping_dict.get(resp, self.default_value) for resp in inst]
return [filter_set(resp) for resp in resps]
import logging
logging.basicConfig(
format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
datefmt="%Y-%m-%d:%H:%M:%S",
level=logging.INFO,
)
eval_logger = logging.getLogger("lm-eval")
SPACING = " " * 47
......@@ -2,9 +2,11 @@ from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from tqdm import tqdm
import time
from lm_eval.logger import eval_logger
from lm_eval import utils
from typing import List, Any, Tuple
eval_logger = utils.eval_logger
def anthropic_completion(
client, #: anthropic.Anthropic,
......@@ -138,7 +140,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
raise NotImplementedError("No support for logits.")
def greedy_until(self, requests) -> List[str]:
def generate_until(self, requests) -> List[str]:
if not requests:
return []
......@@ -164,7 +166,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
)
res.append(response)
self.cache_hook.add_partial("greedy_until", request, response)
self.cache_hook.add_partial("generate_until", request, response)
except anthropic.APIConnectionError as e: # type: ignore # noqa: F821
eval_logger.critical(f"Server unreachable: {e.__cause__}")
break
......@@ -179,7 +181,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
raise NotImplementedError()
def _model_generate(self, context, max_length, eos_token_id):
# Isn't used because we override greedy_until
# Isn't used because we override generate_until
raise NotImplementedError()
def loglikelihood(self, requests):
......
......@@ -20,7 +20,7 @@ class DummyLM(LM):
return res
def greedy_until(self, requests):
def generate_until(self, requests):
res = []
for ctx, _ in requests:
......
......@@ -16,7 +16,6 @@ from pathlib import Path
import torch.nn.functional as F
from lm_eval import utils
from lm_eval.logger import eval_logger
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
......@@ -25,6 +24,8 @@ from lm_eval.utils import MultiTokenEOSCriteria, stop_sequences_criteria
from accelerate import Accelerator, find_executable_batch_size, DistributedType
from typing import List, Optional, Union
eval_logger = utils.eval_logger
def _get_accelerate_args(
device_map_option: Optional[str] = "auto",
......@@ -107,17 +108,20 @@ class HFLM(LM):
if not (parallelize or accelerator.num_processes > 1):
# use user-passed device
device_list = set(
["cuda", "cpu", "mps"]
["cuda", "cpu"]
+ [f"cuda:{i}" for i in range(torch.cuda.device_count())]
+ ["mps", "mps:0"]
)
if device:
if device not in device_list:
device = int(device)
self._device = torch.device(device)
eval_logger.info(f"Using device '{device}'")
if device == "mps":
if device in ("mps", "mps:0") and "dev" not in torch.__version__:
eval_logger.info(
"MPS is still in beta and only supports float32; setting dtype to float32."
"MPS: Setting dtype to float32. To use float16 with MPS, please install a nightly build of "
"PyTorch: pip3 install --pre torch torchvision torchaudio --index-url "
"https://download.pytorch.org/whl/nightly/cpu"
)
else:
eval_logger.info("Device not specified")
......@@ -416,12 +420,13 @@ class HFLM(LM):
utils.clear_torch_cache()
return batch_size
def tok_encode(self, string: str, left_truncate_len=None):
def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None):
""" """
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
add_special_tokens = False
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
add_special_tokens = True
if add_special_tokens is None:
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
add_special_tokens = False
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
add_special_tokens = True
encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens)
......@@ -505,7 +510,7 @@ class HFLM(LM):
self.tokenizer, stop, 1, context.shape[0]
)
return self.model.generate(
context,
input_ids=context,
max_length=max_length,
stopping_criteria=stopping_criteria,
pad_token_id=self.eot_token_id,
......@@ -536,8 +541,12 @@ class HFLM(LM):
if n_spaces > 0:
continuation = context[-n_spaces:] + continuation
context = context[:-n_spaces]
whole_enc = self.tok_encode(context + continuation)
context_enc = self.tok_encode(context)
whole_enc = self.tok_encode(context + continuation, add_special_tokens=False)
context_enc = self.tok_encode(context, add_special_tokens=False)
# whole_enc = self.tok_encode(context + continuation)
# context_enc = self.tok_encode(context, add_special_tokens=False)
context_enc_len = len(context_enc)
continuation_enc = whole_enc[context_enc_len:]
return context_enc, continuation_enc
......@@ -613,6 +622,23 @@ class HFLM(LM):
return loglikelihoods
def _batch_scheduler(self, pos, n_reordered_requests):
sched = pos // int(len(n_reordered_requests) / self.batch_schedule)
if sched in self.batch_sizes:
return self.batch_sizes[sched]
if (len(self.batch_sizes) > 1) and (
self.batch_sizes[sched - 1] == self.max_batch_size
):
# if previous batch size is already maximal, skip recomputation
self.batch_sizes[sched] = self.max_batch_size
return self.batch_sizes[sched]
print(
f"Passed argument batch_size = auto:{self.batch_schedule}. Detecting largest batch size"
)
self.batch_sizes[sched] = self._detect_batch_size(n_reordered_requests, pos)
print(f"Determined largest batch size: {self.batch_sizes[sched]}")
return self.batch_sizes[sched]
def _loglikelihood_tokens(
self, requests, disable_tqdm: bool = False, override_bs=None
):
......@@ -636,38 +662,22 @@ class HFLM(LM):
# automatic (variable) batch size detection for vectorization
# pull longest context sample from request
def _batch_scheduler(pos):
sched = pos // int(n_reordered_requests / self.batch_schedule)
if sched in self.batch_sizes:
return self.batch_sizes[sched]
if (len(self.batch_sizes) > 1) and (
self.batch_sizes[sched - 1] == self.max_batch_size
):
# if previous batch size is already maximal, skip recomputation
self.batch_sizes[sched] = self.max_batch_size
return self.batch_sizes[sched]
print(
f"Passed argument batch_size = auto:{self.batch_schedule}. Detecting largest batch size"
)
self.batch_sizes[sched] = self._detect_batch_size(
re_ord.get_reordered(), pos
)
print(f"Determined largest batch size: {self.batch_sizes[sched]}")
return self.batch_sizes[sched]
for chunk in utils.chunks(
tqdm(re_ord.get_reordered(), disable=(disable_tqdm or (self.rank != 0))),
chunks = utils.chunks(
re_ord.get_reordered(),
n=self.batch_size
if self.batch_size != "auto"
else override_bs
if override_bs is not None
else 0,
fn=_batch_scheduler
fn=self._batch_scheduler
if self.batch_size == "auto"
and n_reordered_requests > 0
and not override_bs
else None,
):
)
pbar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0)))
for chunk in chunks:
inps = []
cont_toks_list = []
inplens = []
......@@ -804,10 +814,13 @@ class HFLM(LM):
res.append(answer)
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
pbar.update(1)
pbar.close()
return re_ord.get_original(res)
def greedy_until(self, requests):
def generate_until(self, requests):
res = defaultdict(list)
re_ords = {}
......@@ -830,13 +843,26 @@ class HFLM(LM):
re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate)
pbar = tqdm(total=len(requests), disable=(self.rank != 0))
if self.batch_size == "auto":
# using rolling window with maximum context
print("Passed argument batch_size = auto. Detecting largest batch size")
batch_size = self._detect_batch_size()
print(f"Determined Largest batch size: {batch_size}")
adaptive_batch_size = batch_size
# for each different set of kwargs, we execute all requests, by batch.
for key, re_ord in re_ords.items():
for chunk in utils.chunks(
chunks = utils.chunks(
re_ord.get_reordered(),
self.batch_size,
):
n=self.batch_size
if self.batch_size != "auto"
else adaptive_batch_size
if adaptive_batch_size is not None
else 0,
fn=self._batch_scheduler
if self.batch_size == "auto" and not adaptive_batch_size
else None,
)
for chunk in chunks:
contexts, all_gen_kwargs = zip(*chunk)
# we assume all gen kwargs in the batch are the same
# this is safe to assume because the `grouper` object ensures it.
......@@ -912,7 +938,7 @@ class HFLM(LM):
res[key].append(s)
self.cache_hook.add_partial(
"greedy_until", (context, gen_kwargs), s
"generate_until", (context, gen_kwargs), s
)
pbar.update(1)
# reorder this group of results back to original unsorted form
......
......@@ -58,7 +58,7 @@ class TextSynthLM(LM):
@property
def eot_token_id(self):
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
# Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
raise NotImplementedError()
@property
......@@ -72,20 +72,20 @@ class TextSynthLM(LM):
@property
def batch_size(self):
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
# Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
raise NotImplementedError()
@property
def device(self):
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
# Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
raise NotImplementedError()
def tok_encode(self, string: str):
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
# Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
raise NotImplementedError()
def tok_decode(self, tokens):
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
# Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
raise NotImplementedError()
def loglikelihood(self, requests):
......@@ -122,7 +122,7 @@ class TextSynthLM(LM):
"input tokenization support from TextSynth."
)
def greedy_until(self, requests):
def generate_until(self, requests):
if not requests:
return []
......@@ -146,7 +146,7 @@ class TextSynthLM(LM):
s = resp["text"]
res.append(s)
self.cache_hook.add_partial("greedy_until", (inp, request_args), s)
self.cache_hook.add_partial("generate_until", (inp, request_args), s)
else:
logger.error(
f"The following response does not contain generated `text`. "
......@@ -160,5 +160,5 @@ class TextSynthLM(LM):
raise NotImplementedError()
def _model_generate(self, context, max_length, eos_token_id):
# Isn't used because we override greedy_until
# Isn't used because we override generate_until
raise NotImplementedError()
import os
import ast
from typing import Dict
from lm_eval import utils
from lm_eval.logger import eval_logger
from lm_eval.utils import eval_logger
# Prompt library.
# Stores prompts in a dictionary indexed by 2 levels:
# prompt category name, and prompt name.
# This allows us to access prompts
PROMPT_REGISTRY: dict[str, dict[str, str]] = {
PROMPT_REGISTRY: Dict[str, Dict[str, str]] = {
"qa-basic": {
"question-newline-answer": "Question: {{question}}\nAnswer:",
"q-newline-a": "Q: {{question}}\nA:",
......@@ -44,6 +48,14 @@ def get_prompt(prompt_id: str, dataset_name: str = None, subset_name: str = None
raise ValueError(
f"{prompt_name} not in prompt list {prompts.all_template_names}"
)
elif ".yaml" in category_name:
import yaml
with open(category_name, "rb") as file:
prompt_yaml_file = yaml.full_load(file)
prompt_string = prompt_yaml_file["prompts"][prompt_name]
return PromptString(prompt_string)
else:
try:
return PROMPT_REGISTRY[category_name][prompt_name]
......@@ -54,15 +66,62 @@ def get_prompt(prompt_id: str, dataset_name: str = None, subset_name: str = None
)
def load_prompt_list(use_prompt: str, dataset_name=None, subset_name=None, **kwargs):
def load_prompt_list(
use_prompt: str, dataset_name=None, subset_name=None, yaml_path=None, **kwargs
):
from promptsource.templates import DatasetTemplates
category_name, prompt_name = use_prompt.split(":")
if subset_name is None:
prompts = DatasetTemplates(dataset_name=dataset_name)
else:
prompts = DatasetTemplates(dataset_name=dataset_name, subset_name=subset_name)
if category_name == "promptsource":
from promptsource.templates import DatasetTemplates
category_name, prompt_name = use_prompt.split(":")
prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names)
if subset_name is None:
prompts = DatasetTemplates(dataset_name=dataset_name)
else:
prompts = DatasetTemplates(
dataset_name=dataset_name, subset_name=subset_name
)
prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names)
elif ".yaml" in category_name:
import yaml
if yaml_path is not None:
category_name = os.path.realpath(os.path.join(yaml_path, category_name))
with open(category_name, "rb") as file:
prompt_yaml_file = yaml.full_load(file)
prompt_list = utils.pattern_match(
prompt_name, prompt_yaml_file["prompts"].keys()
)
# category_name, *prompt_name = use_prompt.split(":")
# TODO allow to multiple prompt naming
# if len(prompt_name) > 1:
# prompt_list = []
# for prompt in prompt_name:
# prompt_list.append(utils.pattern_match(prompt_name, prompts.all_template_names))
# else:
# prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names)
return [":".join([category_name, prompt]) for prompt in prompt_list]
class PromptString:
def __init__(self, prompt_string):
self.prompt_string = prompt_string
def apply(self, doc):
doc_to_text = self.prompt_string["doc_to_text"]
doc_to_target = self.prompt_string["doc_to_target"]
# TODO need a way to process doc_to_choice
if "doc_to_choice" in self.prompt_string:
raise "Not yet implemented to accept doc_to_choice"
text_string = utils.apply_template(doc_to_text, doc)
target_string = utils.apply_template(doc_to_target, doc)
return [text_string, target_string]
......@@ -16,7 +16,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
- [x] MCTACO
- [x] Pubmed QA
- [x] SciQ
- [ ] QASPER
- [x] QASPER
- [x] QA4MRE
- [x] TriviaQA
- [x] AI2 ARC
......@@ -36,7 +36,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
- [x] TruthfulQA (mc1)
- [x] TruthfulQA (mc2)
- [x] TruthfulQA (gen)
- [ ] MuTual
- [x] MuTual
- [ ] Hendrycks Math (Hailey)
- [x] Asdiv
- [ ] GSM8k
......@@ -59,6 +59,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
- [x] MGSM
- [ ] SCROLLS
- [x] Babi
- [x] Belebele
# Novel Tasks
Tasks added in the revamped harness that were not previously available. Again, a strikethrough denotes checking performed *against the original task's implementation or published results introducing the task*.
......
import os
import yaml
from typing import List, Union
from typing import List, Union, Dict
from lm_eval import utils
from lm_eval import prompts
from lm_eval.logger import eval_logger
from lm_eval.api.task import TaskConfig, Task, ConfigurableTask
from lm_eval.api.registry import (
register_task,
......@@ -14,8 +13,23 @@ from lm_eval.api.registry import (
ALL_TASKS,
)
import logging
def register_configurable_task(config: dict[str, str]) -> int:
# import python tasks
from .squadv2.task import SQuAD2
from .scrolls.task import (
QuALITY,
NarrativeQA,
ContractNLI,
GovReport,
SummScreenFD,
QMSum,
)
eval_logger = utils.eval_logger
def register_configurable_task(config: Dict[str, str]) -> int:
SubClass = type(
config["task"] + "ConfigurableTask",
(ConfigurableTask,),
......@@ -27,7 +41,9 @@ def register_configurable_task(config: dict[str, str]) -> int:
register_task(task_name)(SubClass)
if "group" in config:
if type(config["group"]) == str:
if config["group"] == config["task"]:
raise ValueError("task and group name cannot be the same")
elif type(config["group"]) == str:
group_name = [config["group"]]
else:
group_name = config["group"]
......@@ -38,13 +54,46 @@ def register_configurable_task(config: dict[str, str]) -> int:
return 0
def check_prompt_config(config: dict[str, str]) -> List[dict[str, str]]:
def register_configurable_group(config: Dict[str, str], yaml_path: str = None) -> int:
group = config["group"]
all_task_list = config["task"]
config_list = [task for task in all_task_list if type(task) != str]
task_list = [task for task in all_task_list if type(task) == str]
for task_config in config_list:
task_config = utils.load_yaml_config(yaml_path, task_config)
var_configs = check_prompt_config(
{
**task_config,
**{"group": group},
},
yaml_path=os.path.dirname(yaml_path),
)
for config in var_configs:
register_configurable_task(config)
task_names = utils.pattern_match(task_list, ALL_TASKS)
for task in task_names:
if (task in TASK_REGISTRY) or (task in GROUP_REGISTRY):
if group in GROUP_REGISTRY:
GROUP_REGISTRY[group].append(task)
else:
GROUP_REGISTRY[group] = [task]
ALL_TASKS.add(group)
return 0
def check_prompt_config(
config: Dict[str, str], yaml_path: str = None
) -> List[Dict[str, str]]:
all_configs = []
if "use_prompt" in config:
prompt_list = prompts.load_prompt_list(
use_prompt=config["use_prompt"],
dataset_name=config["dataset_path"],
subset_name=config["dataset_name"] if "dataset_name" in config else None,
yaml_path=yaml_path,
)
for idx, prompt_variation in enumerate(prompt_list):
all_configs.append(
......@@ -57,11 +106,13 @@ def check_prompt_config(config: dict[str, str]) -> List[dict[str, str]]:
config["task"]
if "task" in config
else get_task_name_from_config(config),
prompt_variation,
prompt_variation.split("/")[-1]
if ".yaml" in prompt_variation
else prompt_variation,
]
)
},
**{"output_type": "greedy_until"},
**{"output_type": "generate_until"},
}
)
else:
......@@ -69,39 +120,71 @@ def check_prompt_config(config: dict[str, str]) -> List[dict[str, str]]:
return all_configs
def get_task_name_from_config(task_config: dict[str, str]) -> str:
def get_task_name_from_config(task_config: Dict[str, str]) -> str:
if "dataset_name" in task_config:
return "{dataset_path}_{dataset_name}".format(**task_config)
else:
return "{dataset_path}".format(**task_config)
def include_task_folder(task_dir: str) -> None:
def include_task_folder(task_dir: str, register_task: bool = True) -> None:
"""
Calling this function
"""
for root, subdirs, file_list in os.walk(task_dir):
if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
for f in file_list:
if f.endswith(".yaml"):
yaml_path = os.path.join(root, f)
try:
config = utils.load_yaml_config(yaml_path)
all_configs = check_prompt_config(config)
for config in all_configs:
register_configurable_task(config)
except Exception as error:
eval_logger.warning(
"Failed to load config in\n"
f" {yaml_path}\n"
" Config will not be added to registry\n"
f" Error: {error}"
)
# if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
for f in file_list:
if f.endswith(".yaml"):
yaml_path = os.path.join(root, f)
try:
config = utils.load_yaml_config(yaml_path)
if "task" not in config:
continue
all_configs = check_prompt_config(
config, yaml_path=os.path.dirname(yaml_path)
)
for config in all_configs:
if register_task:
if type(config["task"]) == str:
register_configurable_task(config)
else:
if type(config["task"]) == list:
register_configurable_group(config, yaml_path)
task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
include_task_folder(task_dir)
# Log this silently and show it only when
# the user defines the appropriate verbosity.
except ModuleNotFoundError as e:
eval_logger.debug(
f"{yaml_path}: {e}. Config will not be added to registry."
)
except Exception as error:
import traceback
eval_logger.debug(
"Failed to load config in\n"
f" {yaml_path}\n"
" Config will not be added to registry\n"
f" Error: {error}\n"
f" Traceback: {traceback.format_exc()}"
)
return 0
def include_path(task_dir):
include_task_folder(task_dir)
# Register Benchmarks after all tasks have been added
include_task_folder(task_dir, register_task=False)
return 0
def initialize_tasks(verbosity="INFO"):
eval_logger.setLevel(getattr(logging, f"{verbosity}"))
task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
include_path(task_dir)
def get_task(task_name, config):
......@@ -128,27 +211,36 @@ def get_task_name_from_object(task_object):
# TODO: pass num_fewshot and other cmdline overrides in a better way
def get_task_dict(task_name_list: List[Union[str, dict, Task]], **kwargs):
def get_task_dict(task_name_list: List[Union[str, Dict, Task]], **kwargs):
config = {**kwargs}
task_name_from_registry_dict = {}
task_name_from_config_dict = {}
task_name_from_object_dict = {}
if type(task_name_list) != list:
task_name_list = [task_name_list]
for task_element in task_name_list:
if isinstance(task_element, str):
if task_element in GROUP_REGISTRY:
group_name = task_element
for task_name in GROUP_REGISTRY[task_element]:
if task_name not in task_name_from_registry_dict:
task_obj = get_task_dict(task_name)
if task_name in task_obj.keys():
task_dict = {
task_name: (group_name, task_obj[task_name]),
}
else:
task_dict = {
task_name: (group_name, None),
**task_obj,
}
task_name_from_registry_dict = {
**task_name_from_registry_dict,
task_name: (
group_name,
get_task(task_name=task_name, config=config),
),
**task_dict,
}
else:
task_name = task_element
......@@ -168,7 +260,6 @@ def get_task_dict(task_name_list: List[Union[str, dict, Task]], **kwargs):
}
elif isinstance(task_element, Task):
task_name_from_object_dict = {
**task_name_from_object_dict,
get_task_name_from_object(task_element): task_element,
......
# ASDiv
### Paper
Title: `ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers`
Abstract: https://arxiv.org/abs/2106.15772
ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language
patterns and problem types) English math word problem (MWP) corpus for evaluating
the capability of various MWP solvers. Existing MWP corpora for studying AI progress
remain limited either in language usage patterns or in problem types. We thus present
a new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem
types taught in elementary school. Each MWP is annotated with its problem type and grade
level (for indicating the level of difficulty).
NOTE: We currently ignore formulas for answer generation.
Homepage: https://github.com/chaochun/nlu-asdiv-dataset
### Citation
```
@misc{miao2021diverse,
title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},
author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},
year={2021},
eprint={2106.15772},
archivePrefix={arXiv},
primaryClass={cs.AI}
}
```
### Groups and Tasks
#### Groups
* Not part of a group yet.
#### Tasks
* `asdiv`
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
task: babi
dataset_path: Muennighoff/babi
dataset_name: null
output_type: greedy_until
output_type: generate_until
training_split: train
validation_split: valid
test_split: test
......
# BigBenchHard
## Paper
Title: `Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them`
Abstract: https://arxiv.org/abs/2210.09261
A suite of 23 challenging BIG-Bench tasks which we call BIG-Bench Hard (BBH).
These are the task for which prior language model evaluations did not outperform
the average human-rater.
Homepage: https://github.com/suzgunmirac/BIG-Bench-Hard
## Citation
```
@article{suzgun2022challenging,
title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},
author={Suzgun, Mirac and Scales, Nathan and Sch{\"a}rli, Nathanael and Gehrmann, Sebastian and Tay, Yi and Chung, Hyung Won and Chowdhery, Aakanksha and Le, Quoc V and Chi, Ed H and Zhou, Denny and and Wei, Jason},
journal={arXiv preprint arXiv:2210.09261},
year={2022}
}
```
### Groups and Tasks
#### Groups
- `bbh_flan_zeroshot`
- `bbh_flan_fewshot`
- `bbh_flan_cot_fewshot`
- `bbh_flan_cot_zeroshot`
#### Tasks
- ...
### Checklist
- [x] Is in Eval-harness v1.0 ?
- [ ] Has been checked for regression from v1.0?
- [ ] Has been checked for equivalence with original paper methodology?
- [ ] "Main" checked variant clearly denoted?
### Variant Wishlist
- [ ] Variant with Calculator (see https://github.com/openai/grade-school-math/blob/master/grade_school_math/calculator.py for example implementation)
- [ ] Using Verifiers
- [ ] Majority voting "without CoT"
"""
Take in a YAML, and output all other splits with this YAML
"""
import os
import re
import yaml
import requests
import argparse
import datasets
from tqdm import tqdm
from lm_eval import utils
from lm_eval.logger import eval_logger
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--base_yaml_path", required=True)
parser.add_argument("--save_prefix_path", default="flan_zeroshot")
parser.add_argument("--cot", default=False)
parser.add_argument("--fewshot", default=False)
parser.add_argument("--task_prefix", default="")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
# get filename of base_yaml so we can `"include": ` it in our other YAMLs.
base_yaml_name = os.path.split(args.base_yaml_path)[-1]
with open(args.base_yaml_path) as f:
base_yaml = yaml.full_load(f)
base_doc_to_text = "Q: {{input}}\nA:"
answer_regex = re.compile("(?<=answer is )(.*)(?=.)")
dataset_path = "lukaemon/bbh"
for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()):
resp = requests.get(
f"https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/main/cot-prompts/{task}.txt"
).content.decode("utf-8")
prompt = resp.split("\n-----\n")[-1]
description, *few_shot = prompt.split("\n\nQ:")
prefix_doc_to_text = ""
if args.fewshot:
if args.cot:
prefix_doc_to_text = " ".join(few_shot)
else:
for shot in few_shot:
shot = "Q:" + shot
try:
answer = answer_regex.search(shot)[0]
except Exception:
print("task", task)
print(shot)
example = shot.split("Let's think step by step.")[0]
prefix_doc_to_text += f"{example}{answer}\n\n"
doc_to_text = prefix_doc_to_text + base_doc_to_text
if args.cot:
doc_to_text = doc_to_text + " Let's think step by step.\n"
yaml_dict = {
"include": base_yaml_name,
"task": f"bbh_{args.task_prefix}_{task}",
"dataset_name": task,
"description": description + "\n\n",
"doc_to_text": doc_to_text,
}
file_save_path = args.save_prefix_path + f"/{task}.yaml"
eval_logger.info(f"Saving yaml for subset {task} to {file_save_path}")
with open(file_save_path, "w") as yaml_file:
yaml.dump(
yaml_dict,
yaml_file,
width=float("inf"),
allow_unicode=True,
default_style='"',
)
group: bbh_flan_cot_fewshot
dataset_path: lukaemon/bbh
output_type: generate_until
test_split: test
doc_to_target: "{{target}}"
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
# ignore_case: true
# ignore_punctuation: true
generation_kwargs:
until:
- "</s>"
do_sample: false
temperature: 0.0
filter_list:
- name: "get-answer"
filter:
- function: "regex"
regex_pattern: "(?<=the answer is )(.*)(?=.)"
- function: "take_first"
"dataset_name": "boolean_expressions"
"description": "Evaluate the result of a random Boolean expression.\n\n"
"doc_to_text": " not ( ( not not True ) ) is\nA: Let's think step by step.\nRemember that (i) expressions inside brackets are always evaluated first and that (ii) the order of operations from highest priority to lowest priority is \"not\", \"and\", \"or\", respectively.\nWe first simplify this expression \"Z\" as follows: \"Z = not ( ( not not True ) ) = not ( ( A ) )\" where \"A = not not True\".\nLet's evaluate A: A = not not True = not (not True) = not False = True.\nPlugging in A, we get: Z = not ( ( A ) ) = not ( ( True ) ) = not True = False. So the answer is False. True and False and not True and True is\nA: Let's think step by step.\nRemember that (i) expressions inside brackets are always evaluated first and that (ii) the order of operations from highest priority to lowest priority is \"not\", \"and\", \"or\", respectively.\nWe first simplify this expression \"Z\" as follows: \"Z = True and False and not True and True = A and B\" where \"A = True and False\" and \"B = not True and True\".\nLet's evaluate A: A = True and False = False.\nLet's evaluate B: B = not True and True = not (True and True) = not (True) = False.\nPlugging in A and B, we get: Z = A and B = False and False = False. So the answer is False. not not ( not ( False ) ) is\nA: Let's think step by step.\nRemember that (i) expressions inside brackets are always evaluated first and that (ii) the order of operations from highest priority to lowest priority is \"not\", \"and\", \"or\", respectively.\nWe first simplify this expression \"Z\" as follows: \"Z = not not ( not ( False ) ) = not not ( A )\" where \"A = not ( False )\".\nLet's evaluate A: A = not ( False ) = not False = True.\nPlugging in A, we get: Z = not not ( A ) = not not (True) = not not False = True. So the answer is True.Q: {{input}}\nA: Let's think step by step.\n"
"include": "_flan_cot_fewshot_template_yaml"
"task": "bbh_flan_cot_fewshot_boolean_expressions"
"dataset_name": "causal_judgement"
"description": "Answer questions about causal attribution.\n\n"
"doc_to_text": " How would a typical person answer each of the following questions about causation?\nFrank T., had an ongoing dispute with his neighbor over a stretch of land and one day decided to shoot his neighbor in the body. Frank T. had no experience with guns, his hand slipped on the barrel of the gun, and the shot went wild. Nonetheless, the bullet bounced off a large boulder several feet away and hit the neighbor's body, causing significant injury. Did Frank T. intentionally shoot his neighbor in the body?\nOptions:\n- Yes\n- No\nA: Let's think step by step.\nHere in this question, we are told that \"Frank T. had no experience with guns, his hand slipped on the barrel of the gun, and the shot went wild.\" A typical person would assume that this passage suggests that Frank T. had no intention of shooting and injuring someone and that the bullet accidentally hit the neighbor's body; therefore, we conclude that Frank T. did not intentionally hit his neighbor. So the answer is No. How would a typical person answer each of the following questions about causation?\nSuzy and Billy are working on a project that is very important for our nation's security. The boss tells them both: \"Be sure that you are here at exactly 9 am. It is absolutely essential that you arrive at that time.\" Both Billy and Suzy arrive at 9 am. As it happens, there was a motion detector installed in the room where they arrived. The motion detector was set up to be triggered if at least one person appeared in the room at the same time. So the motion detector went off. Did Billy cause the motion detector to go off?\nOptions:\n- Yes\n- No\nA: Let's think step by step.\nHere in this question, we are told that the boss ordered them both to arrive at the meeting room at the same time and that the motion detector was set up to be triggered if at least one person appeared in the room at the same time.\" A typical person would assume that the person probably meant to say the detector was set up to be triggered if \"both persons\" appeared in the room at the same time, not at least one person, since otherwise the phrase \"at the same time\" would not make much sense in that sentence. Because the motion detector went off, a typical person would therefore come to the conclusion that both Suzy and Billy triggered the motion detector to go off; hence, Billy did indeed cause the motion detector to go off. So the answer is Yes. How would a typical person answer each of the following questions about causation?\nGeorge and his sister Lena reunite at their parents' house for Thanksgiving. Whereas George just got into medical school, Lena is unhappy in her marriage and recently lost her job. Over the course of the day, George and Lena get into a number of heated arguments. Later in the afternoon they play a game of darts. They split the first two games, and the third game is close until the end. Who will win comes down to George's last shot. If he hits a high point region, he wins; if he hits a low point region, Lena wins. George thinks of the difficult time Lena is having, and he really wants to let her win. He aims the dart at the low point region. He sets up his shot and the dart lands in the low point region. After his shot, Lena wins the game and is very happy. Did George hit the low point region intentionally?\nOptions:\n- Yes\n- No\nA: Let's think step by step.\nHere in this question, we are told that \"He aims the dart at the low point region.\" A typical person might therefore think George did intentionally hit the low point region, because he wanted to lift up the spirit of his sister Lena. So the answer is Yes.Q: {{input}}\nA: Let's think step by step.\n"
"include": "_flan_cot_fewshot_template_yaml"
"task": "bbh_flan_cot_fewshot_causal_judgement"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment