"src/vscode:/vscode.git/clone" did not exist on "ab6cd9d3bc9bfe31e1ec98e09c7da607b7426b6a"
Commit f7f298ee authored by lintangsutawika's avatar lintangsutawika
Browse files

removed unused files for now

parents c0d5a660 12bc8fce
......@@ -4,7 +4,7 @@ The `lm-evaluation-harness` is meant to be an extensible and flexible framework
These YAML configuration files, along with the current codebase commit hash, are intended to be shareable such that providing the YAML config enables another researcher to precisely replicate the evaluation setup used by another, in the case that the prompt or setup differs from standard `lm-eval` task implementations.
While adding a standard evaluation task on a new dataset can be occasionally as simple as swapping out a Hugging Face dataset path in an existing file, more specialized evaluation setups. Here we'll provide a crash course on the more advanced logic implementable in YAML form available to users.
While adding a standard evaluation task on a new dataset can be occasionally as simple as swapping out a Hugging Face dataset path in an existing file, more specialized evaluation setups also exist. Here we'll provide a crash course on the more advanced logic implementable in YAML form available to users.
If your intended task relies on features beyond what are described in this guide, we'd love to hear about it! Feel free to open an issue describing the scenario on Github, create a PR to the project with a proposed implementation, or ask in the `#lm-thunderdome` channel on the EleutherAI discord.
......@@ -380,4 +380,4 @@ task:
ignore_punctuation: true
```
Calling the benchmark is done the same way we would call any task with `--tasks`. Benchmarks can be added in `lm_eval/benchmarks/`
Calling the benchmark is done the same way we would call any task with `--tasks`. Benchmarks can be added in `lm_eval/tasks/benchmarks/`
......@@ -10,8 +10,7 @@ from typing import Union
import numpy as np
from lm_eval import evaluator, utils
from lm_eval.api.registry import ALL_TASKS
from lm_eval.tasks import include_path, initialize_tasks
from lm_eval.tasks import TaskManager
from lm_eval.utils import make_table
......@@ -156,44 +155,46 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
eval_logger.info(f"Verbosity set to {args.verbosity}")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
initialize_tasks(args.verbosity)
# initialize_tasks(args.verbosity)
task_manager = TaskManager(args.verbosity, include_path=args.include_path)
if args.limit:
eval_logger.warning(
" --limit SHOULD ONLY BE USED FOR TESTING."
"REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
)
if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}")
include_path(args.include_path)
if args.tasks is None:
task_names = ALL_TASKS
eval_logger.error("Need to specify task to evaluate.")
sys.exit()
elif args.tasks == "list":
eval_logger.info(
"Available Tasks:\n - {}".format("\n - ".join(sorted(ALL_TASKS)))
"Available Tasks:\n - {}".format("\n - ".join(task_manager.all_tasks()))
)
sys.exit()
else:
if os.path.isdir(args.tasks):
import glob
task_names = []
loaded_task_list = []
yaml_path = os.path.join(args.tasks, "*.yaml")
for yaml_file in glob.glob(yaml_path):
config = utils.load_yaml_config(yaml_file)
task_names.append(config)
loaded_task_list.append(config)
else:
tasks_list = args.tasks.split(",")
task_names = utils.pattern_match(tasks_list, ALL_TASKS)
for task in [task for task in tasks_list if task not in task_names]:
input_task_list = args.tasks.split(",")
loaded_task_list = utils.pattern_match(
input_task_list, task_manager.all_tasks()
)
for task in [
task for task in input_task_list if task not in loaded_task_list
]:
if os.path.isfile(task):
config = utils.load_yaml_config(task)
task_names.append(config)
loaded_task_list.append(config)
task_missing = [
task
for task in tasks_list
if task not in task_names and "*" not in task
for task in input_task_list
if task not in loaded_task_list and "*" not in task
] # we don't want errors if a wildcard ("*") task name was used
if task_missing:
......@@ -226,12 +227,15 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
elif args.log_samples and not args.output_path:
assert args.output_path, "Specify --output_path"
eval_logger.info(f"Selected Tasks: {task_names}")
eval_logger.info(f"Selected Tasks: {loaded_task_list}")
eval_logger.info("Loading selected tasks...")
all_tasks = task_manager.load_task_or_group(loaded_task_list)
results = evaluator.simple_evaluate(
model=args.model,
model_args=args.model_args,
tasks=task_names,
tasks=all_tasks,
num_fewshot=args.num_fewshot,
batch_size=args.batch_size,
max_batch_size=args.max_batch_size,
......
......@@ -111,6 +111,8 @@ class TaskConfig(dict):
}
# TODO: how to make TaskConfigs be de- and re-serializable, even when using the !function constructor?
# if self.dataset_kwargs is None:
# self.dataset_kwargs = {"trust_remote_code": True}
def __getitem__(self, item):
return getattr(self, item)
......@@ -118,7 +120,7 @@ class TaskConfig(dict):
def __setitem__(self, item, value):
return setattr(self, item, value)
def to_dict(self):
def to_dict(self, keep_callable=False):
"""dumps the current config as a dictionary object, as a printable format.
null fields will not be printed.
Used for dumping results alongside full task configuration
......@@ -134,8 +136,11 @@ class TaskConfig(dict):
if v is None:
cfg_dict.pop(k)
elif isinstance(v, Callable):
# TODO: this should handle Promptsource template objects as a separate case?
cfg_dict[k] = str(v)
if keep_callable:
cfg_dict[k] = v
else:
# TODO: this should handle Promptsource template objects as a separate case?
cfg_dict[k] = str(v)
return cfg_dict
......
......@@ -38,6 +38,7 @@ def simple_evaluate(
write_out: bool = False,
log_samples: bool = True,
gen_kwargs: str = None,
weight_by_size: bool = False,
):
"""Instantiate and evaluate a model on a list of tasks.
......@@ -46,8 +47,8 @@ def simple_evaluate(
:param model_args: Optional[str]
String arguments for each model class, see LM.create_from_arg_string.
Ignored if `model` argument is a LM object.
:param tasks: list[Union[str, Task]]
List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
:param tasks: list[Task]
List of Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
:param num_fewshot: int
Number of examples in few-shot context
:param batch_size: int or str, optional
......@@ -119,7 +120,7 @@ def simple_evaluate(
+ ".db",
)
task_dict = lm_eval.tasks.get_task_dict(tasks)
task_dict = tasks
for task_name in task_dict.keys():
task_obj = task_dict[task_name]
if type(task_obj) == tuple:
......@@ -155,6 +156,7 @@ def simple_evaluate(
decontamination_ngrams_path=decontamination_ngrams_path,
write_out=write_out,
log_samples=log_samples,
weight_by_size=weight_by_size,
)
if lm.rank == 0:
......@@ -192,6 +194,7 @@ def evaluate(
decontamination_ngrams_path=None,
write_out: bool = False,
log_samples: bool = True,
weight_by_size: bool = False,
):
"""Instantiate and evaluate a model on a list of tasks.
......@@ -479,12 +482,14 @@ def evaluate(
if "alias" in metrics:
metrics.pop("alias")
current_size = metrics.pop("samples")
# TODO: There should be a way for users
# to toggle between weighted and
# unweighted averaging
# For unweighted averaging, use:
# current_size = 1
if weight_by_size:
current_size = metrics.pop("samples")
else:
metrics.pop("samples")
current_size = 1
# TODO: Tasks like brier score for individual
# tasks have no stderr since the score is
......
import os
import abc
import yaml
import collections
from functools import partial, lru_cache
from typing import List, Union, Dict
from lm_eval import utils
......@@ -10,7 +14,6 @@ from lm_eval.api.registry import (
register_group,
TASK_REGISTRY,
GROUP_REGISTRY,
ALL_TASKS,
)
import logging
......@@ -29,272 +32,203 @@ from .scrolls.task import (
eval_logger = utils.eval_logger
def register_configurable_task(config: Dict[str, str]) -> int:
SubClass = type(
config["task"] + "ConfigurableTask",
(ConfigurableTask,),
{"CONFIG": TaskConfig(**config)},
)
class TaskManager(abc.ABC):
if "task" in config:
task_name = "{}".format(config["task"])
register_task(task_name)(SubClass)
def __init__(
self,
verbosity="INFO",
include_path=None
) -> None:
if "group" in config:
if config["group"] == config["task"]:
raise ValueError("task and group name cannot be the same")
elif type(config["group"]) == str:
group_name = [config["group"]]
else:
group_name = config["group"]
for group in group_name:
register_group(group)(SubClass)
return 0
def register_configurable_group(config: Dict[str, str], yaml_path: str = None) -> int:
group = config["group"]
all_task_list = config["task"]
config_list = [task for task in all_task_list if type(task) != str]
task_list = [task for task in all_task_list if type(task) == str]
for task_config in config_list:
base_config = {}
task_name_config = {}
if "task" in task_config:
task_name = task_config["task"]
if task_name in ALL_TASKS:
task_obj = get_task_dict(task_name)[task_name]
if type(task_obj) == tuple:
_, task_obj = task_obj
if task_obj is not None:
base_config = task_obj._config.to_dict()
task_name_config["task"] = f"{group}_{task_name}"
task_config = utils.load_yaml_config(yaml_path, task_config)
var_configs = check_prompt_config(
{
**base_config,
**task_config,
**{"group": group},
**task_name_config,
},
yaml_path=os.path.dirname(yaml_path),
)
for config in var_configs:
register_configurable_task(config)
task_names = utils.pattern_match(task_list, ALL_TASKS)
for task in task_names:
if (task in TASK_REGISTRY) or (task in GROUP_REGISTRY):
if group in GROUP_REGISTRY:
GROUP_REGISTRY[group].append(task)
else:
GROUP_REGISTRY[group] = [task]
ALL_TASKS.add(group)
return 0
def check_prompt_config(
config: Dict[str, str], yaml_path: str = None
) -> List[Dict[str, str]]:
all_configs = []
if "use_prompt" in config:
prompt_list = prompts.load_prompt_list(
use_prompt=config["use_prompt"],
dataset_name=config["dataset_path"],
subset_name=config["dataset_name"] if "dataset_name" in config else None,
yaml_path=yaml_path,
)
for idx, prompt_variation in enumerate(prompt_list):
all_configs.append(
{
**config,
**{"use_prompt": prompt_variation},
**{
"task": "_".join(
[
config["task"]
if "task" in config
else get_task_name_from_config(config),
prompt_variation.split("/")[-1]
if ".yaml" in prompt_variation
else prompt_variation,
]
)
},
**{"output_type": "generate_until"},
}
self.verbosity = verbosity
self.include_path = include_path
self.logger = eval_logger.setLevel(getattr(logging, f"{verbosity}"))
self.ALL_TASKS = self.initialize_tasks(
include_path=include_path
)
else:
all_configs.append(config)
return all_configs
def get_task_name_from_config(task_config: Dict[str, str]) -> str:
if "dataset_name" in task_config:
return "{dataset_path}_{dataset_name}".format(**task_config)
else:
return "{dataset_path}".format(**task_config)
def include_task_folder(task_dir: str, register_task: bool = True) -> None:
"""
Calling this function
"""
# Track whether any tasks failed during loading
import_fail = False
for root, subdirs, file_list in os.walk(task_dir):
# if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
for f in file_list:
if f.endswith(".yaml"):
yaml_path = os.path.join(root, f)
try:
config = utils.load_yaml_config(yaml_path)
if "task" not in config:
continue
all_configs = check_prompt_config(
config, yaml_path=os.path.dirname(yaml_path)
)
for config in all_configs:
if register_task:
if type(config["task"]) == str:
register_configurable_task(config)
else:
if type(config["task"]) == list:
register_configurable_group(config, yaml_path)
# Log this silently and show it only when
# the user defines the appropriate verbosity.
except (ImportError, ModuleNotFoundError) as e:
import_fail = True
eval_logger.debug(
f"{yaml_path}: {e}. Config will not be added to registry."
)
except Exception as error:
import traceback
eval_logger.warning(
"Unexpected error loading config in\n"
f" {yaml_path}\n"
" Config will not be added to registry\n"
f" Error: {error}\n"
f" Traceback: {traceback.format_exc()}"
)
if import_fail:
eval_logger.warning(
"Some tasks could not be loaded due to missing dependencies."
" Run with `--verbosity DEBUG` for full details."
)
return 0
def include_path(task_dir):
include_task_folder(task_dir)
# Register Benchmarks after all tasks have been added
include_task_folder(task_dir, register_task=False)
return 0
def initialize_tasks(verbosity="INFO"):
eval_logger.setLevel(getattr(logging, f"{verbosity}"))
task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
include_path(task_dir)
def get_task(task_name, config):
try:
return TASK_REGISTRY[task_name](config=config)
except KeyError:
eval_logger.info("Available tasks:")
eval_logger.info(list(TASK_REGISTRY) + list(GROUP_REGISTRY))
raise KeyError(f"Missing task {task_name}")
def get_task_name_from_object(task_object):
for name, class_ in TASK_REGISTRY.items():
if class_ is task_object:
return name
# TODO: scrap this
# this gives a mechanism for non-registered tasks to have a custom name anyways when reporting
return (
task_object.EVAL_HARNESS_NAME
if hasattr(task_object, "EVAL_HARNESS_NAME")
else type(task_object).__name__
)
# TODO: pass num_fewshot and other cmdline overrides in a better way
def get_task_dict(task_name_list: List[Union[str, Dict, Task]], **kwargs):
config = {**kwargs}
task_name_from_registry_dict = {}
task_name_from_config_dict = {}
task_name_from_object_dict = {}
if type(task_name_list) != list:
task_name_list = [task_name_list]
for task_element in task_name_list:
if isinstance(task_element, str):
if task_element in GROUP_REGISTRY:
group_name = task_element
for task_name in GROUP_REGISTRY[task_element]:
if task_name not in task_name_from_registry_dict:
task_obj = get_task_dict(task_name)
if task_name in task_obj.keys():
task_dict = {
task_name: (group_name, task_obj[task_name]),
}
else:
task_dict = {
task_name: (group_name, None),
**task_obj,
}
task_name_from_registry_dict = {
**task_name_from_registry_dict,
**task_dict,
def initialize_tasks(self, include_path=None):
all_paths = [os.path.dirname(os.path.abspath(__file__)) + "/"]
if include_path is not None:
if isinstance(include_path, str):
include_path = [include_path]
all_paths.extend(include_path)
ALL_TASKS = {}
for task_dir in all_paths:
tasks = self._get_task_and_group(task_dir)
ALL_TASKS = {**tasks, **ALL_TASKS}
return ALL_TASKS
def all_tasks(self):
return sorted(list(self.ALL_TASKS.keys()))
def _name_is_registered(self, name):
if name in self.ALL_TASKS:
return True
return False
def _name_is_task(self, name):
if self.ALL_TASKS[name]["type"] == "task":
return True
return False
def _config_is_task(self, config):
if list(config.keys()) == ["group", "task"]:
return False
return True
def _get_yaml_path(self, name):
assert name in self.ALL_TASKS
return self.ALL_TASKS[name]["yaml_path"]
def _get_config(self, name):
assert name in self.ALL_TASKS
yaml_path = self._get_yaml_path(name)
return utils.load_yaml_config(yaml_path)
def _get_tasklist(self, name):
assert self._name_is_task(name) == False
return self.ALL_TASKS[name]["task"]
@lru_cache(None)
def _load_individual_task_or_group(self, name_or_config: Union[str, dict] = None, parent_name: str = None) -> ConfigurableTask:
def load_task(config, task, group=None):
task_object = ConfigurableTask(config=config)
if group is not None:
task_object = (group, task_object)
return {task: task_object}
if isinstance(name_or_config, str):
if self._name_is_task(name_or_config):
task_config = self._get_config(name_or_config)
return load_task(task_config, task=name_or_config, group=parent_name)
else:
group_name = name_or_config
subtask_list = self._get_tasklist(name_or_config)
if subtask_list == -1:
subtask_list = self._get_config(name_or_config)["task"]
elif isinstance(name_or_config, dict):
if self._config_is_task(name_or_config):
task_name = name_or_config["task"]
if self._name_is_registered(task_name):
base_task_config = self._get_config(task_name)
task_config={
**base_task_config,
**name_or_config,
}
else:
task_config = name_or_config
return load_task(task_config, task=name_or_config, group=parent_name)
else:
task_name = task_element
if task_name not in task_name_from_registry_dict:
task_name_from_registry_dict = {
**task_name_from_registry_dict,
task_name: get_task(task_name=task_element, config=config),
}
elif isinstance(task_element, dict):
task_element.update(config)
task_name_from_config_dict = {
**task_name_from_config_dict,
get_task_name_from_config(task_element): ConfigurableTask(
config=task_element
),
}
elif isinstance(task_element, Task):
task_name_from_object_dict = {
**task_name_from_object_dict,
get_task_name_from_object(task_element): task_element,
}
assert set(task_name_from_registry_dict.keys()).isdisjoint(
set(task_name_from_object_dict.keys())
)
return {
**task_name_from_registry_dict,
**task_name_from_config_dict,
**task_name_from_object_dict,
}
group_name = name_or_config["group"]
subtask_list = name_or_config["task"]
if self._get_yaml_path(group_name) == -1:
all_subtasks = {group_name: (parent_name, None)}
else:
all_subtasks = {}
fn = partial(self._load_individual_task_or_group, parent_name=group_name)
all_subtasks = {**all_subtasks, **dict(collections.ChainMap(*map(fn, subtask_list)))}
return all_subtasks
def load_task_or_group(self, task_list: Union[str, list] = None) -> dict:
if isinstance(task_list, str):
task_list = [task_list]
all_loaded_tasks = dict(
collections.ChainMap(
*map(
self._load_individual_task_or_group,
task_list
)
)
)
return all_loaded_tasks
def _get_task_and_group(self, task_dir: str):
tasks_and_groups = collections.defaultdict()
for root, _, file_list in os.walk(task_dir):
for f in file_list:
if f.endswith(".yaml"):
yaml_path = os.path.join(root, f)
config = utils.simple_load_yaml_config(yaml_path)
if list(config.keys()) == ["group", "task"]:
# This is a group config
tasks_and_groups[config["group"]] = {
"type": "group",
"task": -1, # This signals that
# we don't need to know
# the task list for indexing
# as it can be loaded
# when called.
"yaml_path": yaml_path,
}
else:
# This is a task config
task = config["task"]
tasks_and_groups[task] = {
"type": "task",
"yaml_path": yaml_path,
}
if "group" in config:
groups = config["group"]
if isinstance(config["group"], str):
groups = [groups]
for group in groups:
if group not in tasks_and_groups:
tasks_and_groups[group] = {
"type": "group",
"task": [task],
"yaml_path": -1,
}
else:
tasks_and_groups[group]["task"].append(task)
return tasks_and_groups
# def check_prompt_config(
# config: Dict[str, str], yaml_path: str = None
# ) -> List[Dict[str, str]]:
# all_configs = []
# if "use_prompt" in config:
# prompt_list = prompts.load_prompt_list(
# use_prompt=config["use_prompt"],
# dataset_name=config["dataset_path"],
# subset_name=config["dataset_name"] if "dataset_name" in config else None,
# yaml_path=yaml_path,
# )
# for idx, prompt_variation in enumerate(prompt_list):
# all_configs.append(
# {
# **config,
# **{"use_prompt": prompt_variation},
# **{
# "task": "_".join(
# [
# config["task"]
# if "task" in config
# else get_task_name_from_config(config),
# prompt_variation.split("/")[-1]
# if ".yaml" in prompt_variation
# else prompt_variation,
# ]
# )
# },
# **{"output_type": "generate_until"},
# }
# )
# else:
# all_configs.append(config)
# return all_configs
group: flan_anli
task:
- include: yaml_templates/held_in_template_yaml
task: r1
dataset_path: anli
use_prompt: prompt_templates/anli.yaml:*
validation_split: dev_r1
- include: yaml_templates/held_in_template_yaml
task: r2
dataset_path: anli
use_prompt: prompt_templates/anli.yaml:*
validation_split: dev_r2
- include: yaml_templates/held_in_template_yaml
task: r3
dataset_path: anli
use_prompt: prompt_templates/anli.yaml:*
validation_split: dev_r3
group: flan_arc
task:
- include: yaml_templates/held_in_template_yaml
dataset_path: ai2_arc
dataset_name: ARC-Easy
use_prompt: prompt_templates/arc.yaml:*
validation_split: validation
- include: yaml_templates/held_in_template_yaml
dataset_path: ai2_arc
dataset_name: ARC-Challenge
use_prompt: prompt_templates/arc.yaml:*
validation_split: validation
group: flan_boolq
task:
- include: yaml_templates/held_in_template_yaml
dataset_path: super_glue
dataset_name: boolq
use_prompt: prompt_templates/boolq.yaml:*
validation_split: validation
group: flan_rte
task:
- include: yaml_templates/held_in_template_yaml
dataset_path: super_glue
dataset_name: rte
use_prompt: prompt_templates/rte.yaml:*
validation_split: validation
group: flan_cot
task:
- include: yaml_templates/cot_template_yaml
dataset_path: gsmk
dataset_name: boolq
use_prompt: promptsource:*
validation_split: validation
- include: yaml_templates/cot_template_yaml
dataset_path: EleutherAI/asdiv
use_prompt: promptsource:*
validation_split: validation
group: flan_held_in
task:
- include: yaml_templates/held_in_template_yaml
task: r1
dataset_path: anli
use_prompt: prompt_templates/anli.yaml:*
validation_split: dev_r1
- include: yaml_templates/held_in_template_yaml
task: r2
dataset_path: anli
use_prompt: prompt_templates/anli.yaml:*
validation_split: dev_r2
- include: yaml_templates/held_in_template_yaml
task: r3
dataset_path: anli
use_prompt: prompt_templates/anli.yaml:*
validation_split: dev_r3
- include: yaml_templates/held_in_template_yaml
dataset_path: ai2_arc
dataset_name: ARC-Easy
use_prompt: prompt_templates/arc.yaml:*
validation_split: validation
- include: yaml_templates/held_in_template_yaml
dataset_path: ai2_arc
dataset_name: ARC-Challenge
use_prompt: prompt_templates/arc.yaml:*
validation_split: validation
- include: yaml_templates/held_in_template_yaml
dataset_path: super_glue
dataset_name: boolq
use_prompt: prompt_templates/boolq.yaml:*
validation_split: validation
- include: yaml_templates/held_in_template_yaml
dataset_path: super_glue
dataset_name: rte
use_prompt: prompt_templates/rte.yaml:*
validation_split: validation
group: flan_held_out
task:
# BBH
- bbh_zeroshot
- bbh_fewshot
- bbh_cot_fewshot
- bbh_cot_zeroshot
# MMLU
- mmlu
- mmlu_flan_n_shot_generative
- mmlu_flan_n_shot_loglikelihood
- mmlu_flan_cot_zeroshot
- mmlu_flan_cot_fewshot
# Flan Prompt Templates
prompts:
"template-0":
doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-1":
doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-2":
doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-3":
doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-4":
doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-5":
doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-6":
doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-7":
doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-8":
doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
# Flan Prompt Templates
prompts:
"template-0":
doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
"template-1":
doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
"template-2":
doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
"template-3":
doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
"template-4":
doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
"template-5":
doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
"template-6":
doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
# Flan Prompt Templates
prompts:
"template-0":
doc_to_text: "{{passage}}\n\nCan we conclude that {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-1":
doc_to_text: "{{passage}}\n\nIs it true that {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-2":
doc_to_text: "{{passage}}\n\n{{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-3":
doc_to_text: "Text: {{passage}}\n\nQuestion: {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-4":
doc_to_text: "{{passage}}\n\nWhat's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-5":
doc_to_text: "{{passage}}\nBased on the above text what's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-6":
doc_to_text: "{{passage}}\nAnswer this question making sure that the answer is supposed by the text: {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-7":
doc_to_text: "{{passage}}\n\nIs the following statement correct based on the text\n\n{{question}}\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-8":
# doc_to_text: "{{title}}\n\n{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
doc_to_text: "{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-9":
doc_to_text: "Is it true that {{question}} based on the following text?\n\n{{passage}}\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
# Flan Prompt Templates
prompts:
"template-0":
doc_to_text: "{{premise}}\n\nQuestion with options: Based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- yes\n- no"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-1":
doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that the sentence below is true?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-2":
doc_to_text: "{{premise}}\n\nQ with options: Can we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-3":
doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-4":
doc_to_text: "{{premise}}\nOPTIONS:\n- yes\n- no\nQuestion: Can we infer the following?\n{{hypothesis}}"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-5":
doc_to_text: "Read the following paragraph and determine if the hypothesis is true. Select from options at the end:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nThe answer is"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-6":
doc_to_text: "Read the text and determine if the sentence is true:\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nA:"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-7":
doc_to_text: "Question with options: can we draw the following hypothesis from the context? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nA:"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-8":
doc_to_text: "Determine if the sentence is true based on the text below. Choose from options.\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- yes\n- no"
doc_to_target: "{{['yes', 'no'][label]}}"
group: flan-cot
output_type: generate_until
validation_split: validation
doc_to_target: "{{answer}}"
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
generation_kwargs:
until:
- "\n\n"
do_sample: false
temperature: 0.0
filter_list:
- name: "get-answer"
filter:
- function: "regex"
regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
- function: "take_first"
metadata:
version: 1.0
group: grouptest
task:
- boolq
- group: arc_stuff
task:
- arc_challenge
- glue
- task: arc_easy
metric_list:
- metric: acc
num_fewshot: 3
# - task: mmlu
# num_fewshot: 2
dataset_path: hails/mmlu_no_train
test_split: test
fewshot_split: dev
fewshot_config:
sampler: first_n
output_type: multiple_choice
doc_to_text: !function ../../../styles.template_05
doc_to_choice: !function ../../../styles.choice_05a
......
group: mmlu_alt_ov_05a_generative
task:
- mmlu_alt_ov_05a_stem_generative
- mmlu_alt_ov_05a_other_generative
- mmlu_alt_ov_05a_social_sciences_generative
- mmlu_alt_ov_05a_humanities_generative
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment