"test/vscode:/vscode.git/clone" did not exist on "7bfb18877bad8b2ee3270494f043ddf6a81eda94"
Commit f7f298ee authored by lintangsutawika's avatar lintangsutawika
Browse files

removed unused files for now

parents c0d5a660 12bc8fce
...@@ -4,7 +4,7 @@ The `lm-evaluation-harness` is meant to be an extensible and flexible framework ...@@ -4,7 +4,7 @@ The `lm-evaluation-harness` is meant to be an extensible and flexible framework
These YAML configuration files, along with the current codebase commit hash, are intended to be shareable such that providing the YAML config enables another researcher to precisely replicate the evaluation setup used by another, in the case that the prompt or setup differs from standard `lm-eval` task implementations. These YAML configuration files, along with the current codebase commit hash, are intended to be shareable such that providing the YAML config enables another researcher to precisely replicate the evaluation setup used by another, in the case that the prompt or setup differs from standard `lm-eval` task implementations.
While adding a standard evaluation task on a new dataset can be occasionally as simple as swapping out a Hugging Face dataset path in an existing file, more specialized evaluation setups. Here we'll provide a crash course on the more advanced logic implementable in YAML form available to users. While adding a standard evaluation task on a new dataset can be occasionally as simple as swapping out a Hugging Face dataset path in an existing file, more specialized evaluation setups also exist. Here we'll provide a crash course on the more advanced logic implementable in YAML form available to users.
If your intended task relies on features beyond what are described in this guide, we'd love to hear about it! Feel free to open an issue describing the scenario on Github, create a PR to the project with a proposed implementation, or ask in the `#lm-thunderdome` channel on the EleutherAI discord. If your intended task relies on features beyond what are described in this guide, we'd love to hear about it! Feel free to open an issue describing the scenario on Github, create a PR to the project with a proposed implementation, or ask in the `#lm-thunderdome` channel on the EleutherAI discord.
...@@ -380,4 +380,4 @@ task: ...@@ -380,4 +380,4 @@ task:
ignore_punctuation: true ignore_punctuation: true
``` ```
Calling the benchmark is done the same way we would call any task with `--tasks`. Benchmarks can be added in `lm_eval/benchmarks/` Calling the benchmark is done the same way we would call any task with `--tasks`. Benchmarks can be added in `lm_eval/tasks/benchmarks/`
...@@ -10,8 +10,7 @@ from typing import Union ...@@ -10,8 +10,7 @@ from typing import Union
import numpy as np import numpy as np
from lm_eval import evaluator, utils from lm_eval import evaluator, utils
from lm_eval.api.registry import ALL_TASKS from lm_eval.tasks import TaskManager
from lm_eval.tasks import include_path, initialize_tasks
from lm_eval.utils import make_table from lm_eval.utils import make_table
...@@ -156,44 +155,46 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -156,44 +155,46 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
eval_logger.info(f"Verbosity set to {args.verbosity}") eval_logger.info(f"Verbosity set to {args.verbosity}")
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
initialize_tasks(args.verbosity) # initialize_tasks(args.verbosity)
task_manager = TaskManager(args.verbosity, include_path=args.include_path)
if args.limit: if args.limit:
eval_logger.warning( eval_logger.warning(
" --limit SHOULD ONLY BE USED FOR TESTING." " --limit SHOULD ONLY BE USED FOR TESTING."
"REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
) )
if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}")
include_path(args.include_path)
if args.tasks is None: if args.tasks is None:
task_names = ALL_TASKS eval_logger.error("Need to specify task to evaluate.")
sys.exit()
elif args.tasks == "list": elif args.tasks == "list":
eval_logger.info( eval_logger.info(
"Available Tasks:\n - {}".format("\n - ".join(sorted(ALL_TASKS))) "Available Tasks:\n - {}".format("\n - ".join(task_manager.all_tasks()))
) )
sys.exit()
else: else:
if os.path.isdir(args.tasks): if os.path.isdir(args.tasks):
import glob import glob
task_names = [] loaded_task_list = []
yaml_path = os.path.join(args.tasks, "*.yaml") yaml_path = os.path.join(args.tasks, "*.yaml")
for yaml_file in glob.glob(yaml_path): for yaml_file in glob.glob(yaml_path):
config = utils.load_yaml_config(yaml_file) config = utils.load_yaml_config(yaml_file)
task_names.append(config) loaded_task_list.append(config)
else: else:
tasks_list = args.tasks.split(",") input_task_list = args.tasks.split(",")
task_names = utils.pattern_match(tasks_list, ALL_TASKS) loaded_task_list = utils.pattern_match(
for task in [task for task in tasks_list if task not in task_names]: input_task_list, task_manager.all_tasks()
)
for task in [
task for task in input_task_list if task not in loaded_task_list
]:
if os.path.isfile(task): if os.path.isfile(task):
config = utils.load_yaml_config(task) config = utils.load_yaml_config(task)
task_names.append(config) loaded_task_list.append(config)
task_missing = [ task_missing = [
task task
for task in tasks_list for task in input_task_list
if task not in task_names and "*" not in task if task not in loaded_task_list and "*" not in task
] # we don't want errors if a wildcard ("*") task name was used ] # we don't want errors if a wildcard ("*") task name was used
if task_missing: if task_missing:
...@@ -226,12 +227,15 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -226,12 +227,15 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
elif args.log_samples and not args.output_path: elif args.log_samples and not args.output_path:
assert args.output_path, "Specify --output_path" assert args.output_path, "Specify --output_path"
eval_logger.info(f"Selected Tasks: {task_names}") eval_logger.info(f"Selected Tasks: {loaded_task_list}")
eval_logger.info("Loading selected tasks...")
all_tasks = task_manager.load_task_or_group(loaded_task_list)
results = evaluator.simple_evaluate( results = evaluator.simple_evaluate(
model=args.model, model=args.model,
model_args=args.model_args, model_args=args.model_args,
tasks=task_names, tasks=all_tasks,
num_fewshot=args.num_fewshot, num_fewshot=args.num_fewshot,
batch_size=args.batch_size, batch_size=args.batch_size,
max_batch_size=args.max_batch_size, max_batch_size=args.max_batch_size,
......
...@@ -111,6 +111,8 @@ class TaskConfig(dict): ...@@ -111,6 +111,8 @@ class TaskConfig(dict):
} }
# TODO: how to make TaskConfigs be de- and re-serializable, even when using the !function constructor? # TODO: how to make TaskConfigs be de- and re-serializable, even when using the !function constructor?
# if self.dataset_kwargs is None:
# self.dataset_kwargs = {"trust_remote_code": True}
def __getitem__(self, item): def __getitem__(self, item):
return getattr(self, item) return getattr(self, item)
...@@ -118,7 +120,7 @@ class TaskConfig(dict): ...@@ -118,7 +120,7 @@ class TaskConfig(dict):
def __setitem__(self, item, value): def __setitem__(self, item, value):
return setattr(self, item, value) return setattr(self, item, value)
def to_dict(self): def to_dict(self, keep_callable=False):
"""dumps the current config as a dictionary object, as a printable format. """dumps the current config as a dictionary object, as a printable format.
null fields will not be printed. null fields will not be printed.
Used for dumping results alongside full task configuration Used for dumping results alongside full task configuration
...@@ -134,8 +136,11 @@ class TaskConfig(dict): ...@@ -134,8 +136,11 @@ class TaskConfig(dict):
if v is None: if v is None:
cfg_dict.pop(k) cfg_dict.pop(k)
elif isinstance(v, Callable): elif isinstance(v, Callable):
# TODO: this should handle Promptsource template objects as a separate case? if keep_callable:
cfg_dict[k] = str(v) cfg_dict[k] = v
else:
# TODO: this should handle Promptsource template objects as a separate case?
cfg_dict[k] = str(v)
return cfg_dict return cfg_dict
......
...@@ -38,6 +38,7 @@ def simple_evaluate( ...@@ -38,6 +38,7 @@ def simple_evaluate(
write_out: bool = False, write_out: bool = False,
log_samples: bool = True, log_samples: bool = True,
gen_kwargs: str = None, gen_kwargs: str = None,
weight_by_size: bool = False,
): ):
"""Instantiate and evaluate a model on a list of tasks. """Instantiate and evaluate a model on a list of tasks.
...@@ -46,8 +47,8 @@ def simple_evaluate( ...@@ -46,8 +47,8 @@ def simple_evaluate(
:param model_args: Optional[str] :param model_args: Optional[str]
String arguments for each model class, see LM.create_from_arg_string. String arguments for each model class, see LM.create_from_arg_string.
Ignored if `model` argument is a LM object. Ignored if `model` argument is a LM object.
:param tasks: list[Union[str, Task]] :param tasks: list[Task]
List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise. List of Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
:param num_fewshot: int :param num_fewshot: int
Number of examples in few-shot context Number of examples in few-shot context
:param batch_size: int or str, optional :param batch_size: int or str, optional
...@@ -119,7 +120,7 @@ def simple_evaluate( ...@@ -119,7 +120,7 @@ def simple_evaluate(
+ ".db", + ".db",
) )
task_dict = lm_eval.tasks.get_task_dict(tasks) task_dict = tasks
for task_name in task_dict.keys(): for task_name in task_dict.keys():
task_obj = task_dict[task_name] task_obj = task_dict[task_name]
if type(task_obj) == tuple: if type(task_obj) == tuple:
...@@ -155,6 +156,7 @@ def simple_evaluate( ...@@ -155,6 +156,7 @@ def simple_evaluate(
decontamination_ngrams_path=decontamination_ngrams_path, decontamination_ngrams_path=decontamination_ngrams_path,
write_out=write_out, write_out=write_out,
log_samples=log_samples, log_samples=log_samples,
weight_by_size=weight_by_size,
) )
if lm.rank == 0: if lm.rank == 0:
...@@ -192,6 +194,7 @@ def evaluate( ...@@ -192,6 +194,7 @@ def evaluate(
decontamination_ngrams_path=None, decontamination_ngrams_path=None,
write_out: bool = False, write_out: bool = False,
log_samples: bool = True, log_samples: bool = True,
weight_by_size: bool = False,
): ):
"""Instantiate and evaluate a model on a list of tasks. """Instantiate and evaluate a model on a list of tasks.
...@@ -479,12 +482,14 @@ def evaluate( ...@@ -479,12 +482,14 @@ def evaluate(
if "alias" in metrics: if "alias" in metrics:
metrics.pop("alias") metrics.pop("alias")
current_size = metrics.pop("samples")
# TODO: There should be a way for users # TODO: There should be a way for users
# to toggle between weighted and # to toggle between weighted and
# unweighted averaging # unweighted averaging
# For unweighted averaging, use: if weight_by_size:
# current_size = 1 current_size = metrics.pop("samples")
else:
metrics.pop("samples")
current_size = 1
# TODO: Tasks like brier score for individual # TODO: Tasks like brier score for individual
# tasks have no stderr since the score is # tasks have no stderr since the score is
......
import os import os
import abc
import yaml import yaml
import collections
from functools import partial, lru_cache
from typing import List, Union, Dict from typing import List, Union, Dict
from lm_eval import utils from lm_eval import utils
...@@ -10,7 +14,6 @@ from lm_eval.api.registry import ( ...@@ -10,7 +14,6 @@ from lm_eval.api.registry import (
register_group, register_group,
TASK_REGISTRY, TASK_REGISTRY,
GROUP_REGISTRY, GROUP_REGISTRY,
ALL_TASKS,
) )
import logging import logging
...@@ -29,272 +32,203 @@ from .scrolls.task import ( ...@@ -29,272 +32,203 @@ from .scrolls.task import (
eval_logger = utils.eval_logger eval_logger = utils.eval_logger
def register_configurable_task(config: Dict[str, str]) -> int: class TaskManager(abc.ABC):
SubClass = type(
config["task"] + "ConfigurableTask",
(ConfigurableTask,),
{"CONFIG": TaskConfig(**config)},
)
if "task" in config: def __init__(
task_name = "{}".format(config["task"]) self,
register_task(task_name)(SubClass) verbosity="INFO",
include_path=None
) -> None:
if "group" in config: self.verbosity = verbosity
if config["group"] == config["task"]: self.include_path = include_path
raise ValueError("task and group name cannot be the same") self.logger = eval_logger.setLevel(getattr(logging, f"{verbosity}"))
elif type(config["group"]) == str:
group_name = [config["group"]] self.ALL_TASKS = self.initialize_tasks(
else: include_path=include_path
group_name = config["group"]
for group in group_name:
register_group(group)(SubClass)
return 0
def register_configurable_group(config: Dict[str, str], yaml_path: str = None) -> int:
group = config["group"]
all_task_list = config["task"]
config_list = [task for task in all_task_list if type(task) != str]
task_list = [task for task in all_task_list if type(task) == str]
for task_config in config_list:
base_config = {}
task_name_config = {}
if "task" in task_config:
task_name = task_config["task"]
if task_name in ALL_TASKS:
task_obj = get_task_dict(task_name)[task_name]
if type(task_obj) == tuple:
_, task_obj = task_obj
if task_obj is not None:
base_config = task_obj._config.to_dict()
task_name_config["task"] = f"{group}_{task_name}"
task_config = utils.load_yaml_config(yaml_path, task_config)
var_configs = check_prompt_config(
{
**base_config,
**task_config,
**{"group": group},
**task_name_config,
},
yaml_path=os.path.dirname(yaml_path),
)
for config in var_configs:
register_configurable_task(config)
task_names = utils.pattern_match(task_list, ALL_TASKS)
for task in task_names:
if (task in TASK_REGISTRY) or (task in GROUP_REGISTRY):
if group in GROUP_REGISTRY:
GROUP_REGISTRY[group].append(task)
else:
GROUP_REGISTRY[group] = [task]
ALL_TASKS.add(group)
return 0
def check_prompt_config(
config: Dict[str, str], yaml_path: str = None
) -> List[Dict[str, str]]:
all_configs = []
if "use_prompt" in config:
prompt_list = prompts.load_prompt_list(
use_prompt=config["use_prompt"],
dataset_name=config["dataset_path"],
subset_name=config["dataset_name"] if "dataset_name" in config else None,
yaml_path=yaml_path,
)
for idx, prompt_variation in enumerate(prompt_list):
all_configs.append(
{
**config,
**{"use_prompt": prompt_variation},
**{
"task": "_".join(
[
config["task"]
if "task" in config
else get_task_name_from_config(config),
prompt_variation.split("/")[-1]
if ".yaml" in prompt_variation
else prompt_variation,
]
)
},
**{"output_type": "generate_until"},
}
) )
else:
all_configs.append(config)
return all_configs
def get_task_name_from_config(task_config: Dict[str, str]) -> str:
if "dataset_name" in task_config:
return "{dataset_path}_{dataset_name}".format(**task_config)
else:
return "{dataset_path}".format(**task_config)
def include_task_folder(task_dir: str, register_task: bool = True) -> None:
"""
Calling this function
"""
# Track whether any tasks failed during loading
import_fail = False
for root, subdirs, file_list in os.walk(task_dir):
# if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
for f in file_list:
if f.endswith(".yaml"):
yaml_path = os.path.join(root, f)
try:
config = utils.load_yaml_config(yaml_path)
if "task" not in config:
continue
all_configs = check_prompt_config(
config, yaml_path=os.path.dirname(yaml_path)
)
for config in all_configs:
if register_task:
if type(config["task"]) == str:
register_configurable_task(config)
else:
if type(config["task"]) == list:
register_configurable_group(config, yaml_path)
# Log this silently and show it only when
# the user defines the appropriate verbosity.
except (ImportError, ModuleNotFoundError) as e:
import_fail = True
eval_logger.debug(
f"{yaml_path}: {e}. Config will not be added to registry."
)
except Exception as error:
import traceback
eval_logger.warning(
"Unexpected error loading config in\n"
f" {yaml_path}\n"
" Config will not be added to registry\n"
f" Error: {error}\n"
f" Traceback: {traceback.format_exc()}"
)
if import_fail:
eval_logger.warning(
"Some tasks could not be loaded due to missing dependencies."
" Run with `--verbosity DEBUG` for full details."
)
return 0
def include_path(task_dir):
include_task_folder(task_dir)
# Register Benchmarks after all tasks have been added
include_task_folder(task_dir, register_task=False)
return 0
def initialize_tasks(verbosity="INFO"):
eval_logger.setLevel(getattr(logging, f"{verbosity}"))
task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
include_path(task_dir)
def get_task(task_name, config):
try:
return TASK_REGISTRY[task_name](config=config)
except KeyError:
eval_logger.info("Available tasks:")
eval_logger.info(list(TASK_REGISTRY) + list(GROUP_REGISTRY))
raise KeyError(f"Missing task {task_name}")
def get_task_name_from_object(task_object):
for name, class_ in TASK_REGISTRY.items():
if class_ is task_object:
return name
# TODO: scrap this
# this gives a mechanism for non-registered tasks to have a custom name anyways when reporting
return (
task_object.EVAL_HARNESS_NAME
if hasattr(task_object, "EVAL_HARNESS_NAME")
else type(task_object).__name__
)
# TODO: pass num_fewshot and other cmdline overrides in a better way
def get_task_dict(task_name_list: List[Union[str, Dict, Task]], **kwargs):
config = {**kwargs}
task_name_from_registry_dict = {}
task_name_from_config_dict = {}
task_name_from_object_dict = {}
if type(task_name_list) != list:
task_name_list = [task_name_list]
for task_element in task_name_list:
if isinstance(task_element, str):
if task_element in GROUP_REGISTRY:
group_name = task_element
for task_name in GROUP_REGISTRY[task_element]:
if task_name not in task_name_from_registry_dict:
task_obj = get_task_dict(task_name)
if task_name in task_obj.keys():
task_dict = {
task_name: (group_name, task_obj[task_name]),
}
else:
task_dict = {
task_name: (group_name, None),
**task_obj,
}
task_name_from_registry_dict = { def initialize_tasks(self, include_path=None):
**task_name_from_registry_dict,
**task_dict, all_paths = [os.path.dirname(os.path.abspath(__file__)) + "/"]
if include_path is not None:
if isinstance(include_path, str):
include_path = [include_path]
all_paths.extend(include_path)
ALL_TASKS = {}
for task_dir in all_paths:
tasks = self._get_task_and_group(task_dir)
ALL_TASKS = {**tasks, **ALL_TASKS}
return ALL_TASKS
def all_tasks(self):
return sorted(list(self.ALL_TASKS.keys()))
def _name_is_registered(self, name):
if name in self.ALL_TASKS:
return True
return False
def _name_is_task(self, name):
if self.ALL_TASKS[name]["type"] == "task":
return True
return False
def _config_is_task(self, config):
if list(config.keys()) == ["group", "task"]:
return False
return True
def _get_yaml_path(self, name):
assert name in self.ALL_TASKS
return self.ALL_TASKS[name]["yaml_path"]
def _get_config(self, name):
assert name in self.ALL_TASKS
yaml_path = self._get_yaml_path(name)
return utils.load_yaml_config(yaml_path)
def _get_tasklist(self, name):
assert self._name_is_task(name) == False
return self.ALL_TASKS[name]["task"]
@lru_cache(None)
def _load_individual_task_or_group(self, name_or_config: Union[str, dict] = None, parent_name: str = None) -> ConfigurableTask:
def load_task(config, task, group=None):
task_object = ConfigurableTask(config=config)
if group is not None:
task_object = (group, task_object)
return {task: task_object}
if isinstance(name_or_config, str):
if self._name_is_task(name_or_config):
task_config = self._get_config(name_or_config)
return load_task(task_config, task=name_or_config, group=parent_name)
else:
group_name = name_or_config
subtask_list = self._get_tasklist(name_or_config)
if subtask_list == -1:
subtask_list = self._get_config(name_or_config)["task"]
elif isinstance(name_or_config, dict):
if self._config_is_task(name_or_config):
task_name = name_or_config["task"]
if self._name_is_registered(task_name):
base_task_config = self._get_config(task_name)
task_config={
**base_task_config,
**name_or_config,
} }
else:
task_config = name_or_config
return load_task(task_config, task=name_or_config, group=parent_name)
else: else:
task_name = task_element group_name = name_or_config["group"]
if task_name not in task_name_from_registry_dict: subtask_list = name_or_config["task"]
task_name_from_registry_dict = {
**task_name_from_registry_dict, if self._get_yaml_path(group_name) == -1:
task_name: get_task(task_name=task_element, config=config), all_subtasks = {group_name: (parent_name, None)}
} else:
all_subtasks = {}
elif isinstance(task_element, dict):
task_element.update(config) fn = partial(self._load_individual_task_or_group, parent_name=group_name)
task_name_from_config_dict = { all_subtasks = {**all_subtasks, **dict(collections.ChainMap(*map(fn, subtask_list)))}
**task_name_from_config_dict, return all_subtasks
get_task_name_from_config(task_element): ConfigurableTask(
config=task_element
), def load_task_or_group(self, task_list: Union[str, list] = None) -> dict:
}
if isinstance(task_list, str):
elif isinstance(task_element, Task): task_list = [task_list]
task_name_from_object_dict = {
**task_name_from_object_dict, all_loaded_tasks = dict(
get_task_name_from_object(task_element): task_element, collections.ChainMap(
} *map(
self._load_individual_task_or_group,
assert set(task_name_from_registry_dict.keys()).isdisjoint( task_list
set(task_name_from_object_dict.keys()) )
) )
return { )
**task_name_from_registry_dict, return all_loaded_tasks
**task_name_from_config_dict,
**task_name_from_object_dict, def _get_task_and_group(self, task_dir: str):
} tasks_and_groups = collections.defaultdict()
for root, _, file_list in os.walk(task_dir):
for f in file_list:
if f.endswith(".yaml"):
yaml_path = os.path.join(root, f)
config = utils.simple_load_yaml_config(yaml_path)
if list(config.keys()) == ["group", "task"]:
# This is a group config
tasks_and_groups[config["group"]] = {
"type": "group",
"task": -1, # This signals that
# we don't need to know
# the task list for indexing
# as it can be loaded
# when called.
"yaml_path": yaml_path,
}
else:
# This is a task config
task = config["task"]
tasks_and_groups[task] = {
"type": "task",
"yaml_path": yaml_path,
}
if "group" in config:
groups = config["group"]
if isinstance(config["group"], str):
groups = [groups]
for group in groups:
if group not in tasks_and_groups:
tasks_and_groups[group] = {
"type": "group",
"task": [task],
"yaml_path": -1,
}
else:
tasks_and_groups[group]["task"].append(task)
return tasks_and_groups
# def check_prompt_config(
# config: Dict[str, str], yaml_path: str = None
# ) -> List[Dict[str, str]]:
# all_configs = []
# if "use_prompt" in config:
# prompt_list = prompts.load_prompt_list(
# use_prompt=config["use_prompt"],
# dataset_name=config["dataset_path"],
# subset_name=config["dataset_name"] if "dataset_name" in config else None,
# yaml_path=yaml_path,
# )
# for idx, prompt_variation in enumerate(prompt_list):
# all_configs.append(
# {
# **config,
# **{"use_prompt": prompt_variation},
# **{
# "task": "_".join(
# [
# config["task"]
# if "task" in config
# else get_task_name_from_config(config),
# prompt_variation.split("/")[-1]
# if ".yaml" in prompt_variation
# else prompt_variation,
# ]
# )
# },
# **{"output_type": "generate_until"},
# }
# )
# else:
# all_configs.append(config)
# return all_configs
group: flan_anli
task:
- include: yaml_templates/held_in_template_yaml
task: r1
dataset_path: anli
use_prompt: prompt_templates/anli.yaml:*
validation_split: dev_r1
- include: yaml_templates/held_in_template_yaml
task: r2
dataset_path: anli
use_prompt: prompt_templates/anli.yaml:*
validation_split: dev_r2
- include: yaml_templates/held_in_template_yaml
task: r3
dataset_path: anli
use_prompt: prompt_templates/anli.yaml:*
validation_split: dev_r3
group: flan_arc
task:
- include: yaml_templates/held_in_template_yaml
dataset_path: ai2_arc
dataset_name: ARC-Easy
use_prompt: prompt_templates/arc.yaml:*
validation_split: validation
- include: yaml_templates/held_in_template_yaml
dataset_path: ai2_arc
dataset_name: ARC-Challenge
use_prompt: prompt_templates/arc.yaml:*
validation_split: validation
group: flan_boolq
task:
- include: yaml_templates/held_in_template_yaml
dataset_path: super_glue
dataset_name: boolq
use_prompt: prompt_templates/boolq.yaml:*
validation_split: validation
group: flan_rte
task:
- include: yaml_templates/held_in_template_yaml
dataset_path: super_glue
dataset_name: rte
use_prompt: prompt_templates/rte.yaml:*
validation_split: validation
group: flan_cot
task:
- include: yaml_templates/cot_template_yaml
dataset_path: gsmk
dataset_name: boolq
use_prompt: promptsource:*
validation_split: validation
- include: yaml_templates/cot_template_yaml
dataset_path: EleutherAI/asdiv
use_prompt: promptsource:*
validation_split: validation
group: flan_held_in
task:
- include: yaml_templates/held_in_template_yaml
task: r1
dataset_path: anli
use_prompt: prompt_templates/anli.yaml:*
validation_split: dev_r1
- include: yaml_templates/held_in_template_yaml
task: r2
dataset_path: anli
use_prompt: prompt_templates/anli.yaml:*
validation_split: dev_r2
- include: yaml_templates/held_in_template_yaml
task: r3
dataset_path: anli
use_prompt: prompt_templates/anli.yaml:*
validation_split: dev_r3
- include: yaml_templates/held_in_template_yaml
dataset_path: ai2_arc
dataset_name: ARC-Easy
use_prompt: prompt_templates/arc.yaml:*
validation_split: validation
- include: yaml_templates/held_in_template_yaml
dataset_path: ai2_arc
dataset_name: ARC-Challenge
use_prompt: prompt_templates/arc.yaml:*
validation_split: validation
- include: yaml_templates/held_in_template_yaml
dataset_path: super_glue
dataset_name: boolq
use_prompt: prompt_templates/boolq.yaml:*
validation_split: validation
- include: yaml_templates/held_in_template_yaml
dataset_path: super_glue
dataset_name: rte
use_prompt: prompt_templates/rte.yaml:*
validation_split: validation
group: flan_held_out
task:
# BBH
- bbh_zeroshot
- bbh_fewshot
- bbh_cot_fewshot
- bbh_cot_zeroshot
# MMLU
- mmlu
- mmlu_flan_n_shot_generative
- mmlu_flan_n_shot_loglikelihood
- mmlu_flan_cot_zeroshot
- mmlu_flan_cot_fewshot
# Flan Prompt Templates
prompts:
"template-0":
doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-1":
doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-2":
doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-3":
doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-4":
doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-5":
doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-6":
doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-7":
doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-8":
doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
# Flan Prompt Templates
prompts:
"template-0":
doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
"template-1":
doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
"template-2":
doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
"template-3":
doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
"template-4":
doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
"template-5":
doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
"template-6":
doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
# Flan Prompt Templates
prompts:
"template-0":
doc_to_text: "{{passage}}\n\nCan we conclude that {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-1":
doc_to_text: "{{passage}}\n\nIs it true that {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-2":
doc_to_text: "{{passage}}\n\n{{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-3":
doc_to_text: "Text: {{passage}}\n\nQuestion: {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-4":
doc_to_text: "{{passage}}\n\nWhat's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-5":
doc_to_text: "{{passage}}\nBased on the above text what's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-6":
doc_to_text: "{{passage}}\nAnswer this question making sure that the answer is supposed by the text: {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-7":
doc_to_text: "{{passage}}\n\nIs the following statement correct based on the text\n\n{{question}}\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-8":
# doc_to_text: "{{title}}\n\n{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
doc_to_text: "{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-9":
doc_to_text: "Is it true that {{question}} based on the following text?\n\n{{passage}}\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
# Flan Prompt Templates
prompts:
"template-0":
doc_to_text: "{{premise}}\n\nQuestion with options: Based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- yes\n- no"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-1":
doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that the sentence below is true?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-2":
doc_to_text: "{{premise}}\n\nQ with options: Can we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-3":
doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-4":
doc_to_text: "{{premise}}\nOPTIONS:\n- yes\n- no\nQuestion: Can we infer the following?\n{{hypothesis}}"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-5":
doc_to_text: "Read the following paragraph and determine if the hypothesis is true. Select from options at the end:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nThe answer is"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-6":
doc_to_text: "Read the text and determine if the sentence is true:\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nA:"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-7":
doc_to_text: "Question with options: can we draw the following hypothesis from the context? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nA:"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-8":
doc_to_text: "Determine if the sentence is true based on the text below. Choose from options.\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- yes\n- no"
doc_to_target: "{{['yes', 'no'][label]}}"
group: flan-cot
output_type: generate_until
validation_split: validation
doc_to_target: "{{answer}}"
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
generation_kwargs:
until:
- "\n\n"
do_sample: false
temperature: 0.0
filter_list:
- name: "get-answer"
filter:
- function: "regex"
regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
- function: "take_first"
metadata:
version: 1.0
group: grouptest
task:
- boolq
- group: arc_stuff
task:
- arc_challenge
- glue
- task: arc_easy
metric_list:
- metric: acc
num_fewshot: 3
# - task: mmlu
# num_fewshot: 2
dataset_path: hails/mmlu_no_train dataset_path: hails/mmlu_no_train
test_split: test test_split: test
fewshot_split: dev fewshot_split: dev
fewshot_config:
sampler: first_n
output_type: multiple_choice output_type: multiple_choice
doc_to_text: !function ../../../styles.template_05 doc_to_text: !function ../../../styles.template_05
doc_to_choice: !function ../../../styles.choice_05a doc_to_choice: !function ../../../styles.choice_05a
......
group: mmlu_alt_ov_05a_generative
task:
- mmlu_alt_ov_05a_stem_generative
- mmlu_alt_ov_05a_other_generative
- mmlu_alt_ov_05a_social_sciences_generative
- mmlu_alt_ov_05a_humanities_generative
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment