Commit 02e841ce authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'main' of https://github.com/EleutherAI/lm-evaluation-harness into t5v2-alt-plus

parents 90ad5db7 e74ec966
...@@ -105,7 +105,7 @@ class OpenaiCompletionsLM(TemplateLM): ...@@ -105,7 +105,7 @@ class OpenaiCompletionsLM(TemplateLM):
except ModuleNotFoundError: except ModuleNotFoundError:
raise Exception( raise Exception(
"attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \ "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`", please install these via `pip install lm-eval[openai]` or `pip install -e .\"[openai]\"`",
) )
self.model = model self.model = model
self.base_url = base_url self.base_url = base_url
...@@ -231,7 +231,7 @@ class OpenaiCompletionsLM(TemplateLM): ...@@ -231,7 +231,7 @@ class OpenaiCompletionsLM(TemplateLM):
self.cache_hook.add_partial("loglikelihood", cache_key, answer) self.cache_hook.add_partial("loglikelihood", cache_key, answer)
return re_ord.get_original(res) return re_ord.get_original(res)
def generate_until(self, requests) -> List[str]: def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
if not requests: if not requests:
return [] return []
res = [] res = []
...@@ -258,7 +258,8 @@ class OpenaiCompletionsLM(TemplateLM): ...@@ -258,7 +258,8 @@ class OpenaiCompletionsLM(TemplateLM):
# todo: more intelligent batching for heterogeneous `until` # todo: more intelligent batching for heterogeneous `until`
for chunk, request_args in tqdm( for chunk, request_args in tqdm(
list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size)) list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size)),
disable=disable_tqdm,
): ):
inps = [] inps = []
self._max_gen_toks = request_args.get("max_gen_toks", self.max_gen_toks) self._max_gen_toks = request_args.get("max_gen_toks", self.max_gen_toks)
...@@ -308,10 +309,12 @@ class OpenaiCompletionsLM(TemplateLM): ...@@ -308,10 +309,12 @@ class OpenaiCompletionsLM(TemplateLM):
# Isn't used because we override generate_until # Isn't used because we override generate_until
raise NotImplementedError() raise NotImplementedError()
def loglikelihood_rolling(self, requests) -> List[float]: def loglikelihood_rolling(
self, requests, disable_tqdm: bool = False
) -> List[float]:
loglikelihoods = [] loglikelihoods = []
for (string,) in tqdm([req.args for req in requests]): for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
rolling_token_windows = list( rolling_token_windows = list(
map( map(
utils.make_disjoint_window, utils.make_disjoint_window,
...@@ -398,7 +401,7 @@ class OpenaiChatCompletionsLM(LM): ...@@ -398,7 +401,7 @@ class OpenaiChatCompletionsLM(LM):
# Isn't used because we override _loglikelihood_tokens # Isn't used because we override _loglikelihood_tokens
raise NotImplementedError() raise NotImplementedError()
def generate_until(self, requests) -> List[str]: def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
res = defaultdict(list) res = defaultdict(list)
re_ords = {} re_ords = {}
...@@ -412,7 +415,7 @@ class OpenaiChatCompletionsLM(LM): ...@@ -412,7 +415,7 @@ class OpenaiChatCompletionsLM(LM):
[req.args for req in reqs], lambda x: (-len(x[0]), x[0]) [req.args for req in reqs], lambda x: (-len(x[0]), x[0])
) )
pbar = tqdm(total=len(requests), disable=(self.rank != 0)) pbar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0)))
for key, re_ord in re_ords.items(): for key, re_ord in re_ords.items():
# n needs to be 1 because messages in # n needs to be 1 because messages in
# chat completion are not batch but # chat completion are not batch but
...@@ -471,8 +474,8 @@ class OpenaiChatCompletionsLM(LM): ...@@ -471,8 +474,8 @@ class OpenaiChatCompletionsLM(LM):
return grouper.get_original(res) return grouper.get_original(res)
def loglikelihood(self, requests): def loglikelihood(self, requests, disable_tqdm: bool = False):
raise NotImplementedError("No support for logits.") raise NotImplementedError("No support for logits.")
def loglikelihood_rolling(self, requests): def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
raise NotImplementedError("No support for logits.") raise NotImplementedError("No support for logits.")
...@@ -95,9 +95,9 @@ class TextSynthLM(LM): ...@@ -95,9 +95,9 @@ class TextSynthLM(LM):
# Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
raise NotImplementedError() raise NotImplementedError()
def loglikelihood(self, requests): def loglikelihood(self, requests, disable_tqdm: bool = False):
res = [] res = []
for context, continuation in tqdm(requests): for context, continuation in tqdm(requests, disable=disable_tqdm):
response = textsynth_completion( response = textsynth_completion(
url=self.api_url + "/v1/engines/" + self.engine + "/logprob", url=self.api_url + "/v1/engines/" + self.engine + "/logprob",
headers={"Authorization": "Bearer " + self.api_key}, headers={"Authorization": "Bearer " + self.api_key},
...@@ -119,7 +119,7 @@ class TextSynthLM(LM): ...@@ -119,7 +119,7 @@ class TextSynthLM(LM):
assert False assert False
return res return res
def loglikelihood_rolling(self, requests): def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
# TODO: The TextSynth API does not support tokenized inputs so we cannot # TODO: The TextSynth API does not support tokenized inputs so we cannot
# manually partition long contexts into smaller rolling windows as # manually partition long contexts into smaller rolling windows as
# done for other models derived from `BaseLM`. Override this method # done for other models derived from `BaseLM`. Override this method
...@@ -129,12 +129,12 @@ class TextSynthLM(LM): ...@@ -129,12 +129,12 @@ class TextSynthLM(LM):
"input tokenization support from TextSynth." "input tokenization support from TextSynth."
) )
def generate_until(self, requests): def generate_until(self, requests, disable_tqdm: bool = False):
if not requests: if not requests:
return [] return []
res = [] res = []
for request in tqdm(requests): for request in tqdm(requests, disable=disable_tqdm):
inp = request[0] inp = request[0]
request_args = request[1] request_args = request[1]
until = request_args["until"] until = request_args["until"]
......
import collections import collections
import fnmatch import fnmatch
import gc import gc
import itertools
import time import time
from functools import wraps from functools import wraps
from typing import ( from typing import (
...@@ -262,55 +263,44 @@ def stop_sequences_criteria( ...@@ -262,55 +263,44 @@ def stop_sequences_criteria(
) )
def divide(iterable, n) -> List[Iterator]: def undistribute(iterable):
"""Divide the elements from *iterable* into *n* parts, maintaining """
order. Undoes https://more-itertools.readthedocs.io/en/stable/api.html#more_itertools.distribute .
>>> group_1, group_2 = divide([1, 2, 3, 4, 5, 6], 2) Re-interleaves results that have been split using more_itertools.distribute:
>>> group_1, group_2 = distribute(2, [1, 2, 3, 4, 5, 6])
>>> list(group_1) >>> list(group_1)
[1, 2, 3] [1, 3, 5]
>>> list(group_2) >>> list(group_2)
[4, 5, 6] [2, 4, 6]
>>> undistribute([group_1, group_2])
[1, 2, 3, 4, 5, 6]
If the length of *iterable* is not evenly divisible by *n*, then the Handles non-uniform component lengths:
length of the returned iterables will not be identical:
>>> children = divide([1, 2, 3, 4, 5, 6, 7], 3) >>> children = distribute(3, [1, 2, 3, 4, 5, 6, 7])
>>> [list(c) for c in children] >>> [list(c) for c in children]
[[1, 2, 3], [4, 5], [6, 7]] [[1, 4, 7], [2, 5], [3, 6]]
>>> undistribute(children)
[1, 2, 3, 4, 5, 6, 7]
If the length of the iterable is smaller than n, then the last returned Also handles when some iterables are empty:
iterables will be empty:
>>> children = divide([1, 2, 3], 5) >>> children = distribute(5, [1, 2, 3])
>>> [list(c) for c in children] >>> [list(c) for c in children]
[[1], [2], [3], [], []] [[1], [2], [3], [], []]
>>> undistribute(children)
This function will exhaust the iterable before returning and may require [1, 2, 3]
significant storage. If order is not important, see :func:`distribute`,
which does not first pull the iterable into memory.
""" """
if n < 1:
raise ValueError("n must be at least 1")
try: return [
iterable[:0] x
except TypeError: for x in itertools.chain.from_iterable(
seq = tuple(iterable) itertools.zip_longest(*[list(x) for x in iterable])
else: )
seq = iterable if x is not None
]
q, r = divmod(len(seq), n)
ret = []
stop = 0
for i in range(1, n + 1):
start = stop
stop += q + 1 if i <= r else q
ret.append(iter(seq[start:stop]))
return ret
def retry_on_specific_exceptions( def retry_on_specific_exceptions(
......
import copy import copy
from importlib.metadata import version
from importlib.util import find_spec from importlib.util import find_spec
from typing import List, Literal, Optional, Tuple, Union from typing import List, Literal, Optional, Tuple, Union
from more_itertools import distribute
from packaging.version import parse as parse_version
from tqdm import tqdm from tqdm import tqdm
from lm_eval.api.instance import Instance from lm_eval.api.instance import Instance
from lm_eval.api.model import TemplateLM from lm_eval.api.model import TemplateLM
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
from lm_eval.models.utils import Collator, divide from lm_eval.models.utils import Collator, undistribute
from lm_eval.utils import ( from lm_eval.utils import (
eval_logger, eval_logger,
get_rolling_token_windows, get_rolling_token_windows,
...@@ -17,7 +20,6 @@ from lm_eval.utils import ( ...@@ -17,7 +20,6 @@ from lm_eval.utils import (
try: try:
import ray import ray
from ray.util.multiprocessing import Pool
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
except ModuleNotFoundError: except ModuleNotFoundError:
...@@ -26,14 +28,6 @@ except ModuleNotFoundError: ...@@ -26,14 +28,6 @@ except ModuleNotFoundError:
eval_logger = eval_logger eval_logger = eval_logger
# adapted from https://github.com/vllm-project/vllm/issues/367#issuecomment-1788341727
def run_inference_one_model(
model_args: dict, sampling_params, requests: List[List[int]]
):
llm = LLM(**model_args)
return llm.generate(prompt_token_ids=requests, sampling_params=sampling_params)
@register_model("vllm") @register_model("vllm")
class VLLM(TemplateLM): class VLLM(TemplateLM):
_DEFAULT_MAX_LENGTH = 2048 _DEFAULT_MAX_LENGTH = 2048
...@@ -60,6 +54,7 @@ class VLLM(TemplateLM): ...@@ -60,6 +54,7 @@ class VLLM(TemplateLM):
gpu_memory_utilization: float = 0.9, gpu_memory_utilization: float = 0.9,
device: str = "cuda", device: str = "cuda",
data_parallel_size: int = 1, data_parallel_size: int = 1,
**kwargs,
): ):
super().__init__() super().__init__()
...@@ -92,6 +87,7 @@ class VLLM(TemplateLM): ...@@ -92,6 +87,7 @@ class VLLM(TemplateLM):
"quantization": quantization, "quantization": quantization,
"seed": int(seed), "seed": int(seed),
} }
self.model_args.update(kwargs)
self.batch_size = ( self.batch_size = (
"auto" "auto"
if isinstance(batch_size, str) and "auto" in batch_size if isinstance(batch_size, str) and "auto" in batch_size
...@@ -100,6 +96,12 @@ class VLLM(TemplateLM): ...@@ -100,6 +96,12 @@ class VLLM(TemplateLM):
if self.data_parallel_size <= 1: if self.data_parallel_size <= 1:
self.model = LLM(**self.model_args) self.model = LLM(**self.model_args)
else: else:
assert parse_version(version("vllm")) < parse_version(
"0.3.3"
), "data_parallel is only compatible with vllm < v0.3.3."
eval_logger.warning(
"You might experience occasional issues with model weight downloading when data_parallel is in use. To ensure stable performance, run with data_parallel_size=1 until the weights are downloaded and cached."
)
self.model_args["worker_use_ray"] = True self.model_args["worker_use_ray"] = True
self.batch_size = "auto" self.batch_size = "auto"
eval_logger.info("Manual batching is not compatible with data parallelism.") eval_logger.info("Manual batching is not compatible with data parallelism.")
...@@ -181,15 +183,30 @@ class VLLM(TemplateLM): ...@@ -181,15 +183,30 @@ class VLLM(TemplateLM):
temperature=0, prompt_logprobs=1, max_tokens=1 temperature=0, prompt_logprobs=1, max_tokens=1
) )
if self.data_parallel_size > 1: if self.data_parallel_size > 1:
requests = [list(x) for x in divide(requests, self.data_parallel_size)] # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
inputs = [(self.model_args, sampling_params, req) for req in requests] # also seems to only work with decorator and not with ray.remote() fn
# see https://github.com/vllm-project/vllm/issues/973
# note: this has changed on 0.3.3, and it only works now if num_gpus are set.
# but then tensor_parallel breaks
@ray.remote
def run_inference_one_model(
model_args: dict, sampling_params, requests: List[List[int]]
):
llm = LLM(**model_args)
return llm.generate(
prompt_token_ids=requests, sampling_params=sampling_params
)
with Pool(self.data_parallel_size) as pool: # dispatch requests to all self.data_parallel_size workers, in interleaved fashion
results = pool.starmap(run_inference_one_model, inputs) # interleaved important to balance context lengths across workers
requests = [list(x) for x in distribute(self.data_parallel_size, requests)]
inputs = ((self.model_args, sampling_params, req) for req in requests)
object_refs = [run_inference_one_model.remote(*x) for x in inputs]
results = ray.get(object_refs)
# Invoke ray.shutdown() to prevent hang-ups if subsequent calls required. # Invoke ray.shutdown() to prevent hang-ups if subsequent calls required.
ray.shutdown() ray.shutdown()
# flatten results # flatten results
return [item for sublist in results for item in sublist] return undistribute(results)
outputs = self.model.generate( outputs = self.model.generate(
prompt_token_ids=requests, prompt_token_ids=requests,
...@@ -198,10 +215,12 @@ class VLLM(TemplateLM): ...@@ -198,10 +215,12 @@ class VLLM(TemplateLM):
) )
return outputs return outputs
def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]: def loglikelihood_rolling(
self, requests: List[Instance], disable_tqdm: bool = False
) -> List[float]:
loglikelihoods = [] loglikelihoods = []
for (string,) in tqdm([req.args for req in requests]): for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
rolling_token_windows = list( rolling_token_windows = list(
map( map(
make_disjoint_window, make_disjoint_window,
...@@ -227,7 +246,9 @@ class VLLM(TemplateLM): ...@@ -227,7 +246,9 @@ class VLLM(TemplateLM):
loglikelihoods.append(string_nll) loglikelihoods.append(string_nll)
return loglikelihoods return loglikelihoods
def generate_until(self, requests: List[Instance]) -> List[str]: def generate_until(
self, requests: List[Instance], disable_tqdm: bool = False
) -> List[str]:
res = [] res = []
# batch tokenize contexts # batch tokenize contexts
...@@ -256,7 +277,7 @@ class VLLM(TemplateLM): ...@@ -256,7 +277,7 @@ class VLLM(TemplateLM):
pbar = tqdm( pbar = tqdm(
total=len(requests), total=len(requests),
disable=(self.rank != 0), disable=(disable_tqdm or (self.rank != 0)),
desc="Running generate_until requests", desc="Running generate_until requests",
) )
# for each different set of kwargs, we execute all requests, by batch. # for each different set of kwargs, we execute all requests, by batch.
...@@ -282,8 +303,12 @@ class VLLM(TemplateLM): ...@@ -282,8 +303,12 @@ class VLLM(TemplateLM):
raise ValueError( raise ValueError(
f"Expected `kwargs` to be of type `dict` but got {gen_kwargs}" f"Expected `kwargs` to be of type `dict` but got {gen_kwargs}"
) )
# add EOS token to stop sequences
eos = self.tokenizer.decode(self.eot_token_id)
if not until: if not until:
until = [self.tokenizer.decode(self.eot_token_id)] until = [eos]
else:
until.append(eos)
if "max_gen_toks" in kwargs.keys(): if "max_gen_toks" in kwargs.keys():
max_gen_toks = kwargs.pop("max_gen_toks") max_gen_toks = kwargs.pop("max_gen_toks")
else: else:
...@@ -390,6 +415,26 @@ class VLLM(TemplateLM): ...@@ -390,6 +415,26 @@ class VLLM(TemplateLM):
# The first entry of prompt_logprobs is None because the model has no previous tokens to condition on. # The first entry of prompt_logprobs is None because the model has no previous tokens to condition on.
continuation_logprobs_dicts = outputs.prompt_logprobs continuation_logprobs_dicts = outputs.prompt_logprobs
def coerce_logprob_to_num(logprob):
# vLLM changed the return type of logprobs from float
# to a Logprob object storing the float value + extra data
# (https://github.com/vllm-project/vllm/pull/3065).
# If we are dealing with vllm's Logprob object, return
# the logprob value stored as an attribute. Otherwise,
# return the object itself (which should be a float
# for older versions of vLLM).
return getattr(logprob, "logprob", logprob)
continuation_logprobs_dicts = [
{
token: coerce_logprob_to_num(logprob)
for token, logprob in logprob_dict.items()
}
if logprob_dict is not None
else None
for logprob_dict in continuation_logprobs_dicts
]
# Calculate continuation_logprobs # Calculate continuation_logprobs
# assume ctxlen always >= 1 # assume ctxlen always >= 1
continuation_logprobs = sum( continuation_logprobs = sum(
......
import abc
import collections import collections
import logging import logging
import os import os
from functools import partial from functools import partial
from typing import Dict, List, Union from typing import Dict, List, Mapping, Optional, Union
from lm_eval import utils from lm_eval import utils
from lm_eval.api.task import ConfigurableTask, Task from lm_eval.api.task import ConfigurableTask, Task
...@@ -15,7 +14,7 @@ class TaskManager: ...@@ -15,7 +14,7 @@ class TaskManager:
""" """
def __init__(self, verbosity="INFO", include_path=None) -> None: def __init__(self, verbosity="INFO", include_path: Optional[str] = None) -> None:
self.verbosity = verbosity self.verbosity = verbosity
self.include_path = include_path self.include_path = include_path
self.logger = utils.eval_logger self.logger = utils.eval_logger
...@@ -26,8 +25,8 @@ class TaskManager: ...@@ -26,8 +25,8 @@ class TaskManager:
self.task_group_map = collections.defaultdict(list) self.task_group_map = collections.defaultdict(list)
def initialize_tasks(self, include_path: str = None): def initialize_tasks(self, include_path: Optional[str] = None):
"""Creates an dictionary of tasks index. """Creates a dictionary of tasks index.
:param include_path: str = None :param include_path: str = None
An additional path to be searched for tasks An additional path to be searched for tasks
...@@ -59,7 +58,7 @@ class TaskManager: ...@@ -59,7 +58,7 @@ class TaskManager:
def match_tasks(self, task_list): def match_tasks(self, task_list):
return utils.pattern_match(task_list, self.all_tasks) return utils.pattern_match(task_list, self.all_tasks)
def _name_is_registered(self, name): def _name_is_registered(self, name) -> bool:
if name in self.all_tasks: if name in self.all_tasks:
return True return True
return False return False
...@@ -69,7 +68,7 @@ class TaskManager: ...@@ -69,7 +68,7 @@ class TaskManager:
return True return True
return False return False
def _name_is_group(self, name): def _name_is_group(self, name) -> bool:
if self._name_is_registered(name) and ( if self._name_is_registered(name) and (
self.task_index[name]["type"] == "group" self.task_index[name]["type"] == "group"
): ):
...@@ -83,27 +82,29 @@ class TaskManager: ...@@ -83,27 +82,29 @@ class TaskManager:
return True return True
return False return False
def _config_is_task(self, config): def _config_is_task(self, config) -> bool:
if ("task" in config) and isinstance(config["task"], str): if ("task" in config) and isinstance(config["task"], str):
return True return True
return False return False
def _config_is_group(self, config): def _config_is_group(self, config) -> bool:
if ("task" in config) and isinstance(config["task"], list): if ("task" in config) and isinstance(config["task"], list):
return True return True
return False return False
def _config_is_python_task(self, config): def _config_is_python_task(self, config) -> bool:
if "class" in config: if "class" in config:
return True return True
return False return False
def _get_yaml_path(self, name): def _get_yaml_path(self, name):
assert name in self.task_index if name not in self.task_index:
raise ValueError
return self.task_index[name]["yaml_path"] return self.task_index[name]["yaml_path"]
def _get_config(self, name): def _get_config(self, name):
assert name in self.task_index if name not in self.task_index:
raise ValueError
yaml_path = self._get_yaml_path(name) yaml_path = self._get_yaml_path(name)
if yaml_path == -1: if yaml_path == -1:
return {} return {}
...@@ -111,7 +112,8 @@ class TaskManager: ...@@ -111,7 +112,8 @@ class TaskManager:
return utils.load_yaml_config(yaml_path, mode="full") return utils.load_yaml_config(yaml_path, mode="full")
def _get_tasklist(self, name): def _get_tasklist(self, name):
assert self._name_is_task(name) is False if self._name_is_task(name):
raise ValueError
return self.task_index[name]["task"] return self.task_index[name]["task"]
def _process_alias(self, config, group=None): def _process_alias(self, config, group=None):
...@@ -125,14 +127,15 @@ class TaskManager: ...@@ -125,14 +127,15 @@ class TaskManager:
def _load_individual_task_or_group( def _load_individual_task_or_group(
self, self,
name_or_config: Union[str, dict] = None, name_or_config: Optional[Union[str, dict]] = None,
parent_name: str = None, parent_name: Optional[str] = None,
update_config: dict = None, update_config: Optional[dict] = None,
yaml_path: str = None, yaml_path: Optional[str] = None,
) -> ConfigurableTask: ) -> Mapping:
def load_task(config, task, group=None, yaml_path=None): def load_task(config, task, group=None, yaml_path=None):
if "include" in config: if "include" in config:
assert yaml_path is not None if yaml_path is None:
raise ValueError
config.update( config.update(
utils.load_yaml_config( utils.load_yaml_config(
yaml_path, yaml_path,
...@@ -166,7 +169,7 @@ class TaskManager: ...@@ -166,7 +169,7 @@ class TaskManager:
# This checks if we're at the root. # This checks if we're at the root.
if parent_name is None: if parent_name is None:
group_config = self._get_config(name_or_config) group_config = self._get_config(name_or_config)
if set(group_config.keys()) > set(["task", "group"]): if set(group_config.keys()) > {"task", "group"}:
update_config = { update_config = {
k: v k: v
for k, v in group_config.items() for k, v in group_config.items()
...@@ -228,7 +231,7 @@ class TaskManager: ...@@ -228,7 +231,7 @@ class TaskManager:
else: else:
group_name = name_or_config["group"] group_name = name_or_config["group"]
subtask_list = name_or_config["task"] subtask_list = name_or_config["task"]
if set(name_or_config.keys()) > set(["task", "group"]): if set(name_or_config.keys()) > {"task", "group"}:
update_config = { update_config = {
k: v k: v
for k, v in name_or_config.items() for k, v in name_or_config.items()
...@@ -251,7 +254,7 @@ class TaskManager: ...@@ -251,7 +254,7 @@ class TaskManager:
} }
return all_subtasks return all_subtasks
def load_task_or_group(self, task_list: Union[str, list] = None) -> dict: def load_task_or_group(self, task_list: Optional[Union[str, list]] = None) -> dict:
"""Loads a dictionary of task objects from a list """Loads a dictionary of task objects from a list
:param task_list: Union[str, list] = None :param task_list: Union[str, list] = None
...@@ -272,7 +275,7 @@ class TaskManager: ...@@ -272,7 +275,7 @@ class TaskManager:
return self._load_individual_task_or_group(config) return self._load_individual_task_or_group(config)
def _get_task_and_group(self, task_dir: str): def _get_task_and_group(self, task_dir: str):
"""Creates an dictionary of tasks index with the following metadata, """Creates a dictionary of tasks index with the following metadata,
- `type`, that can be either `task`, `python_task`, or `group`. - `type`, that can be either `task`, `python_task`, or `group`.
`task` refer to regular task configs, `python_task` are special `task` refer to regular task configs, `python_task` are special
yaml files that only consists of `task` and `class` parameters. yaml files that only consists of `task` and `class` parameters.
...@@ -358,7 +361,8 @@ def include_path(task_dir): ...@@ -358,7 +361,8 @@ def include_path(task_dir):
logger.setLevel(getattr(logging, "INFO")) logger.setLevel(getattr(logging, "INFO"))
logger.info( logger.info(
"To still use tasks loaded from args.include_path," "To still use tasks loaded from args.include_path,"
"see an example of the new TaskManager API in https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage" "see an example of the new TaskManager API in "
"https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
) )
return 0 return 0
...@@ -397,7 +401,8 @@ def get_task_name_from_object(task_object): ...@@ -397,7 +401,8 @@ def get_task_name_from_object(task_object):
def get_task_dict( def get_task_dict(
task_name_list: List[Union[str, Dict, Task]], task_manager: TaskManager = None task_name_list: List[Union[str, Dict, Task]],
task_manager: Optional[TaskManager] = None,
): ):
"""Creates a dictionary of task objects from either a name of task, config, or prepared Task object. """Creates a dictionary of task objects from either a name of task, config, or prepared Task object.
...@@ -442,9 +447,10 @@ def get_task_dict( ...@@ -442,9 +447,10 @@ def get_task_dict(
get_task_name_from_object(task_element): task_element, get_task_name_from_object(task_element): task_element,
} }
assert set(task_name_from_string_dict.keys()).isdisjoint( if not set(task_name_from_string_dict.keys()).isdisjoint(
set(task_name_from_object_dict.keys()) set(task_name_from_object_dict.keys())
) ):
raise ValueError
return { return {
**task_name_from_string_dict, **task_name_from_string_dict,
......
# Arabic EXAMS
### Paper
EXAMS: a resource specialized in multilingual high school exam questions.
The original paper [EXAMS](https://aclanthology.org/2020.emnlp-main.438/)
The Arabic EXAMS dataset includes five subjects
- Islamic studies
- Biology
- Physics
- Science
- Social
The original dataset [EXAMS-QA](https://github.com/mhardalov/exams-qa)
EXAMS is a benchmark dataset for cross-lingual and multilingual question answering for high school examinations.
With 24,000 high-quality high school exam questions in 16 languages, covering 8 language families and 24 school subjects from Natural Sciences and Social Sciences, among others.
EXAMS offers unique fine-grained evaluation framework across multiple languages and subjects
Homepage for Arabic EXAMS: [EXAMS Arabic Homepage](https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/EXAMS_Arabic)
### Citation
### Groups and Tasks
#### Groups
- `EXAMS Arabic`: include IslamicStudies, Biology, Science, Physics, Social.
#### Tasks
The following tasks evaluate subjects in Arabic EXAMS dataset using loglikelihood-based multiple-choice scoring:
- `aexams_IslamicStudies`
- `aexams_Biology`
- `aexams_Science`
- `aexams_Physics`
- `aexams_Social`
### Checklist
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation?
* [x] Yes, original implementation contributed by author of the benchmark
If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
group: aexams
dataset_path: Hennara/aexams
test_split: test
fewshot_split: dev
fewshot_config:
sampler: first_n
output_type: multiple_choice
doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nالجواب:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
"dataset_name": "Biology"
"description": "قم بالإجابة على مايلي في مجال العلوم الحيوية\n\n"
"include": "_default_template_yaml"
"task": "aexams_Biology"
"dataset_name": "IslamicStudies"
"description": "قم بالإجابة على مايلي في مجال العلوم الإسلامية \n\n"
"include": "_default_template_yaml"
"task": "aexams_IslamicStudies"
"dataset_name": "Physics"
"description": "قم بالإجابة على مايلي في مجال الفيزياء \n\n"
"include": "_default_template_yaml"
"task": "aexams_Physics"
"dataset_name": "Science"
"description": "قم بالإجابة على مايلي في مجال العلوم \n\n"
"include": "_default_template_yaml"
"task": "aexams_Science"
"dataset_name": "Social"
"description": "قم بالإجابة على مايلي في مجال العلوم الإجتماعية \n\n"
"include": "_default_template_yaml"
"task": "aexams_Social"
# AGIEval
### Paper
Title: AGIEval: A Human-Centric Benchmark for Evaluating Foundation Models
Abstract: https://arxiv.org/abs/2304.06364.pdf
AGIEval is a human-centric benchmark specifically designed to evaluate the general abilities of foundation models in tasks pertinent to human cognition and problem-solving.
This benchmark is derived from 20 official, public, and high-standard admission and qualification exams intended for general human test-takers, such as general college admission tests (e.g., Chinese College Entrance Exam (Gaokao) and American SAT), law school admission tests, math competitions, lawyer qualification tests, and national civil service exams.
Homepage: https://github.com/ruixiangcui/AGIEval
### Citation
```
@misc{zhong2023agieval,
title={AGIEval: A Human-Centric Benchmark for Evaluating Foundation Models},
author={Wanjun Zhong and Ruixiang Cui and Yiduo Guo and Yaobo Liang and Shuai Lu and Yanlin Wang and Amin Saied and Weizhu Chen and Nan Duan},
year={2023},
eprint={2304.06364},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
Please make sure to cite all the individual datasets in your paper when you use them. We provide the relevant citation information below:
```
@inproceedings{ling-etal-2017-program,
title = "Program Induction by Rationale Generation: Learning to Solve and Explain Algebraic Word Problems",
author = "Ling, Wang and
Yogatama, Dani and
Dyer, Chris and
Blunsom, Phil",
booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2017",
address = "Vancouver, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/P17-1015",
doi = "10.18653/v1/P17-1015",
pages = "158--167",
abstract = "Solving algebraic word problems requires executing a series of arithmetic operations{---}a program{---}to obtain a final answer. However, since programs can be arbitrarily complicated, inducing them directly from question-answer pairs is a formidable challenge. To make this task more feasible, we solve these problems by generating answer rationales, sequences of natural language and human-readable mathematical expressions that derive the final answer through a series of small steps. Although rationales do not explicitly specify programs, they provide a scaffolding for their structure via intermediate milestones. To evaluate our approach, we have created a new 100,000-sample dataset of questions, answers and rationales. Experimental results show that indirect supervision of program learning via answer rationales is a promising strategy for inducing arithmetic programs.",
}
@inproceedings{hendrycksmath2021,
title={Measuring Mathematical Problem Solving With the MATH Dataset},
author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
journal={NeurIPS},
year={2021}
}
@inproceedings{Liu2020LogiQAAC,
title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning},
author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},
booktitle={International Joint Conference on Artificial Intelligence},
year={2020}
}
@inproceedings{zhong2019jec,
title={JEC-QA: A Legal-Domain Question Answering Dataset},
author={Zhong, Haoxi and Xiao, Chaojun and Tu, Cunchao and Zhang, Tianyang and Liu, Zhiyuan and Sun, Maosong},
booktitle={Proceedings of AAAI},
year={2020},
}
@article{Wang2021FromLT,
title={From LSAT: The Progress and Challenges of Complex Reasoning},
author={Siyuan Wang and Zhongkun Liu and Wanjun Zhong and Ming Zhou and Zhongyu Wei and Zhumin Chen and Nan Duan},
journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
year={2021},
volume={30},
pages={2201-2216}
}
```
### Groups and Tasks
#### Groups
- `agieval`: Evaluates all tasks listed below.
- `agieval_en`: Evaluates all English subtasks: `agieval_aqua_rat`, `agieval_gaokao_english`, `agieval_logiqa_en`, `agieval_lsat_*`, `agieval_sat_*`, `agieval_math`
- `agieval_cn`: Evaluates all Chinese subtasks:
`agieval_gaokao_biology`, `agieval_gaokao_chemistry`, `agieval_gaokao_chinese`, `agieval_gaokao_geography`,
`agieval_gaokao_history`, `agieval_gaokao_mathqa`, `agieval_gaokao_mathcloze`, `agieval_gaokao_physics`, `agieval_jec_qa_ca`, `agieval_jec_qa_kd`, `agieval_logiqa_zh`
- `agieval_nous`: Evaluates a specific subset of AGIEval tasks (multiple-choice and english-only), namely those in https://github.com/teknium1/LLM-Benchmark-Logs/blob/main/benchmark-logs/Mistral-7B-Base.md
#### Tasks
- `agieval_aqua_rat`
- `agieval_gaokao_biology`
- `agieval_gaokao_chemistry`
- `agieval_gaokao_chinese`
- `agieval_gaokao_english`
- `agieval_gaokao_geography`
- `agieval_gaokao_history`
- `agieval_gaokao_mathqa`
- `agieval_gaokao_mathcloze`
- `agieval_gaokao_physics`
- `agieval_jec_qa_ca`
- `agieval_jec_qa_kd`
- `agieval_logiqa_en`
- `agieval_logiqa_zh`
- `agieval_lsat_ar`
- `agieval_lsat_lr`
- `agieval_lsat_rc`
- `agieval_sat_en`
- `agieval_sat_en_without_passage`
- `agieval_sat_math`
- `agieval_math`
group:
- agieval
- agieval_en
- agieval_nous
task: agieval_aqua_rat
dataset_path: hails/agieval-aqua-rat
dataset_name: null
output_type: multiple_choice
training_split: null
validation_split: null
test_split: test
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "{{choices}}"
process_results: !function utils.process_results_mcqa
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
include: aqua-rat.yaml
group:
- agieval
- agieval_cn
task: agieval_gaokao_biology
dataset_path: hails/agieval-gaokao-biology
include: aqua-rat.yaml
group:
- agieval
- agieval_cn
task: agieval_gaokao_chemistry
dataset_path: hails/agieval-gaokao-chemistry
include: aqua-rat.yaml
group:
- agieval
- agieval_cn
task: agieval_gaokao_chinese
dataset_path: hails/agieval-gaokao-chinese
include: aqua-rat.yaml
group:
- agieval
- agieval_en # categorizing as EN because the AGIEval codebase lists this as in `english_qa_tasks`
task: agieval_gaokao_english
dataset_path: hails/agieval-gaokao-english
include: aqua-rat.yaml
group:
- agieval
- agieval_cn
task: agieval_gaokao_geography
dataset_path: hails/agieval-gaokao-geography
include: aqua-rat.yaml
group:
- agieval
- agieval_cn
task: agieval_gaokao_history
dataset_path: hails/agieval-gaokao-history
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment