Commit 08218829 authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'main' of https://github.com/EleutherAI/lm-evaluation-harness into t5v2-alt-plus

parents 51afaca2 a97fde23
...@@ -140,10 +140,16 @@ lm_eval --model vllm \ ...@@ -140,10 +140,16 @@ lm_eval --model vllm \
--tasks lambada_openai \ --tasks lambada_openai \
--batch_size auto --batch_size auto
``` ```
For a full list of supported vLLM configurations, please reference our vLLM integration and the vLLM documentation. To use vllm, do `pip install lm_eval[vllm]`. For a full list of supported vLLM configurations, please reference our [vLLM integration](https://github.com/EleutherAI/lm-evaluation-harness/blob/e74ec966556253fbe3d8ecba9de675c77c075bce/lm_eval/models/vllm_causallms.py) and the vLLM documentation.
vLLM occasionally differs in output from Huggingface. We treat Huggingface as the reference implementation, and provide a [script](./scripts/model_comparator.py) for checking the validity of vllm results against HF. vLLM occasionally differs in output from Huggingface. We treat Huggingface as the reference implementation, and provide a [script](./scripts/model_comparator.py) for checking the validity of vllm results against HF.
> [!Tip]
> For fastest performance, we recommend using `--batch_size auto` for vLLM whenever possible, to leverage its continuous batching functionality!
> [!Tip]
> Passing `max_model_len=4096` or some other reasonable default to vLLM through model args may cause speedups or prevent out-of-memory errors when trying to use auto batch size, such as for Mistral-7B-v0.1 which defaults to a maximum length of 32k.
### Model APIs and Inference Servers ### Model APIs and Inference Servers
Our library also supports the evaluation of models served via several commercial APIs, and we hope to implement support for the most commonly used performant local/self-hosted inference servers. Our library also supports the evaluation of models served via several commercial APIs, and we hope to implement support for the most commonly used performant local/self-hosted inference servers.
...@@ -240,9 +246,6 @@ Additionally, one can provide a directory with `--use_cache` to cache the result ...@@ -240,9 +246,6 @@ Additionally, one can provide a directory with `--use_cache` to cache the result
For a full list of supported arguments, check out the [interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md) guide in our documentation! For a full list of supported arguments, check out the [interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md) guide in our documentation!
> [!Tip]
> Running lm-evaluation-harness as an external library and can't find (almost) any tasks available? Run `lm_eval.tasks.initialize_tasks()` to load the library's stock tasks before calling `lm_eval.evaluate()` or `lm_eval.simple_evaluate()` !
## Visualizing Results ## Visualizing Results
You can seamlessly visualize and analyze the results of your evaluation harness runs using both Weights & Biases (W&B) and Zeno. You can seamlessly visualize and analyze the results of your evaluation harness runs using both Weights & Biases (W&B) and Zeno.
......
...@@ -112,8 +112,8 @@ my_model = initialize_my_model() ...@@ -112,8 +112,8 @@ my_model = initialize_my_model()
# - `Your_LM.generate_until()` # - `Your_LM.generate_until()`
lm_obj = Your_LM(model=my_model, batch_size=16) lm_obj = Your_LM(model=my_model, batch_size=16)
# The task_manager indexes tasks including ones # optional: the task_manager indexes tasks including ones
# specified by the user through `include_path` # specified by the user through `include_path`.
task_manager = lm_eval.tasks.TaskManager( task_manager = lm_eval.tasks.TaskManager(
include_path="/path/to/custom/yaml" include_path="/path/to/custom/yaml"
) )
...@@ -138,9 +138,9 @@ task_dict = lm_eval.tasks.get_task_dict( ...@@ -138,9 +138,9 @@ task_dict = lm_eval.tasks.get_task_dict(
# custom paths is required. # custom paths is required.
) )
def evaluate( results = evaluate(
lm=lm_obj, lm=lm_obj,
task_dict=task_dict, task_dict=task_dict,
... ...
): )
``` ```
...@@ -13,7 +13,7 @@ import numpy as np ...@@ -13,7 +13,7 @@ import numpy as np
from lm_eval import evaluator, utils from lm_eval import evaluator, utils
from lm_eval.evaluator import request_caching_arg_to_dict from lm_eval.evaluator import request_caching_arg_to_dict
from lm_eval.logging_utils import WandbLogger from lm_eval.logging_utils import WandbLogger
from lm_eval.tasks import TaskManager, include_path, initialize_tasks from lm_eval.tasks import TaskManager
from lm_eval.utils import make_table, simple_parse_args_string from lm_eval.utils import make_table, simple_parse_args_string
...@@ -53,13 +53,30 @@ def _int_or_none_list_arg_type(max_len: int, value: str, split_char: str = ","): ...@@ -53,13 +53,30 @@ def _int_or_none_list_arg_type(max_len: int, value: str, split_char: str = ","):
return items return items
def parse_eval_args() -> argparse.Namespace: def check_argument_types(parser: argparse.ArgumentParser):
"""
Check to make sure all CLI args are typed, raises error if not
"""
for action in parser._actions:
if action.dest != "help" and not action.const:
if action.type is None:
raise ValueError(
f"Argument '{action.dest}' doesn't have a type specified."
)
else:
continue
def setup_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("--model", "-m", default="hf", help="Name of model e.g. `hf`") parser.add_argument(
"--model", "-m", type=str, default="hf", help="Name of model e.g. `hf`"
)
parser.add_argument( parser.add_argument(
"--tasks", "--tasks",
"-t", "-t",
default=None, default=None,
type=str,
metavar="task1,task2", metavar="task1,task2",
help="To get full list of tasks, use the command lm-eval --tasks list", help="To get full list of tasks, use the command lm-eval --tasks list",
) )
...@@ -67,6 +84,7 @@ def parse_eval_args() -> argparse.Namespace: ...@@ -67,6 +84,7 @@ def parse_eval_args() -> argparse.Namespace:
"--model_args", "--model_args",
"-a", "-a",
default="", default="",
type=str,
help="Comma separated string arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`", help="Comma separated string arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`",
) )
parser.add_argument( parser.add_argument(
...@@ -164,6 +182,7 @@ def parse_eval_args() -> argparse.Namespace: ...@@ -164,6 +182,7 @@ def parse_eval_args() -> argparse.Namespace:
) )
parser.add_argument( parser.add_argument(
"--gen_kwargs", "--gen_kwargs",
type=str,
default=None, default=None,
help=( help=(
"String arguments for model generation on greedy_until tasks," "String arguments for model generation on greedy_until tasks,"
...@@ -180,6 +199,7 @@ def parse_eval_args() -> argparse.Namespace: ...@@ -180,6 +199,7 @@ def parse_eval_args() -> argparse.Namespace:
) )
parser.add_argument( parser.add_argument(
"--wandb_args", "--wandb_args",
type=str,
default="", default="",
help="Comma separated string arguments passed to wandb.init, e.g. `project=lm-eval,job_type=eval", help="Comma separated string arguments passed to wandb.init, e.g. `project=lm-eval,job_type=eval",
) )
...@@ -209,13 +229,19 @@ def parse_eval_args() -> argparse.Namespace: ...@@ -209,13 +229,19 @@ def parse_eval_args() -> argparse.Namespace:
help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub", help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
) )
return parser
def parse_eval_args(parser: argparse.ArgumentParser) -> argparse.Namespace:
check_argument_types(parser)
return parser.parse_args() return parser.parse_args()
def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
if not args: if not args:
# we allow for args to be passed externally, else we parse them ourselves # we allow for args to be passed externally, else we parse them ourselves
args = parse_eval_args() parser = setup_parser()
args = parse_eval_args(parser)
if args.wandb_args: if args.wandb_args:
wandb_logger = WandbLogger(**simple_parse_args_string(args.wandb_args)) wandb_logger = WandbLogger(**simple_parse_args_string(args.wandb_args))
...@@ -232,7 +258,8 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -232,7 +258,8 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
"Specify --output_path if providing --log_samples or --predict_only" "Specify --output_path if providing --log_samples or --predict_only"
) )
initialize_tasks(args.verbosity) if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}")
task_manager = TaskManager(args.verbosity, include_path=args.include_path) task_manager = TaskManager(args.verbosity, include_path=args.include_path)
if args.limit: if args.limit:
...@@ -305,7 +332,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -305,7 +332,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
) )
eval_logger.info(f"Selected Tasks: {task_names}") eval_logger.info(f"Selected Tasks: {task_names}")
eval_logger.info("Loading selected tasks...")
request_caching_args = request_caching_arg_to_dict( request_caching_args = request_caching_arg_to_dict(
cache_requests=args.cache_requests cache_requests=args.cache_requests
......
...@@ -66,11 +66,11 @@ class LM(abc.ABC): ...@@ -66,11 +66,11 @@ class LM(abc.ABC):
multiple chunks, the last input will still a full-sized context. multiple chunks, the last input will still a full-sized context.
Example: Example:
Input tokens: [ 0 1 2 3 4 5 6 7 8 9 ] Input tokens: [ 0 1 2 3 4 5 6 7 8 9 ]
Prefix: EOT Prefix: BOS/EOS
Max context length: 4 Max context length: 4
Resulting input/prediction pairs: Resulting input/prediction pairs:
INPUT: EOT 0 1 2 INPUT: BOS 0 1 2
PRED: 0 1 2 3 PRED: 0 1 2 3
INPUT: 3 4 5 6 INPUT: 3 4 5 6
...@@ -90,7 +90,8 @@ class LM(abc.ABC): ...@@ -90,7 +90,8 @@ class LM(abc.ABC):
:return: list[tuple[float]] :return: list[tuple[float]]
A list of tuples (logprob,) A list of tuples (logprob,)
logprob: float logprob: float
The log probability of `context` conditioned on the EOT token. The log probability of `context` conditioned on the BOS/EOS token.
Can also be overridden for custom cases by `prefix_token_id`.
""" """
pass pass
...@@ -283,6 +284,11 @@ class TemplateLM(LM): ...@@ -283,6 +284,11 @@ class TemplateLM(LM):
def eot_token_id(self): def eot_token_id(self):
pass pass
@property
def prefix_token_id(self):
# it is used as prefix for loglikelihood
return self.eot_token_id
@abc.abstractmethod @abc.abstractmethod
def tok_encode(self, string: str, **kwargs): def tok_encode(self, string: str, **kwargs):
pass pass
...@@ -317,9 +323,9 @@ class TemplateLM(LM): ...@@ -317,9 +323,9 @@ class TemplateLM(LM):
new_reqs = [] new_reqs = []
for context, continuation in [req.args for req in requests]: for context, continuation in [req.args for req in requests]:
if context == "": if context == "":
# end of text as context # BOS or EOS as context
context_enc, continuation_enc = ( context_enc, continuation_enc = (
[self.eot_token_id], [self.prefix_token_id],
self.tok_encode(continuation), self.tok_encode(continuation),
) )
else: else:
......
...@@ -392,7 +392,7 @@ class Task(abc.ABC): ...@@ -392,7 +392,7 @@ class Task(abc.ABC):
# used with caching # used with caching
og_limit = limit og_limit = limit
cache_key = f"requests-{self._config.task}" cache_key = f"requests-{self._config.task}-{self.config.num_fewshot}shot-rank{rank}-world_size{world_size}"
cached_instances = load_from_cache(file_name=cache_key) cached_instances = load_from_cache(file_name=cache_key)
......
import itertools import itertools
import logging import logging
import random import random
import time
from collections import defaultdict from collections import defaultdict
from typing import TYPE_CHECKING, List, Optional, Union from typing import TYPE_CHECKING, List, Optional, Union
...@@ -106,6 +107,7 @@ def simple_evaluate( ...@@ -106,6 +107,7 @@ def simple_evaluate(
Dictionary of results Dictionary of results
""" """
eval_logger.setLevel(getattr(logging, f"{verbosity}")) eval_logger.setLevel(getattr(logging, f"{verbosity}"))
start_date = time.time()
if delete_requests_cache: if delete_requests_cache:
eval_logger.info("Deleting requests cache...") eval_logger.info("Deleting requests cache...")
...@@ -146,9 +148,22 @@ def simple_evaluate( ...@@ -146,9 +148,22 @@ def simple_evaluate(
if isinstance(model, str): if isinstance(model, str):
if model_args is None: if model_args is None:
eval_logger.warning("model_args not specified. Using defaults.")
model_args = "" model_args = ""
if "pretrained" not in model_args and model in [
"hf-auto",
"hf",
"huggingface",
"vllm",
]:
eval_logger.warning(
"pretrained not specified. Using default pretrained=gpt2."
)
if isinstance(model_args, dict): if isinstance(model_args, dict):
eval_logger.info(
f"Initializing {model} model, with arguments: {model_args}"
)
lm = lm_eval.api.registry.get_model(model).create_from_arg_obj( lm = lm_eval.api.registry.get_model(model).create_from_arg_obj(
model_args, model_args,
{ {
...@@ -159,6 +174,9 @@ def simple_evaluate( ...@@ -159,6 +174,9 @@ def simple_evaluate(
) )
else: else:
eval_logger.info(
f"Initializing {model} model, with arguments: {simple_parse_args_string(model_args)}"
)
lm = lm_eval.api.registry.get_model(model).create_from_arg_string( lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
model_args, model_args,
{ {
...@@ -170,6 +188,7 @@ def simple_evaluate( ...@@ -170,6 +188,7 @@ def simple_evaluate(
else: else:
if not isinstance(model, lm_eval.api.model.LM): if not isinstance(model, lm_eval.api.model.LM):
raise TypeError raise TypeError
eval_logger.info("Using pre-initialized model")
lm = model lm = model
if use_cache is not None: if use_cache is not None:
...@@ -187,10 +206,6 @@ def simple_evaluate( ...@@ -187,10 +206,6 @@ def simple_evaluate(
if task_manager is None: if task_manager is None:
task_manager = TaskManager(verbosity) task_manager = TaskManager(verbosity)
eval_logger.info(
"get_task_dict has been updated to accept an optional argument, `task_manager`"
"Read more here:https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
)
task_dict = get_task_dict(tasks, task_manager) task_dict = get_task_dict(tasks, task_manager)
for task_name in task_dict.keys(): for task_name in task_dict.keys():
task_obj = task_dict[task_name] task_obj = task_dict[task_name]
...@@ -213,6 +228,8 @@ def simple_evaluate( ...@@ -213,6 +228,8 @@ def simple_evaluate(
# we have to change the class properties post-hoc. This is pretty hacky. # we have to change the class properties post-hoc. This is pretty hacky.
task_obj.override_metric(metric_name="bypass") task_obj.override_metric(metric_name="bypass")
# override tasks' fewshot values to the provided num_fewshot arg value
# except if tasks have it set to 0 manually in their configs--then we should never overwrite that
if num_fewshot is not None: if num_fewshot is not None:
if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0: if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
eval_logger.info( eval_logger.info(
...@@ -223,6 +240,10 @@ def simple_evaluate( ...@@ -223,6 +240,10 @@ def simple_evaluate(
f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}" f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
) )
task_obj.set_config(key="num_fewshot", value=num_fewshot) task_obj.set_config(key="num_fewshot", value=num_fewshot)
else:
# if num_fewshot not provided, and the task does not define a default one, default to 0
if (default_num_fewshot := task_obj.get_config("num_fewshot")) is None:
task_obj.set_config(key="num_fewshot", value=0)
if check_integrity: if check_integrity:
run_task_tests(task_list=tasks) run_task_tests(task_list=tasks)
...@@ -262,6 +283,7 @@ def simple_evaluate( ...@@ -262,6 +283,7 @@ def simple_evaluate(
"gen_kwargs": gen_kwargs, "gen_kwargs": gen_kwargs,
} }
results["git_hash"] = get_git_commit_hash() results["git_hash"] = get_git_commit_hash()
results["date"] = start_date
add_env_info(results) # additional environment info to results add_env_info(results) # additional environment info to results
return results return results
else: else:
......
...@@ -397,7 +397,8 @@ class WandbLogger: ...@@ -397,7 +397,8 @@ class WandbLogger:
self.run.log({f"{group}_eval_results": grouped_df}) self.run.log({f"{group}_eval_results": grouped_df})
def get_commit_from_path(repo_path: Path) -> Optional[str]: def get_commit_from_path(repo_path: Union[Path, str]) -> Optional[str]:
try:
git_folder = Path(repo_path, ".git") git_folder = Path(repo_path, ".git")
if git_folder.is_file(): if git_folder.is_file():
git_folder = Path( git_folder = Path(
...@@ -415,6 +416,11 @@ def get_commit_from_path(repo_path: Path) -> Optional[str]: ...@@ -415,6 +416,11 @@ def get_commit_from_path(repo_path: Path) -> Optional[str]:
git_hash = head_ref.read_text(encoding="utf-8").replace("\n", "") git_hash = head_ref.read_text(encoding="utf-8").replace("\n", "")
else: else:
git_hash = None git_hash = None
except Exception as err:
logger.debug(
f"Failed to retrieve a Git commit hash from path: {str(repo_path)}. Error: {err}"
)
return None
return git_hash return git_hash
......
...@@ -99,6 +99,7 @@ class HFLM(TemplateLM): ...@@ -99,6 +99,7 @@ class HFLM(TemplateLM):
trust_remote_code: Optional[bool] = False, trust_remote_code: Optional[bool] = False,
use_fast_tokenizer: Optional[bool] = True, use_fast_tokenizer: Optional[bool] = True,
add_bos_token: Optional[bool] = False, add_bos_token: Optional[bool] = False,
prefix_token_id: Optional[int] = None,
# arguments used for splitting a model across GPUs naively. # arguments used for splitting a model across GPUs naively.
# only used if `parallelize=True`. # only used if `parallelize=True`.
parallelize: Optional[bool] = False, parallelize: Optional[bool] = False,
...@@ -340,6 +341,12 @@ class HFLM(TemplateLM): ...@@ -340,6 +341,12 @@ class HFLM(TemplateLM):
self._rank = 0 self._rank = 0
self._world_size = 1 self._world_size = 1
self.custom_prefix_token_id = prefix_token_id
if prefix_token_id is not None:
eval_logger.info(
f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}"
)
@property @property
def config(self): def config(self):
# return the associated transformers.AutoConfig for the given pretrained model. # return the associated transformers.AutoConfig for the given pretrained model.
...@@ -358,6 +365,15 @@ class HFLM(TemplateLM): ...@@ -358,6 +365,15 @@ class HFLM(TemplateLM):
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.tokenizer.eos_token_id return self.tokenizer.eos_token_id
@property
def prefix_token_id(self):
# it is used as prefix for loglikelihood
if self.custom_prefix_token_id is not None:
return self.custom_prefix_token_id
if self.tokenizer.bos_token_id is not None:
return self.tokenizer.bos_token_id
return self.tokenizer.eos_token_id
@property @property
def max_length(self): def max_length(self):
if self._max_length: # if max length manually set, return it if self._max_length: # if max length manually set, return it
...@@ -812,7 +828,7 @@ class HFLM(TemplateLM): ...@@ -812,7 +828,7 @@ class HFLM(TemplateLM):
utils.make_disjoint_window, utils.make_disjoint_window,
utils.get_rolling_token_windows( utils.get_rolling_token_windows(
token_list=self.tok_encode(string), token_list=self.tok_encode(string),
prefix_token=self.eot_token_id, prefix_token=self.prefix_token_id,
max_seq_len=self.max_length, max_seq_len=self.max_length,
context_len=1, context_len=1,
), ),
...@@ -1149,7 +1165,7 @@ class HFLM(TemplateLM): ...@@ -1149,7 +1165,7 @@ class HFLM(TemplateLM):
if "until" in kwargs.keys(): if "until" in kwargs.keys():
until = kwargs.pop("until") until = kwargs.pop("until")
if isinstance(until, str): if isinstance(until, str):
until = [kwargs] until = [until]
elif not isinstance(until, list): elif not isinstance(until, list):
raise ValueError( raise ValueError(
f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}" f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
......
...@@ -305,6 +305,11 @@ class NEURON_HF(TemplateLM): ...@@ -305,6 +305,11 @@ class NEURON_HF(TemplateLM):
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.tokenizer.eos_token_id return self.tokenizer.eos_token_id
@property
def prefix_token_id(self):
# it is used as prefix for loglikelihood
return self.tokenizer.bos_token_id or self.tokenizer.eos_token_id
@property @property
def max_length(self): def max_length(self):
if self._max_length: # if max length manually set, return it if self._max_length: # if max length manually set, return it
...@@ -460,7 +465,7 @@ class NEURON_HF(TemplateLM): ...@@ -460,7 +465,7 @@ class NEURON_HF(TemplateLM):
utils.make_disjoint_window, utils.make_disjoint_window,
utils.get_rolling_token_windows( utils.get_rolling_token_windows(
token_list=self.tok_encode(string), token_list=self.tok_encode(string),
prefix_token=self.eot_token_id, prefix_token=self.prefix_token_id,
max_seq_len=self.max_length, max_seq_len=self.max_length,
context_len=1, context_len=1,
), ),
...@@ -659,7 +664,7 @@ class NEURON_HF(TemplateLM): ...@@ -659,7 +664,7 @@ class NEURON_HF(TemplateLM):
if "until" in kwargs.keys(): if "until" in kwargs.keys():
until = kwargs.pop("until") until = kwargs.pop("until")
if isinstance(until, str): if isinstance(until, str):
until = [kwargs] until = [until]
elif not isinstance(until, list): elif not isinstance(until, list):
raise ValueError( raise ValueError(
f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}" f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
......
...@@ -281,7 +281,7 @@ class OpenaiCompletionsLM(TemplateLM): ...@@ -281,7 +281,7 @@ class OpenaiCompletionsLM(TemplateLM):
**{ **{
k: v k: v
for k, v in request_args.items() for k, v in request_args.items()
if k not in ["do_sample", "max_gen_toks"] if k not in {"do_sample", "max_gen_toks", "until"}
}, },
) )
for resp, (context, args_) in zip(response.choices, chunk): for resp, (context, args_) in zip(response.choices, chunk):
......
...@@ -42,6 +42,7 @@ class VLLM(TemplateLM): ...@@ -42,6 +42,7 @@ class VLLM(TemplateLM):
tokenizer_mode: Literal["auto", "slow"] = "auto", tokenizer_mode: Literal["auto", "slow"] = "auto",
tokenizer_revision: Optional[str] = None, tokenizer_revision: Optional[str] = None,
add_bos_token: Optional[bool] = False, add_bos_token: Optional[bool] = False,
prefix_token_id: Optional[int] = None,
tensor_parallel_size: int = 1, tensor_parallel_size: int = 1,
quantization: Optional[str] = None, quantization: Optional[str] = None,
max_gen_toks: int = 256, max_gen_toks: int = 256,
...@@ -118,6 +119,11 @@ class VLLM(TemplateLM): ...@@ -118,6 +119,11 @@ class VLLM(TemplateLM):
tokenizer_revision=tokenizer_revision, tokenizer_revision=tokenizer_revision,
) )
self.add_bos_token = add_bos_token self.add_bos_token = add_bos_token
self.custom_prefix_token_id = prefix_token_id
if prefix_token_id is not None:
eval_logger.info(
f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}"
)
self._max_gen_toks = max_gen_toks self._max_gen_toks = max_gen_toks
...@@ -126,6 +132,15 @@ class VLLM(TemplateLM): ...@@ -126,6 +132,15 @@ class VLLM(TemplateLM):
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.tokenizer.eos_token_id return self.tokenizer.eos_token_id
@property
def prefix_token_id(self):
# it is used as prefix for loglikelihood
if self.custom_prefix_token_id is not None:
return self.custom_prefix_token_id
if self.tokenizer.bos_token_id is not None:
return self.tokenizer.bos_token_id
return self.tokenizer.eos_token_id
@property @property
def max_length(self): def max_length(self):
if self._max_length: # if max length manually set, return it if self._max_length: # if max length manually set, return it
......
# v1.0 Tasks
This list keeps track of which tasks' implementations have been ported to YAML / v2.0 of the Eval Harness.
Boxes should be checked iff tasks are implemented in the refactor and tested for regression. Tasks should be struck through if checked *against original introducing paper* implementation or popularizing implementation. (WIP) Denotes that there exists a PR or person working on this task already.
- [x] Glue
- [x] SuperGlue
- [x] CoQA
- [x] DROP
- [x] ~~Lambada~~
- [x] Lambada (Cloze variants)
- [x] ~~Lambada (Multilingual)~~
- [x] Wikitext
- [x] PiQA
- [x] PROST
- [x] MCTACO
- [x] Pubmed QA
- [x] SciQ
- [x] QASPER
- [x] QA4MRE
- [x] TriviaQA
- [x] AI2 ARC
- [x] LogiQA
- [x] HellaSwag
- [x] SWAG
- [x] OpenBookQA
- [ ] SQuADv2 (Lintang)
- [x] RACE
- [x] HeadQA
- [x] MathQA
- [x] WebQs
- [x] WSC273
- [x] Winogrande
- [x] ANLI
- [x] Hendrycks Ethics (missing some tasks/metrics, see PR 660: <https://github.com/EleutherAI/lm-evaluation-harness/pull/660> for more info)
- [x] TruthfulQA (mc1)
- [x] TruthfulQA (mc2)
- [x] TruthfulQA (gen)
- [x] MuTual
- [ ] Hendrycks Math (Hailey)
- [x] Asdiv
- [ ] GSM8k
- [x] Arithmetic
- [ ] MMMLU (Hailey)
- [x] Translation (WMT) suite
- [x] Unscramble
- [x] ~~Pile (perplexity)~~
- [x] BLiMP
- [x] ToxiGen
- [x] StoryCloze
- [ ] NaturalQs (Hailey)
- [x] CrowS-Pairs
- [x] XCopa
- [ ] BIG-Bench (Hailey)
- [x] XStoryCloze
- [x] XWinograd
- [x] PAWS-X
- [x] XNLI
- [x] MGSM
- [ ] SCROLLS
- [x] Babi
- [x] Belebele
# Novel Tasks
Tasks added in the revamped harness that were not previously available. Again, a strikethrough denotes checking performed *against the original task's implementation or published results introducing the task*.
# Task Wishlist
- [ ] TheoremQA
- [ ] Theorem Proving evaluations
- [ ] Chain of Thought
- [ ] Self-consistency ; Least-to-Most prompting, etc.
- [ ] Summarization Tasks
- [ ] Anthropic Model-Written Evals
...@@ -356,28 +356,6 @@ class TaskManager: ...@@ -356,28 +356,6 @@ class TaskManager:
return tasks_and_groups return tasks_and_groups
def include_path(task_dir):
logger = utils.eval_logger
logger.setLevel(getattr(logging, "INFO"))
logger.info(
"To still use tasks loaded from args.include_path,"
"see an example of the new TaskManager API in "
"https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
)
return 0
def initialize_tasks(verbosity="INFO"):
logger = utils.eval_logger
logger.setLevel(getattr(logging, f"{verbosity}"))
logger.info(
"lm_eval.tasks.initialize_tasks() is deprecated and no longer necessary. "
"It will be removed in v0.4.2 release. "
"TaskManager will instead be used."
)
return 0
def get_task_name_from_config(task_config: Dict[str, str]) -> str: def get_task_name_from_config(task_config: Dict[str, str]) -> str:
if "task" in task_config: if "task" in task_config:
return task_config["task"] return task_config["task"]
...@@ -401,7 +379,7 @@ def get_task_name_from_object(task_object): ...@@ -401,7 +379,7 @@ def get_task_name_from_object(task_object):
def get_task_dict( def get_task_dict(
task_name_list: List[Union[str, Dict, Task]], task_name_list: Union[str, List[Union[str, Dict, Task]]],
task_manager: Optional[TaskManager] = None, task_manager: Optional[TaskManager] = None,
): ):
"""Creates a dictionary of task objects from either a name of task, config, or prepared Task object. """Creates a dictionary of task objects from either a name of task, config, or prepared Task object.
...@@ -423,6 +401,15 @@ def get_task_dict( ...@@ -423,6 +401,15 @@ def get_task_dict(
if isinstance(task_name_list, str): if isinstance(task_name_list, str):
task_name_list = [task_name_list] task_name_list = [task_name_list]
elif isinstance(task_name_list, list):
if not all([isinstance(task, (str, dict, Task)) for task in task_name_list]):
raise TypeError(
"Expected all list items to be of types 'str', 'dict', or 'Task', but at least one entry did not match."
)
else:
raise TypeError(
f"Expected a 'str' or 'list' but received {type(task_name_list)}."
)
string_task_name_list = [task for task in task_name_list if isinstance(task, str)] string_task_name_list = [task for task in task_name_list if isinstance(task, str)]
others_task_name_list = [task for task in task_name_list if ~isinstance(task, str)] others_task_name_list = [task for task in task_name_list if ~isinstance(task, str)]
......
# ACLUE
### Paper
Can Large Language Model Comprehend Ancient Chinese? A Preliminary Test on ACLUE
https://arxiv.org/abs/2310.09550
The Ancient Chinese Language Understanding Evaluation (ACLUE) is an evaluation benchmark focused on ancient Chinese language comprehension. It aims to assess the performance of large-scale language models on understanding ancient Chinese. The benchmark comprises 15 tasks spanning various domains, including lexical, syntactic, semantic, inference, and knowledge. ACLUE's tasks are derived from a combination of manually curated questions from publicly available resources, and automatically
generated questions from classical Chinese language corpora. The range of questions span from the Xia dynasty (2070 BCE) to the Ming dynasty (1368 CE). ACLUE adopts a multiple-choice question format for all tasks.
Homepage: https://github.com/isen-zhang/ACLUE
### Citation
```bibtex
@inproceedings{zhang-li-2023-large,
title = "Can Large Langauge Model Comprehend {A}ncient {C}hinese? A Preliminary Test on {ACLUE}",
author = "Zhang, Yixuan and Li, Haonan",
booktitle = "Proceedings of the Ancient Language Processing Workshop",
month = sep,
year = "2023",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2023.alp-1.9",
pages = "80--87"
}
```
### Groups and Tasks
#### Groups
- `aclue`: All 15 subjects of the ACLUE dataset, evaluated following the methodology in CMMLU's original implementation.
#### Tasks
The following tasks evaluate subjects in the ACLUE dataset using loglikelihood-based multiple-choice scoring:
- `aclue_{subject_english}`
### Checklist
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation?
* [x] Yes, original implementation contributed by author of the benchmark
If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
group: aclue
dataset_path: tyouisen/aclue
test_split: test
fewshot_split: dev
fewshot_config:
sampler: first_n
output_type: multiple_choice
doc_to_text: "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: "{{['A', 'B', 'C', 'D'].index(Answer)}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
"""
Take in a YAML, and output all other splits with this YAML
"""
import argparse
import os
import yaml
from tqdm import tqdm
from lm_eval.utils import eval_logger
SUBJECTS = {
"古文单字多义": "polysemy_resolution",
"诗词情感分类": "poetry_sentiment_analysis",
"古汉语命名体识别": "named_entity_recognition",
"古汉语知识": "basic_ancient_chinese",
"古诗词上下句预测": "poetry_context_prediction",
"古文断句": "sentence_segmentation",
"对联": "couplet_prediction",
"古诗词曲鉴赏": "poetry_appreciate",
"国学常识": "ancient_chinese_culture",
"古音学": "ancient_phonetics",
"通假字": "homographic_character_resolution",
"古代文学知识": "ancient_literature",
"医古文": "ancient_medical",
"古诗词质量评估": "poetry_quality_assessment",
"古文阅读理解": "reading_comprehension",
}
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--base_yaml_path", required=True)
parser.add_argument("--save_prefix_path", default="aclue")
parser.add_argument("--cot_prompt_path", default=None)
parser.add_argument("--task_prefix", default="")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
# get filename of base_yaml so we can `"include": ` it in our other YAMLs.
base_yaml_name = os.path.split(args.base_yaml_path)[-1]
with open(args.base_yaml_path, encoding="utf-8") as f:
base_yaml = yaml.full_load(f)
if args.cot_prompt_path is not None:
import json
with open(args.cot_prompt_path, encoding="utf-8") as f:
cot_file = json.load(f)
for subject_zh, subject_eng in tqdm(SUBJECTS.items()):
if args.cot_prompt_path is not None:
description = cot_file[subject_eng]
else:
description = (
f"以下是关于{subject_zh}的单项选择题,请直接给出正确答案的选项。\n\n"
)
yaml_dict = {
"include": base_yaml_name,
"task": f"aclue_{args.task_prefix}_{subject_eng}"
if args.task_prefix != ""
else f"aclue_{subject_eng}",
"dataset_name": subject_eng,
"description": description,
}
file_save_path = args.save_prefix_path + f"_{subject_eng}.yaml"
eval_logger.info(f"Saving yaml for subset {subject_eng} to {file_save_path}")
with open(file_save_path, "w", encoding="utf-8") as yaml_file:
yaml.dump(
yaml_dict,
yaml_file,
width=float("inf"),
allow_unicode=True,
default_style='"',
)
"dataset_name": "ancient_chinese_culture"
"description": "以下是关于国学常识的单项选择题,请直接给出正确答案的选项。\n\n"
"include": "_default_template_yaml"
"task": "aclue_ancient_chinese_culture"
"dataset_name": "ancient_literature"
"description": "以下是关于古代文学知识的单项选择题,请直接给出正确答案的选项。\n\n"
"include": "_default_template_yaml"
"task": "aclue_ancient_literature"
"dataset_name": "ancient_medical"
"description": "以下是关于医古文的单项选择题,请直接给出正确答案的选项。\n\n"
"include": "_default_template_yaml"
"task": "aclue_ancient_medical"
"dataset_name": "ancient_phonetics"
"description": "以下是关于古音学的单项选择题,请直接给出正确答案的选项。\n\n"
"include": "_default_template_yaml"
"task": "aclue_ancient_phonetics"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment