Commit 25869601 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into mathvista

# Conflicts:
#	lm_eval/models/hf_vlms.py
parents 56f40c53 c1d8795d
...@@ -8,6 +8,7 @@ build ...@@ -8,6 +8,7 @@ build
dist dist
*.egg-info *.egg-info
venv venv
.venv/
.vscode/ .vscode/
temp temp
__pycache__ __pycache__
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
exclude: ^tests/testdata/ exclude: ^tests/testdata/
repos: repos:
- repo: https://github.com/pre-commit/pre-commit-hooks - repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0 rev: v4.6.0
hooks: hooks:
- id: check-added-large-files - id: check-added-large-files
- id: check-ast - id: check-ast
...@@ -29,7 +29,7 @@ repos: ...@@ -29,7 +29,7 @@ repos:
- id: mixed-line-ending - id: mixed-line-ending
args: [--fix=lf] args: [--fix=lf]
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.4.8 rev: v0.6.8
hooks: hooks:
# Run the linter. # Run the linter.
- id: ruff - id: ruff
......
...@@ -54,7 +54,7 @@ The Language Model Evaluation Harness is the backend for 🤗 Hugging Face's pop ...@@ -54,7 +54,7 @@ The Language Model Evaluation Harness is the backend for 🤗 Hugging Face's pop
To install the `lm-eval` package from the github repository, run: To install the `lm-eval` package from the github repository, run:
```bash ```bash
git clone https://github.com/EleutherAI/lm-evaluation-harness git clone --depth 1 https://github.com/EleutherAI/lm-evaluation-harness
cd lm-evaluation-harness cd lm-evaluation-harness
pip install -e . pip install -e .
``` ```
......
This diff is collapsed.
...@@ -68,6 +68,7 @@ ...@@ -68,6 +68,7 @@
"source": [ "source": [
"import wandb\n", "import wandb\n",
"\n", "\n",
"\n",
"wandb.login()" "wandb.login()"
] ]
}, },
...@@ -130,6 +131,7 @@ ...@@ -130,6 +131,7 @@
"import lm_eval\n", "import lm_eval\n",
"from lm_eval.loggers import WandbLogger\n", "from lm_eval.loggers import WandbLogger\n",
"\n", "\n",
"\n",
"results = lm_eval.simple_evaluate(\n", "results = lm_eval.simple_evaluate(\n",
" model=\"hf\",\n", " model=\"hf\",\n",
" model_args=\"pretrained=microsoft/phi-2,trust_remote_code=True\",\n", " model_args=\"pretrained=microsoft/phi-2,trust_remote_code=True\",\n",
......
...@@ -431,7 +431,12 @@ class TemplateLM(LM): ...@@ -431,7 +431,12 @@ class TemplateLM(LM):
using_default_template = False using_default_template = False
# First, handle the cases when the model has a dict of multiple templates # First, handle the cases when the model has a dict of multiple templates
template = self.tokenizer.chat_template or self.tokenizer.default_chat_template try:
template = (
self.tokenizer.chat_template or self.tokenizer.default_chat_template
)
except AttributeError:
return None
if isinstance(template, dict): if isinstance(template, dict):
using_default_dict = self.tokenizer.chat_template is None using_default_dict = self.tokenizer.chat_template is None
......
...@@ -57,7 +57,6 @@ class TaskConfig(dict): ...@@ -57,7 +57,6 @@ class TaskConfig(dict):
task: Optional[str] = None task: Optional[str] = None
task_alias: Optional[str] = None task_alias: Optional[str] = None
tag: Optional[Union[str, list]] = None tag: Optional[Union[str, list]] = None
group: Optional[Union[str, list]] = None
# HF dataset options. # HF dataset options.
# which dataset to use, # which dataset to use,
# and what splits for what purpose # and what splits for what purpose
...@@ -98,18 +97,6 @@ class TaskConfig(dict): ...@@ -98,18 +97,6 @@ class TaskConfig(dict):
) )
def __post_init__(self) -> None: def __post_init__(self) -> None:
if self.group is not None:
eval_logger.warning(
"A task YAML file was found to contain a `group` key. Groups which provide aggregate scores over several subtasks now require a separate config file--if not aggregating, you may want to use the `tag` config option instead within your config. Setting `group` within a TaskConfig will be deprecated in v0.4.4. Please see https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md for more information."
)
if self.tag is None:
self.tag = self.group
else:
raise ValueError(
"Got both a `group` and `tag` entry within a TaskConfig. Please use one or the other--`group` values will be deprecated in v0.4.4."
)
if self.generation_kwargs is not None: if self.generation_kwargs is not None:
if self.output_type != "generate_until": if self.output_type != "generate_until":
eval_logger.warning( eval_logger.warning(
...@@ -1511,7 +1498,7 @@ class ConfigurableTask(Task): ...@@ -1511,7 +1498,7 @@ class ConfigurableTask(Task):
# we expect multiple_targets to be a list. # we expect multiple_targets to be a list.
elif self.multiple_target: elif self.multiple_target:
gold = list(gold) gold = list(gold)
elif type(gold) != type(result): elif type(gold) is not type(result):
# cast gold to the same type as result # cast gold to the same type as result
gold = type(result)(gold) gold = type(result)(gold)
...@@ -1594,7 +1581,7 @@ class ConfigurableTask(Task): ...@@ -1594,7 +1581,7 @@ class ConfigurableTask(Task):
f"ConfigurableTask(task_name={getattr(self.config, 'task', None)}," f"ConfigurableTask(task_name={getattr(self.config, 'task', None)},"
f"output_type={self.OUTPUT_TYPE}," f"output_type={self.OUTPUT_TYPE},"
f"num_fewshot={getattr(self.config, 'num_fewshot', None)}," f"num_fewshot={getattr(self.config, 'num_fewshot', None)},"
f"num_samples={len(self.eval_docs)})", f"num_samples={len(self.eval_docs)})"
) )
......
...@@ -157,6 +157,9 @@ def simple_evaluate( ...@@ -157,6 +157,9 @@ def simple_evaluate(
seed_message.append(f"Setting torch manual seed to {torch_random_seed}") seed_message.append(f"Setting torch manual seed to {torch_random_seed}")
torch.manual_seed(torch_random_seed) torch.manual_seed(torch_random_seed)
if fewshot_random_seed is not None:
seed_message.append(f"Setting fewshot manual seed to {fewshot_random_seed}")
if seed_message: if seed_message:
eval_logger.info(" | ".join(seed_message)) eval_logger.info(" | ".join(seed_message))
...@@ -276,9 +279,6 @@ def simple_evaluate( ...@@ -276,9 +279,6 @@ def simple_evaluate(
task_obj.set_config(key="num_fewshot", value=0) task_obj.set_config(key="num_fewshot", value=0)
# fewshot_random_seed set for tasks, even with a default num_fewshot (e.g. in the YAML file) # fewshot_random_seed set for tasks, even with a default num_fewshot (e.g. in the YAML file)
task_obj.set_fewshot_seed(seed=fewshot_random_seed) task_obj.set_fewshot_seed(seed=fewshot_random_seed)
eval_logger.info(
f"Setting fewshot random generator seed to {fewshot_random_seed}"
)
adjusted_task_dict[task_name] = task_obj adjusted_task_dict[task_name] = task_obj
...@@ -433,10 +433,14 @@ def evaluate( ...@@ -433,10 +433,14 @@ def evaluate(
) )
# end multimodality validation check # end multimodality validation check
# Cache the limit arg.
limit_arg = limit
limits = []
for task_output in eval_tasks: for task_output in eval_tasks:
task: Task = task_output.task task: Task = task_output.task
limit = get_sample_size(task, limit) limit = get_sample_size(task, limit_arg)
limits.append(limit)
task.build_all_requests( task.build_all_requests(
limit=limit, limit=limit,
rank=lm.rank, rank=lm.rank,
...@@ -506,7 +510,7 @@ def evaluate( ...@@ -506,7 +510,7 @@ def evaluate(
WORLD_SIZE = lm.world_size WORLD_SIZE = lm.world_size
### Postprocess outputs ### ### Postprocess outputs ###
# TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately) # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
for task_output in eval_tasks: for task_output, limit in zip(eval_tasks, limits):
task = task_output.task task = task_output.task
task.apply_filters() task.apply_filters()
...@@ -655,7 +659,7 @@ def evaluate( ...@@ -655,7 +659,7 @@ def evaluate(
len(task_output.task.eval_docs), len(task_output.task.eval_docs),
), ),
} }
for task_output in eval_tasks for task_output, limit in zip(eval_tasks, limits)
}, },
} }
if log_samples: if log_samples:
......
...@@ -73,9 +73,12 @@ class TemplateAPI(TemplateLM): ...@@ -73,9 +73,12 @@ class TemplateAPI(TemplateLM):
seed: int = 1234, seed: int = 1234,
max_length: Optional[int] = 2048, max_length: Optional[int] = 2048,
add_bos_token: bool = False, add_bos_token: bool = False,
custom_prefix_token_id=None, custom_prefix_token_id: int = None,
# send the requests as tokens or strings # send the requests as tokens or strings
tokenized_requests=True, tokenized_requests: bool = True,
trust_remote_code: bool = False,
revision: Optional[str] = "main",
use_fast_tokenizer: bool = True,
**kwargs, **kwargs,
) -> None: ) -> None:
super().__init__() super().__init__()
...@@ -128,7 +131,10 @@ class TemplateAPI(TemplateLM): ...@@ -128,7 +131,10 @@ class TemplateAPI(TemplateLM):
import transformers import transformers
self.tokenizer = transformers.AutoTokenizer.from_pretrained( self.tokenizer = transformers.AutoTokenizer.from_pretrained(
self.tokenizer if self.tokenizer else self.model self.tokenizer if self.tokenizer else self.model,
trust_remote_code=trust_remote_code,
revision=revision,
use_fast=use_fast_tokenizer,
) )
# Not used as the API will handle padding but to mirror the behavior of the HFLM # Not used as the API will handle padding but to mirror the behavior of the HFLM
self.tokenizer = configure_pad_token(self.tokenizer) self.tokenizer = configure_pad_token(self.tokenizer)
...@@ -153,6 +159,9 @@ class TemplateAPI(TemplateLM): ...@@ -153,6 +159,9 @@ class TemplateAPI(TemplateLM):
assert isinstance(tokenizer, str), "tokenizer must be a string" assert isinstance(tokenizer, str), "tokenizer must be a string"
self.tokenizer = transformers.AutoTokenizer.from_pretrained( self.tokenizer = transformers.AutoTokenizer.from_pretrained(
tokenizer, tokenizer,
trust_remote_code=trust_remote_code,
revision=revision,
use_fast=use_fast_tokenizer,
) )
@abc.abstractmethod @abc.abstractmethod
......
...@@ -26,9 +26,9 @@ class DummyLM(LM): ...@@ -26,9 +26,9 @@ class DummyLM(LM):
def generate_until(self, requests, disable_tqdm: bool = False): def generate_until(self, requests, disable_tqdm: bool = False):
res = [] res = []
for ctx, _ in tqdm(requests, disable=disable_tqdm): for request in tqdm(requests, disable=disable_tqdm):
res.append("lol") res.append("lol")
assert ctx.strip() != "" assert request.arguments[0].strip() != ""
return res return res
......
...@@ -13,6 +13,7 @@ from lm_eval.api.registry import register_model ...@@ -13,6 +13,7 @@ from lm_eval.api.registry import register_model
from lm_eval.models.huggingface import HFLM from lm_eval.models.huggingface import HFLM
from lm_eval.models.utils import ( from lm_eval.models.utils import (
Collator, Collator,
flatten_image_list,
pad_and_concat, pad_and_concat,
replace_placeholders, replace_placeholders,
stop_sequences_criteria, stop_sequences_criteria,
...@@ -295,6 +296,11 @@ class HFMultimodalLM(HFLM): ...@@ -295,6 +296,11 @@ class HFMultimodalLM(HFLM):
images = [img[: self.max_images] for img in images] images = [img[: self.max_images] for img in images]
if self.rgb: if self.rgb:
images = [[img.convert("RGB") for img in sublist] for sublist in images] images = [[img.convert("RGB") for img in sublist] for sublist in images]
# certain models like llava expect a single-level image list even for bs>1, multi-image. TODO: port this over to loglikelihoods
if getattr(self.config, "model_type", "") == "llava":
images = flatten_image_list(images)
try: try:
encoding = self.processor( encoding = self.processor(
images=images, images=images,
......
...@@ -55,7 +55,7 @@ class HFLM(TemplateLM): ...@@ -55,7 +55,7 @@ class HFLM(TemplateLM):
def __init__( def __init__(
self, self,
pretrained: Union[str, transformers.PreTrainedModel], pretrained: Union[str, transformers.PreTrainedModel],
backend: Optional[Literal["default", "causal", "seq2seq"]] = "default", backend: Literal["default", "causal", "seq2seq"] = "default",
# override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq) # override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq)
revision: Optional[str] = "main", revision: Optional[str] = "main",
subfolder: Optional[str] = None, subfolder: Optional[str] = None,
...@@ -90,7 +90,6 @@ class HFLM(TemplateLM): ...@@ -90,7 +90,6 @@ class HFLM(TemplateLM):
**kwargs, **kwargs,
) -> None: ) -> None:
super().__init__() super().__init__()
# optionally: take in an already-initialized transformers.PreTrainedModel # optionally: take in an already-initialized transformers.PreTrainedModel
if not isinstance(pretrained, str): if not isinstance(pretrained, str):
eval_logger.warning( eval_logger.warning(
...@@ -164,7 +163,7 @@ class HFLM(TemplateLM): ...@@ -164,7 +163,7 @@ class HFLM(TemplateLM):
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
) )
# determine which of 'causal' and 'seq2seq' backends to use # determine which of 'causal' and 'seq2seq' backends to use for HF models
self._get_backend( self._get_backend(
config=self.config, backend=backend, trust_remote_code=trust_remote_code config=self.config, backend=backend, trust_remote_code=trust_remote_code
) )
...@@ -287,7 +286,7 @@ class HFLM(TemplateLM): ...@@ -287,7 +286,7 @@ class HFLM(TemplateLM):
def _get_accelerate_args( def _get_accelerate_args(
self, self,
parallelize: bool = None, parallelize: Optional[bool] = None,
device_map: Optional[str] = "auto", device_map: Optional[str] = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None, max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None, max_cpu_memory: Optional[Union[int, str]] = None,
...@@ -441,31 +440,26 @@ class HFLM(TemplateLM): ...@@ -441,31 +440,26 @@ class HFLM(TemplateLM):
def _get_backend( def _get_backend(
self, self,
config: Union[transformers.PretrainedConfig, transformers.AutoConfig], config: Union[transformers.PretrainedConfig, transformers.AutoConfig],
backend: Optional[Literal["default", "causal", "seq2seq"]] = "default", backend: Literal["default", "causal", "seq2seq"] = "default",
trust_remote_code: Optional[bool] = False, trust_remote_code: Optional[bool] = False,
) -> None: ) -> None:
""" """
Helper method during initialization. Helper method during initialization.
Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder)) Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder)) model type to be used.
model type to be used.
sets `self.AUTO_MODEL_CLASS` appropriately if not already set. sets `self.AUTO_MODEL_CLASS` appropriately if not already set.
**If not calling HFLM.__init__() or HFLM._get_backend() within a subclass of HFLM,
user must set `self.backend` to be either "causal" or "seq2seq" manually!**
""" """
# escape hatch: if we're using a subclass that shouldn't follow
# the default _get_backend logic,
# then skip over the method.
# TODO: this seems very much undesirable in some cases--our code in HFLM
# references AutoModelForCausalLM at times to check for equality
if self.AUTO_MODEL_CLASS is not None:
return
assert backend in ["default", "causal", "seq2seq"] assert backend in ["default", "causal", "seq2seq"]
if backend != "default": if backend != "default":
# if we've settled on non-default backend, use that manually # if we've settled on non-default backend, use that manually
if backend == "causal": if backend == "causal":
self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM self.backend = backend
elif backend == "seq2seq": elif backend == "seq2seq":
self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM self.backend = backend
eval_logger.info( eval_logger.info(
f"Overrode HF model backend type, and using type '{backend}'" f"Overrode HF model backend type, and using type '{backend}'"
) )
...@@ -478,26 +472,32 @@ class HFLM(TemplateLM): ...@@ -478,26 +472,32 @@ class HFLM(TemplateLM):
# first check if model type is listed under seq2seq models, since some # first check if model type is listed under seq2seq models, since some
# models like MBart are listed in both seq2seq and causal mistakenly in HF transformers. # models like MBart are listed in both seq2seq and causal mistakenly in HF transformers.
# these special cases should be treated as seq2seq models. # these special cases should be treated as seq2seq models.
self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM self.backend = "seq2seq"
eval_logger.info(f"Using model type '{backend}'")
elif ( elif (
getattr(self.config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES getattr(self.config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
): ):
self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM self.backend = "causal"
eval_logger.info(f"Using model type '{backend}'")
else: else:
if not trust_remote_code: if not trust_remote_code:
eval_logger.warning( eval_logger.warning(
"HF model type is neither marked as CausalLM or Seq2SeqLM. \ "HF model type is neither marked as CausalLM or Seq2SeqLM. \
This is expected if your model requires `trust_remote_code=True` but may be an error otherwise." This is expected if your model requires `trust_remote_code=True` but may be an error otherwise."
"Setting backend to causal"
) )
# if model type is neither in HF transformers causal or seq2seq model registries # if model type is neither in HF transformers causal or seq2seq model registries
# then we default to AutoModelForCausalLM # then we default to assuming AutoModelForCausalLM
self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM self.backend = "causal"
eval_logger.info(
f"Model type cannot be determined. Using default model type '{backend}'"
)
assert self.AUTO_MODEL_CLASS in [ if self.AUTO_MODEL_CLASS is None:
transformers.AutoModelForCausalLM, if self.backend == "causal":
transformers.AutoModelForSeq2SeqLM, self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
] elif self.backend == "seq2seq":
return None self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
def _get_config( def _get_config(
self, self,
...@@ -505,6 +505,7 @@ class HFLM(TemplateLM): ...@@ -505,6 +505,7 @@ class HFLM(TemplateLM):
revision: str = "main", revision: str = "main",
trust_remote_code: bool = False, trust_remote_code: bool = False,
) -> None: ) -> None:
"""Return the model config for HuggingFace models"""
self._config = transformers.AutoConfig.from_pretrained( self._config = transformers.AutoConfig.from_pretrained(
pretrained, pretrained,
revision=revision, revision=revision,
...@@ -703,7 +704,7 @@ class HFLM(TemplateLM): ...@@ -703,7 +704,7 @@ class HFLM(TemplateLM):
# if OOM, then halves batch_size and tries again # if OOM, then halves batch_size and tries again
@find_executable_batch_size(starting_batch_size=self.max_batch_size) @find_executable_batch_size(starting_batch_size=self.max_batch_size)
def forward_batch(batch_size): def forward_batch(batch_size):
if self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM: if self.backend == "seq2seq":
length = max(max_context_enc, max_cont_enc) length = max(max_context_enc, max_cont_enc)
batched_conts = torch.ones( batched_conts = torch.ones(
(batch_size, length), device=self.device (batch_size, length), device=self.device
...@@ -754,7 +755,7 @@ class HFLM(TemplateLM): ...@@ -754,7 +755,7 @@ class HFLM(TemplateLM):
# by default for CausalLM - false or self.add_bos_token is set # by default for CausalLM - false or self.add_bos_token is set
if add_special_tokens is None: if add_special_tokens is None:
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: if self.backend == "causal":
special_tokens_kwargs = { special_tokens_kwargs = {
"add_special_tokens": False or self.add_bos_token "add_special_tokens": False or self.add_bos_token
} }
...@@ -782,7 +783,7 @@ class HFLM(TemplateLM): ...@@ -782,7 +783,7 @@ class HFLM(TemplateLM):
self.tokenizer.padding_side = padding_side self.tokenizer.padding_side = padding_side
add_special_tokens = {} add_special_tokens = {}
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: if self.backend == "causal":
add_special_tokens = {"add_special_tokens": False or self.add_bos_token} add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
encoding = self.tokenizer( encoding = self.tokenizer(
...@@ -860,14 +861,14 @@ class HFLM(TemplateLM): ...@@ -860,14 +861,14 @@ class HFLM(TemplateLM):
def _select_cont_toks( def _select_cont_toks(
self, logits: torch.Tensor, contlen: int = None, inplen: int = None self, logits: torch.Tensor, contlen: int = None, inplen: int = None
) -> torch.Tensor: ) -> torch.Tensor:
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: if self.backend == "causal":
assert ( assert (
contlen and inplen contlen and inplen
), "Must pass input len and cont. len to select scored logits for causal LM" ), "Must pass input len and cont. len to select scored logits for causal LM"
# discard right-padding. # discard right-padding.
# also discard the input/context tokens. we'll only score continuations. # also discard the input/context tokens. we'll only score continuations.
logits = logits[inplen - contlen : inplen] logits = logits[inplen - contlen : inplen]
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM: elif self.backend == "seq2seq":
assert ( assert (
contlen and not inplen contlen and not inplen
), "Selecting scored logits for Seq2SeqLM requires only cont. len" ), "Selecting scored logits for Seq2SeqLM requires only cont. len"
...@@ -990,8 +991,7 @@ class HFLM(TemplateLM): ...@@ -990,8 +991,7 @@ class HFLM(TemplateLM):
requests, requests,
sort_fn=_collate, sort_fn=_collate,
group_by="contexts" group_by="contexts"
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM if self.backend == "causal" and self.logits_cache
and self.logits_cache
else None, else None,
group_fn=_lookup_one_token_cont, group_fn=_lookup_one_token_cont,
) )
...@@ -1048,14 +1048,14 @@ class HFLM(TemplateLM): ...@@ -1048,14 +1048,14 @@ class HFLM(TemplateLM):
# cont_toks 4 5 6 7 8 9 [:, -len(continuation_enc):, :self.vocab_size] slice # cont_toks 4 5 6 7 8 9 [:, -len(continuation_enc):, :self.vocab_size] slice
# when too long to fit in context, truncate from the left # when too long to fit in context, truncate from the left
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: if self.backend == "causal":
inp = torch.tensor( inp = torch.tensor(
(context_enc + continuation_enc)[-(self.max_length + 1) :][:-1], (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
dtype=torch.long, dtype=torch.long,
device=self.device, device=self.device,
) )
(inplen,) = inp.shape (inplen,) = inp.shape
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM: elif self.backend == "seq2seq":
inp = torch.tensor( inp = torch.tensor(
(context_enc)[-self.max_length :], (context_enc)[-self.max_length :],
dtype=torch.long, dtype=torch.long,
...@@ -1095,11 +1095,11 @@ class HFLM(TemplateLM): ...@@ -1095,11 +1095,11 @@ class HFLM(TemplateLM):
# create encoder attn mask and batched conts, if seq2seq # create encoder attn mask and batched conts, if seq2seq
call_kwargs = {} call_kwargs = {}
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: if self.backend == "causal":
batched_inps = pad_and_concat( batched_inps = pad_and_concat(
padding_len_inp, inps, padding_side="right" padding_len_inp, inps, padding_side="right"
) # [batch, padding_len_inp] ) # [batch, padding_len_inp]
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM: elif self.backend == "seq2seq":
# TODO: left-pad encoder inps and mask? # TODO: left-pad encoder inps and mask?
batched_inps = pad_and_concat( batched_inps = pad_and_concat(
padding_len_inp, inps padding_len_inp, inps
...@@ -1130,7 +1130,7 @@ class HFLM(TemplateLM): ...@@ -1130,7 +1130,7 @@ class HFLM(TemplateLM):
# from prompt/prefix tuning tokens, if applicable # from prompt/prefix tuning tokens, if applicable
ctx_len = ( ctx_len = (
inplen + (logits.shape[0] - padding_len_inp) inplen + (logits.shape[0] - padding_len_inp)
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM if self.backend == "causal"
else None else None
) )
logits = self._select_cont_toks(logits, contlen=contlen, inplen=ctx_len) logits = self._select_cont_toks(logits, contlen=contlen, inplen=ctx_len)
...@@ -1265,10 +1265,10 @@ class HFLM(TemplateLM): ...@@ -1265,10 +1265,10 @@ class HFLM(TemplateLM):
max_gen_toks = self.max_gen_toks max_gen_toks = self.max_gen_toks
# set the max length in tokens of inputs ("context_enc") # set the max length in tokens of inputs ("context_enc")
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: if self.backend == "causal":
# max len for inputs = max length, minus room to generate the max new tokens # max len for inputs = max length, minus room to generate the max new tokens
max_ctx_len = self.max_length - max_gen_toks max_ctx_len = self.max_length - max_gen_toks
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM: elif self.backend == "seq2seq":
# max len for inputs = encoder's whole max_length # max len for inputs = encoder's whole max_length
max_ctx_len = self.max_length max_ctx_len = self.max_length
...@@ -1295,7 +1295,7 @@ class HFLM(TemplateLM): ...@@ -1295,7 +1295,7 @@ class HFLM(TemplateLM):
cont_toks_list = cont.tolist() cont_toks_list = cont.tolist()
for cont_toks, context in zip(cont_toks_list, contexts): for cont_toks, context in zip(cont_toks_list, contexts):
# discard context + left-padding toks if using causal decoder-only LM # discard context + left-padding toks if using causal decoder-only LM
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: if self.backend == "causal":
cont_toks = cont_toks[context_enc.shape[1] :] cont_toks = cont_toks[context_enc.shape[1] :]
s = self.tok_decode(cont_toks) s = self.tok_decode(cont_toks)
......
import copy import copy
import json
import logging import logging
import subprocess
from collections import defaultdict from collections import defaultdict
from typing import List, Optional, Union from typing import List, Optional, Union
...@@ -33,54 +31,6 @@ except ImportError: ...@@ -33,54 +31,6 @@ except ImportError:
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def get_nc_count() -> Union[int, None]:
"""Returns the number of neuron cores on the current instance."""
try:
cmd = "neuron-ls --json-output"
result = subprocess.run(cmd, shell=True, capture_output=True)
print(f"inferring nc_count from `neuron-ls` {result.stdout}")
json_output = json.loads(result.stdout)
count = sum([x["nc_count"] for x in json_output])
print(f"nc_count={count}")
return count
except Exception:
return None
def wrap_constant_batch_size(func):
def _decorator(self, input_ids):
"""input_ids a 2D array with batch_size on dim=0
makes sure the func runs with self.batch_size
"""
# access a from TestSample
batch_size = input_ids.shape[0]
if batch_size < self.batch_size:
# handle the event of input_ids.shape[0] != batch_size
# Neuron cores expect constant batch_size
input_ids = torch.concat(
(
input_ids,
# add missing_batch_size dummy
torch.zeros(
[self.batch_size - batch_size, *input_ids.size()[1:]],
dtype=input_ids.dtype,
device=input_ids.device,
),
),
dim=0,
)
elif batch_size > self.batch_size:
raise ValueError(
f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})"
)
# return the forward pass that requires constant batch size
return func(self, input_ids)[:batch_size]
return _decorator
class CustomNeuronModelForCausalLM(NeuronModelForCausalLM): class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
"""NeuronModelForCausalLM with `stopping_criteria` in `generate`""" """NeuronModelForCausalLM with `stopping_criteria` in `generate`"""
...@@ -146,7 +96,7 @@ class CustomNeuronModelForCausalLM(NeuronModelForCausalLM): ...@@ -146,7 +96,7 @@ class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
raise ValueError( raise ValueError(
f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})" f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})"
) )
elif batch_size < self.batch_size: elif batch_size < self.batch_size and not self.continuous_batching:
logger.warning( logger.warning(
"Inputs will be padded to match the model static batch size. This will increase latency." "Inputs will be padded to match the model static batch size. This will increase latency."
) )
...@@ -158,8 +108,6 @@ class CustomNeuronModelForCausalLM(NeuronModelForCausalLM): ...@@ -158,8 +108,6 @@ class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
if attention_mask is not None: if attention_mask is not None:
padding = torch.zeros(padding_shape, dtype=torch.int64) padding = torch.zeros(padding_shape, dtype=torch.int64)
padded_attention_mask = torch.cat([attention_mask, padding]) padded_attention_mask = torch.cat([attention_mask, padding])
# Drop the current generation context and clear the Key/Value cache
self.reset_generation()
output_ids = self.generate_tokens( output_ids = self.generate_tokens(
padded_input_ids, padded_input_ids,
...@@ -179,8 +127,6 @@ class NEURON_HF(TemplateLM): ...@@ -179,8 +127,6 @@ class NEURON_HF(TemplateLM):
Tested with neuron 2.17.0 Tested with neuron 2.17.0
""" """
_DEFAULT_MAX_LENGTH = 2048
def __init__( def __init__(
self, self,
pretrained: Optional[str] = "TinyLlama/TinyLlama-1.1B-Chat-v1.0", pretrained: Optional[str] = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
...@@ -203,7 +149,7 @@ class NEURON_HF(TemplateLM): ...@@ -203,7 +149,7 @@ class NEURON_HF(TemplateLM):
"please install neuron via pip install transformers-neuron ", "please install neuron via pip install transformers-neuron ",
"also make sure you are running on an AWS inf2 instance", "also make sure you are running on an AWS inf2 instance",
) )
if version.parse(optimum_neuron_version) != version.parse("0.0.17"): if version.parse(optimum_neuron_version) != version.parse("0.0.24"):
logger.warning( logger.warning(
'`optimum-neuron` model requires `pip install "optimum[neuronx]>=0.0.17" ' '`optimum-neuron` model requires `pip install "optimum[neuronx]>=0.0.17" '
"preferably using the Hugging Face Neuron Deep Learning AMI (Ubuntu 22.04) " "preferably using the Hugging Face Neuron Deep Learning AMI (Ubuntu 22.04) "
...@@ -217,35 +163,16 @@ class NEURON_HF(TemplateLM): ...@@ -217,35 +163,16 @@ class NEURON_HF(TemplateLM):
self.batch_size_per_gpu = int(batch_size) self.batch_size_per_gpu = int(batch_size)
batch_size = int(batch_size) batch_size = int(batch_size)
if tp_degree is None:
# execute `neuron-ls --json-output | jq '.[0].nc_count'``
# to get the number of neuron cores on your instance
tp_degree = get_nc_count()
assert isinstance(tp_degree, int), (
f"model_args must include tp_degree. tp_degree must be set to an integer,"
f" but is tp_degree=`{tp_degree}` with type=`{type(tp_degree)}`."
"Set it to number of neuron cores on your instance."
" For inf2.xlarge and inf2.8xlarge, set it to `2`."
" For inf2.24xlarge, set it to `12`."
" For inf2.48xlarge, set it to `24`."
)
revision = str(revision) # cast to string if not already one
# TODO: update this to be less of a hack once subfolder is fixed in HF
revision = revision + ("/" + subfolder if subfolder is not None else "")
self._config = transformers.AutoConfig.from_pretrained( self._config = transformers.AutoConfig.from_pretrained(
pretrained, pretrained,
revision=revision, revision=revision,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
) )
torch_dtype = lm_eval.models.utils.get_dtype(dtype)
assert torch_dtype in [ revision = str(revision) # cast to string if not already one
torch.float16, # TODO: update this to be less of a hack once subfolder is fixed in HF
torch.bfloat16, revision = revision + ("/" + subfolder if subfolder is not None else "")
], "Only float16 and bfloat16 are supported"
self.tokenizer = transformers.AutoTokenizer.from_pretrained( self.tokenizer = transformers.AutoTokenizer.from_pretrained(
pretrained if tokenizer is None else tokenizer, pretrained if tokenizer is None else tokenizer,
...@@ -254,36 +181,58 @@ class NEURON_HF(TemplateLM): ...@@ -254,36 +181,58 @@ class NEURON_HF(TemplateLM):
use_fast=use_fast_tokenizer, use_fast=use_fast_tokenizer,
) )
# Neuron specific code neuron_config = getattr(self._config, "neuron", None)
if torch_dtype == torch.float16: if neuron_config is None:
self.amp_dtype = "f16" # Check export parameters
elif torch_dtype == torch.bfloat16: if tp_degree is not None:
self.amp_dtype = "bf16" assert isinstance(tp_degree, int), (
elif torch_dtype == torch.float32: f"tp_degree must be set to an integer,"
self.amp_dtype = "f32" f" but is tp_degree=`{tp_degree}` with type=`{type(tp_degree)}`."
else: "Set it to a number lower than the number of neuron cores on your instance."
raise NotImplementedError("Only float16 and bfloat16 are implemented.") " For inf2.xlarge and inf2.8xlarge, set it to `2`."
" For inf2.24xlarge, set it <= `12`."
compiler_args = {"num_cores": tp_degree, "auto_cast_type": self.amp_dtype} " For inf2.48xlarge, set it <= `24`."
input_shapes = { )
"batch_size": batch_size, torch_dtype = lm_eval.models.utils.get_dtype(dtype)
"sequence_length": self._DEFAULT_MAX_LENGTH,
} if torch_dtype == torch.float16:
self.amp_dtype = "f16"
elif torch_dtype == torch.bfloat16:
self.amp_dtype = "bf16"
elif torch_dtype == torch.float32:
self.amp_dtype = "f32"
else:
raise NotImplementedError(
"Only float16/bfloat16/float32 are supported."
)
print( print(f"{'='*20} \n exporting model to neuron")
f"{'='*20} \n loading model to neuron with" self.model = CustomNeuronModelForCausalLM.from_pretrained(
f" {compiler_args}, {input_shapes}..." pretrained,
) revision=revision,
self.model = CustomNeuronModelForCausalLM.from_pretrained( trust_remote_code=trust_remote_code,
pretrained, low_cpu_mem_usage=low_cpu_mem_usage,
revision=revision, export=True,
trust_remote_code=trust_remote_code, batch_size=batch_size,
low_cpu_mem_usage=low_cpu_mem_usage, num_cores=tp_degree,
export=True, auto_cast_type=self.amp_dtype,
**compiler_args, sequence_length=max_length,
**input_shapes, )
) neuron_config = self.model.config.neuron
print(f"SUCCESS: neuron model compiled. \n {'='*20}") print(
f"SUCCESS: neuron model exported with config {neuron_config}. \n {'='*20}"
)
else:
print(
f"{'='*20} \n loading neuron model with config" f" {neuron_config}..."
)
self.model = CustomNeuronModelForCausalLM.from_pretrained(
pretrained,
revision=revision,
trust_remote_code=trust_remote_code,
low_cpu_mem_usage=low_cpu_mem_usage,
)
print(f"SUCCESS: neuron model loaded. \n {'='*20}")
self.truncation = truncation self.truncation = truncation
...@@ -291,8 +240,6 @@ class NEURON_HF(TemplateLM): ...@@ -291,8 +240,6 @@ class NEURON_HF(TemplateLM):
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
self.add_bos_token = add_bos_token self.add_bos_token = add_bos_token
self._max_length = max_length
self.batch_schedule = 1 self.batch_schedule = 1
self.batch_sizes = {} self.batch_sizes = {}
...@@ -313,17 +260,7 @@ class NEURON_HF(TemplateLM): ...@@ -313,17 +260,7 @@ class NEURON_HF(TemplateLM):
@property @property
def max_length(self): def max_length(self):
if self._max_length: # if max length manually set, return it return self.model.max_length
return self._max_length
seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
for attr in seqlen_config_attrs:
if hasattr(self.model.config, attr):
return getattr(self.model.config, attr)
if hasattr(self.tokenizer, "model_max_length"):
if self.tokenizer.model_max_length == 1000000000000000019884624838656:
return self._DEFAULT_MAX_LENGTH
return self.tokenizer.model_max_length
return self._DEFAULT_MAX_LENGTH
@property @property
def max_gen_toks(self) -> int: def max_gen_toks(self) -> int:
...@@ -391,34 +328,6 @@ class NEURON_HF(TemplateLM): ...@@ -391,34 +328,6 @@ class NEURON_HF(TemplateLM):
def tok_decode(self, tokens): def tok_decode(self, tokens):
return self.tokenizer.decode(tokens) return self.tokenizer.decode(tokens)
@wrap_constant_batch_size
def _model_call(self, input_ids: torch.Tensor):
"""
get logits for the entire sequence
:param input_ids: torch.Tensor
A torch tensor of shape [batch, sequence_cont]
the size of sequence may vary from call to call
:return
A torch tensor of shape [batch, sequence, vocab] with the
logits returned from the model's decoder-lm head
"""
_, sequence_length = input_ids.shape
with torch.inference_mode():
cache_ids = torch.arange(0, sequence_length, dtype=torch.int32).split(1)
input_ids_split = input_ids.split(1, dim=1)
return torch.concat(
[
self.model.forward(
input_ids=input_id, cache_ids=cache_id, return_dict=False
)[0]
for input_id, cache_id in zip(input_ids_split, cache_ids)
],
dim=1,
)
def _model_generate(self, context, max_length, stop, **generation_kwargs): def _model_generate(self, context, max_length, stop, **generation_kwargs):
# we require users to pass do_sample=True explicitly # we require users to pass do_sample=True explicitly
# for non-greedy gen. This should be reevaluated when considering beam search. # for non-greedy gen. This should be reevaluated when considering beam search.
...@@ -580,15 +489,41 @@ class NEURON_HF(TemplateLM): ...@@ -580,15 +489,41 @@ class NEURON_HF(TemplateLM):
cont_toks_list.append(continuation_enc) cont_toks_list.append(continuation_enc)
inplens.append(inplen) inplens.append(inplen)
# create encoder attn mask and batched conts, if seq2seq # Add dummy inputs up to the model static batch size
call_kwargs = {} if len(inps) < self.batch_size:
inps = inps + [
torch.zeros_like(inps[0]),
] * (self.batch_size - len(inps))
masks = [torch.ones_like(inp) for inp in inps]
batched_inps = lm_eval.models.utils.pad_and_concat( batched_inps = lm_eval.models.utils.pad_and_concat(
padding_len_inp, inps, padding_side="right" padding_len_inp, inps, padding_side="right"
) # [batch, padding_len_inp] ) # [batch, padding_len_inp]
multi_logits = F.log_softmax( batched_masks = lm_eval.models.utils.pad_and_concat(
self._model_call(batched_inps, **call_kwargs), dim=-1 padding_len_inp, masks, padding_side="right"
) # [batch, padding_length (inp or cont), vocab] )
if self.model.model.neuron_config.output_all_logits:
inputs = self.model.prepare_inputs_for_prefill(
batched_inps, batched_masks
)
multi_logits = F.log_softmax(
self.model.forward(**inputs).logits, dim=-1
) # [batch, padding_length (inp or cont), vocab]
else:
# The model will only return the logits for the last input token, so we need
# to iterate over inputs to accumulate logits.
# To speed things up we use the KV cache as we would do when generating.
inputs = self.model.prepare_inputs_for_prefill(
batched_inps[:, :1], batched_masks[:, :1]
)
outputs = [self.model.forward(**inputs).logits]
for i in range(1, padding_len_inp):
inputs = self.model.prepare_inputs_for_decode(
batched_inps[:, : i + 1], batched_masks[:, : i + 1]
)
outputs.append(self.model.forward(**inputs).logits)
multi_logits = F.log_softmax(torch.concat(outputs, dim=1), dim=-1)
for (cache_key, _, _), logits, inplen, cont_toks in zip( for (cache_key, _, _), logits, inplen, cont_toks in zip(
chunk, multi_logits, inplens, cont_toks_list chunk, multi_logits, inplens, cont_toks_list
......
...@@ -69,11 +69,11 @@ class LocalCompletionsAPI(TemplateAPI): ...@@ -69,11 +69,11 @@ class LocalCompletionsAPI(TemplateAPI):
for choice, ctxlen in zip(out["choices"], ctxlens): for choice, ctxlen in zip(out["choices"], ctxlens):
assert ctxlen > 0, "Context length must be greater than 0" assert ctxlen > 0, "Context length must be greater than 0"
logprobs = sum(choice["logprobs"]["token_logprobs"][ctxlen:-1]) logprobs = sum(choice["logprobs"]["token_logprobs"][ctxlen:-1])
tokens = choice["logprobs"]["token_logprobs"][ctxlen:-1] tokens_logprobs = choice["logprobs"]["token_logprobs"][ctxlen:-1]
top_logprobs = choice["logprobs"]["top_logprobs"][ctxlen:-1] top_logprobs = choice["logprobs"]["top_logprobs"][ctxlen:-1]
is_greedy = True is_greedy = True
for tok, top in zip(tokens, top_logprobs): for tok, top in zip(tokens_logprobs, top_logprobs):
if tok != max(top, key=top.get): if tok != max(top.values()):
is_greedy = False is_greedy = False
break break
res.append((logprobs, is_greedy)) res.append((logprobs, is_greedy))
...@@ -190,14 +190,18 @@ class OpenAICompletionsAPI(LocalCompletionsAPI): ...@@ -190,14 +190,18 @@ class OpenAICompletionsAPI(LocalCompletionsAPI):
key = os.environ.get("OPENAI_API_KEY", None) key = os.environ.get("OPENAI_API_KEY", None)
if key is None: if key is None:
raise ValueError( raise ValueError(
"API key not found. Please set the OPENAI_API_KEY environment variable." "API key not found. Please set the `OPENAI_API_KEY` environment variable."
) )
return key return key
def loglikelihood(self, requests, **kwargs): def loglikelihood(self, requests, **kwargs):
assert ( assert (
self.model != "gpt-3.5-turbo" self.model
), "Loglikelihood is not supported for gpt-3.5-turbo" in [
"babbage-002",
"davinci-002",
]
), f"Prompt loglikelihoods are only supported by OpenAI's API for {['babbage-002', 'davinci-002']}."
return super().loglikelihood(requests, **kwargs) return super().loglikelihood(requests, **kwargs)
def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]: def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
...@@ -226,6 +230,11 @@ class OpenAIChatCompletion(LocalChatCompletion): ...@@ -226,6 +230,11 @@ class OpenAIChatCompletion(LocalChatCompletion):
key = os.environ.get("OPENAI_API_KEY", None) key = os.environ.get("OPENAI_API_KEY", None)
if key is None: if key is None:
raise ValueError( raise ValueError(
"API key not found. Please set the OPENAI_API_KEY environment variable." "API key not found. Please set the `OPENAI_API_KEY` environment variable."
) )
return key return key
def loglikelihood(self, requests, **kwargs):
raise NotImplementedError(
"Loglikelihood (and therefore `multiple_choice`-type tasks) is not supported for chat completions as OpenAI does not provide prompt logprobs. See https://github.com/EleutherAI/lm-evaluation-harness/issues/942#issuecomment-1777836312 or https://github.com/EleutherAI/lm-evaluation-harness/issues/1196 for more background on this limitation."
)
...@@ -698,3 +698,14 @@ def replace_placeholders( ...@@ -698,3 +698,14 @@ def replace_placeholders(
# Add the last part of the string # Add the last part of the string
result.append(parts[-1]) result.append(parts[-1])
return "".join(result) return "".join(result)
def flatten_image_list(images: List[List]):
"""
Takes in a list of lists of images, and returns a single list of all images in order.
Used for some multimodal models like Llava-1.5 which expects this flattened-list format for its image processor.
:param images: A list of lists of PIL images.
:return: a list of PIL images, via concatenating all the sub-lists in order.
"""
return [image for image_list in images for image in image_list]
...@@ -7,9 +7,9 @@ from tqdm import tqdm ...@@ -7,9 +7,9 @@ from tqdm import tqdm
from lm_eval.api.instance import Instance from lm_eval.api.instance import Instance
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
from lm_eval.models.utils import Collator, undistribute from lm_eval.models.utils import Collator, replace_placeholders, undistribute
from lm_eval.models.vllm_causallms import VLLM from lm_eval.models.vllm_causallms import VLLM
from lm_eval.utils import simple_parse_args_string from lm_eval.utils import eval_logger
try: try:
...@@ -36,10 +36,11 @@ class VLLM_VLM(VLLM): ...@@ -36,10 +36,11 @@ class VLLM_VLM(VLLM):
interleave: bool = True, interleave: bool = True,
# TODO<baber>: handle max_images and limit_mm_per_prompt better # TODO<baber>: handle max_images and limit_mm_per_prompt better
max_images: int = 999, max_images: int = 999,
limit_mm_per_prompt: str = "image=1",
**kwargs, **kwargs,
): ):
kwargs["limit_mm_per_prompt"] = simple_parse_args_string(limit_mm_per_prompt) if max_images != 999:
kwargs["limit_mm_per_prompt"] = {"image": max_images}
eval_logger.info(f"Setting limit_mm_per_prompt[image] to {max_images}")
super().__init__( super().__init__(
pretrained=pretrained, pretrained=pretrained,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
...@@ -63,6 +64,17 @@ class VLLM_VLM(VLLM): ...@@ -63,6 +64,17 @@ class VLLM_VLM(VLLM):
truncation: bool = False, truncation: bool = False,
): ):
images = [img[: self.max_images] for img in images] images = [img[: self.max_images] for img in images]
# TODO<baber>: is the default placeholder always <image>?
if self.chat_applied is False:
strings = [
replace_placeholders(
string,
DEFAULT_IMAGE_PLACEHOLDER,
DEFAULT_IMAGE_PLACEHOLDER,
self.max_images,
)
for string in strings
]
outputs = [] outputs = []
for x, i in zip(strings, images): for x, i in zip(strings, images):
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
| [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English | | [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English |
| [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English | | [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English |
| [babi](babi/README.md) | Tasks designed as question and answering challenges based on simulated stories. | English | | [babi](babi/README.md) | Tasks designed as question and answering challenges based on simulated stories. | English |
| [basque_bench](basque_bench/README.md) | Collection of tasks in Basque encompassing various evaluation areas. | Basque |
| [basqueglue](basqueglue/README.md) | Tasks designed to evaluate language understanding in Basque language. | Basque | | [basqueglue](basqueglue/README.md) | Tasks designed to evaluate language understanding in Basque language. | Basque |
| [bbh](bbh/README.md) | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German | | [bbh](bbh/README.md) | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German |
| [belebele](belebele/README.md) | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages) | | [belebele](belebele/README.md) | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages) |
...@@ -25,6 +26,7 @@ ...@@ -25,6 +26,7 @@
| [bertaqa](bertaqa/README.md) | Local Basque cultural trivia QA tests in English and Basque languages. | English, Basque, Basque (MT) | | [bertaqa](bertaqa/README.md) | Local Basque cultural trivia QA tests in English and Basque languages. | English, Basque, Basque (MT) |
| [bigbench](bigbench/README.md) | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple | | [bigbench](bigbench/README.md) | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple |
| [blimp](blimp/README.md) | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English | | [blimp](blimp/README.md) | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English |
| [catalan_bench](catalan_bench/README.md) | Collection of tasks in Catalan encompassing various evaluation areas. | Catalan |
| [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese | | [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese |
| [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese | | [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese |
| code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby | | code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby |
...@@ -42,6 +44,7 @@ ...@@ -42,6 +44,7 @@
| [fda](fda/README.md) | Tasks for extracting key-value pairs from FDA documents to test information extraction. | English | | [fda](fda/README.md) | Tasks for extracting key-value pairs from FDA documents to test information extraction. | English |
| [fld](fld/README.md) | Tasks involving free-form and directed dialogue understanding. | English | | [fld](fld/README.md) | Tasks involving free-form and directed dialogue understanding. | English |
| [french_bench](french_bench/README.md) | Set of tasks designed to assess language model performance in French. | French| | [french_bench](french_bench/README.md) | Set of tasks designed to assess language model performance in French. | French|
| [galician_bench](galician_bench/README.md) | Collection of tasks in Galician encompassing various evaluation areas. | Galician |
| [glue](glue/README.md) | General Language Understanding Evaluation benchmark to test broad language abilities. | English | | [glue](glue/README.md) | General Language Understanding Evaluation benchmark to test broad language abilities. | English |
| [gpqa](gpqa/README.md) | Tasks designed for general public question answering and knowledge verification. | English | | [gpqa](gpqa/README.md) | Tasks designed for general public question answering and knowledge verification. | English |
| [gsm8k](gsm8k/README.md) | A benchmark of grade school math problems aimed at evaluating reasoning capabilities. | English | | [gsm8k](gsm8k/README.md) | A benchmark of grade school math problems aimed at evaluating reasoning capabilities. | English |
...@@ -86,6 +89,7 @@ ...@@ -86,6 +89,7 @@
| [pile_10k](pile_10k/README.md) | The first 10K elements of The Pile, useful for debugging models trained on it. | English | | [pile_10k](pile_10k/README.md) | The first 10K elements of The Pile, useful for debugging models trained on it. | English |
| [piqa](piqa/README.md) | Physical Interaction Question Answering tasks to test physical commonsense reasoning. | English | | [piqa](piqa/README.md) | Physical Interaction Question Answering tasks to test physical commonsense reasoning. | English |
| [polemo2](polemo2/README.md) | Sentiment analysis and emotion detection tasks based on Polish language data. | Polish | | [polemo2](polemo2/README.md) | Sentiment analysis and emotion detection tasks based on Polish language data. | Polish |
| [portuguese_bench](portuguese_bench/README.md) | Collection of tasks in European Portuguese encompassing various evaluation areas. | Portuguese |
| [prost](prost/README.md) | Tasks requiring understanding of professional standards and ethics in various domains. | English | | [prost](prost/README.md) | Tasks requiring understanding of professional standards and ethics in various domains. | English |
| [pubmedqa](pubmedqa/README.md) | Question answering tasks based on PubMed research articles for biomedical understanding. | English | | [pubmedqa](pubmedqa/README.md) | Question answering tasks based on PubMed research articles for biomedical understanding. | English |
| [qa4mre](qa4mre/README.md) | Question Answering for Machine Reading Evaluation, assessing comprehension and reasoning. | English | | [qa4mre](qa4mre/README.md) | Question Answering for Machine Reading Evaluation, assessing comprehension and reasoning. | English |
...@@ -95,6 +99,7 @@ ...@@ -95,6 +99,7 @@
| [sciq](sciq/README.md) | Science Question Answering tasks to assess understanding of scientific concepts. | English | | [sciq](sciq/README.md) | Science Question Answering tasks to assess understanding of scientific concepts. | English |
| [scrolls](scrolls/README.md) | Tasks that involve long-form reading comprehension across various domains. | English | | [scrolls](scrolls/README.md) | Tasks that involve long-form reading comprehension across various domains. | English |
| [siqa](siqa/README.md) | Social Interaction Question Answering to evaluate common sense and social reasoning. | English | | [siqa](siqa/README.md) | Social Interaction Question Answering to evaluate common sense and social reasoning. | English |
| [spanish_bench](spanish_bench/README.md) | Collection of tasks in Spanish encompassing various evaluation areas. | Spanish |
| [squad_completion](squad_completion/README.md) | A variant of the SQuAD question answering task designed for zero-shot evaluation of small LMs. | English | | [squad_completion](squad_completion/README.md) | A variant of the SQuAD question answering task designed for zero-shot evaluation of small LMs. | English |
| [squadv2](squadv2/README.md) | Stanford Question Answering Dataset version 2, a reading comprehension benchmark. | English | | [squadv2](squadv2/README.md) | Stanford Question Answering Dataset version 2, a reading comprehension benchmark. | English |
| [storycloze](storycloze/README.md) | Tasks to predict story endings, focusing on narrative logic and coherence. | English | | [storycloze](storycloze/README.md) | Tasks to predict story endings, focusing on narrative logic and coherence. | English |
...@@ -107,6 +112,7 @@ ...@@ -107,6 +112,7 @@
| [translation](translation/README.md) | Tasks focused on evaluating the language translation capabilities of models. | Arabic, English, Spanish, Basque, Hindi, Indonesian, Burmese, Russian, Swahili, Telugu, Chinese | | [translation](translation/README.md) | Tasks focused on evaluating the language translation capabilities of models. | Arabic, English, Spanish, Basque, Hindi, Indonesian, Burmese, Russian, Swahili, Telugu, Chinese |
| [triviaqa](triviaqa/README.md) | A large-scale dataset for trivia question answering to test general knowledge. | English | | [triviaqa](triviaqa/README.md) | A large-scale dataset for trivia question answering to test general knowledge. | English |
| [truthfulqa](truthfulqa/README.md) | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses. | English | | [truthfulqa](truthfulqa/README.md) | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses. | English |
| [turkishmmlu](turkishmmlu/README.md) | A multiple-choice QA test modeled after MMLU, written in Turkish based on Turkish high-school level exams. | Turkish |
| [unitxt](unitxt/README.md) | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI. | English | | [unitxt](unitxt/README.md) | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI. | English |
| [unscramble](unscramble/README.md) | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding. | English | | [unscramble](unscramble/README.md) | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding. | English |
| [webqs](webqs/README.md) | Web-based question answering tasks designed to evaluate internet search and retrieval. | English | | [webqs](webqs/README.md) | Web-based question answering tasks designed to evaluate internet search and retrieval. | English |
......
...@@ -40,7 +40,11 @@ class TaskManager: ...@@ -40,7 +40,11 @@ class TaskManager:
[x for x in self._all_tasks if self._task_index[x]["type"] == "group"] [x for x in self._all_tasks if self._task_index[x]["type"] == "group"]
) )
self._all_subtasks = sorted( self._all_subtasks = sorted(
[x for x in self._all_tasks if self._task_index[x]["type"] == "task"] [
x
for x in self._all_tasks
if self._task_index[x]["type"] in ["task", "python_task"]
]
) )
self._all_tags = sorted( self._all_tags = sorted(
[x for x in self._all_tasks if self._task_index[x]["type"] == "tag"] [x for x in self._all_tasks if self._task_index[x]["type"] == "tag"]
...@@ -271,7 +275,7 @@ class TaskManager: ...@@ -271,7 +275,7 @@ class TaskManager:
task_object = config["class"]() task_object = config["class"]()
if isinstance(task_object, ConfigurableTask): if isinstance(task_object, ConfigurableTask):
# very scuffed: set task name here. TODO: fixme? # very scuffed: set task name here. TODO: fixme?
task_object.config.task = config["task"] task_object.config.task = task
else: else:
task_object = ConfigurableTask(config=config) task_object = ConfigurableTask(config=config)
...@@ -436,6 +440,30 @@ class TaskManager: ...@@ -436,6 +440,30 @@ class TaskManager:
:return :return
Dictionary of task names as key and task metadata Dictionary of task names as key and task metadata
""" """
def _populate_tags_and_groups(config, task, tasks_and_groups, print_info):
# TODO: remove group in next release
if "tag" in config:
attr_list = config["tag"]
if isinstance(attr_list, str):
attr_list = [attr_list]
for tag in attr_list:
if tag not in tasks_and_groups:
tasks_and_groups[tag] = {
"type": "tag",
"task": [task],
"yaml_path": -1,
}
elif tasks_and_groups[tag]["type"] != "tag":
self.logger.info(
f"The tag '{tag}' is already registered as a group, this tag will not be registered. "
"This may affect tasks you want to call."
)
break
else:
tasks_and_groups[tag]["task"].append(task)
# TODO: remove group in next release # TODO: remove group in next release
print_info = True print_info = True
ignore_dirs = [ ignore_dirs = [
...@@ -451,10 +479,14 @@ class TaskManager: ...@@ -451,10 +479,14 @@ class TaskManager:
config = utils.load_yaml_config(yaml_path, mode="simple") config = utils.load_yaml_config(yaml_path, mode="simple")
if self._config_is_python_task(config): if self._config_is_python_task(config):
# This is a python class config # This is a python class config
tasks_and_groups[config["task"]] = { task = config["task"]
tasks_and_groups[task] = {
"type": "python_task", "type": "python_task",
"yaml_path": yaml_path, "yaml_path": yaml_path,
} }
_populate_tags_and_groups(
config, task, tasks_and_groups, print_info
)
elif self._config_is_group(config): elif self._config_is_group(config):
# This is a group config # This is a group config
tasks_and_groups[config["group"]] = { tasks_and_groups[config["group"]] = {
...@@ -483,41 +515,9 @@ class TaskManager: ...@@ -483,41 +515,9 @@ class TaskManager:
"type": "task", "type": "task",
"yaml_path": yaml_path, "yaml_path": yaml_path,
} }
_populate_tags_and_groups(
# TODO: remove group in next release config, task, tasks_and_groups, print_info
for attr in ["tag", "group"]: )
if attr in config:
if attr == "group" and print_info:
self.logger.info(
"`group` and `group_alias` keys in TaskConfigs are deprecated and will be removed in v0.4.5 of lm_eval. "
"The new `tag` field will be used to allow for a shortcut to a group of tasks one does not wish to aggregate metrics across. "
"`group`s which aggregate across subtasks must be only defined in a separate group config file, "
"which will be the official way to create groups that support cross-task aggregation as in `mmlu`. "
"Please see the v0.4.4 patch notes and our documentation: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/new_task_guide.md#advanced-group-configs "
"for more information."
)
print_info = False
# attr = "tag"
attr_list = config[attr]
if isinstance(attr_list, str):
attr_list = [attr_list]
for tag in attr_list:
if tag not in tasks_and_groups:
tasks_and_groups[tag] = {
"type": "tag",
"task": [task],
"yaml_path": -1,
}
elif tasks_and_groups[tag]["type"] != "tag":
self.logger.info(
f"The tag {tag} is already registered as a group, this tag will not be registered. "
"This may affect tasks you want to call."
)
break
else:
tasks_and_groups[tag]["task"].append(task)
else: else:
self.logger.debug(f"File {f} in {root} could not be loaded") self.logger.debug(f"File {f} in {root} could not be loaded")
......
# BasqueBench
### Paper
BasqueBench is a benchmark for evaluating language models in Basque tasks. This is, it evaluates the ability of a language model to understand and generate Basque text. BasqueBench offers a combination of pre-existing, open datasets and datasets developed exclusivelly for this benchmark. All the details of BasqueBench will be published in a paper soon.
The new evaluation datasets included in BasqueBench are:
| Task | Category | Homepage |
|:-------------:|:-----:|:-----:|
| MGSM_eu | Math | https://huggingface.co/datasets/HiTZ/MGSM-eu |
| WNLI_eu | Natural Language Inference | https://huggingface.co/datasets/HiTZ/wnli-eu |
| XCOPA_eu | Commonsense Reasoning | https://huggingface.co/datasets/HiTZ/XCOPA-eu |
The datasets included in BasqueBench that have been made public in previous pubications are:
| Task | Category | Paper title | Homepage |
|:-------------:|:-----:|:-------------:|:-----:|
| Belebele_eu | Reading Comprehension | [The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants](https://arxiv.org/abs/2308.16884) | https://huggingface.co/datasets/facebook/belebele |
| EusExams | Question Answering | [Latxa: An Open Language Model and Evaluation Suite for Basque](https://arxiv.org/abs/2403.20266) | https://huggingface.co/datasets/HiTZ/EusExams |
| EusProficiency | Question Answering | [Latxa: An Open Language Model and Evaluation Suite for Basque](https://arxiv.org/abs/2403.20266) | https://huggingface.co/datasets/HiTZ/EusProficiency |
| EusReading | Reading Comprehension | [Latxa: An Open Language Model and Evaluation Suite for Basque](https://arxiv.org/abs/2403.20266) | https://huggingface.co/datasets/HiTZ/EusReading |
| EusTrivia | Question Answering | [Latxa: An Open Language Model and Evaluation Suite for Basque](https://arxiv.org/abs/2403.20266) | https://huggingface.co/datasets/HiTZ/EusTrivia |
| FLORES_eu | Translation | [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) | https://huggingface.co/datasets/facebook/flores |
| QNLIeu | Natural Language Inference | [BasqueGLUE: A Natural Language Understanding Benchmark for Basque](https://aclanthology.org/2022.lrec-1.172/) | https://huggingface.co/datasets/orai-nlp/basqueGLUE |
| XNLIeu | Natural Language Inference | [XNLIeu: a dataset for cross-lingual NLI in Basque](https://arxiv.org/abs/2404.06996) | https://huggingface.co/datasets/HiTZ/xnli-eu |
| XStoryCloze_eu | Commonsense Reasoning | [Few-shot Learning with Multilingual Generative Language Models](https://aclanthology.org/2022.emnlp-main.616/) | https://huggingface.co/datasets/juletxara/xstory_cloze |
### Citation
Paper for BasqueBench coming soon.
### Groups and Tasks
#### Groups
- `basque_bench`: All tasks included in BasqueBench.
- `flores_eu`: All FLORES translation tasks from or to Basque.
#### Tasks
The following tasks evaluate tasks on BasqueBench dataset using various scoring methods.
- `belebele_eus_Latn`
- `eus_exams_eu`
- `eus_proficiency`
- `eus_reading`
- `eus_trivia`
- `flores_eu`
- `flores_eu-ca`
- `flores_eu-de`
- `flores_eu-en`
- `flores_eu-es`
- `flores_eu-fr`
- `flores_eu-gl`
- `flores_eu-it`
- `flores_eu-pt`
- `flores_ca-eu`
- `flores_de-eu`
- `flores_en-eu`
- `flores_es-eu`
- `flores_fr-eu`
- `flores_gl-eu`
- `flores_it-eu`
- `flores_pt-eu`
- `mgsm_direct_eu`
- `mgsm_native_cot_eu`
- `qnlieu`
- `wnli_eu`
- `xcopa_eu`
- `xnli_eu`
- `xnli_eu_native`
- `xstorycloze_eu`
Some of these tasks are taken from benchmarks already available in LM Evaluation Harness. These are:
- `belebele_eus_Latn`: Belebele Basque
- `qnlieu`: From BasqueGLUE
### Checklist
* [x] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation?
* [ ] Yes, original implementation contributed by author of the benchmark
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
group: basque_bench
task:
- belebele_eus_Latn
- xstorycloze_eu
- flores_eu
- eus_reading
- eus_proficiency
- eus_trivia
- eus_exams_eu
- qnlieu
- xnli_eu
- xnli_eu_native
- wnli_eu
- xcopa_eu
- mgsm_direct_eu
- mgsm_native_cot_eu
metadata:
version: 1.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment